Message ID | 20190627080045.8814-10-lionel.g.landwerlin@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/i915: Vulkan performance query support | expand |
Quoting Lionel Landwerlin (2019-06-27 09:00:44) > We can't run into issues with doing writing the global OA/NOA > registers configuration from CPU so far, but HW engineers actually > recommend doing this from the command streamer. > > Since we have a command buffer prepared for the execbuffer side of > things, we can reuse that approach here too. > > Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> > --- > drivers/gpu/drm/i915/i915_perf.c | 203 +++++++++++++++---------------- > 1 file changed, 100 insertions(+), 103 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c > index bf4f5fee6764..7e636463e1f5 100644 > --- a/drivers/gpu/drm/i915/i915_perf.c > +++ b/drivers/gpu/drm/i915/i915_perf.c > @@ -389,6 +389,19 @@ void i915_oa_config_release(struct kref *ref) > kfree(oa_config); > } > > +static void i915_oa_config_dispose_buffers(struct drm_i915_private *i915) > +{ > + struct i915_oa_config *oa_config, *next; > + > + mutex_lock(&i915->perf.metrics_lock); > + list_for_each_entry_safe(oa_config, next, &i915->perf.metrics_buffers, vma_link) { > + list_del(&oa_config->vma_link); > + i915_gem_object_put(oa_config->obj); > + oa_config->obj = NULL; > + } > + mutex_unlock(&i915->perf.metrics_lock); > +} > + > static u32 *write_cs_mi_lri(u32 *cs, const struct i915_oa_reg *reg_data, u32 n_regs) > { > u32 i; > @@ -1813,67 +1826,86 @@ static int alloc_noa_wait(struct drm_i915_private *i915) > return ret; > } > > -static void config_oa_regs(struct drm_i915_private *dev_priv, > - const struct i915_oa_reg *regs, > - u32 n_regs) > +static int config_oa_regs(struct drm_i915_private *i915, > + struct i915_oa_config *oa_config) > { > - u32 i; > + struct i915_request *rq; > + struct i915_vma *vma; > + long timeout; > + u32 *cs; > + int err; > > - for (i = 0; i < n_regs; i++) { > - const struct i915_oa_reg *reg = regs + i; > + rq = i915_request_create(i915->engine[RCS0]->kernel_context); > + if (IS_ERR(rq)) > + return PTR_ERR(rq); > > - I915_WRITE(reg->addr, reg->value); > + err = i915_active_request_set(&i915->engine[RCS0]->last_oa_config, > + rq); This will need an explicit mutex as it is not serialised by any single timeline. i915->perf.metrics_lock viable? Needs to be taken here and in execbuf. > + if (err) { > + i915_request_add(rq); > + return err; > + } > + > + vma = i915_vma_instance(oa_config->obj, &i915->ggtt.vm, NULL); > + if (unlikely(IS_ERR(vma))) { > + i915_request_add(rq); > + return PTR_ERR(vma); > + } > + > + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL); > + if (err) { > + i915_request_add(rq); > + return err; > } This time there is no excuse for not pinning outside of the timeline->mutex. > + > + err = i915_vma_move_to_active(vma, rq, 0); > + if (err) { > + i915_vma_unpin(vma); > + i915_request_add(rq); > + return err; > + } > + > + cs = intel_ring_begin(rq, INTEL_GEN(i915) >= 8 ? 4 : 2); > + if (IS_ERR(cs)) { > + i915_vma_unpin(vma); > + i915_request_add(rq); > + return PTR_ERR(cs); > + } > + > + if (INTEL_GEN(i915) > 8) { > + *cs++ = MI_BATCH_BUFFER_START_GEN8; > + *cs++ = lower_32_bits(vma->node.start); > + *cs++ = upper_32_bits(vma->node.start); > + *cs++ = MI_NOOP; > + } else { > + *cs++ = MI_BATCH_BUFFER_START; > + *cs++ = vma->node.start; > + } > + > + intel_ring_advance(rq, cs); > + > + i915_vma_unpin(vma); > + > + i915_request_add(rq); > + > + i915_request_get(rq); Strictly, before the add, as the add is the xfer of the ref. > + timeout = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED, > + MAX_SCHEDULE_TIMEOUT); > + i915_request_put(rq); > + > + return timeout < 0 ? err : 0; Report err, but timeline carries the errno? -Chris
On 27/06/2019 13:02, Chris Wilson wrote: > Quoting Lionel Landwerlin (2019-06-27 09:00:44) >> We can't run into issues with doing writing the global OA/NOA >> registers configuration from CPU so far, but HW engineers actually >> recommend doing this from the command streamer. >> >> Since we have a command buffer prepared for the execbuffer side of >> things, we can reuse that approach here too. >> >> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> >> --- >> drivers/gpu/drm/i915/i915_perf.c | 203 +++++++++++++++---------------- >> 1 file changed, 100 insertions(+), 103 deletions(-) >> >> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c >> index bf4f5fee6764..7e636463e1f5 100644 >> --- a/drivers/gpu/drm/i915/i915_perf.c >> +++ b/drivers/gpu/drm/i915/i915_perf.c >> @@ -389,6 +389,19 @@ void i915_oa_config_release(struct kref *ref) >> kfree(oa_config); >> } >> >> +static void i915_oa_config_dispose_buffers(struct drm_i915_private *i915) >> +{ >> + struct i915_oa_config *oa_config, *next; >> + >> + mutex_lock(&i915->perf.metrics_lock); >> + list_for_each_entry_safe(oa_config, next, &i915->perf.metrics_buffers, vma_link) { >> + list_del(&oa_config->vma_link); >> + i915_gem_object_put(oa_config->obj); >> + oa_config->obj = NULL; >> + } >> + mutex_unlock(&i915->perf.metrics_lock); >> +} >> + >> static u32 *write_cs_mi_lri(u32 *cs, const struct i915_oa_reg *reg_data, u32 n_regs) >> { >> u32 i; >> @@ -1813,67 +1826,86 @@ static int alloc_noa_wait(struct drm_i915_private *i915) >> return ret; >> } >> >> -static void config_oa_regs(struct drm_i915_private *dev_priv, >> - const struct i915_oa_reg *regs, >> - u32 n_regs) >> +static int config_oa_regs(struct drm_i915_private *i915, >> + struct i915_oa_config *oa_config) >> { >> - u32 i; >> + struct i915_request *rq; >> + struct i915_vma *vma; >> + long timeout; >> + u32 *cs; >> + int err; >> >> - for (i = 0; i < n_regs; i++) { >> - const struct i915_oa_reg *reg = regs + i; >> + rq = i915_request_create(i915->engine[RCS0]->kernel_context); >> + if (IS_ERR(rq)) >> + return PTR_ERR(rq); >> >> - I915_WRITE(reg->addr, reg->value); >> + err = i915_active_request_set(&i915->engine[RCS0]->last_oa_config, >> + rq); > This will need an explicit mutex as it is not serialised by any single > timeline. > > i915->perf.metrics_lock viable? Needs to be taken here and in execbuf. Not sure that I understand. This function is called under struct_mutex lock. > >> + if (err) { >> + i915_request_add(rq); >> + return err; >> + } >> + >> + vma = i915_vma_instance(oa_config->obj, &i915->ggtt.vm, NULL); >> + if (unlikely(IS_ERR(vma))) { >> + i915_request_add(rq); >> + return PTR_ERR(vma); >> + } >> + >> + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL); >> + if (err) { >> + i915_request_add(rq); >> + return err; >> } > This time there is no excuse for not pinning outside of the > timeline->mutex. Right, okay, but there is so much going on when turning on i915-perf, draining + idling + context image edition. Do we really need to optimize this in this patch? After all this removes the cpu wait that was done under struct_mutex lock. > >> + >> + err = i915_vma_move_to_active(vma, rq, 0); >> + if (err) { >> + i915_vma_unpin(vma); >> + i915_request_add(rq); >> + return err; >> + } >> + >> + cs = intel_ring_begin(rq, INTEL_GEN(i915) >= 8 ? 4 : 2); >> + if (IS_ERR(cs)) { >> + i915_vma_unpin(vma); >> + i915_request_add(rq); >> + return PTR_ERR(cs); >> + } >> + >> + if (INTEL_GEN(i915) > 8) { >> + *cs++ = MI_BATCH_BUFFER_START_GEN8; >> + *cs++ = lower_32_bits(vma->node.start); >> + *cs++ = upper_32_bits(vma->node.start); >> + *cs++ = MI_NOOP; >> + } else { >> + *cs++ = MI_BATCH_BUFFER_START; >> + *cs++ = vma->node.start; >> + } >> + >> + intel_ring_advance(rq, cs); >> + >> + i915_vma_unpin(vma); >> + >> + i915_request_add(rq); >> + >> + i915_request_get(rq); > Strictly, before the add, as the add is the xfer of the ref. Oops. > >> + timeout = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED, >> + MAX_SCHEDULE_TIMEOUT); >> + i915_request_put(rq); >> + >> + return timeout < 0 ? err : 0; > Report err, but timeline carries the errno? I meant to return timeout. > -Chris >
Quoting Lionel Landwerlin (2019-06-27 15:50:38) > On 27/06/2019 13:02, Chris Wilson wrote: > > Quoting Lionel Landwerlin (2019-06-27 09:00:44) > >> @@ -1813,67 +1826,86 @@ static int alloc_noa_wait(struct drm_i915_private *i915) > >> return ret; > >> } > >> > >> -static void config_oa_regs(struct drm_i915_private *dev_priv, > >> - const struct i915_oa_reg *regs, > >> - u32 n_regs) > >> +static int config_oa_regs(struct drm_i915_private *i915, > >> + struct i915_oa_config *oa_config) > >> { > >> - u32 i; > >> + struct i915_request *rq; > >> + struct i915_vma *vma; > >> + long timeout; > >> + u32 *cs; > >> + int err; > >> > >> - for (i = 0; i < n_regs; i++) { > >> - const struct i915_oa_reg *reg = regs + i; > >> + rq = i915_request_create(i915->engine[RCS0]->kernel_context); > >> + if (IS_ERR(rq)) > >> + return PTR_ERR(rq); > >> > >> - I915_WRITE(reg->addr, reg->value); > >> + err = i915_active_request_set(&i915->engine[RCS0]->last_oa_config, > >> + rq); > > This will need an explicit mutex as it is not serialised by any single > > timeline. > > > > i915->perf.metrics_lock viable? Needs to be taken here and in execbuf. > > > Not sure that I understand. This function is called under struct_mutex lock. The active_request is serialised by the timeline lock, which for about the next 10 minutes struct_mutex remains in lieu of. I want you to nominate an explicit lock for serialisation of the last_oa_config barrier > > > >> + if (err) { > >> + i915_request_add(rq); > >> + return err; > >> + } > >> + > >> + vma = i915_vma_instance(oa_config->obj, &i915->ggtt.vm, NULL); > >> + if (unlikely(IS_ERR(vma))) { > >> + i915_request_add(rq); > >> + return PTR_ERR(vma); > >> + } > >> + > >> + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL); > >> + if (err) { > >> + i915_request_add(rq); > >> + return err; > >> } > > This time there is no excuse for not pinning outside of the > > timeline->mutex. > > > Right, okay, but there is so much going on when turning on i915-perf, > draining + idling + context image edition. > > Do we really need to optimize this in this patch? It's not optimizing, it's about the lock nesting. To pin a vma, we may ultimately need to emit requests, at which point trying to pin from inside a request critical section is going to make lockdep rightfully complain. > After all this removes the cpu wait that was done under struct_mutex lock. Puzzling, as all of this is under struct_mutex and may require waits :( Fear not, removing struct_mutex here is easier than execbuf, so only 50 patches! So long as you haven't used struct_mutex for serialising perf :) > >> + timeout = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED, > >> + MAX_SCHEDULE_TIMEOUT); One other thing, I915_WAIT_LOCKED is no longer used. -Chris
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index bf4f5fee6764..7e636463e1f5 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -389,6 +389,19 @@ void i915_oa_config_release(struct kref *ref) kfree(oa_config); } +static void i915_oa_config_dispose_buffers(struct drm_i915_private *i915) +{ + struct i915_oa_config *oa_config, *next; + + mutex_lock(&i915->perf.metrics_lock); + list_for_each_entry_safe(oa_config, next, &i915->perf.metrics_buffers, vma_link) { + list_del(&oa_config->vma_link); + i915_gem_object_put(oa_config->obj); + oa_config->obj = NULL; + } + mutex_unlock(&i915->perf.metrics_lock); +} + static u32 *write_cs_mi_lri(u32 *cs, const struct i915_oa_reg *reg_data, u32 n_regs) { u32 i; @@ -1813,67 +1826,86 @@ static int alloc_noa_wait(struct drm_i915_private *i915) return ret; } -static void config_oa_regs(struct drm_i915_private *dev_priv, - const struct i915_oa_reg *regs, - u32 n_regs) +static int config_oa_regs(struct drm_i915_private *i915, + struct i915_oa_config *oa_config) { - u32 i; + struct i915_request *rq; + struct i915_vma *vma; + long timeout; + u32 *cs; + int err; - for (i = 0; i < n_regs; i++) { - const struct i915_oa_reg *reg = regs + i; + rq = i915_request_create(i915->engine[RCS0]->kernel_context); + if (IS_ERR(rq)) + return PTR_ERR(rq); - I915_WRITE(reg->addr, reg->value); + err = i915_active_request_set(&i915->engine[RCS0]->last_oa_config, + rq); + if (err) { + i915_request_add(rq); + return err; + } + + vma = i915_vma_instance(oa_config->obj, &i915->ggtt.vm, NULL); + if (unlikely(IS_ERR(vma))) { + i915_request_add(rq); + return PTR_ERR(vma); + } + + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL); + if (err) { + i915_request_add(rq); + return err; } + + err = i915_vma_move_to_active(vma, rq, 0); + if (err) { + i915_vma_unpin(vma); + i915_request_add(rq); + return err; + } + + cs = intel_ring_begin(rq, INTEL_GEN(i915) >= 8 ? 4 : 2); + if (IS_ERR(cs)) { + i915_vma_unpin(vma); + i915_request_add(rq); + return PTR_ERR(cs); + } + + if (INTEL_GEN(i915) > 8) { + *cs++ = MI_BATCH_BUFFER_START_GEN8; + *cs++ = lower_32_bits(vma->node.start); + *cs++ = upper_32_bits(vma->node.start); + *cs++ = MI_NOOP; + } else { + *cs++ = MI_BATCH_BUFFER_START; + *cs++ = vma->node.start; + } + + intel_ring_advance(rq, cs); + + i915_vma_unpin(vma); + + i915_request_add(rq); + + i915_request_get(rq); + timeout = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED, + MAX_SCHEDULE_TIMEOUT); + i915_request_put(rq); + + return timeout < 0 ? err : 0; } static int hsw_enable_metric_set(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - const struct i915_oa_config *oa_config = stream->oa_config; - /* PRM: - * - * OA unit is using “crclk” for its functionality. When trunk - * level clock gating takes place, OA clock would be gated, - * unable to count the events from non-render clock domain. - * Render clock gating must be disabled when OA is enabled to - * count the events from non-render domain. Unit level clock - * gating for RCS should also be disabled. - */ I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) & ~GEN7_DOP_CLOCK_GATE_ENABLE)); I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) | GEN6_CSUNIT_CLOCK_GATE_DISABLE)); - config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len); - - /* It apparently takes a fairly long time for a new MUX - * configuration to be be applied after these register writes. - * This delay duration was derived empirically based on the - * render_basic config but hopefully it covers the maximum - * configuration latency. - * - * As a fallback, the checks in _append_oa_reports() to skip - * invalid OA reports do also seem to work to discard reports - * generated before this config has completed - albeit not - * silently. - * - * Unfortunately this is essentially a magic number, since we - * don't currently know of a reliable mechanism for predicting - * how long the MUX config will take to apply and besides - * seeing invalid reports we don't know of a reliable way to - * explicitly check that the MUX config has landed. - * - * It's even possible we've miss characterized the underlying - * problem - it just seems like the simplest explanation why - * a delay at this location would mitigate any invalid reports. - */ - usleep_range(15000, 20000); - - config_oa_regs(dev_priv, oa_config->b_counter_regs, - oa_config->b_counter_regs_len); - - return 0; + return config_oa_regs(dev_priv, stream->oa_config); } static void hsw_disable_metric_set(struct drm_i915_private *dev_priv) @@ -1978,7 +2010,6 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv, { unsigned int map_type = i915_coherent_map_type(dev_priv); struct i915_gem_context *ctx; - struct i915_request *rq; int ret; lockdep_assert_held(&dev_priv->drm.struct_mutex); @@ -2037,14 +2068,9 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv, } /* - * Apply the configuration by doing one context restore of the edited - * context image. + * The above configuration will be applied when called + * config_oa_regs(). */ - rq = i915_request_create(dev_priv->engine[RCS0]->kernel_context); - if (IS_ERR(rq)) - return PTR_ERR(rq); - - i915_request_add(rq); return 0; } @@ -2093,35 +2119,7 @@ static int gen8_enable_metric_set(struct i915_perf_stream *stream) if (ret) return ret; - config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len); - - /* It apparently takes a fairly long time for a new MUX - * configuration to be be applied after these register writes. - * This delay duration was derived empirically based on the - * render_basic config but hopefully it covers the maximum - * configuration latency. - * - * As a fallback, the checks in _append_oa_reports() to skip - * invalid OA reports do also seem to work to discard reports - * generated before this config has completed - albeit not - * silently. - * - * Unfortunately this is essentially a magic number, since we - * don't currently know of a reliable mechanism for predicting - * how long the MUX config will take to apply and besides - * seeing invalid reports we don't know of a reliable way to - * explicitly check that the MUX config has landed. - * - * It's even possible we've miss characterized the underlying - * problem - it just seems like the simplest explanation why - * a delay at this location would mitigate any invalid reports. - */ - usleep_range(15000, 20000); - - config_oa_regs(dev_priv, oa_config->b_counter_regs, - oa_config->b_counter_regs_len); - - return 0; + return config_oa_regs(dev_priv, stream->oa_config); } static void gen8_disable_metric_set(struct drm_i915_private *dev_priv) @@ -2292,6 +2290,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, struct perf_open_properties *props) { struct drm_i915_private *dev_priv = stream->dev_priv; + struct drm_i915_gem_object *obj; int format_size; int ret; @@ -2376,13 +2375,6 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, } } - ret = i915_perf_get_oa_config(dev_priv, props->metrics_set, - &stream->oa_config, NULL); - if (ret) { - DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set); - goto err_config; - } - ret = alloc_noa_wait(dev_priv); if (ret) { DRM_DEBUG("Unable to allocate NOA wait batch buffer\n"); @@ -2412,6 +2404,19 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, if (ret) goto err_lock; + ret = i915_perf_get_oa_config(dev_priv, props->metrics_set, + &stream->oa_config, &obj); + if (ret) { + DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set); + goto err_config; + } + + /* + * We just need the buffer to be created, but not our own reference on + * it as the oa_config already has one. + */ + i915_gem_object_put(obj); + stream->ops = &i915_oa_stream_ops; dev_priv->perf.oa.exclusive_stream = stream; @@ -2430,14 +2435,16 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, err_enable: dev_priv->perf.oa.exclusive_stream = NULL; dev_priv->perf.oa.ops.disable_metric_set(dev_priv); + +err_config: + i915_oa_config_put(stream->oa_config); + i915_oa_config_dispose_buffers(dev_priv); mutex_unlock(&dev_priv->drm.struct_mutex); err_lock: free_oa_buffer(dev_priv); err_oa_buf_alloc: - i915_oa_config_put(stream->oa_config); - intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL); intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref); @@ -2446,9 +2453,6 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, mutex_unlock(&dev_priv->drm.struct_mutex); err_noa_wait_alloc: - i915_oa_config_put(stream->oa_config); - -err_config: if (stream->ctx) oa_put_render_ctx_id(stream); @@ -2810,20 +2814,13 @@ static int i915_perf_release(struct inode *inode, struct file *file) { struct i915_perf_stream *stream = file->private_data; struct drm_i915_private *dev_priv = stream->dev_priv; - struct i915_oa_config *oa_config, *next; mutex_lock(&dev_priv->perf.lock); i915_perf_destroy_locked(stream); /* Dispose of all oa config batch buffers. */ - mutex_lock(&dev_priv->perf.metrics_lock); - list_for_each_entry_safe(oa_config, next, &dev_priv->perf.metrics_buffers, vma_link) { - list_del(&oa_config->vma_link); - i915_gem_object_put(oa_config->obj); - oa_config->obj = NULL; - } - mutex_unlock(&dev_priv->perf.metrics_lock); + i915_oa_config_dispose_buffers(dev_priv); mutex_unlock(&dev_priv->perf.lock);
We can't run into issues with doing writing the global OA/NOA registers configuration from CPU so far, but HW engineers actually recommend doing this from the command streamer. Since we have a command buffer prepared for the execbuffer side of things, we can reuse that approach here too. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> --- drivers/gpu/drm/i915/i915_perf.c | 203 +++++++++++++++---------------- 1 file changed, 100 insertions(+), 103 deletions(-)