Message ID | 20190521140855.3957-5-lionel.g.landwerlin@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/i915: Vulkan performance query support | expand |
Quoting Lionel Landwerlin (2019-05-21 15:08:54) > + if (eb->oa_config && > + eb->oa_config != eb->i915->perf.oa.exclusive_stream->oa_config) { But the oa_config is not ordered with respect to requests, and the registers changed here are not context saved and so may be changed by a third party before execution. The first party must presumably dropped the perf_fd before then, so maybe you don't care? Hmm, doesn't even take a third party as the perf_fd owner may change their mind for different contexts and be surprised when the wrong set is used. > + struct i915_vma *oa_vma; > + > + oa_vma = i915_vma_instance(eb->oa_bo, > + &eb->engine->i915->ggtt.vm, NULL); > + if (unlikely(IS_ERR(oa_vma))) { > + err = PTR_ERR(oa_vma); > + return err; > + } > + > + err = i915_vma_pin(oa_vma, 0, 0, PIN_GLOBAL); > + if (err) > + return err; > + > + err = eb->engine->emit_bb_start(eb->request, > + oa_vma->node.start, > + 0, I915_DISPATCH_SECURE); > + if (err) { > + i915_vma_unpin(oa_vma); > + return err; > + } > + > + err = i915_vma_move_to_active(oa_vma, eb->request, 0); Move to active first, so that the vma is not in use if the move fails. > + if (err) { > + i915_vma_unpin(oa_vma); > + return err; > + } > + > + > + i915_vma_unpin(oa_vma); > + > + > + swap(eb->oa_config, eb->i915->perf.oa.exclusive_stream->oa_config); > + } > + > err = eb->engine->emit_bb_start(eb->request, > eb->batch->node.start + > eb->batch_start_offset, > @@ -2341,6 +2410,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, > eb.buffer_count = args->buffer_count; > eb.batch_start_offset = args->batch_start_offset; > eb.batch_len = args->batch_len; > + eb.oa_config = NULL; > > eb.batch_flags = 0; > if (args->flags & I915_EXEC_SECURE) { > @@ -2385,17 +2455,29 @@ i915_gem_do_execbuffer(struct drm_device *dev, > */ > intel_gt_pm_get(eb.i915); > > - err = i915_mutex_lock_interruptible(dev); > - if (err) > - goto err_rpm; > - > err = eb_select_engine(&eb, file, args); Lost the lock. > if (unlikely(err)) > - goto err_unlock; > + goto err_rpm; > + > + if (args->flags & I915_EXEC_PERF_CONFIG) { > + if (!intel_engine_has_oa(eb.engine)) { > + err = -ENODEV; > + goto err_engine; > + } > + > + err = get_execbuf_oa_config(eb.i915, args->DR1, args->DR4, > + &eb.oa_config, &eb.oa_bo); > + if (err) > + goto err_engine; > + } > + > + err = i915_mutex_lock_interruptible(dev); > + if (err) > + goto err_oa; > > err = eb_wait_for_ring(&eb); /* may temporarily drop struct_mutex */ > if (unlikely(err)) > - goto err_engine; > + goto err_unlock; > > err = eb_relocate(&eb); > if (err) { > @@ -2541,10 +2623,15 @@ i915_gem_do_execbuffer(struct drm_device *dev, > err_vma: > if (eb.exec) > eb_release_vmas(&eb); > -err_engine: > - eb_unpin_context(&eb); > err_unlock: > mutex_unlock(&dev->struct_mutex); > +err_oa: > + if (eb.oa_config) { > + i915_gem_object_put(eb.oa_bo); > + i915_oa_config_put(eb.oa_config); > + } > +err_engine: > + eb_unpin_context(&eb); Lost the lock. > err_rpm: > intel_gt_pm_put(eb.i915); > i915_gem_context_put(eb.gem_context);
On 21/05/2019 18:07, Chris Wilson wrote: > Quoting Lionel Landwerlin (2019-05-21 15:08:54) >> + if (eb->oa_config && >> + eb->oa_config != eb->i915->perf.oa.exclusive_stream->oa_config) { > But the oa_config is not ordered with respect to requests, and the > registers changed here are not context saved and so may be changed by a > third party before execution. The first party must presumably dropped > the perf_fd before then, so maybe you don't care? Hmm, doesn't even take > a third party as the perf_fd owner may change their mind for different > contexts and be surprised when the wrong set is used. The OA config batch should be ordered with regard to the MI_BBS we're doing just below right? It's written before in the ring buffer. That essentially all we need so that as the perf queries run in the batch supplied by the application runs with the configuration needed. If the application shares the perf FD and someone else runs another configuration, it's the application fault. It needs to hold the perf FD for as long as it's doing perf queries and not allow anybody else to interact with the OA configuration. This mechanism is unfortunately what we have resolve to because we don't have per context performance counters. The alternative is post processing the OA buffer (which we do in GL) from the CPU which is not really compatible with Vulkan queries. -Lionel > >> + struct i915_vma *oa_vma; >> + >> + oa_vma = i915_vma_instance(eb->oa_bo, >> + &eb->engine->i915->ggtt.vm, NULL); >> + if (unlikely(IS_ERR(oa_vma))) { >> + err = PTR_ERR(oa_vma); >> + return err; >> + } >> + >> + err = i915_vma_pin(oa_vma, 0, 0, PIN_GLOBAL); >> + if (err) >> + return err; >> + >> + err = eb->engine->emit_bb_start(eb->request, >> + oa_vma->node.start, >> + 0, I915_DISPATCH_SECURE); >> + if (err) { >> + i915_vma_unpin(oa_vma); >> + return err; >> + } >> + >> + err = i915_vma_move_to_active(oa_vma, eb->request, 0); > Move to active first, so that the vma is not in use if the move fails. > >> + if (err) { >> + i915_vma_unpin(oa_vma); >> + return err; >> + } >> + >> + >> + i915_vma_unpin(oa_vma); >> + >> + >> + swap(eb->oa_config, eb->i915->perf.oa.exclusive_stream->oa_config); >> + } >> + >> err = eb->engine->emit_bb_start(eb->request, >> eb->batch->node.start + >> eb->batch_start_offset, >> @@ -2341,6 +2410,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, >> eb.buffer_count = args->buffer_count; >> eb.batch_start_offset = args->batch_start_offset; >> eb.batch_len = args->batch_len; >> + eb.oa_config = NULL; >> >> eb.batch_flags = 0; >> if (args->flags & I915_EXEC_SECURE) { >> @@ -2385,17 +2455,29 @@ i915_gem_do_execbuffer(struct drm_device *dev, >> */ >> intel_gt_pm_get(eb.i915); >> >> - err = i915_mutex_lock_interruptible(dev); >> - if (err) >> - goto err_rpm; >> - >> err = eb_select_engine(&eb, file, args); > Lost the lock. Whoops... I'll split the engine_has_oa() check away. Thanks, -Lionel > >> if (unlikely(err)) >> - goto err_unlock; >> + goto err_rpm; >> + >> + if (args->flags & I915_EXEC_PERF_CONFIG) { >> + if (!intel_engine_has_oa(eb.engine)) { >> + err = -ENODEV; >> + goto err_engine; >> + } >> + >> + err = get_execbuf_oa_config(eb.i915, args->DR1, args->DR4, >> + &eb.oa_config, &eb.oa_bo); >> + if (err) >> + goto err_engine; >> + } >> + >> + err = i915_mutex_lock_interruptible(dev); >> + if (err) >> + goto err_oa; >> >> err = eb_wait_for_ring(&eb); /* may temporarily drop struct_mutex */ >> if (unlikely(err)) >> - goto err_engine; >> + goto err_unlock; >> >> err = eb_relocate(&eb); >> if (err) { >> @@ -2541,10 +2623,15 @@ i915_gem_do_execbuffer(struct drm_device *dev, >> err_vma: >> if (eb.exec) >> eb_release_vmas(&eb); >> -err_engine: >> - eb_unpin_context(&eb); >> err_unlock: >> mutex_unlock(&dev->struct_mutex); >> +err_oa: >> + if (eb.oa_config) { >> + i915_gem_object_put(eb.oa_bo); >> + i915_oa_config_put(eb.oa_config); >> + } >> +err_engine: >> + eb_unpin_context(&eb); > Lost the lock. > >> err_rpm: >> intel_gt_pm_put(eb.i915); >> i915_gem_context_put(eb.gem_context);
Quoting Lionel Landwerlin (2019-05-21 18:19:50) > On 21/05/2019 18:07, Chris Wilson wrote: > > Quoting Lionel Landwerlin (2019-05-21 15:08:54) > >> + if (eb->oa_config && > >> + eb->oa_config != eb->i915->perf.oa.exclusive_stream->oa_config) { > > But the oa_config is not ordered with respect to requests, and the > > registers changed here are not context saved and so may be changed by a > > third party before execution. The first party must presumably dropped > > the perf_fd before then, so maybe you don't care? Hmm, doesn't even take > > a third party as the perf_fd owner may change their mind for different > > contexts and be surprised when the wrong set is used. > > > The OA config batch should be ordered with regard to the MI_BBS we're > doing just below right? But you only emit if the oa_config has changed. Ergo, it may have changed between submission and execution. > It's written before in the ring buffer. > > > That essentially all we need so that as the perf queries run in the > batch supplied by the application runs with the configuration needed. > > If the application shares the perf FD and someone else runs another > configuration, it's the application fault. > > It needs to hold the perf FD for as long as it's doing perf queries and > not allow anybody else to interact with the OA configuration. If one app is trying to use different configs on different contexts (which seems reasonable if it is trying to sample different stats?) then it can be caught out. The order of execution is not the same as the order of submission (unless we enforce it by e.g. defining the perf.oa_config as a barrier). Another way would be to unconditionally emit the BB_START for the oa_vma, and instead do the early exit with a MI_CONDITIONAL_BB_END by comparing against a value stashed in the engine hwsp. You could do a predicated BB_START instead, but that looks to be more of a nuisance. -Chris
On 21/05/2019 18:48, Chris Wilson wrote: > Quoting Lionel Landwerlin (2019-05-21 18:19:50) >> On 21/05/2019 18:07, Chris Wilson wrote: >>> Quoting Lionel Landwerlin (2019-05-21 15:08:54) >>>> + if (eb->oa_config && >>>> + eb->oa_config != eb->i915->perf.oa.exclusive_stream->oa_config) { >>> But the oa_config is not ordered with respect to requests, and the >>> registers changed here are not context saved and so may be changed by a >>> third party before execution. The first party must presumably dropped >>> the perf_fd before then, so maybe you don't care? Hmm, doesn't even take >>> a third party as the perf_fd owner may change their mind for different >>> contexts and be surprised when the wrong set is used. >> >> The OA config batch should be ordered with regard to the MI_BBS we're >> doing just below right? > But you only emit if the oa_config has changed. Ergo, it may have > changed between submission and execution. > >> It's written before in the ring buffer. >> >> >> That essentially all we need so that as the perf queries run in the >> batch supplied by the application runs with the configuration needed. >> >> If the application shares the perf FD and someone else runs another >> configuration, it's the application fault. >> >> It needs to hold the perf FD for as long as it's doing perf queries and >> not allow anybody else to interact with the OA configuration. > If one app is trying to use different configs on different contexts > (which seems reasonable if it is trying to sample different stats?) then > it can be caught out. The order of execution is not the same as the > order of submission (unless we enforce it by e.g. defining the > perf.oa_config as a barrier). Thanks, I think I see the problem. It's pretty much the same as the sseu reconfiguration. Looking at the code it seems that the barrier is gone for sseu and I'm afraid that sounds like what's needed here :( -Lionel > > > Another way would be to unconditionally emit the BB_START for the > oa_vma, and instead do the early exit with a MI_CONDITIONAL_BB_END by > comparing against a value stashed in the engine hwsp. You could do a > predicated BB_START instead, but that looks to be more of a nuisance. > -Chris >
On 21/05/2019 18:48, Chris Wilson wrote: > Quoting Lionel Landwerlin (2019-05-21 18:19:50) >> On 21/05/2019 18:07, Chris Wilson wrote: >>> Quoting Lionel Landwerlin (2019-05-21 15:08:54) >>>> + if (eb->oa_config && >>>> + eb->oa_config != eb->i915->perf.oa.exclusive_stream->oa_config) { >>> But the oa_config is not ordered with respect to requests, and the >>> registers changed here are not context saved and so may be changed by a >>> third party before execution. The first party must presumably dropped >>> the perf_fd before then, so maybe you don't care? Hmm, doesn't even take >>> a third party as the perf_fd owner may change their mind for different >>> contexts and be surprised when the wrong set is used. >> >> The OA config batch should be ordered with regard to the MI_BBS we're >> doing just below right? > But you only emit if the oa_config has changed. Ergo, it may have > changed between submission and execution. > >> It's written before in the ring buffer. >> >> >> That essentially all we need so that as the perf queries run in the >> batch supplied by the application runs with the configuration needed. >> >> If the application shares the perf FD and someone else runs another >> configuration, it's the application fault. >> >> It needs to hold the perf FD for as long as it's doing perf queries and >> not allow anybody else to interact with the OA configuration. > If one app is trying to use different configs on different contexts > (which seems reasonable if it is trying to sample different stats?) then > it can be caught out. The order of execution is not the same as the > order of submission (unless we enforce it by e.g. defining the > perf.oa_config as a barrier). Thinking about this a bit more, the use case here was always that the app is the single user of the OA unit. In this scenario, the app is doing queries and should not share the configuration of the OA HW with anybody else. So all the sampling should be ordered with regard to the context's timeline. -Lionel > > > Another way would be to unconditionally emit the BB_START for the > oa_vma, and instead do the early exit with a MI_CONDITIONAL_BB_END by > comparing against a value stashed in the engine hwsp. You could do a > predicated BB_START instead, but that looks to be more of a nuisance. > -Chris >
Quoting Lionel Landwerlin (2019-05-22 10:19:46) > On 21/05/2019 18:48, Chris Wilson wrote: > > Quoting Lionel Landwerlin (2019-05-21 18:19:50) > >> On 21/05/2019 18:07, Chris Wilson wrote: > >>> Quoting Lionel Landwerlin (2019-05-21 15:08:54) > >>>> + if (eb->oa_config && > >>>> + eb->oa_config != eb->i915->perf.oa.exclusive_stream->oa_config) { > >>> But the oa_config is not ordered with respect to requests, and the > >>> registers changed here are not context saved and so may be changed by a > >>> third party before execution. The first party must presumably dropped > >>> the perf_fd before then, so maybe you don't care? Hmm, doesn't even take > >>> a third party as the perf_fd owner may change their mind for different > >>> contexts and be surprised when the wrong set is used. > >> > >> The OA config batch should be ordered with regard to the MI_BBS we're > >> doing just below right? > > But you only emit if the oa_config has changed. Ergo, it may have > > changed between submission and execution. > > > >> It's written before in the ring buffer. > >> > >> > >> That essentially all we need so that as the perf queries run in the > >> batch supplied by the application runs with the configuration needed. > >> > >> If the application shares the perf FD and someone else runs another > >> configuration, it's the application fault. > >> > >> It needs to hold the perf FD for as long as it's doing perf queries and > >> not allow anybody else to interact with the OA configuration. > > If one app is trying to use different configs on different contexts > > (which seems reasonable if it is trying to sample different stats?) then > > it can be caught out. The order of execution is not the same as the > > order of submission (unless we enforce it by e.g. defining the > > perf.oa_config as a barrier). > > > Thinking about this a bit more, the use case here was always that the > app is the single user of the OA unit. > > In this scenario, the app is doing queries and should not share the > configuration of the OA HW with anybody else. What about with itself? And does that excuse the kernel carrying a TOCTOU bug? -Chris
On 22/05/2019 10:25, Chris Wilson wrote: > Quoting Lionel Landwerlin (2019-05-22 10:19:46) >> On 21/05/2019 18:48, Chris Wilson wrote: >>> Quoting Lionel Landwerlin (2019-05-21 18:19:50) >>>> On 21/05/2019 18:07, Chris Wilson wrote: >>>>> Quoting Lionel Landwerlin (2019-05-21 15:08:54) >>>>>> + if (eb->oa_config && >>>>>> + eb->oa_config != eb->i915->perf.oa.exclusive_stream->oa_config) { >>>>> But the oa_config is not ordered with respect to requests, and the >>>>> registers changed here are not context saved and so may be changed by a >>>>> third party before execution. The first party must presumably dropped >>>>> the perf_fd before then, so maybe you don't care? Hmm, doesn't even take >>>>> a third party as the perf_fd owner may change their mind for different >>>>> contexts and be surprised when the wrong set is used. >>>> The OA config batch should be ordered with regard to the MI_BBS we're >>>> doing just below right? >>> But you only emit if the oa_config has changed. Ergo, it may have >>> changed between submission and execution. >>> >>>> It's written before in the ring buffer. >>>> >>>> >>>> That essentially all we need so that as the perf queries run in the >>>> batch supplied by the application runs with the configuration needed. >>>> >>>> If the application shares the perf FD and someone else runs another >>>> configuration, it's the application fault. >>>> >>>> It needs to hold the perf FD for as long as it's doing perf queries and >>>> not allow anybody else to interact with the OA configuration. >>> If one app is trying to use different configs on different contexts >>> (which seems reasonable if it is trying to sample different stats?) then >>> it can be caught out. The order of execution is not the same as the >>> order of submission (unless we enforce it by e.g. defining the >>> perf.oa_config as a barrier). >> >> Thinking about this a bit more, the use case here was always that the >> app is the single user of the OA unit. >> >> In this scenario, the app is doing queries and should not share the >> configuration of the OA HW with anybody else. > What about with itself? And does that excuse the kernel carrying a > TOCTOU bug? > -Chris > You mean with something like Iris that uses 2 contexts? I'm assuming things are properly synchronized. There is also another problem with the 2 contexts which is that we only allow filtering a single ID at the moment... Sorry, I'm not familiar with the TOCTOU bug :( -Lionel
Quoting Lionel Landwerlin (2019-05-21 15:08:54) > @@ -2048,6 +2081,42 @@ static int eb_submit(struct i915_execbuffer *eb) > return err; > } if (eb->oa_config) { err = i915_active_request_set(&eb->i915->perf.oa.oa_config_active, eb->request); if (err) return err; } with the addition of struct i915_active_request oa_config_active; to i915->perf.oa, and i915_active_init; That will ensure that the oa_config can't be changed before execution (and the ordering restriction is essentially a no-op if only one context has a specified oa_config). > + if (eb->oa_config && > + eb->oa_config != eb->i915->perf.oa.exclusive_stream->oa_config) { Fwiw, I would move these to eb_oa_config(). if (eb->oa_config) { err = eb_oa_config(eb); if (err) return err; } How does eb_oa_config mix with the global gen8_configure_all_contexts()? -Chris
On 28/05/2019 11:52, Chris Wilson wrote: > Quoting Lionel Landwerlin (2019-05-21 15:08:54) >> @@ -2048,6 +2081,42 @@ static int eb_submit(struct i915_execbuffer *eb) >> return err; >> } > if (eb->oa_config) { > err = i915_active_request_set(&eb->i915->perf.oa.oa_config_active, > eb->request); > if (err) > return err; > } > > with the addition of > struct i915_active_request oa_config_active; > to i915->perf.oa, and i915_active_init; That will ensure that the > oa_config can't be changed before execution (and the ordering restriction > is essentially a no-op if only one context has a specified oa_config). > >> + if (eb->oa_config && >> + eb->oa_config != eb->i915->perf.oa.exclusive_stream->oa_config) { > Fwiw, I would move these to eb_oa_config(). > > if (eb->oa_config) { > err = eb_oa_config(eb); > if (err) > return err; > } > > How does eb_oa_config mix with the global gen8_configure_all_contexts()? Excellent point, I should document this. HW configurations have roughly 3 parts : - NOA configuration, network configuration to source specific data from anywhere (global, non power saved/restored) - Boolean counters, filters signals brought by NOA (global, can't remember whether saved/restored) - Flex counters, filters on events happening within the EUs (per context, saved/restored) First two will affect all running contexts (because global), last one will only be applied to the context that triggered the execbuf. That should be fine because of the other requirement we need (don't preempt the context running a performance query). -Lionel > -Chris >
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index e381c1c73902..766fbbede430 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -445,6 +445,7 @@ struct intel_engine_cs { #define I915_ENGINE_HAS_PREEMPTION BIT(2) #define I915_ENGINE_HAS_SEMAPHORES BIT(3) #define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET BIT(4) +#define I915_ENGINE_HAS_OA BIT(5) unsigned int flags; /* @@ -534,6 +535,12 @@ intel_engine_needs_breadcrumb_tasklet(const struct intel_engine_cs *engine) return engine->flags & I915_ENGINE_NEEDS_BREADCRUMB_TASKLET; } +static inline bool +intel_engine_has_oa(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_HAS_OA; +} + #define instdone_slice_mask(dev_priv__) \ (IS_GEN(dev_priv__, 7) ? \ 1 : RUNTIME_INFO(dev_priv__)->sseu.slice_mask) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 2ad95977f7a8..cad6fca4ba0f 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -2395,6 +2395,7 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine) engine->init_context = gen8_init_rcs_context; engine->emit_flush = gen8_emit_flush_render; engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; + engine->flags |= I915_ENGINE_HAS_OA; } return 0; diff --git a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c index f0d60affdba3..dc85a3e474b9 100644 --- a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c @@ -2210,8 +2210,10 @@ static void setup_rcs(struct intel_engine_cs *engine) engine->irq_enable_mask = I915_USER_INTERRUPT; } - if (IS_HASWELL(i915)) + if (IS_HASWELL(i915)) { engine->emit_bb_start = hsw_emit_bb_start; + engine->flags |= I915_ENGINE_HAS_OA; + } engine->resume = rcs_resume; } diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 5871e0cfbab0..6d9a15642342 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -472,6 +472,10 @@ static int i915_getparam_ioctl(struct drm_device *dev, void *data, case I915_PARAM_PERF_REVISION: value = 2; break; + case I915_PARAM_HAS_EXEC_PERF_CONFIG: + /* Obviously requires perf support. */ + value = dev_priv->perf.initialized; + break; default: DRM_DEBUG("Unknown parameter %d\n", param->param); return -EINVAL; diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index abd564bfa03b..25860d99ffc6 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3154,6 +3154,7 @@ int i915_perf_get_oa_config(struct drm_i915_private *i915, int metrics_set, struct i915_oa_config **out_config, struct drm_i915_gem_object **out_obj); +void i915_oa_config_put(struct i915_oa_config *oa_config); /* i915_gem_evict.c */ int __must_check i915_gem_evict_something(struct i915_address_space *vm, diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 361c232dde83..3794c6ce71e3 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -288,6 +288,9 @@ struct i915_execbuffer { */ int lut_size; struct hlist_head *buckets; /** ht for relocation handles */ + + struct i915_oa_config *oa_config; /** HW configuration for OA, NULL is not needed. */ + struct drm_i915_gem_object *oa_bo; }; #define exec_entry(EB, VMA) (&(EB)->exec[(VMA)->exec_flags - (EB)->flags]) @@ -1183,6 +1186,33 @@ static void clflush_write32(u32 *addr, u32 value, unsigned int flushes) *addr = value; } +static int +get_execbuf_oa_config(struct drm_i915_private *dev_priv, + int perf_fd, u32 oa_config_id, + struct i915_oa_config **out_oa_config, + struct drm_i915_gem_object **out_oa_obj) +{ + struct file *perf_file; + int ret; + + if (!dev_priv->perf.oa.exclusive_stream) + return -EINVAL; + + perf_file = fget(perf_fd); + if (!perf_file) + return -EINVAL; + + if (perf_file->private_data != dev_priv->perf.oa.exclusive_stream) + return -EINVAL; + + fput(perf_file); + + ret = i915_perf_get_oa_config(dev_priv, oa_config_id, + out_oa_config, out_oa_obj); + + return ret; +} + static int __reloc_gpu_alloc(struct i915_execbuffer *eb, struct i915_vma *vma, unsigned int len) @@ -1937,12 +1967,15 @@ static bool i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec) return false; } - if (exec->DR4 == 0xffffffff) { - DRM_DEBUG("UXA submitting garbage DR4, fixing up\n"); - exec->DR4 = 0; + /* We reuse DR1 & DR4 fields for passing the perf config detail. */ + if (!(exec->flags & I915_EXEC_PERF_CONFIG)) { + if (exec->DR4 == 0xffffffff) { + DRM_DEBUG("UXA submitting garbage DR4, fixing up\n"); + exec->DR4 = 0; + } + if (exec->DR1 || exec->DR4) + return false; } - if (exec->DR1 || exec->DR4) - return false; if ((exec->batch_start_offset | exec->batch_len) & 0x7) return false; @@ -2048,6 +2081,42 @@ static int eb_submit(struct i915_execbuffer *eb) return err; } + if (eb->oa_config && + eb->oa_config != eb->i915->perf.oa.exclusive_stream->oa_config) { + struct i915_vma *oa_vma; + + oa_vma = i915_vma_instance(eb->oa_bo, + &eb->engine->i915->ggtt.vm, NULL); + if (unlikely(IS_ERR(oa_vma))) { + err = PTR_ERR(oa_vma); + return err; + } + + err = i915_vma_pin(oa_vma, 0, 0, PIN_GLOBAL); + if (err) + return err; + + err = eb->engine->emit_bb_start(eb->request, + oa_vma->node.start, + 0, I915_DISPATCH_SECURE); + if (err) { + i915_vma_unpin(oa_vma); + return err; + } + + err = i915_vma_move_to_active(oa_vma, eb->request, 0); + if (err) { + i915_vma_unpin(oa_vma); + return err; + } + + + i915_vma_unpin(oa_vma); + + + swap(eb->oa_config, eb->i915->perf.oa.exclusive_stream->oa_config); + } + err = eb->engine->emit_bb_start(eb->request, eb->batch->node.start + eb->batch_start_offset, @@ -2341,6 +2410,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, eb.buffer_count = args->buffer_count; eb.batch_start_offset = args->batch_start_offset; eb.batch_len = args->batch_len; + eb.oa_config = NULL; eb.batch_flags = 0; if (args->flags & I915_EXEC_SECURE) { @@ -2385,17 +2455,29 @@ i915_gem_do_execbuffer(struct drm_device *dev, */ intel_gt_pm_get(eb.i915); - err = i915_mutex_lock_interruptible(dev); - if (err) - goto err_rpm; - err = eb_select_engine(&eb, file, args); if (unlikely(err)) - goto err_unlock; + goto err_rpm; + + if (args->flags & I915_EXEC_PERF_CONFIG) { + if (!intel_engine_has_oa(eb.engine)) { + err = -ENODEV; + goto err_engine; + } + + err = get_execbuf_oa_config(eb.i915, args->DR1, args->DR4, + &eb.oa_config, &eb.oa_bo); + if (err) + goto err_engine; + } + + err = i915_mutex_lock_interruptible(dev); + if (err) + goto err_oa; err = eb_wait_for_ring(&eb); /* may temporarily drop struct_mutex */ if (unlikely(err)) - goto err_engine; + goto err_unlock; err = eb_relocate(&eb); if (err) { @@ -2541,10 +2623,15 @@ i915_gem_do_execbuffer(struct drm_device *dev, err_vma: if (eb.exec) eb_release_vmas(&eb); -err_engine: - eb_unpin_context(&eb); err_unlock: mutex_unlock(&dev->struct_mutex); +err_oa: + if (eb.oa_config) { + i915_gem_object_put(eb.oa_bo); + i915_oa_config_put(eb.oa_config); + } +err_engine: + eb_unpin_context(&eb); err_rpm: intel_gt_pm_put(eb.i915); i915_gem_context_put(eb.gem_context); diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 7e0ebd4bc8f2..7b861f12f161 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -365,7 +365,7 @@ struct perf_open_properties { int oa_period_exponent; }; -static void put_oa_config(struct i915_oa_config *oa_config) +void i915_oa_config_put(struct i915_oa_config *oa_config) { if (!atomic_dec_and_test(&oa_config->ref_count)) return; @@ -515,7 +515,7 @@ int i915_perf_get_oa_config(struct drm_i915_private *i915, err_buf_alloc: if (out_config) { - put_oa_config(oa_config); + i915_oa_config_put(oa_config); *out_config = NULL; } unlock: @@ -1496,7 +1496,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream) if (stream->ctx) oa_put_render_ctx_id(stream); - put_oa_config(stream->oa_config); + i915_oa_config_put(stream->oa_config); if (dev_priv->perf.oa.spurious_report_rs.missed) { DRM_NOTE("%d spurious OA report notices suppressed due to ratelimiting\n", @@ -2264,7 +2264,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, free_oa_buffer(dev_priv); err_oa_buf_alloc: - put_oa_config(stream->oa_config); + i915_oa_config_put(stream->oa_config); intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL); intel_runtime_pm_put(dev_priv, stream->wakeref); @@ -3441,7 +3441,7 @@ int i915_perf_add_config_ioctl(struct drm_device *dev, void *data, sysfs_err: mutex_unlock(&dev_priv->perf.metrics_lock); reg_err: - put_oa_config(oa_config); + i915_oa_config_put(oa_config); DRM_DEBUG("Failed to add new OA config\n"); return err; } @@ -3495,7 +3495,7 @@ int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data, DRM_DEBUG("Removed config %s id=%i\n", oa_config->uuid, oa_config->id); - put_oa_config(oa_config); + i915_oa_config_put(oa_config); config_err: mutex_unlock(&dev_priv->perf.metrics_lock); @@ -3657,7 +3657,7 @@ static int destroy_config(int id, void *p, void *data) { struct i915_oa_config *oa_config = p; - put_oa_config(oa_config); + i915_oa_config_put(oa_config); return 0; } diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 5601dc688295..e57fb5f249da 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -604,6 +604,16 @@ typedef struct drm_i915_irq_wait { */ #define I915_PARAM_PERF_REVISION 53 +/* + * Request an OA performance configuration change before running the commands + * given in an execbuf. + * + * Performance configuration ID is given in the DR4 field of + * drm_i915_gem_execbuffer2 and the file descriptor of the i915 perf stream is + * given in DR1. Execbuffer will fail if any of these parameter is invalid. + */ +#define I915_PARAM_HAS_EXEC_PERF_CONFIG 54 + /* Must be kept compact -- no holes and well documented */ typedef struct drm_i915_getparam { @@ -1126,7 +1136,15 @@ struct drm_i915_gem_execbuffer2 { */ #define I915_EXEC_FENCE_ARRAY (1<<19) -#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_ARRAY<<1)) +/* Request that perf monitoring hardware be reprogrammed before executing the + * commands from the batch in the execbuf. The DR1 & DR4 fields of the execbuf + * must respectively contain the file descriptor of the perf monitoring device + * and the configuration to program. + */ +#define I915_EXEC_PERF_CONFIG (1<<20) + + +#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_PERF_CONFIG<<1)) #define I915_EXEC_CONTEXT_ID_MASK (0xffffffff) #define i915_execbuffer2_set_context_id(eb2, context) \
We want the ability to dispatch a set of command buffer to the hardware, each with a different OA configuration. To achieve this, we reuse a couple of fields from the execbuf2 struct (I CAN HAZ execbuf3?) to notify what OA configuration should be used for a batch buffer. This requires the process making the execbuf with this flag to also own the perf fd at the time of execbuf. v2: Add a emit_oa_config() vfunc in the intel_engine_cs (Chris) Move oa_config vma to active (Chris) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> --- drivers/gpu/drm/i915/gt/intel_engine_types.h | 7 ++ drivers/gpu/drm/i915/gt/intel_lrc.c | 1 + drivers/gpu/drm/i915/gt/intel_ringbuffer.c | 4 +- drivers/gpu/drm/i915/i915_drv.c | 4 + drivers/gpu/drm/i915/i915_drv.h | 1 + drivers/gpu/drm/i915/i915_gem_execbuffer.c | 113 ++++++++++++++++--- drivers/gpu/drm/i915/i915_perf.c | 14 +-- include/uapi/drm/i915_drm.h | 20 +++- 8 files changed, 142 insertions(+), 22 deletions(-)