Message ID | 20220823204155.8178-7-umesh.nerlige.ramappa@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add DG2 OA support | expand |
On Tue, 23 Aug 2022 13:41:42 -0700, Umesh Nerlige Ramappa wrote: > Hi Umesh, > diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c > index 6fc4f0d8fc5a..bbf1c574f393 100644 > --- a/drivers/gpu/drm/i915/i915_perf.c > +++ b/drivers/gpu/drm/i915/i915_perf.c > @@ -385,6 +385,21 @@ static struct ctl_table_header *sysctl_header; > > static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer); > > +static u32 _oa_taken(struct i915_perf_stream * stream, u32 tail, u32 head) nit: no space between * and stream. > +{ > + u32 size = stream->oa_buffer.vma->size; > + > + return tail >= head ? tail - head : size - (head - tail); > +} If we are doing this we should probably eliminate references to OA_TAKEN which serves an identical purpose (I think there is one remaining reference) and also delete OA_TAKEN #define. > + > +static u32 _rewind_tail(struct i915_perf_stream * stream, u32 relative_hw_tail, > + u32 rewind_delta) > +{ > + return rewind_delta > relative_hw_tail ? > + stream->oa_buffer.vma->size - (rewind_delta - relative_hw_tail) : > + relative_hw_tail - rewind_delta; > +} Also are we really saying here that we are supporting non-power-of-2 OA buffer sizes? Because if we stayed with power-of-2 sizes the expression above are nice and elegant and actually closer to the previous code being changed in this patch. For example: #include <linux/circ_buf.h> static u32 _oa_taken(struct i915_perf_stream *stream, u32 tail, u32 head) { return CIRC_CNT(tail, head, stream->oa_buffer.vma->size); } static u32 _rewind_tail(struct i915_perf_stream *stream, u32 relative_hw_tail, u32 rewind_delta) { return CIRC_CNT(relative_hw_tail, rewind_delta, stream->oa_buffer.vma->size); } Note that for power-of-2 sizes the two functions above are identical but we should keep them separate for clarity (as is done in the patch) since they are serving two different functions in the OA code. Also another assumption in the code seems to be: stream->oa_buffer.vma->size == OA_BUFFER_SIZE which I am pretty sure will not hold for arbitrary non-power-of-2 OA buffer sizes? So we might as well stick with power-of-2 sizes and change later in a separate patch only if needed? Thanks. -- Ashutosh > + > void i915_oa_config_release(struct kref *ref) > { > struct i915_oa_config *oa_config = > @@ -487,12 +502,14 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) > * sizes need not be integral multiples or 64 or powers of 2. > * Compute potentially partially landed report in the OA buffer > */ > - partial_report_size = OA_TAKEN(hw_tail, stream->oa_buffer.tail); > + partial_report_size = > + _oa_taken(stream, hw_tail, stream->oa_buffer.tail); > partial_report_size %= report_size; > > /* Subtract partial amount off the tail */ > - hw_tail = gtt_offset + ((hw_tail - partial_report_size) & > - (stream->oa_buffer.vma->size - 1)); > + hw_tail = gtt_offset + _rewind_tail(stream, > + hw_tail - gtt_offset, > + partial_report_size); > > now = ktime_get_mono_fast_ns(); > > @@ -527,16 +544,16 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) > * memory in the order they were written to. > * If not : (╯°□°)╯︵ ┻━┻ > */ > - while (OA_TAKEN(tail, aged_tail) >= report_size) { > + while (_oa_taken(stream, tail, aged_tail) >= report_size) { > u32 *report32 = (void *)(stream->oa_buffer.vaddr + tail); > > if (report32[0] != 0 || report32[1] != 0) > break; > > - tail = (tail - report_size) & (OA_BUFFER_SIZE - 1); > + tail = _rewind_tail(stream, tail, report_size); > } > > - if (OA_TAKEN(hw_tail, tail) > report_size && > + if (_oa_taken(stream, hw_tail, tail) > report_size && > __ratelimit(&stream->perf->tail_pointer_race)) > DRM_NOTE("unlanded report(s) head=0x%x " > "tail=0x%x hw_tail=0x%x\n", > @@ -547,8 +564,9 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) > stream->oa_buffer.aging_timestamp = now; > } > > - pollin = OA_TAKEN(stream->oa_buffer.tail - gtt_offset, > - stream->oa_buffer.head - gtt_offset) >= report_size; > + pollin = _oa_taken(stream, > + stream->oa_buffer.tail, > + stream->oa_buffer.head) >= report_size; > > spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); > > @@ -679,11 +697,9 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, > int report_size = stream->oa_buffer.format_size; > u8 *oa_buf_base = stream->oa_buffer.vaddr; > u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); > - u32 mask = (OA_BUFFER_SIZE - 1); > size_t start_offset = *offset; > unsigned long flags; > - u32 head, tail; > - u32 taken; > + u32 head, tail, size; > int ret = 0; > > if (drm_WARN_ON(&uncore->i915->drm, !stream->enabled)) > @@ -693,6 +709,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, > > head = stream->oa_buffer.head; > tail = stream->oa_buffer.tail; > + size = stream->oa_buffer.vma->size; > > spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); > > @@ -711,16 +728,15 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, > * all a power of two). > */ > if (drm_WARN_ONCE(&uncore->i915->drm, > - head > stream->oa_buffer.vma->size || > - tail > stream->oa_buffer.vma->size, > + head > size || tail > size, > "Inconsistent OA buffer pointers: head = %u, tail = %u\n", > head, tail)) > return -EIO; > > > for (/* none */; > - (taken = OA_TAKEN(tail, head)); > - head = (head + report_size) & mask) { > + _oa_taken(stream, tail, head); > + head = (head + report_size) % size) { > u8 *report = oa_buf_base + head; > u32 *report32 = (void *)report; > u32 ctx_id; > -- > 2.25.1 >
On Wed, Sep 14, 2022 at 09:04:10AM -0700, Dixit, Ashutosh wrote: >On Tue, 23 Aug 2022 13:41:42 -0700, Umesh Nerlige Ramappa wrote: >> > >Hi Umesh, > >> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c >> index 6fc4f0d8fc5a..bbf1c574f393 100644 >> --- a/drivers/gpu/drm/i915/i915_perf.c >> +++ b/drivers/gpu/drm/i915/i915_perf.c >> @@ -385,6 +385,21 @@ static struct ctl_table_header *sysctl_header; >> >> static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer); >> >> +static u32 _oa_taken(struct i915_perf_stream * stream, u32 tail, u32 head) > >nit: no space between * and stream. > >> +{ >> + u32 size = stream->oa_buffer.vma->size; >> + >> + return tail >= head ? tail - head : size - (head - tail); >> +} > >If we are doing this we should probably eliminate references to OA_TAKEN >which serves an identical purpose (I think there is one remaining >reference) and also delete OA_TAKEN #define. > >> + >> +static u32 _rewind_tail(struct i915_perf_stream * stream, u32 relative_hw_tail, >> + u32 rewind_delta) >> +{ >> + return rewind_delta > relative_hw_tail ? >> + stream->oa_buffer.vma->size - (rewind_delta - relative_hw_tail) : >> + relative_hw_tail - rewind_delta; >> +} > >Also are we really saying here that we are supporting non-power-of-2 OA >buffer sizes? Because if we stayed with power-of-2 sizes the expression >above are nice and elegant and actually closer to the previous code being >changed in this patch. For example: > >#include <linux/circ_buf.h> > >static u32 _oa_taken(struct i915_perf_stream *stream, u32 tail, u32 head) >{ > return CIRC_CNT(tail, head, stream->oa_buffer.vma->size); >} > >static u32 _rewind_tail(struct i915_perf_stream *stream, u32 relative_hw_tail, > u32 rewind_delta) >{ > return CIRC_CNT(relative_hw_tail, rewind_delta, stream->oa_buffer.vma->size); >} > >Note that for power-of-2 sizes the two functions above are identical but we >should keep them separate for clarity (as is done in the patch) since they >are serving two different functions in the OA code. > >Also another assumption in the code seems to be: > > stream->oa_buffer.vma->size == OA_BUFFER_SIZE > >which I am pretty sure will not hold for arbitrary non-power-of-2 OA buffer >sizes? So we might as well stick with power-of-2 sizes and change later in >a separate patch only if needed? Most changes here are related to the OA buffer size issue and that is specific to xehpsdv where the size is not a power of 2. I am thinking of dropping these changes in the next revision since DG2 is fixed and OA buffer sizes are power of 2. Thanks, Umesh > >Thanks. >-- >Ashutosh
On Wed, 14 Sep 2022 11:19:30 -0700, Umesh Nerlige Ramappa wrote: > > On Wed, Sep 14, 2022 at 09:04:10AM -0700, Dixit, Ashutosh wrote: > > On Tue, 23 Aug 2022 13:41:42 -0700, Umesh Nerlige Ramappa wrote: > >> > > > > Hi Umesh, > > > >> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c > >> index 6fc4f0d8fc5a..bbf1c574f393 100644 > >> --- a/drivers/gpu/drm/i915/i915_perf.c > >> +++ b/drivers/gpu/drm/i915/i915_perf.c > >> @@ -385,6 +385,21 @@ static struct ctl_table_header *sysctl_header; > >> > >> static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer); > >> > >> +static u32 _oa_taken(struct i915_perf_stream * stream, u32 tail, u32 head) > > > > nit: no space between * and stream. > > > >> +{ > >> + u32 size = stream->oa_buffer.vma->size; > >> + > >> + return tail >= head ? tail - head : size - (head - tail); > >> +} > > > > If we are doing this we should probably eliminate references to OA_TAKEN > > which serves an identical purpose (I think there is one remaining > > reference) and also delete OA_TAKEN #define. > > > >> + > >> +static u32 _rewind_tail(struct i915_perf_stream * stream, u32 relative_hw_tail, > >> + u32 rewind_delta) > >> +{ > >> + return rewind_delta > relative_hw_tail ? > >> + stream->oa_buffer.vma->size - (rewind_delta - relative_hw_tail) : > >> + relative_hw_tail - rewind_delta; > >> +} > > > > Also are we really saying here that we are supporting non-power-of-2 OA > > buffer sizes? Because if we stayed with power-of-2 sizes the expression > > above are nice and elegant and actually closer to the previous code being > > changed in this patch. For example: > > > > #include <linux/circ_buf.h> > > > > static u32 _oa_taken(struct i915_perf_stream *stream, u32 tail, u32 head) > > { > > return CIRC_CNT(tail, head, stream->oa_buffer.vma->size); > > } > > > > static u32 _rewind_tail(struct i915_perf_stream *stream, u32 relative_hw_tail, > > u32 rewind_delta) > > { > > return CIRC_CNT(relative_hw_tail, rewind_delta, stream->oa_buffer.vma->size); > > } > > > > Note that for power-of-2 sizes the two functions above are identical but we > > should keep them separate for clarity (as is done in the patch) since they > > are serving two different functions in the OA code. > > > > Also another assumption in the code seems to be: > > > > stream->oa_buffer.vma->size == OA_BUFFER_SIZE > > > > which I am pretty sure will not hold for arbitrary non-power-of-2 OA buffer > > sizes? So we might as well stick with power-of-2 sizes and change later in > > a separate patch only if needed? > > Most changes here are related to the OA buffer size issue and that is > specific to xehpsdv where the size is not a power of 2. I am thinking of > dropping these changes in the next revision since DG2 is fixed and OA > buffer sizes are power of 2. In the code stream->oa_buffer.vma->size and OA_BUFFER_SIZE are both used, if we want to clean that up and only use stream->oa_buffer.vma->size, we could still do soemthing like I suggested with just power-of-2 sizes and keep this patch. If we ever have to support non-power-of-2 sizes in the future we'll just need to change _oa_taken and _rewind_tail functions. Anyway your call. Thanks. -- Ashutosh
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 6fc4f0d8fc5a..bbf1c574f393 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -385,6 +385,21 @@ static struct ctl_table_header *sysctl_header; static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer); +static u32 _oa_taken(struct i915_perf_stream * stream, u32 tail, u32 head) +{ + u32 size = stream->oa_buffer.vma->size; + + return tail >= head ? tail - head : size - (head - tail); +} + +static u32 _rewind_tail(struct i915_perf_stream * stream, u32 relative_hw_tail, + u32 rewind_delta) +{ + return rewind_delta > relative_hw_tail ? + stream->oa_buffer.vma->size - (rewind_delta - relative_hw_tail) : + relative_hw_tail - rewind_delta; +} + void i915_oa_config_release(struct kref *ref) { struct i915_oa_config *oa_config = @@ -487,12 +502,14 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) * sizes need not be integral multiples or 64 or powers of 2. * Compute potentially partially landed report in the OA buffer */ - partial_report_size = OA_TAKEN(hw_tail, stream->oa_buffer.tail); + partial_report_size = + _oa_taken(stream, hw_tail, stream->oa_buffer.tail); partial_report_size %= report_size; /* Subtract partial amount off the tail */ - hw_tail = gtt_offset + ((hw_tail - partial_report_size) & - (stream->oa_buffer.vma->size - 1)); + hw_tail = gtt_offset + _rewind_tail(stream, + hw_tail - gtt_offset, + partial_report_size); now = ktime_get_mono_fast_ns(); @@ -527,16 +544,16 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) * memory in the order they were written to. * If not : (╯°□°)╯︵ ┻━┻ */ - while (OA_TAKEN(tail, aged_tail) >= report_size) { + while (_oa_taken(stream, tail, aged_tail) >= report_size) { u32 *report32 = (void *)(stream->oa_buffer.vaddr + tail); if (report32[0] != 0 || report32[1] != 0) break; - tail = (tail - report_size) & (OA_BUFFER_SIZE - 1); + tail = _rewind_tail(stream, tail, report_size); } - if (OA_TAKEN(hw_tail, tail) > report_size && + if (_oa_taken(stream, hw_tail, tail) > report_size && __ratelimit(&stream->perf->tail_pointer_race)) DRM_NOTE("unlanded report(s) head=0x%x " "tail=0x%x hw_tail=0x%x\n", @@ -547,8 +564,9 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) stream->oa_buffer.aging_timestamp = now; } - pollin = OA_TAKEN(stream->oa_buffer.tail - gtt_offset, - stream->oa_buffer.head - gtt_offset) >= report_size; + pollin = _oa_taken(stream, + stream->oa_buffer.tail, + stream->oa_buffer.head) >= report_size; spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); @@ -679,11 +697,9 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, int report_size = stream->oa_buffer.format_size; u8 *oa_buf_base = stream->oa_buffer.vaddr; u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); - u32 mask = (OA_BUFFER_SIZE - 1); size_t start_offset = *offset; unsigned long flags; - u32 head, tail; - u32 taken; + u32 head, tail, size; int ret = 0; if (drm_WARN_ON(&uncore->i915->drm, !stream->enabled)) @@ -693,6 +709,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, head = stream->oa_buffer.head; tail = stream->oa_buffer.tail; + size = stream->oa_buffer.vma->size; spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); @@ -711,16 +728,15 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, * all a power of two). */ if (drm_WARN_ONCE(&uncore->i915->drm, - head > stream->oa_buffer.vma->size || - tail > stream->oa_buffer.vma->size, + head > size || tail > size, "Inconsistent OA buffer pointers: head = %u, tail = %u\n", head, tail)) return -EIO; for (/* none */; - (taken = OA_TAKEN(tail, head)); - head = (head + report_size) & mask) { + _oa_taken(stream, tail, head); + head = (head + report_size) % size) { u8 *report = oa_buf_base + head; u32 *report32 = (void *)report; u32 ctx_id;
DG2 has a new feature to supports OA buffer sizes up to 128Mb by toggling a bit in OA_DEBUG. This would eventually be a user configurable parameter. Use OA buffer vma size in all calculations with some helpers. v2: Let compiler decide inline (Jani) Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> --- drivers/gpu/drm/i915/i915_perf.c | 46 +++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 15 deletions(-)