Message ID | 20190913230620.15906-3-umesh.nerlige.ramappa@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/i915/perf: Enable non-power-of-2 OA report sizes | expand |
On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote: > OA perf unit supports non-power of 2 report sizes. Enable support for > these sizes in the driver. > > Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> > --- > drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++-------------------- > 1 file changed, 21 insertions(+), 38 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c > index 50b6d154fd46..482fca3da7de 100644 > --- a/drivers/gpu/drm/i915/i915_perf.c > +++ b/drivers/gpu/drm/i915/i915_perf.c > @@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) > u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); > int report_size = stream->oa_buffer.format_size; > unsigned long flags; > - u32 hw_tail; > + u32 hw_tail, aging_tail; > u64 now; > > /* We have to consider the (unlikely) possibility that read() errors > @@ -459,16 +459,17 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) > */ > spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); > > - hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream); > + hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset; > + aging_tail = stream->oa_buffer.aging_tail - gtt_offset; > > /* The tail pointer increases in 64 byte increments, > * not in report_size steps... > */ > - hw_tail &= ~(report_size - 1); > + hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % report_size)); I'm struggling to parse this line above and I'm not 100% sure it's correct. Could add a comment to explain what is going on? Thanks, -Lionel > > now = ktime_get_mono_fast_ns(); > > - if (hw_tail == stream->oa_buffer.aging_tail) { > + if (hw_tail == aging_tail) { > /* If the HW tail hasn't move since the last check and the HW > * tail has been aging for long enough, declare it the new > * tail. > @@ -486,8 +487,6 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) > * a read() in progress. > */ > head = stream->oa_buffer.head - gtt_offset; > - > - hw_tail -= gtt_offset; > tail = hw_tail; > > /* Walk the stream backward until we find at least 2 reports > @@ -613,7 +612,18 @@ static int append_oa_sample(struct i915_perf_stream *stream, > buf += sizeof(header); > > if (sample_flags & SAMPLE_OA_REPORT) { > - if (copy_to_user(buf, report, report_size)) > + u8 *oa_buf_end = stream->oa_buffer.vaddr + OA_BUFFER_SIZE; > + int report_size_partial = oa_buf_end - report; > + > + if (report_size_partial < report_size) { > + if (copy_to_user(buf, report, report_size_partial)) > + return -EFAULT; > + buf += report_size_partial; > + > + if (copy_to_user(buf, stream->oa_buffer.vaddr, > + report_size - report_size_partial)) > + return -EFAULT; > + } else if (copy_to_user(buf, report, report_size)) > return -EFAULT; > } > > @@ -682,8 +692,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, > * only be incremented by multiples of the report size (notably also > * all a power of two). > */ > - if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size || > - tail > OA_BUFFER_SIZE || tail % report_size, > + if (WARN_ONCE(head > OA_BUFFER_SIZE || > + tail > OA_BUFFER_SIZE, > "Inconsistent OA buffer pointers: head = %u, tail = %u\n", > head, tail)) > return -EIO; > @@ -697,20 +707,6 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, > u32 ctx_id; > u32 reason; > > - /* > - * All the report sizes factor neatly into the buffer > - * size so we never expect to see a report split > - * between the beginning and end of the buffer. > - * > - * Given the initial alignment check a misalignment > - * here would imply a driver bug that would result > - * in an overrun. > - */ > - if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) { > - DRM_ERROR("Spurious OA head ptr: non-integral report offset\n"); > - break; > - } > - > /* > * The reason field includes flags identifying what > * triggered this specific report (mostly timer > @@ -956,8 +952,8 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream, > * only be incremented by multiples of the report size (notably also > * all a power of two). > */ > - if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size || > - tail > OA_BUFFER_SIZE || tail % report_size, > + if (WARN_ONCE(head > OA_BUFFER_SIZE || > + tail > OA_BUFFER_SIZE, > "Inconsistent OA buffer pointers: head = %u, tail = %u\n", > head, tail)) > return -EIO; > @@ -969,19 +965,6 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream, > u8 *report = oa_buf_base + head; > u32 *report32 = (void *)report; > > - /* All the report sizes factor neatly into the buffer > - * size so we never expect to see a report split > - * between the beginning and end of the buffer. > - * > - * Given the initial alignment check a misalignment > - * here would imply a driver bug that would result > - * in an overrun. > - */ > - if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) { > - DRM_ERROR("Spurious OA head ptr: non-integral report offset\n"); > - break; > - } > - > /* The report-ID field for periodic samples includes > * some undocumented flags related to what triggered > * the report and is never expected to be zero so we
On Sun, 15 Sep 2019 04:24:41 -0700, Lionel Landwerlin wrote: > > On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote: > > OA perf unit supports non-power of 2 report sizes. Enable support for > > these sizes in the driver. > > > > Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> > > --- > > drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++-------------------- > > 1 file changed, 21 insertions(+), 38 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c > > index 50b6d154fd46..482fca3da7de 100644 > > --- a/drivers/gpu/drm/i915/i915_perf.c > > +++ b/drivers/gpu/drm/i915/i915_perf.c > > @@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) > > u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); > > int report_size = stream->oa_buffer.format_size; > > unsigned long flags; > > - u32 hw_tail; > > + u32 hw_tail, aging_tail; > > u64 now; > > /* We have to consider the (unlikely) possibility that read() > > errors > > @@ -459,16 +459,17 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) > > */ > > spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); > > - hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream); > > + hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset; > > + aging_tail = stream->oa_buffer.aging_tail - gtt_offset; > > /* The tail pointer increases in 64 byte increments, > > * not in report_size steps... > > */ > > - hw_tail &= ~(report_size - 1); > > + hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % report_size)); > > > I'm struggling to parse this line above and I'm not 100% sure it's correct. > > Could add a comment to explain what is going on? Also for efficiency perhaps the modulo (%) should be replaced by a increment, compare and wraparound? Thanks! -- Ashutosh
On Sun, Sep 15, 2019 at 02:24:41PM +0300, Lionel Landwerlin wrote: >On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote: >>OA perf unit supports non-power of 2 report sizes. Enable support for >>these sizes in the driver. >> >>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> >>--- >> drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++-------------------- >> 1 file changed, 21 insertions(+), 38 deletions(-) >> >>diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c >>index 50b6d154fd46..482fca3da7de 100644 >>--- a/drivers/gpu/drm/i915/i915_perf.c >>+++ b/drivers/gpu/drm/i915/i915_perf.c >>@@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) >> u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); >> int report_size = stream->oa_buffer.format_size; >> unsigned long flags; >>- u32 hw_tail; >>+ u32 hw_tail, aging_tail; >> u64 now; >> /* We have to consider the (unlikely) possibility that read() errors >>@@ -459,16 +459,17 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) >> */ >> spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); >>- hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream); >>+ hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset; >>+ aging_tail = stream->oa_buffer.aging_tail - gtt_offset; >> /* The tail pointer increases in 64 byte increments, >> * not in report_size steps... >> */ >>- hw_tail &= ~(report_size - 1); >>+ hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % report_size)); > > >I'm struggling to parse this line above and I'm not 100% sure it's correct. > >Could add a comment to explain what is going on? The aging tail is always pointing to the boundary of a report whereas the hw_tail is advancing in 64 byte increments. The innermost OA_TAKEN is returning the number of bytes between the hw_tail and the aging_tail. The modulo is getting the size of the partial report (if any). The outermost OA_TAKEN is subtracting the size of partial report from the hw_tail to get a hw_tail that points to the boundary of the last full report. The value of hw_tail would be the same as from the deleted line of code above this line. Thanks, Umesh > > >Thanks, > > >-Lionel > > >> now = ktime_get_mono_fast_ns(); >>- if (hw_tail == stream->oa_buffer.aging_tail) { >>+ if (hw_tail == aging_tail) { >> /* If the HW tail hasn't move since the last check and the HW >> * tail has been aging for long enough, declare it the new >> * tail. >>@@ -486,8 +487,6 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) >> * a read() in progress. >> */ >> head = stream->oa_buffer.head - gtt_offset; >>- >>- hw_tail -= gtt_offset; >> tail = hw_tail; >> /* Walk the stream backward until we find at least 2 reports >>@@ -613,7 +612,18 @@ static int append_oa_sample(struct i915_perf_stream *stream, >> buf += sizeof(header); >> if (sample_flags & SAMPLE_OA_REPORT) { >>- if (copy_to_user(buf, report, report_size)) >>+ u8 *oa_buf_end = stream->oa_buffer.vaddr + OA_BUFFER_SIZE; >>+ int report_size_partial = oa_buf_end - report; >>+ >>+ if (report_size_partial < report_size) { >>+ if (copy_to_user(buf, report, report_size_partial)) >>+ return -EFAULT; >>+ buf += report_size_partial; >>+ >>+ if (copy_to_user(buf, stream->oa_buffer.vaddr, >>+ report_size - report_size_partial)) >>+ return -EFAULT; >>+ } else if (copy_to_user(buf, report, report_size)) >> return -EFAULT; >> } >>@@ -682,8 +692,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, >> * only be incremented by multiples of the report size (notably also >> * all a power of two). >> */ >>- if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size || >>- tail > OA_BUFFER_SIZE || tail % report_size, >>+ if (WARN_ONCE(head > OA_BUFFER_SIZE || >>+ tail > OA_BUFFER_SIZE, >> "Inconsistent OA buffer pointers: head = %u, tail = %u\n", >> head, tail)) >> return -EIO; >>@@ -697,20 +707,6 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, >> u32 ctx_id; >> u32 reason; >>- /* >>- * All the report sizes factor neatly into the buffer >>- * size so we never expect to see a report split >>- * between the beginning and end of the buffer. >>- * >>- * Given the initial alignment check a misalignment >>- * here would imply a driver bug that would result >>- * in an overrun. >>- */ >>- if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) { >>- DRM_ERROR("Spurious OA head ptr: non-integral report offset\n"); >>- break; >>- } >>- >> /* >> * The reason field includes flags identifying what >> * triggered this specific report (mostly timer >>@@ -956,8 +952,8 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream, >> * only be incremented by multiples of the report size (notably also >> * all a power of two). >> */ >>- if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size || >>- tail > OA_BUFFER_SIZE || tail % report_size, >>+ if (WARN_ONCE(head > OA_BUFFER_SIZE || >>+ tail > OA_BUFFER_SIZE, >> "Inconsistent OA buffer pointers: head = %u, tail = %u\n", >> head, tail)) >> return -EIO; >>@@ -969,19 +965,6 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream, >> u8 *report = oa_buf_base + head; >> u32 *report32 = (void *)report; >>- /* All the report sizes factor neatly into the buffer >>- * size so we never expect to see a report split >>- * between the beginning and end of the buffer. >>- * >>- * Given the initial alignment check a misalignment >>- * here would imply a driver bug that would result >>- * in an overrun. >>- */ >>- if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) { >>- DRM_ERROR("Spurious OA head ptr: non-integral report offset\n"); >>- break; >>- } >>- >> /* The report-ID field for periodic samples includes >> * some undocumented flags related to what triggered >> * the report and is never expected to be zero so we > >
On Mon, 16 Sep 2019 12:17:54 -0700, Umesh Nerlige Ramappa wrote: > > On Sun, Sep 15, 2019 at 02:24:41PM +0300, Lionel Landwerlin wrote: > > On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote: > >> OA perf unit supports non-power of 2 report sizes. Enable support for > >> these sizes in the driver. > >> > >> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> > >> --- > >> drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++-------------------- > >> 1 file changed, 21 insertions(+), 38 deletions(-) > >> > >> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c > >> index 50b6d154fd46..482fca3da7de 100644 > >> --- a/drivers/gpu/drm/i915/i915_perf.c > >> +++ b/drivers/gpu/drm/i915/i915_perf.c > >> @@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) > >> u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); > >> int report_size = stream->oa_buffer.format_size; > >> unsigned long flags; > >> - u32 hw_tail; > >> + u32 hw_tail, aging_tail; > >> u64 now; > >> /* We have to consider the (unlikely) possibility that read() errors > >> @@ -459,16 +459,17 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) > >> */ > >> spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); > >> - hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream); > >> + hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset; > >> + aging_tail = stream->oa_buffer.aging_tail - gtt_offset; > >> /* The tail pointer increases in 64 byte increments, > >> * not in report_size steps... > >> */ > >> - hw_tail &= ~(report_size - 1); > >> + hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % report_size)); > > > > > > I'm struggling to parse this line above and I'm not 100% sure it's correct. > > > > Could add a comment to explain what is going on? > > The aging tail is always pointing to the boundary of a report whereas > the hw_tail is advancing in 64 byte increments. > > The innermost OA_TAKEN is returning the number of bytes between the > hw_tail and the aging_tail. The modulo is getting the size of the > partial report (if any). > > The outermost OA_TAKEN is subtracting the size of partial report from > the hw_tail to get a hw_tail that points to the boundary of the last > full report. > > The value of hw_tail would be the same as from the deleted line of code > above this line. Looks like aging_tail should not be needed (it is complicating the expression and is not there in the original expression). All we need is a "circular modulus". For example would the following work? if (hw_tail < report_size) hw_tail += OA_BUFFER_SIZE; hw_tail = rounddown(hw_tail, report_size); Another way (if we want to avoid the division) would be to maintain a cached copy of hw_tail in SW which is successively (and circularly) incremented by report_size till it catches up with hw_tail read from HW. But probably the first method above is simpler.
On Mon, Sep 16, 2019 at 09:11:58PM -0700, Ashutosh Dixit wrote: >On Mon, 16 Sep 2019 12:17:54 -0700, Umesh Nerlige Ramappa wrote: >> >> On Sun, Sep 15, 2019 at 02:24:41PM +0300, Lionel Landwerlin wrote: >> > On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote: >> >> OA perf unit supports non-power of 2 report sizes. Enable support for >> >> these sizes in the driver. >> >> >> >> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> >> >> --- >> >> drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++-------------------- >> >> 1 file changed, 21 insertions(+), 38 deletions(-) >> >> >> >> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c >> >> index 50b6d154fd46..482fca3da7de 100644 >> >> --- a/drivers/gpu/drm/i915/i915_perf.c >> >> +++ b/drivers/gpu/drm/i915/i915_perf.c >> >> @@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) >> >> u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); >> >> int report_size = stream->oa_buffer.format_size; >> >> unsigned long flags; >> >> - u32 hw_tail; >> >> + u32 hw_tail, aging_tail; >> >> u64 now; >> >> /* We have to consider the (unlikely) possibility that read() errors >> >> @@ -459,16 +459,17 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) >> >> */ >> >> spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); >> >> - hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream); >> >> + hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset; >> >> + aging_tail = stream->oa_buffer.aging_tail - gtt_offset; >> >> /* The tail pointer increases in 64 byte increments, >> >> * not in report_size steps... >> >> */ >> >> - hw_tail &= ~(report_size - 1); >> >> + hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % report_size)); >> > >> > >> > I'm struggling to parse this line above and I'm not 100% sure it's correct. >> > >> > Could add a comment to explain what is going on? >> >> The aging tail is always pointing to the boundary of a report whereas >> the hw_tail is advancing in 64 byte increments. >> >> The innermost OA_TAKEN is returning the number of bytes between the >> hw_tail and the aging_tail. The modulo is getting the size of the >> partial report (if any). >> >> The outermost OA_TAKEN is subtracting the size of partial report from >> the hw_tail to get a hw_tail that points to the boundary of the last >> full report. >> >> The value of hw_tail would be the same as from the deleted line of code >> above this line. > >Looks like aging_tail should not be needed (it is complicating the >expression and is not there in the original expression). All we need is a >"circular modulus". For example would the following work? original expression assumes all report sizes are powers of 2 and hence does not need a reference (like aging_tail) to rounddown the hw_tail. > > if (hw_tail < report_size) > hw_tail += OA_BUFFER_SIZE; Assuming that this condition is detecting a report split across the OA buffer boundary, the above check will not catch the split in cases where there are multiple reports generated before we read the hw_tail. > hw_tail = rounddown(hw_tail, report_size); > >Another way (if we want to avoid the division) would be to maintain a >cached copy of hw_tail in SW which is successively (and circularly) >incremented by report_size till it catches up with hw_tail read from >HW. But probably the first method above is simpler. aging_tail is a cached copy of the hw_tail that advances only in report size increments. Thanks, Umesh
On Tue, 17 Sep 2019 10:33:51 -0700, Umesh Nerlige Ramappa wrote: > > On Mon, Sep 16, 2019 at 09:11:58PM -0700, Ashutosh Dixit wrote: > > On Mon, 16 Sep 2019 12:17:54 -0700, Umesh Nerlige Ramappa wrote: > >> > >> On Sun, Sep 15, 2019 at 02:24:41PM +0300, Lionel Landwerlin wrote: > >> > On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote: > >> >> OA perf unit supports non-power of 2 report sizes. Enable support for > >> >> these sizes in the driver. > >> >> > >> >> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> > >> >> --- > >> >> drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++-------------------- > >> >> 1 file changed, 21 insertions(+), 38 deletions(-) > >> >> > >> >> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c > >> >> index 50b6d154fd46..482fca3da7de 100644 > >> >> --- a/drivers/gpu/drm/i915/i915_perf.c > >> >> +++ b/drivers/gpu/drm/i915/i915_perf.c > >> >> @@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) > >> >> u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); > >> >> int report_size = stream->oa_buffer.format_size; > >> >> unsigned long flags; > >> >> - u32 hw_tail; > >> >> + u32 hw_tail, aging_tail; > >> >> u64 now; > >> >> /* We have to consider the (unlikely) possibility that read() errors > >> >> @@ -459,16 +459,17 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) > >> >> */ > >> >> spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); > >> >> - hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream); > >> >> + hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset; > >> >> + aging_tail = stream->oa_buffer.aging_tail - gtt_offset; > >> >> /* The tail pointer increases in 64 byte increments, > >> >> * not in report_size steps... > >> >> */ > >> >> - hw_tail &= ~(report_size - 1); > >> >> + hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % report_size)); > >> > > >> > > >> > I'm struggling to parse this line above and I'm not 100% sure it's correct. > >> > > >> > Could add a comment to explain what is going on? > >> > >> The aging tail is always pointing to the boundary of a report whereas > >> the hw_tail is advancing in 64 byte increments. > >> > >> The innermost OA_TAKEN is returning the number of bytes between the > >> hw_tail and the aging_tail. The modulo is getting the size of the > >> partial report (if any). > >> > >> The outermost OA_TAKEN is subtracting the size of partial report from > >> the hw_tail to get a hw_tail that points to the boundary of the last > >> full report. > >> > >> The value of hw_tail would be the same as from the deleted line of code > >> above this line. > > > > Looks like aging_tail should not be needed (it is complicating the > > expression and is not there in the original expression). All we need is a > > "circular modulus". For example would the following work? > > original expression assumes all report sizes are powers of 2 and hence does > not need a reference (like aging_tail) to rounddown the hw_tail. > > > > > if (hw_tail < report_size) > > hw_tail += OA_BUFFER_SIZE; > > Assuming that this condition is detecting a report split across the OA > buffer boundary, the above check will not catch the split in cases where > there are multiple reports generated before we read the hw_tail. > > > hw_tail = rounddown(hw_tail, report_size); > > > > Another way (if we want to avoid the division) would be to maintain a > > cached copy of hw_tail in SW which is successively (and circularly) > > incremented by report_size till it catches up with hw_tail read from > > HW. But probably the first method above is simpler. > > aging_tail is a cached copy of the hw_tail that advances only in report > size increments. Umesh is right, the previous aging_tail needs to be taken into account. Basically we need to start from the previous aging_tail and continue incrementing by report_size till we catch up with the new hw_tail (at the previous report_size boundary, which gives the value of the new aging_tail).
On 16/09/2019 22:17, Umesh Nerlige Ramappa wrote: > On Sun, Sep 15, 2019 at 02:24:41PM +0300, Lionel Landwerlin wrote: >> On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote: >>> OA perf unit supports non-power of 2 report sizes. Enable support for >>> these sizes in the driver. >>> >>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> >>> --- >>> drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++-------------------- >>> 1 file changed, 21 insertions(+), 38 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/i915/i915_perf.c >>> b/drivers/gpu/drm/i915/i915_perf.c >>> index 50b6d154fd46..482fca3da7de 100644 >>> --- a/drivers/gpu/drm/i915/i915_perf.c >>> +++ b/drivers/gpu/drm/i915/i915_perf.c >>> @@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct >>> i915_perf_stream *stream) >>> u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); >>> int report_size = stream->oa_buffer.format_size; >>> unsigned long flags; >>> - u32 hw_tail; >>> + u32 hw_tail, aging_tail; >>> u64 now; >>> /* We have to consider the (unlikely) possibility that read() >>> errors >>> @@ -459,16 +459,17 @@ static bool oa_buffer_check_unlocked(struct >>> i915_perf_stream *stream) >>> */ >>> spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); >>> - hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream); >>> + hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset; >>> + aging_tail = stream->oa_buffer.aging_tail - gtt_offset; >>> /* The tail pointer increases in 64 byte increments, >>> * not in report_size steps... >>> */ >>> - hw_tail &= ~(report_size - 1); >>> + hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % >>> report_size)); >> >> >> I'm struggling to parse this line above and I'm not 100% sure it's >> correct. >> >> Could add a comment to explain what is going on? > > The aging tail is always pointing to the boundary of a report whereas > the hw_tail is advancing in 64 byte increments. > > The innermost OA_TAKEN is returning the number of bytes between the > hw_tail and the aging_tail. The modulo is getting the size of the > partial report (if any). > > The outermost OA_TAKEN is subtracting the size of partial report from > the hw_tail to get a hw_tail that points to the boundary of the last > full report. > > The value of hw_tail would be the same as from the deleted line of code > above this line. > > Thanks, > Umesh Thanks, I ran a few tests locally to convince myself it's correct :) It's still a bit difficult to parse, probably because OA_TAKEN() wasn't meant for this. Could create a helper function that does this computation, something like this : static inline u32 align_hw_tail_to_report_boundary(u32 hw_tail, u32 last_aligned_tail) { /* Compute potentially partially landed report in the OA buffer */ u32 partial_report_size = OA_TAKEN(hw_tail, last_aligned_tail) % report_size; /* Substract that partial amount off the tail. */ return (hw_tail - partial_report_size) % OA_BUFFER_SIZE; } Cheers, -Lionel > >> >> >> Thanks, >> >> >> -Lionel >> >> >>> now = ktime_get_mono_fast_ns(); >>> - if (hw_tail == stream->oa_buffer.aging_tail) { >>> + if (hw_tail == aging_tail) { >>> /* If the HW tail hasn't move since the last check and the HW >>> * tail has been aging for long enough, declare it the new >>> * tail. >>> @@ -486,8 +487,6 @@ static bool oa_buffer_check_unlocked(struct >>> i915_perf_stream *stream) >>> * a read() in progress. >>> */ >>> head = stream->oa_buffer.head - gtt_offset; >>> - >>> - hw_tail -= gtt_offset; >>> tail = hw_tail; >>> /* Walk the stream backward until we find at least 2 reports >>> @@ -613,7 +612,18 @@ static int append_oa_sample(struct >>> i915_perf_stream *stream, >>> buf += sizeof(header); >>> if (sample_flags & SAMPLE_OA_REPORT) { >>> - if (copy_to_user(buf, report, report_size)) >>> + u8 *oa_buf_end = stream->oa_buffer.vaddr + OA_BUFFER_SIZE; >>> + int report_size_partial = oa_buf_end - report; >>> + >>> + if (report_size_partial < report_size) { >>> + if (copy_to_user(buf, report, report_size_partial)) >>> + return -EFAULT; >>> + buf += report_size_partial; >>> + >>> + if (copy_to_user(buf, stream->oa_buffer.vaddr, >>> + report_size - report_size_partial)) >>> + return -EFAULT; >>> + } else if (copy_to_user(buf, report, report_size)) >>> return -EFAULT; >>> } >>> @@ -682,8 +692,8 @@ static int gen8_append_oa_reports(struct >>> i915_perf_stream *stream, >>> * only be incremented by multiples of the report size (notably >>> also >>> * all a power of two). >>> */ >>> - if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size || >>> - tail > OA_BUFFER_SIZE || tail % report_size, >>> + if (WARN_ONCE(head > OA_BUFFER_SIZE || >>> + tail > OA_BUFFER_SIZE, >>> "Inconsistent OA buffer pointers: head = %u, tail = >>> %u\n", >>> head, tail)) >>> return -EIO; >>> @@ -697,20 +707,6 @@ static int gen8_append_oa_reports(struct >>> i915_perf_stream *stream, >>> u32 ctx_id; >>> u32 reason; >>> - /* >>> - * All the report sizes factor neatly into the buffer >>> - * size so we never expect to see a report split >>> - * between the beginning and end of the buffer. >>> - * >>> - * Given the initial alignment check a misalignment >>> - * here would imply a driver bug that would result >>> - * in an overrun. >>> - */ >>> - if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) { >>> - DRM_ERROR("Spurious OA head ptr: non-integral report >>> offset\n"); >>> - break; >>> - } >>> - >>> /* >>> * The reason field includes flags identifying what >>> * triggered this specific report (mostly timer >>> @@ -956,8 +952,8 @@ static int gen7_append_oa_reports(struct >>> i915_perf_stream *stream, >>> * only be incremented by multiples of the report size (notably >>> also >>> * all a power of two). >>> */ >>> - if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size || >>> - tail > OA_BUFFER_SIZE || tail % report_size, >>> + if (WARN_ONCE(head > OA_BUFFER_SIZE || >>> + tail > OA_BUFFER_SIZE, >>> "Inconsistent OA buffer pointers: head = %u, tail = >>> %u\n", >>> head, tail)) >>> return -EIO; >>> @@ -969,19 +965,6 @@ static int gen7_append_oa_reports(struct >>> i915_perf_stream *stream, >>> u8 *report = oa_buf_base + head; >>> u32 *report32 = (void *)report; >>> - /* All the report sizes factor neatly into the buffer >>> - * size so we never expect to see a report split >>> - * between the beginning and end of the buffer. >>> - * >>> - * Given the initial alignment check a misalignment >>> - * here would imply a driver bug that would result >>> - * in an overrun. >>> - */ >>> - if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) { >>> - DRM_ERROR("Spurious OA head ptr: non-integral report >>> offset\n"); >>> - break; >>> - } >>> - >>> /* The report-ID field for periodic samples includes >>> * some undocumented flags related to what triggered >>> * the report and is never expected to be zero so we >> >> >
On Wed, Sep 18, 2019 at 11:21:01AM +0300, Lionel Landwerlin wrote: >On 16/09/2019 22:17, Umesh Nerlige Ramappa wrote: >>On Sun, Sep 15, 2019 at 02:24:41PM +0300, Lionel Landwerlin wrote: >>>On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote: >>>>OA perf unit supports non-power of 2 report sizes. Enable support for >>>>these sizes in the driver. >>>> >>>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> >>>>--- >>>> drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++-------------------- >>>> 1 file changed, 21 insertions(+), 38 deletions(-) >>>> >>>>diff --git a/drivers/gpu/drm/i915/i915_perf.c >>>>b/drivers/gpu/drm/i915/i915_perf.c >>>>index 50b6d154fd46..482fca3da7de 100644 >>>>--- a/drivers/gpu/drm/i915/i915_perf.c >>>>+++ b/drivers/gpu/drm/i915/i915_perf.c >>>>@@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct >>>>i915_perf_stream *stream) >>>> u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); >>>> int report_size = stream->oa_buffer.format_size; >>>> unsigned long flags; >>>>- u32 hw_tail; >>>>+ u32 hw_tail, aging_tail; >>>> u64 now; >>>> /* We have to consider the (unlikely) possibility that >>>>read() errors >>>>@@ -459,16 +459,17 @@ static bool >>>>oa_buffer_check_unlocked(struct i915_perf_stream *stream) >>>> */ >>>> spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); >>>>- hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream); >>>>+ hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset; >>>>+ aging_tail = stream->oa_buffer.aging_tail - gtt_offset; >>>> /* The tail pointer increases in 64 byte increments, >>>> * not in report_size steps... >>>> */ >>>>- hw_tail &= ~(report_size - 1); >>>>+ hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) >>>>% report_size)); >>> >>> >>>I'm struggling to parse this line above and I'm not 100% sure it's >>>correct. >>> >>>Could add a comment to explain what is going on? >> >>The aging tail is always pointing to the boundary of a report whereas >>the hw_tail is advancing in 64 byte increments. >> >>The innermost OA_TAKEN is returning the number of bytes between the >>hw_tail and the aging_tail. The modulo is getting the size of the >>partial report (if any). >> >>The outermost OA_TAKEN is subtracting the size of partial report from >>the hw_tail to get a hw_tail that points to the boundary of the last >>full report. >> >>The value of hw_tail would be the same as from the deleted line of code >>above this line. >> >>Thanks, >>Umesh > > >Thanks, I ran a few tests locally to convince myself it's correct :) > > >It's still a bit difficult to parse, probably because OA_TAKEN() >wasn't meant for this. > >Could create a helper function that does this computation, something >like this : > > >static inline u32 align_hw_tail_to_report_boundary(u32 hw_tail, u32 >last_aligned_tail) > >{ > > /* Compute potentially partially landed report in the OA buffer */ > > u32 partial_report_size = OA_TAKEN(hw_tail, last_aligned_tail) % >report_size; > > /* Substract that partial amount off the tail. */ > > return (hw_tail - partial_report_size) % OA_BUFFER_SIZE; > >} Sure, I can add a helper function to make this more readable. Thanks, Umesh > > >Cheers, > > >-Lionel > > >> >>> >>> >>>Thanks, >>> >>> >>>-Lionel >>> >>> >>>> now = ktime_get_mono_fast_ns(); >>>>- if (hw_tail == stream->oa_buffer.aging_tail) { >>>>+ if (hw_tail == aging_tail) { >>>> /* If the HW tail hasn't move since the last check and the HW >>>> * tail has been aging for long enough, declare it the new >>>> * tail. >>>>@@ -486,8 +487,6 @@ static bool oa_buffer_check_unlocked(struct >>>>i915_perf_stream *stream) >>>> * a read() in progress. >>>> */ >>>> head = stream->oa_buffer.head - gtt_offset; >>>>- >>>>- hw_tail -= gtt_offset; >>>> tail = hw_tail; >>>> /* Walk the stream backward until we find at least 2 reports >>>>@@ -613,7 +612,18 @@ static int append_oa_sample(struct >>>>i915_perf_stream *stream, >>>> buf += sizeof(header); >>>> if (sample_flags & SAMPLE_OA_REPORT) { >>>>- if (copy_to_user(buf, report, report_size)) >>>>+ u8 *oa_buf_end = stream->oa_buffer.vaddr + OA_BUFFER_SIZE; >>>>+ int report_size_partial = oa_buf_end - report; >>>>+ >>>>+ if (report_size_partial < report_size) { >>>>+ if (copy_to_user(buf, report, report_size_partial)) >>>>+ return -EFAULT; >>>>+ buf += report_size_partial; >>>>+ >>>>+ if (copy_to_user(buf, stream->oa_buffer.vaddr, >>>>+ report_size - report_size_partial)) >>>>+ return -EFAULT; >>>>+ } else if (copy_to_user(buf, report, report_size)) >>>> return -EFAULT; >>>> } >>>>@@ -682,8 +692,8 @@ static int gen8_append_oa_reports(struct >>>>i915_perf_stream *stream, >>>> * only be incremented by multiples of the report size >>>>(notably also >>>> * all a power of two). >>>> */ >>>>- if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size || >>>>- tail > OA_BUFFER_SIZE || tail % report_size, >>>>+ if (WARN_ONCE(head > OA_BUFFER_SIZE || >>>>+ tail > OA_BUFFER_SIZE, >>>> "Inconsistent OA buffer pointers: head = %u, tail >>>>= %u\n", >>>> head, tail)) >>>> return -EIO; >>>>@@ -697,20 +707,6 @@ static int gen8_append_oa_reports(struct >>>>i915_perf_stream *stream, >>>> u32 ctx_id; >>>> u32 reason; >>>>- /* >>>>- * All the report sizes factor neatly into the buffer >>>>- * size so we never expect to see a report split >>>>- * between the beginning and end of the buffer. >>>>- * >>>>- * Given the initial alignment check a misalignment >>>>- * here would imply a driver bug that would result >>>>- * in an overrun. >>>>- */ >>>>- if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) { >>>>- DRM_ERROR("Spurious OA head ptr: non-integral >>>>report offset\n"); >>>>- break; >>>>- } >>>>- >>>> /* >>>> * The reason field includes flags identifying what >>>> * triggered this specific report (mostly timer >>>>@@ -956,8 +952,8 @@ static int gen7_append_oa_reports(struct >>>>i915_perf_stream *stream, >>>> * only be incremented by multiples of the report size >>>>(notably also >>>> * all a power of two). >>>> */ >>>>- if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size || >>>>- tail > OA_BUFFER_SIZE || tail % report_size, >>>>+ if (WARN_ONCE(head > OA_BUFFER_SIZE || >>>>+ tail > OA_BUFFER_SIZE, >>>> "Inconsistent OA buffer pointers: head = %u, tail >>>>= %u\n", >>>> head, tail)) >>>> return -EIO; >>>>@@ -969,19 +965,6 @@ static int gen7_append_oa_reports(struct >>>>i915_perf_stream *stream, >>>> u8 *report = oa_buf_base + head; >>>> u32 *report32 = (void *)report; >>>>- /* All the report sizes factor neatly into the buffer >>>>- * size so we never expect to see a report split >>>>- * between the beginning and end of the buffer. >>>>- * >>>>- * Given the initial alignment check a misalignment >>>>- * here would imply a driver bug that would result >>>>- * in an overrun. >>>>- */ >>>>- if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) { >>>>- DRM_ERROR("Spurious OA head ptr: non-integral >>>>report offset\n"); >>>>- break; >>>>- } >>>>- >>>> /* The report-ID field for periodic samples includes >>>> * some undocumented flags related to what triggered >>>> * the report and is never expected to be zero so we >>> >>> >> >
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 50b6d154fd46..482fca3da7de 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); int report_size = stream->oa_buffer.format_size; unsigned long flags; - u32 hw_tail; + u32 hw_tail, aging_tail; u64 now; /* We have to consider the (unlikely) possibility that read() errors @@ -459,16 +459,17 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) */ spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); - hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream); + hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset; + aging_tail = stream->oa_buffer.aging_tail - gtt_offset; /* The tail pointer increases in 64 byte increments, * not in report_size steps... */ - hw_tail &= ~(report_size - 1); + hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % report_size)); now = ktime_get_mono_fast_ns(); - if (hw_tail == stream->oa_buffer.aging_tail) { + if (hw_tail == aging_tail) { /* If the HW tail hasn't move since the last check and the HW * tail has been aging for long enough, declare it the new * tail. @@ -486,8 +487,6 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) * a read() in progress. */ head = stream->oa_buffer.head - gtt_offset; - - hw_tail -= gtt_offset; tail = hw_tail; /* Walk the stream backward until we find at least 2 reports @@ -613,7 +612,18 @@ static int append_oa_sample(struct i915_perf_stream *stream, buf += sizeof(header); if (sample_flags & SAMPLE_OA_REPORT) { - if (copy_to_user(buf, report, report_size)) + u8 *oa_buf_end = stream->oa_buffer.vaddr + OA_BUFFER_SIZE; + int report_size_partial = oa_buf_end - report; + + if (report_size_partial < report_size) { + if (copy_to_user(buf, report, report_size_partial)) + return -EFAULT; + buf += report_size_partial; + + if (copy_to_user(buf, stream->oa_buffer.vaddr, + report_size - report_size_partial)) + return -EFAULT; + } else if (copy_to_user(buf, report, report_size)) return -EFAULT; } @@ -682,8 +692,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, * only be incremented by multiples of the report size (notably also * all a power of two). */ - if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size || - tail > OA_BUFFER_SIZE || tail % report_size, + if (WARN_ONCE(head > OA_BUFFER_SIZE || + tail > OA_BUFFER_SIZE, "Inconsistent OA buffer pointers: head = %u, tail = %u\n", head, tail)) return -EIO; @@ -697,20 +707,6 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, u32 ctx_id; u32 reason; - /* - * All the report sizes factor neatly into the buffer - * size so we never expect to see a report split - * between the beginning and end of the buffer. - * - * Given the initial alignment check a misalignment - * here would imply a driver bug that would result - * in an overrun. - */ - if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) { - DRM_ERROR("Spurious OA head ptr: non-integral report offset\n"); - break; - } - /* * The reason field includes flags identifying what * triggered this specific report (mostly timer @@ -956,8 +952,8 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream, * only be incremented by multiples of the report size (notably also * all a power of two). */ - if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size || - tail > OA_BUFFER_SIZE || tail % report_size, + if (WARN_ONCE(head > OA_BUFFER_SIZE || + tail > OA_BUFFER_SIZE, "Inconsistent OA buffer pointers: head = %u, tail = %u\n", head, tail)) return -EIO; @@ -969,19 +965,6 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream, u8 *report = oa_buf_base + head; u32 *report32 = (void *)report; - /* All the report sizes factor neatly into the buffer - * size so we never expect to see a report split - * between the beginning and end of the buffer. - * - * Given the initial alignment check a misalignment - * here would imply a driver bug that would result - * in an overrun. - */ - if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) { - DRM_ERROR("Spurious OA head ptr: non-integral report offset\n"); - break; - } - /* The report-ID field for periodic samples includes * some undocumented flags related to what triggered * the report and is never expected to be zero so we
OA perf unit supports non-power of 2 report sizes. Enable support for these sizes in the driver. Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> --- drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++-------------------- 1 file changed, 21 insertions(+), 38 deletions(-)