diff mbox series

[2/3] drm/i915/perf: Add support for report sizes that are not power of 2

Message ID 20190913230620.15906-3-umesh.nerlige.ramappa@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/perf: Enable non-power-of-2 OA report sizes | expand

Commit Message

Umesh Nerlige Ramappa Sept. 13, 2019, 11:06 p.m. UTC
OA perf unit supports non-power of 2 report sizes. Enable support for
these sizes in the driver.

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++--------------------
 1 file changed, 21 insertions(+), 38 deletions(-)

Comments

Lionel Landwerlin Sept. 15, 2019, 11:24 a.m. UTC | #1
On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote:
> OA perf unit supports non-power of 2 report sizes. Enable support for
> these sizes in the driver.
>
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++--------------------
>   1 file changed, 21 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 50b6d154fd46..482fca3da7de 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
>   	u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
>   	int report_size = stream->oa_buffer.format_size;
>   	unsigned long flags;
> -	u32 hw_tail;
> +	u32 hw_tail, aging_tail;
>   	u64 now;
>   
>   	/* We have to consider the (unlikely) possibility that read() errors
> @@ -459,16 +459,17 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
>   	 */
>   	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
>   
> -	hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream);
> +	hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset;
> +	aging_tail = stream->oa_buffer.aging_tail - gtt_offset;
>   
>   	/* The tail pointer increases in 64 byte increments,
>   	 * not in report_size steps...
>   	 */
> -	hw_tail &= ~(report_size - 1);
> +	hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % report_size));


I'm struggling to parse this line above and I'm not 100% sure it's correct.

Could add a comment to explain what is going on?


Thanks,


-Lionel


>   
>   	now = ktime_get_mono_fast_ns();
>   
> -	if (hw_tail == stream->oa_buffer.aging_tail) {
> +	if (hw_tail == aging_tail) {
>   		/* If the HW tail hasn't move since the last check and the HW
>   		 * tail has been aging for long enough, declare it the new
>   		 * tail.
> @@ -486,8 +487,6 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
>   		 * a read() in progress.
>   		 */
>   		head = stream->oa_buffer.head - gtt_offset;
> -
> -		hw_tail -= gtt_offset;
>   		tail = hw_tail;
>   
>   		/* Walk the stream backward until we find at least 2 reports
> @@ -613,7 +612,18 @@ static int append_oa_sample(struct i915_perf_stream *stream,
>   	buf += sizeof(header);
>   
>   	if (sample_flags & SAMPLE_OA_REPORT) {
> -		if (copy_to_user(buf, report, report_size))
> +		u8 *oa_buf_end = stream->oa_buffer.vaddr + OA_BUFFER_SIZE;
> +		int report_size_partial = oa_buf_end - report;
> +
> +		if (report_size_partial < report_size) {
> +			if (copy_to_user(buf, report, report_size_partial))
> +				return -EFAULT;
> +			buf += report_size_partial;
> +
> +			if (copy_to_user(buf, stream->oa_buffer.vaddr,
> +					report_size - report_size_partial))
> +				return -EFAULT;
> +		} else if (copy_to_user(buf, report, report_size))
>   			return -EFAULT;
>   	}
>   
> @@ -682,8 +692,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
>   	 * only be incremented by multiples of the report size (notably also
>   	 * all a power of two).
>   	 */
> -	if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
> -		      tail > OA_BUFFER_SIZE || tail % report_size,
> +	if (WARN_ONCE(head > OA_BUFFER_SIZE ||
> +		      tail > OA_BUFFER_SIZE,
>   		      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
>   		      head, tail))
>   		return -EIO;
> @@ -697,20 +707,6 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
>   		u32 ctx_id;
>   		u32 reason;
>   
> -		/*
> -		 * All the report sizes factor neatly into the buffer
> -		 * size so we never expect to see a report split
> -		 * between the beginning and end of the buffer.
> -		 *
> -		 * Given the initial alignment check a misalignment
> -		 * here would imply a driver bug that would result
> -		 * in an overrun.
> -		 */
> -		if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
> -			DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
> -			break;
> -		}
> -
>   		/*
>   		 * The reason field includes flags identifying what
>   		 * triggered this specific report (mostly timer
> @@ -956,8 +952,8 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
>   	 * only be incremented by multiples of the report size (notably also
>   	 * all a power of two).
>   	 */
> -	if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
> -		      tail > OA_BUFFER_SIZE || tail % report_size,
> +	if (WARN_ONCE(head > OA_BUFFER_SIZE ||
> +		      tail > OA_BUFFER_SIZE,
>   		      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
>   		      head, tail))
>   		return -EIO;
> @@ -969,19 +965,6 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
>   		u8 *report = oa_buf_base + head;
>   		u32 *report32 = (void *)report;
>   
> -		/* All the report sizes factor neatly into the buffer
> -		 * size so we never expect to see a report split
> -		 * between the beginning and end of the buffer.
> -		 *
> -		 * Given the initial alignment check a misalignment
> -		 * here would imply a driver bug that would result
> -		 * in an overrun.
> -		 */
> -		if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
> -			DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
> -			break;
> -		}
> -
>   		/* The report-ID field for periodic samples includes
>   		 * some undocumented flags related to what triggered
>   		 * the report and is never expected to be zero so we
Dixit, Ashutosh Sept. 16, 2019, 6:10 a.m. UTC | #2
On Sun, 15 Sep 2019 04:24:41 -0700, Lionel Landwerlin wrote:
>
> On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote:
> > OA perf unit supports non-power of 2 report sizes. Enable support for
> > these sizes in the driver.
> >
> > Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> > ---
> >   drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++--------------------
> >   1 file changed, 21 insertions(+), 38 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> > index 50b6d154fd46..482fca3da7de 100644
> > --- a/drivers/gpu/drm/i915/i915_perf.c
> > +++ b/drivers/gpu/drm/i915/i915_perf.c
> > @@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
> >	u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
> >	int report_size = stream->oa_buffer.format_size;
> >	unsigned long flags;
> > -	u32 hw_tail;
> > +	u32 hw_tail, aging_tail;
> >	u64 now;
> >		/* We have to consider the (unlikely) possibility that read()
> > errors
> > @@ -459,16 +459,17 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
> >	 */
> >	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
> >   -	hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream);
> > +	hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset;
> > +	aging_tail = stream->oa_buffer.aging_tail - gtt_offset;
> >		/* The tail pointer increases in 64 byte increments,
> >	 * not in report_size steps...
> >	 */
> > -	hw_tail &= ~(report_size - 1);
> > +	hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % report_size));
>
>
> I'm struggling to parse this line above and I'm not 100% sure it's correct.
>
> Could add a comment to explain what is going on?

Also for efficiency perhaps the modulo (%) should be replaced by a
increment, compare and wraparound?

Thanks!
--
Ashutosh
Umesh Nerlige Ramappa Sept. 16, 2019, 7:17 p.m. UTC | #3
On Sun, Sep 15, 2019 at 02:24:41PM +0300, Lionel Landwerlin wrote:
>On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote:
>>OA perf unit supports non-power of 2 report sizes. Enable support for
>>these sizes in the driver.
>>
>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>---
>>  drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++--------------------
>>  1 file changed, 21 insertions(+), 38 deletions(-)
>>
>>diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>>index 50b6d154fd46..482fca3da7de 100644
>>--- a/drivers/gpu/drm/i915/i915_perf.c
>>+++ b/drivers/gpu/drm/i915/i915_perf.c
>>@@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
>>  	u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
>>  	int report_size = stream->oa_buffer.format_size;
>>  	unsigned long flags;
>>-	u32 hw_tail;
>>+	u32 hw_tail, aging_tail;
>>  	u64 now;
>>  	/* We have to consider the (unlikely) possibility that read() errors
>>@@ -459,16 +459,17 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
>>  	 */
>>  	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
>>-	hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream);
>>+	hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset;
>>+	aging_tail = stream->oa_buffer.aging_tail - gtt_offset;
>>  	/* The tail pointer increases in 64 byte increments,
>>  	 * not in report_size steps...
>>  	 */
>>-	hw_tail &= ~(report_size - 1);
>>+	hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % report_size));
>
>
>I'm struggling to parse this line above and I'm not 100% sure it's correct.
>
>Could add a comment to explain what is going on?

The aging tail is always pointing to the boundary of a report whereas
the hw_tail is advancing in 64 byte increments.
 
The innermost OA_TAKEN is returning the number of bytes between the
hw_tail and the aging_tail. The modulo is getting the size of the
partial report (if any).

The outermost OA_TAKEN is subtracting the size of partial report from
the hw_tail to get a hw_tail that points to the boundary of the last
full report.

The value of hw_tail would be the same as from the deleted line of code
above this line.

Thanks,
Umesh

>
>
>Thanks,
>
>
>-Lionel
>
>
>>  	now = ktime_get_mono_fast_ns();
>>-	if (hw_tail == stream->oa_buffer.aging_tail) {
>>+	if (hw_tail == aging_tail) {
>>  		/* If the HW tail hasn't move since the last check and the HW
>>  		 * tail has been aging for long enough, declare it the new
>>  		 * tail.
>>@@ -486,8 +487,6 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
>>  		 * a read() in progress.
>>  		 */
>>  		head = stream->oa_buffer.head - gtt_offset;
>>-
>>-		hw_tail -= gtt_offset;
>>  		tail = hw_tail;
>>  		/* Walk the stream backward until we find at least 2 reports
>>@@ -613,7 +612,18 @@ static int append_oa_sample(struct i915_perf_stream *stream,
>>  	buf += sizeof(header);
>>  	if (sample_flags & SAMPLE_OA_REPORT) {
>>-		if (copy_to_user(buf, report, report_size))
>>+		u8 *oa_buf_end = stream->oa_buffer.vaddr + OA_BUFFER_SIZE;
>>+		int report_size_partial = oa_buf_end - report;
>>+
>>+		if (report_size_partial < report_size) {
>>+			if (copy_to_user(buf, report, report_size_partial))
>>+				return -EFAULT;
>>+			buf += report_size_partial;
>>+
>>+			if (copy_to_user(buf, stream->oa_buffer.vaddr,
>>+					report_size - report_size_partial))
>>+				return -EFAULT;
>>+		} else if (copy_to_user(buf, report, report_size))
>>  			return -EFAULT;
>>  	}
>>@@ -682,8 +692,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
>>  	 * only be incremented by multiples of the report size (notably also
>>  	 * all a power of two).
>>  	 */
>>-	if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
>>-		      tail > OA_BUFFER_SIZE || tail % report_size,
>>+	if (WARN_ONCE(head > OA_BUFFER_SIZE ||
>>+		      tail > OA_BUFFER_SIZE,
>>  		      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
>>  		      head, tail))
>>  		return -EIO;
>>@@ -697,20 +707,6 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
>>  		u32 ctx_id;
>>  		u32 reason;
>>-		/*
>>-		 * All the report sizes factor neatly into the buffer
>>-		 * size so we never expect to see a report split
>>-		 * between the beginning and end of the buffer.
>>-		 *
>>-		 * Given the initial alignment check a misalignment
>>-		 * here would imply a driver bug that would result
>>-		 * in an overrun.
>>-		 */
>>-		if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
>>-			DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
>>-			break;
>>-		}
>>-
>>  		/*
>>  		 * The reason field includes flags identifying what
>>  		 * triggered this specific report (mostly timer
>>@@ -956,8 +952,8 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
>>  	 * only be incremented by multiples of the report size (notably also
>>  	 * all a power of two).
>>  	 */
>>-	if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
>>-		      tail > OA_BUFFER_SIZE || tail % report_size,
>>+	if (WARN_ONCE(head > OA_BUFFER_SIZE ||
>>+		      tail > OA_BUFFER_SIZE,
>>  		      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
>>  		      head, tail))
>>  		return -EIO;
>>@@ -969,19 +965,6 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
>>  		u8 *report = oa_buf_base + head;
>>  		u32 *report32 = (void *)report;
>>-		/* All the report sizes factor neatly into the buffer
>>-		 * size so we never expect to see a report split
>>-		 * between the beginning and end of the buffer.
>>-		 *
>>-		 * Given the initial alignment check a misalignment
>>-		 * here would imply a driver bug that would result
>>-		 * in an overrun.
>>-		 */
>>-		if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
>>-			DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
>>-			break;
>>-		}
>>-
>>  		/* The report-ID field for periodic samples includes
>>  		 * some undocumented flags related to what triggered
>>  		 * the report and is never expected to be zero so we
>
>
Dixit, Ashutosh Sept. 17, 2019, 4:11 a.m. UTC | #4
On Mon, 16 Sep 2019 12:17:54 -0700, Umesh Nerlige Ramappa wrote:
>
> On Sun, Sep 15, 2019 at 02:24:41PM +0300, Lionel Landwerlin wrote:
> > On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote:
> >> OA perf unit supports non-power of 2 report sizes. Enable support for
> >> these sizes in the driver.
> >>
> >> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> >> ---
> >>  drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++--------------------
> >>  1 file changed, 21 insertions(+), 38 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> >> index 50b6d154fd46..482fca3da7de 100644
> >> --- a/drivers/gpu/drm/i915/i915_perf.c
> >> +++ b/drivers/gpu/drm/i915/i915_perf.c
> >> @@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
> >>	u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
> >>	int report_size = stream->oa_buffer.format_size;
> >>	unsigned long flags;
> >> -	u32 hw_tail;
> >> +	u32 hw_tail, aging_tail;
> >>	u64 now;
> >>	/* We have to consider the (unlikely) possibility that read() errors
> >> @@ -459,16 +459,17 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
> >>	 */
> >>	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
> >> -	hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream);
> >> +	hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset;
> >> +	aging_tail = stream->oa_buffer.aging_tail - gtt_offset;
> >>	/* The tail pointer increases in 64 byte increments,
> >>	 * not in report_size steps...
> >>	 */
> >> -	hw_tail &= ~(report_size - 1);
> >> +	hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % report_size));
> >
> >
> > I'm struggling to parse this line above and I'm not 100% sure it's correct.
> >
> > Could add a comment to explain what is going on?
>
> The aging tail is always pointing to the boundary of a report whereas
> the hw_tail is advancing in 64 byte increments.
>
> The innermost OA_TAKEN is returning the number of bytes between the
> hw_tail and the aging_tail. The modulo is getting the size of the
> partial report (if any).
>
> The outermost OA_TAKEN is subtracting the size of partial report from
> the hw_tail to get a hw_tail that points to the boundary of the last
> full report.
>
> The value of hw_tail would be the same as from the deleted line of code
> above this line.

Looks like aging_tail should not be needed (it is complicating the
expression and is not there in the original expression). All we need is a
"circular modulus". For example would the following work?

    if (hw_tail < report_size)
        hw_tail += OA_BUFFER_SIZE;
    hw_tail = rounddown(hw_tail, report_size);

Another way (if we want to avoid the division) would be to maintain a
cached copy of hw_tail in SW which is successively (and circularly)
incremented by report_size till it catches up with hw_tail read from
HW. But probably the first method above is simpler.
Umesh Nerlige Ramappa Sept. 17, 2019, 5:33 p.m. UTC | #5
On Mon, Sep 16, 2019 at 09:11:58PM -0700, Ashutosh Dixit wrote:
>On Mon, 16 Sep 2019 12:17:54 -0700, Umesh Nerlige Ramappa wrote:
>>
>> On Sun, Sep 15, 2019 at 02:24:41PM +0300, Lionel Landwerlin wrote:
>> > On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote:
>> >> OA perf unit supports non-power of 2 report sizes. Enable support for
>> >> these sizes in the driver.
>> >>
>> >> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>> >> ---
>> >>  drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++--------------------
>> >>  1 file changed, 21 insertions(+), 38 deletions(-)
>> >>
>> >> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>> >> index 50b6d154fd46..482fca3da7de 100644
>> >> --- a/drivers/gpu/drm/i915/i915_perf.c
>> >> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> >> @@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
>> >>	u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
>> >>	int report_size = stream->oa_buffer.format_size;
>> >>	unsigned long flags;
>> >> -	u32 hw_tail;
>> >> +	u32 hw_tail, aging_tail;
>> >>	u64 now;
>> >>	/* We have to consider the (unlikely) possibility that read() errors
>> >> @@ -459,16 +459,17 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
>> >>	 */
>> >>	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
>> >> -	hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream);
>> >> +	hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset;
>> >> +	aging_tail = stream->oa_buffer.aging_tail - gtt_offset;
>> >>	/* The tail pointer increases in 64 byte increments,
>> >>	 * not in report_size steps...
>> >>	 */
>> >> -	hw_tail &= ~(report_size - 1);
>> >> +	hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % report_size));
>> >
>> >
>> > I'm struggling to parse this line above and I'm not 100% sure it's correct.
>> >
>> > Could add a comment to explain what is going on?
>>
>> The aging tail is always pointing to the boundary of a report whereas
>> the hw_tail is advancing in 64 byte increments.
>>
>> The innermost OA_TAKEN is returning the number of bytes between the
>> hw_tail and the aging_tail. The modulo is getting the size of the
>> partial report (if any).
>>
>> The outermost OA_TAKEN is subtracting the size of partial report from
>> the hw_tail to get a hw_tail that points to the boundary of the last
>> full report.
>>
>> The value of hw_tail would be the same as from the deleted line of code
>> above this line.
>
>Looks like aging_tail should not be needed (it is complicating the
>expression and is not there in the original expression). All we need is a
>"circular modulus". For example would the following work?

original expression assumes all report sizes are powers of 2 and hence 
does not need a reference (like aging_tail) to rounddown the hw_tail.

>
>    if (hw_tail < report_size)
>        hw_tail += OA_BUFFER_SIZE;

Assuming that this condition is detecting a report split across the OA 
buffer boundary, the above check will not catch the split in cases where 
there are multiple reports generated before we read the hw_tail.

>    hw_tail = rounddown(hw_tail, report_size);
>
>Another way (if we want to avoid the division) would be to maintain a
>cached copy of hw_tail in SW which is successively (and circularly)
>incremented by report_size till it catches up with hw_tail read from
>HW. But probably the first method above is simpler.

aging_tail is a cached copy of the hw_tail that advances only in report 
size increments.

Thanks,
Umesh
Dixit, Ashutosh Sept. 18, 2019, 5:38 a.m. UTC | #6
On Tue, 17 Sep 2019 10:33:51 -0700, Umesh Nerlige Ramappa wrote:
>
> On Mon, Sep 16, 2019 at 09:11:58PM -0700, Ashutosh Dixit wrote:
> > On Mon, 16 Sep 2019 12:17:54 -0700, Umesh Nerlige Ramappa wrote:
> >>
> >> On Sun, Sep 15, 2019 at 02:24:41PM +0300, Lionel Landwerlin wrote:
> >> > On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote:
> >> >> OA perf unit supports non-power of 2 report sizes. Enable support for
> >> >> these sizes in the driver.
> >> >>
> >> >> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> >> >> ---
> >> >>  drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++--------------------
> >> >>  1 file changed, 21 insertions(+), 38 deletions(-)
> >> >>
> >> >> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> >> >> index 50b6d154fd46..482fca3da7de 100644
> >> >> --- a/drivers/gpu/drm/i915/i915_perf.c
> >> >> +++ b/drivers/gpu/drm/i915/i915_perf.c
> >> >> @@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
> >> >>	u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
> >> >>	int report_size = stream->oa_buffer.format_size;
> >> >>	unsigned long flags;
> >> >> -	u32 hw_tail;
> >> >> +	u32 hw_tail, aging_tail;
> >> >>	u64 now;
> >> >>	/* We have to consider the (unlikely) possibility that read() errors
> >> >> @@ -459,16 +459,17 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
> >> >>	 */
> >> >>	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
> >> >> -	hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream);
> >> >> +	hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset;
> >> >> +	aging_tail = stream->oa_buffer.aging_tail - gtt_offset;
> >> >>	/* The tail pointer increases in 64 byte increments,
> >> >>	 * not in report_size steps...
> >> >>	 */
> >> >> -	hw_tail &= ~(report_size - 1);
> >> >> +	hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % report_size));
> >> >
> >> >
> >> > I'm struggling to parse this line above and I'm not 100% sure it's correct.
> >> >
> >> > Could add a comment to explain what is going on?
> >>
> >> The aging tail is always pointing to the boundary of a report whereas
> >> the hw_tail is advancing in 64 byte increments.
> >>
> >> The innermost OA_TAKEN is returning the number of bytes between the
> >> hw_tail and the aging_tail. The modulo is getting the size of the
> >> partial report (if any).
> >>
> >> The outermost OA_TAKEN is subtracting the size of partial report from
> >> the hw_tail to get a hw_tail that points to the boundary of the last
> >> full report.
> >>
> >> The value of hw_tail would be the same as from the deleted line of code
> >> above this line.
> >
> > Looks like aging_tail should not be needed (it is complicating the
> > expression and is not there in the original expression). All we need is a
> > "circular modulus". For example would the following work?
>
> original expression assumes all report sizes are powers of 2 and hence does
> not need a reference (like aging_tail) to rounddown the hw_tail.
>
> >
> >    if (hw_tail < report_size)
> >        hw_tail += OA_BUFFER_SIZE;
>
> Assuming that this condition is detecting a report split across the OA
> buffer boundary, the above check will not catch the split in cases where
> there are multiple reports generated before we read the hw_tail.
>
> >    hw_tail = rounddown(hw_tail, report_size);
> >
> > Another way (if we want to avoid the division) would be to maintain a
> > cached copy of hw_tail in SW which is successively (and circularly)
> > incremented by report_size till it catches up with hw_tail read from
> > HW. But probably the first method above is simpler.
>
> aging_tail is a cached copy of the hw_tail that advances only in report
> size increments.

Umesh is right, the previous aging_tail needs to be taken into
account. Basically we need to start from the previous aging_tail and
continue incrementing by report_size till we catch up with the new hw_tail
(at the previous report_size boundary, which gives the value of the new
aging_tail).
Lionel Landwerlin Sept. 18, 2019, 8:21 a.m. UTC | #7
On 16/09/2019 22:17, Umesh Nerlige Ramappa wrote:
> On Sun, Sep 15, 2019 at 02:24:41PM +0300, Lionel Landwerlin wrote:
>> On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote:
>>> OA perf unit supports non-power of 2 report sizes. Enable support for
>>> these sizes in the driver.
>>>
>>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>> ---
>>>  drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++--------------------
>>>  1 file changed, 21 insertions(+), 38 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_perf.c 
>>> b/drivers/gpu/drm/i915/i915_perf.c
>>> index 50b6d154fd46..482fca3da7de 100644
>>> --- a/drivers/gpu/drm/i915/i915_perf.c
>>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>>> @@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct 
>>> i915_perf_stream *stream)
>>>      u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
>>>      int report_size = stream->oa_buffer.format_size;
>>>      unsigned long flags;
>>> -    u32 hw_tail;
>>> +    u32 hw_tail, aging_tail;
>>>      u64 now;
>>>      /* We have to consider the (unlikely) possibility that read() 
>>> errors
>>> @@ -459,16 +459,17 @@ static bool oa_buffer_check_unlocked(struct 
>>> i915_perf_stream *stream)
>>>       */
>>>      spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
>>> -    hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream);
>>> +    hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset;
>>> +    aging_tail = stream->oa_buffer.aging_tail - gtt_offset;
>>>      /* The tail pointer increases in 64 byte increments,
>>>       * not in report_size steps...
>>>       */
>>> -    hw_tail &= ~(report_size - 1);
>>> +    hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % 
>>> report_size));
>>
>>
>> I'm struggling to parse this line above and I'm not 100% sure it's 
>> correct.
>>
>> Could add a comment to explain what is going on?
>
> The aging tail is always pointing to the boundary of a report whereas
> the hw_tail is advancing in 64 byte increments.
>
> The innermost OA_TAKEN is returning the number of bytes between the
> hw_tail and the aging_tail. The modulo is getting the size of the
> partial report (if any).
>
> The outermost OA_TAKEN is subtracting the size of partial report from
> the hw_tail to get a hw_tail that points to the boundary of the last
> full report.
>
> The value of hw_tail would be the same as from the deleted line of code
> above this line.
>
> Thanks,
> Umesh


Thanks, I ran a few tests locally to convince myself it's correct :)


It's still a bit difficult to parse, probably because OA_TAKEN() wasn't 
meant for this.

Could create a helper function that does this computation, something 
like this :


static inline u32 align_hw_tail_to_report_boundary(u32 hw_tail, u32 
last_aligned_tail)

{

     /* Compute potentially partially landed report in the OA buffer */

     u32 partial_report_size = OA_TAKEN(hw_tail, last_aligned_tail) % 
report_size;

     /* Substract that partial amount off the tail. */

     return (hw_tail - partial_report_size) % OA_BUFFER_SIZE;

}


Cheers,


-Lionel


>
>>
>>
>> Thanks,
>>
>>
>> -Lionel
>>
>>
>>>      now = ktime_get_mono_fast_ns();
>>> -    if (hw_tail == stream->oa_buffer.aging_tail) {
>>> +    if (hw_tail == aging_tail) {
>>>          /* If the HW tail hasn't move since the last check and the HW
>>>           * tail has been aging for long enough, declare it the new
>>>           * tail.
>>> @@ -486,8 +487,6 @@ static bool oa_buffer_check_unlocked(struct 
>>> i915_perf_stream *stream)
>>>           * a read() in progress.
>>>           */
>>>          head = stream->oa_buffer.head - gtt_offset;
>>> -
>>> -        hw_tail -= gtt_offset;
>>>          tail = hw_tail;
>>>          /* Walk the stream backward until we find at least 2 reports
>>> @@ -613,7 +612,18 @@ static int append_oa_sample(struct 
>>> i915_perf_stream *stream,
>>>      buf += sizeof(header);
>>>      if (sample_flags & SAMPLE_OA_REPORT) {
>>> -        if (copy_to_user(buf, report, report_size))
>>> +        u8 *oa_buf_end = stream->oa_buffer.vaddr + OA_BUFFER_SIZE;
>>> +        int report_size_partial = oa_buf_end - report;
>>> +
>>> +        if (report_size_partial < report_size) {
>>> +            if (copy_to_user(buf, report, report_size_partial))
>>> +                return -EFAULT;
>>> +            buf += report_size_partial;
>>> +
>>> +            if (copy_to_user(buf, stream->oa_buffer.vaddr,
>>> +                    report_size - report_size_partial))
>>> +                return -EFAULT;
>>> +        } else if (copy_to_user(buf, report, report_size))
>>>              return -EFAULT;
>>>      }
>>> @@ -682,8 +692,8 @@ static int gen8_append_oa_reports(struct 
>>> i915_perf_stream *stream,
>>>       * only be incremented by multiples of the report size (notably 
>>> also
>>>       * all a power of two).
>>>       */
>>> -    if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
>>> -              tail > OA_BUFFER_SIZE || tail % report_size,
>>> +    if (WARN_ONCE(head > OA_BUFFER_SIZE ||
>>> +              tail > OA_BUFFER_SIZE,
>>>                "Inconsistent OA buffer pointers: head = %u, tail = 
>>> %u\n",
>>>                head, tail))
>>>          return -EIO;
>>> @@ -697,20 +707,6 @@ static int gen8_append_oa_reports(struct 
>>> i915_perf_stream *stream,
>>>          u32 ctx_id;
>>>          u32 reason;
>>> -        /*
>>> -         * All the report sizes factor neatly into the buffer
>>> -         * size so we never expect to see a report split
>>> -         * between the beginning and end of the buffer.
>>> -         *
>>> -         * Given the initial alignment check a misalignment
>>> -         * here would imply a driver bug that would result
>>> -         * in an overrun.
>>> -         */
>>> -        if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
>>> -            DRM_ERROR("Spurious OA head ptr: non-integral report 
>>> offset\n");
>>> -            break;
>>> -        }
>>> -
>>>          /*
>>>           * The reason field includes flags identifying what
>>>           * triggered this specific report (mostly timer
>>> @@ -956,8 +952,8 @@ static int gen7_append_oa_reports(struct 
>>> i915_perf_stream *stream,
>>>       * only be incremented by multiples of the report size (notably 
>>> also
>>>       * all a power of two).
>>>       */
>>> -    if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
>>> -              tail > OA_BUFFER_SIZE || tail % report_size,
>>> +    if (WARN_ONCE(head > OA_BUFFER_SIZE ||
>>> +              tail > OA_BUFFER_SIZE,
>>>                "Inconsistent OA buffer pointers: head = %u, tail = 
>>> %u\n",
>>>                head, tail))
>>>          return -EIO;
>>> @@ -969,19 +965,6 @@ static int gen7_append_oa_reports(struct 
>>> i915_perf_stream *stream,
>>>          u8 *report = oa_buf_base + head;
>>>          u32 *report32 = (void *)report;
>>> -        /* All the report sizes factor neatly into the buffer
>>> -         * size so we never expect to see a report split
>>> -         * between the beginning and end of the buffer.
>>> -         *
>>> -         * Given the initial alignment check a misalignment
>>> -         * here would imply a driver bug that would result
>>> -         * in an overrun.
>>> -         */
>>> -        if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
>>> -            DRM_ERROR("Spurious OA head ptr: non-integral report 
>>> offset\n");
>>> -            break;
>>> -        }
>>> -
>>>          /* The report-ID field for periodic samples includes
>>>           * some undocumented flags related to what triggered
>>>           * the report and is never expected to be zero so we
>>
>>
>
Umesh Nerlige Ramappa Sept. 18, 2019, 4:59 p.m. UTC | #8
On Wed, Sep 18, 2019 at 11:21:01AM +0300, Lionel Landwerlin wrote:
>On 16/09/2019 22:17, Umesh Nerlige Ramappa wrote:
>>On Sun, Sep 15, 2019 at 02:24:41PM +0300, Lionel Landwerlin wrote:
>>>On 14/09/2019 02:06, Umesh Nerlige Ramappa wrote:
>>>>OA perf unit supports non-power of 2 report sizes. Enable support for
>>>>these sizes in the driver.
>>>>
>>>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>>>---
>>>> drivers/gpu/drm/i915/i915_perf.c | 59 ++++++++++++--------------------
>>>> 1 file changed, 21 insertions(+), 38 deletions(-)
>>>>
>>>>diff --git a/drivers/gpu/drm/i915/i915_perf.c 
>>>>b/drivers/gpu/drm/i915/i915_perf.c
>>>>index 50b6d154fd46..482fca3da7de 100644
>>>>--- a/drivers/gpu/drm/i915/i915_perf.c
>>>>+++ b/drivers/gpu/drm/i915/i915_perf.c
>>>>@@ -450,7 +450,7 @@ static bool oa_buffer_check_unlocked(struct 
>>>>i915_perf_stream *stream)
>>>>     u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
>>>>     int report_size = stream->oa_buffer.format_size;
>>>>     unsigned long flags;
>>>>-    u32 hw_tail;
>>>>+    u32 hw_tail, aging_tail;
>>>>     u64 now;
>>>>     /* We have to consider the (unlikely) possibility that 
>>>>read() errors
>>>>@@ -459,16 +459,17 @@ static bool 
>>>>oa_buffer_check_unlocked(struct i915_perf_stream *stream)
>>>>      */
>>>>     spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
>>>>-    hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream);
>>>>+    hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset;
>>>>+    aging_tail = stream->oa_buffer.aging_tail - gtt_offset;
>>>>     /* The tail pointer increases in 64 byte increments,
>>>>      * not in report_size steps...
>>>>      */
>>>>-    hw_tail &= ~(report_size - 1);
>>>>+    hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) 
>>>>% report_size));
>>>
>>>
>>>I'm struggling to parse this line above and I'm not 100% sure it's 
>>>correct.
>>>
>>>Could add a comment to explain what is going on?
>>
>>The aging tail is always pointing to the boundary of a report whereas
>>the hw_tail is advancing in 64 byte increments.
>>
>>The innermost OA_TAKEN is returning the number of bytes between the
>>hw_tail and the aging_tail. The modulo is getting the size of the
>>partial report (if any).
>>
>>The outermost OA_TAKEN is subtracting the size of partial report from
>>the hw_tail to get a hw_tail that points to the boundary of the last
>>full report.
>>
>>The value of hw_tail would be the same as from the deleted line of code
>>above this line.
>>
>>Thanks,
>>Umesh
>
>
>Thanks, I ran a few tests locally to convince myself it's correct :)
>
>
>It's still a bit difficult to parse, probably because OA_TAKEN() 
>wasn't meant for this.
>
>Could create a helper function that does this computation, something 
>like this :
>
>
>static inline u32 align_hw_tail_to_report_boundary(u32 hw_tail, u32 
>last_aligned_tail)
>
>{
>
>    /* Compute potentially partially landed report in the OA buffer */
>
>    u32 partial_report_size = OA_TAKEN(hw_tail, last_aligned_tail) % 
>report_size;
>
>    /* Substract that partial amount off the tail. */
>
>    return (hw_tail - partial_report_size) % OA_BUFFER_SIZE;
>
>}

Sure, I can add a helper function to make this more readable.

Thanks,
Umesh

>
>
>Cheers,
>
>
>-Lionel
>
>
>>
>>>
>>>
>>>Thanks,
>>>
>>>
>>>-Lionel
>>>
>>>
>>>>     now = ktime_get_mono_fast_ns();
>>>>-    if (hw_tail == stream->oa_buffer.aging_tail) {
>>>>+    if (hw_tail == aging_tail) {
>>>>         /* If the HW tail hasn't move since the last check and the HW
>>>>          * tail has been aging for long enough, declare it the new
>>>>          * tail.
>>>>@@ -486,8 +487,6 @@ static bool oa_buffer_check_unlocked(struct 
>>>>i915_perf_stream *stream)
>>>>          * a read() in progress.
>>>>          */
>>>>         head = stream->oa_buffer.head - gtt_offset;
>>>>-
>>>>-        hw_tail -= gtt_offset;
>>>>         tail = hw_tail;
>>>>         /* Walk the stream backward until we find at least 2 reports
>>>>@@ -613,7 +612,18 @@ static int append_oa_sample(struct 
>>>>i915_perf_stream *stream,
>>>>     buf += sizeof(header);
>>>>     if (sample_flags & SAMPLE_OA_REPORT) {
>>>>-        if (copy_to_user(buf, report, report_size))
>>>>+        u8 *oa_buf_end = stream->oa_buffer.vaddr + OA_BUFFER_SIZE;
>>>>+        int report_size_partial = oa_buf_end - report;
>>>>+
>>>>+        if (report_size_partial < report_size) {
>>>>+            if (copy_to_user(buf, report, report_size_partial))
>>>>+                return -EFAULT;
>>>>+            buf += report_size_partial;
>>>>+
>>>>+            if (copy_to_user(buf, stream->oa_buffer.vaddr,
>>>>+                    report_size - report_size_partial))
>>>>+                return -EFAULT;
>>>>+        } else if (copy_to_user(buf, report, report_size))
>>>>             return -EFAULT;
>>>>     }
>>>>@@ -682,8 +692,8 @@ static int gen8_append_oa_reports(struct 
>>>>i915_perf_stream *stream,
>>>>      * only be incremented by multiples of the report size 
>>>>(notably also
>>>>      * all a power of two).
>>>>      */
>>>>-    if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
>>>>-              tail > OA_BUFFER_SIZE || tail % report_size,
>>>>+    if (WARN_ONCE(head > OA_BUFFER_SIZE ||
>>>>+              tail > OA_BUFFER_SIZE,
>>>>               "Inconsistent OA buffer pointers: head = %u, tail 
>>>>= %u\n",
>>>>               head, tail))
>>>>         return -EIO;
>>>>@@ -697,20 +707,6 @@ static int gen8_append_oa_reports(struct 
>>>>i915_perf_stream *stream,
>>>>         u32 ctx_id;
>>>>         u32 reason;
>>>>-        /*
>>>>-         * All the report sizes factor neatly into the buffer
>>>>-         * size so we never expect to see a report split
>>>>-         * between the beginning and end of the buffer.
>>>>-         *
>>>>-         * Given the initial alignment check a misalignment
>>>>-         * here would imply a driver bug that would result
>>>>-         * in an overrun.
>>>>-         */
>>>>-        if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
>>>>-            DRM_ERROR("Spurious OA head ptr: non-integral 
>>>>report offset\n");
>>>>-            break;
>>>>-        }
>>>>-
>>>>         /*
>>>>          * The reason field includes flags identifying what
>>>>          * triggered this specific report (mostly timer
>>>>@@ -956,8 +952,8 @@ static int gen7_append_oa_reports(struct 
>>>>i915_perf_stream *stream,
>>>>      * only be incremented by multiples of the report size 
>>>>(notably also
>>>>      * all a power of two).
>>>>      */
>>>>-    if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
>>>>-              tail > OA_BUFFER_SIZE || tail % report_size,
>>>>+    if (WARN_ONCE(head > OA_BUFFER_SIZE ||
>>>>+              tail > OA_BUFFER_SIZE,
>>>>               "Inconsistent OA buffer pointers: head = %u, tail 
>>>>= %u\n",
>>>>               head, tail))
>>>>         return -EIO;
>>>>@@ -969,19 +965,6 @@ static int gen7_append_oa_reports(struct 
>>>>i915_perf_stream *stream,
>>>>         u8 *report = oa_buf_base + head;
>>>>         u32 *report32 = (void *)report;
>>>>-        /* All the report sizes factor neatly into the buffer
>>>>-         * size so we never expect to see a report split
>>>>-         * between the beginning and end of the buffer.
>>>>-         *
>>>>-         * Given the initial alignment check a misalignment
>>>>-         * here would imply a driver bug that would result
>>>>-         * in an overrun.
>>>>-         */
>>>>-        if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
>>>>-            DRM_ERROR("Spurious OA head ptr: non-integral 
>>>>report offset\n");
>>>>-            break;
>>>>-        }
>>>>-
>>>>         /* The report-ID field for periodic samples includes
>>>>          * some undocumented flags related to what triggered
>>>>          * the report and is never expected to be zero so we
>>>
>>>
>>
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 50b6d154fd46..482fca3da7de 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -450,7 +450,7 @@  static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
 	u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
 	int report_size = stream->oa_buffer.format_size;
 	unsigned long flags;
-	u32 hw_tail;
+	u32 hw_tail, aging_tail;
 	u64 now;
 
 	/* We have to consider the (unlikely) possibility that read() errors
@@ -459,16 +459,17 @@  static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
 	 */
 	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
 
-	hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream);
+	hw_tail = dev_priv->perf.ops.oa_hw_tail_read(stream) - gtt_offset;
+	aging_tail = stream->oa_buffer.aging_tail - gtt_offset;
 
 	/* The tail pointer increases in 64 byte increments,
 	 * not in report_size steps...
 	 */
-	hw_tail &= ~(report_size - 1);
+	hw_tail = OA_TAKEN(hw_tail, (OA_TAKEN(hw_tail, aging_tail) % report_size));
 
 	now = ktime_get_mono_fast_ns();
 
-	if (hw_tail == stream->oa_buffer.aging_tail) {
+	if (hw_tail == aging_tail) {
 		/* If the HW tail hasn't move since the last check and the HW
 		 * tail has been aging for long enough, declare it the new
 		 * tail.
@@ -486,8 +487,6 @@  static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
 		 * a read() in progress.
 		 */
 		head = stream->oa_buffer.head - gtt_offset;
-
-		hw_tail -= gtt_offset;
 		tail = hw_tail;
 
 		/* Walk the stream backward until we find at least 2 reports
@@ -613,7 +612,18 @@  static int append_oa_sample(struct i915_perf_stream *stream,
 	buf += sizeof(header);
 
 	if (sample_flags & SAMPLE_OA_REPORT) {
-		if (copy_to_user(buf, report, report_size))
+		u8 *oa_buf_end = stream->oa_buffer.vaddr + OA_BUFFER_SIZE;
+		int report_size_partial = oa_buf_end - report;
+
+		if (report_size_partial < report_size) {
+			if (copy_to_user(buf, report, report_size_partial))
+				return -EFAULT;
+			buf += report_size_partial;
+
+			if (copy_to_user(buf, stream->oa_buffer.vaddr,
+					report_size - report_size_partial))
+				return -EFAULT;
+		} else if (copy_to_user(buf, report, report_size))
 			return -EFAULT;
 	}
 
@@ -682,8 +692,8 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	 * only be incremented by multiples of the report size (notably also
 	 * all a power of two).
 	 */
-	if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
-		      tail > OA_BUFFER_SIZE || tail % report_size,
+	if (WARN_ONCE(head > OA_BUFFER_SIZE ||
+		      tail > OA_BUFFER_SIZE,
 		      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
 		      head, tail))
 		return -EIO;
@@ -697,20 +707,6 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		u32 ctx_id;
 		u32 reason;
 
-		/*
-		 * All the report sizes factor neatly into the buffer
-		 * size so we never expect to see a report split
-		 * between the beginning and end of the buffer.
-		 *
-		 * Given the initial alignment check a misalignment
-		 * here would imply a driver bug that would result
-		 * in an overrun.
-		 */
-		if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
-			DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
-			break;
-		}
-
 		/*
 		 * The reason field includes flags identifying what
 		 * triggered this specific report (mostly timer
@@ -956,8 +952,8 @@  static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 	 * only be incremented by multiples of the report size (notably also
 	 * all a power of two).
 	 */
-	if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
-		      tail > OA_BUFFER_SIZE || tail % report_size,
+	if (WARN_ONCE(head > OA_BUFFER_SIZE ||
+		      tail > OA_BUFFER_SIZE,
 		      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
 		      head, tail))
 		return -EIO;
@@ -969,19 +965,6 @@  static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 		u8 *report = oa_buf_base + head;
 		u32 *report32 = (void *)report;
 
-		/* All the report sizes factor neatly into the buffer
-		 * size so we never expect to see a report split
-		 * between the beginning and end of the buffer.
-		 *
-		 * Given the initial alignment check a misalignment
-		 * here would imply a driver bug that would result
-		 * in an overrun.
-		 */
-		if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
-			DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
-			break;
-		}
-
 		/* The report-ID field for periodic samples includes
 		 * some undocumented flags related to what triggered
 		 * the report and is never expected to be zero so we