diff mbox series

[1/2] i915/uncore: Acquire fw before loop in intel_uncore_read64_2x32

Message ID 20221105003235.1717908-2-umesh.nerlige.ramappa@intel.com (mailing list archive)
State New, archived
Headers show
Series Fix live busy stats selftest failure | expand

Commit Message

Umesh Nerlige Ramappa Nov. 5, 2022, 12:32 a.m. UTC
PMU reads the GT timestamp as a 2x32 mmio read and since upper and lower
32 bit registers are read in a loop, there is a latency involved between
getting the GT timestamp and the CPU timestamp. As part of the
resolution, refactor intel_uncore_read64_2x32 to acquire forcewake and
uncore lock prior to reading upper and lower regs.

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/intel_uncore.h | 44 ++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 14 deletions(-)

Comments

Tvrtko Ursulin Nov. 7, 2022, 10:13 a.m. UTC | #1
On 05/11/2022 00:32, Umesh Nerlige Ramappa wrote:
> PMU reads the GT timestamp as a 2x32 mmio read and since upper and lower
> 32 bit registers are read in a loop, there is a latency involved between
> getting the GT timestamp and the CPU timestamp. As part of the
> resolution, refactor intel_uncore_read64_2x32 to acquire forcewake and
> uncore lock prior to reading upper and lower regs.
> 
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> ---
>   drivers/gpu/drm/i915/intel_uncore.h | 44 ++++++++++++++++++++---------
>   1 file changed, 30 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_uncore.h b/drivers/gpu/drm/i915/intel_uncore.h
> index 5449146a0624..e9e38490815d 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.h
> +++ b/drivers/gpu/drm/i915/intel_uncore.h
> @@ -382,20 +382,6 @@ __uncore_write(write_notrace, 32, l, false)
>    */
>   __uncore_read(read64, 64, q, true)
>   
> -static inline u64
> -intel_uncore_read64_2x32(struct intel_uncore *uncore,
> -			 i915_reg_t lower_reg, i915_reg_t upper_reg)
> -{
> -	u32 upper, lower, old_upper, loop = 0;
> -	upper = intel_uncore_read(uncore, upper_reg);
> -	do {
> -		old_upper = upper;
> -		lower = intel_uncore_read(uncore, lower_reg);
> -		upper = intel_uncore_read(uncore, upper_reg);
> -	} while (upper != old_upper && loop++ < 2);
> -	return (u64)upper << 32 | lower;
> -}
> -
>   #define intel_uncore_posting_read(...) ((void)intel_uncore_read_notrace(__VA_ARGS__))
>   #define intel_uncore_posting_read16(...) ((void)intel_uncore_read16_notrace(__VA_ARGS__))
>   
> @@ -455,6 +441,36 @@ static inline void intel_uncore_rmw_fw(struct intel_uncore *uncore,
>   		intel_uncore_write_fw(uncore, reg, val);
>   }
>   
> +static inline u64
> +intel_uncore_read64_2x32(struct intel_uncore *uncore,
> +			 i915_reg_t lower_reg, i915_reg_t upper_reg)
> +{
> +	u32 upper, lower, old_upper, loop = 0;
> +	enum forcewake_domains fw_domains;
> +	unsigned long flags;
> +
> +	fw_domains = intel_uncore_forcewake_for_reg(uncore, lower_reg,
> +						    FW_REG_READ);
> +
> +	fw_domains |= intel_uncore_forcewake_for_reg(uncore, upper_reg,
> +						    FW_REG_READ);
> +
> +	spin_lock_irqsave(&uncore->lock, flags);
> +	intel_uncore_forcewake_get__locked(uncore, fw_domains);
> +
> +	upper = intel_uncore_read_fw(uncore, upper_reg);
> +	do {
> +		old_upper = upper;
> +		lower = intel_uncore_read_fw(uncore, lower_reg);
> +		upper = intel_uncore_read_fw(uncore, upper_reg);
> +	} while (upper != old_upper && loop++ < 2);
> +
> +	intel_uncore_forcewake_put__locked(uncore, fw_domains);

I mulled over the fact this no longer applies the put hysteresis, but 
then I saw GuC busyness is essentially the only current caller so 
thought it doesn't really warrant adding a super long named 
intel_uncore_forcewake_put_delayed__locked helper.

Perhaps it would make sense to move this out of static inline, in which 
case it would also be easier to have the hysteresis without needing to 
export any new helpers, but mostly because it does not feel the static 
inline is justified. Sounds an attractive option but it is passable as is.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

> +	spin_unlock_irqrestore(&uncore->lock, flags);
> +
> +	return (u64)upper << 32 | lower;
> +}
> +
>   static inline int intel_uncore_write_and_verify(struct intel_uncore *uncore,
>   						i915_reg_t reg, u32 val,
>   						u32 mask, u32 expected_val)
Dixit, Ashutosh Nov. 7, 2022, 9:23 p.m. UTC | #2
On Mon, 07 Nov 2022 02:13:46 -0800, Tvrtko Ursulin wrote:
>
> On 05/11/2022 00:32, Umesh Nerlige Ramappa wrote:
> > PMU reads the GT timestamp as a 2x32 mmio read and since upper and lower
> > 32 bit registers are read in a loop, there is a latency involved between
> > getting the GT timestamp and the CPU timestamp. As part of the
> > resolution, refactor intel_uncore_read64_2x32 to acquire forcewake and
> > uncore lock prior to reading upper and lower regs.
> >
> > Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> > ---
> >   drivers/gpu/drm/i915/intel_uncore.h | 44 ++++++++++++++++++++---------
> >   1 file changed, 30 insertions(+), 14 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/intel_uncore.h b/drivers/gpu/drm/i915/intel_uncore.h
> > index 5449146a0624..e9e38490815d 100644
> > --- a/drivers/gpu/drm/i915/intel_uncore.h
> > +++ b/drivers/gpu/drm/i915/intel_uncore.h
> > @@ -382,20 +382,6 @@ __uncore_write(write_notrace, 32, l, false)
> >    */
> >   __uncore_read(read64, 64, q, true)
> >   -static inline u64
> > -intel_uncore_read64_2x32(struct intel_uncore *uncore,
> > -			 i915_reg_t lower_reg, i915_reg_t upper_reg)
> > -{
> > -	u32 upper, lower, old_upper, loop = 0;
> > -	upper = intel_uncore_read(uncore, upper_reg);
> > -	do {
> > -		old_upper = upper;
> > -		lower = intel_uncore_read(uncore, lower_reg);
> > -		upper = intel_uncore_read(uncore, upper_reg);
> > -	} while (upper != old_upper && loop++ < 2);
> > -	return (u64)upper << 32 | lower;
> > -}
> > -
> >   #define intel_uncore_posting_read(...) ((void)intel_uncore_read_notrace(__VA_ARGS__))
> >   #define intel_uncore_posting_read16(...) ((void)intel_uncore_read16_notrace(__VA_ARGS__))
> >   @@ -455,6 +441,36 @@ static inline void intel_uncore_rmw_fw(struct
> > intel_uncore *uncore,
> >		intel_uncore_write_fw(uncore, reg, val);
> >   }
> >   +static inline u64
> > +intel_uncore_read64_2x32(struct intel_uncore *uncore,
> > +			 i915_reg_t lower_reg, i915_reg_t upper_reg)
> > +{
> > +	u32 upper, lower, old_upper, loop = 0;
> > +	enum forcewake_domains fw_domains;
> > +	unsigned long flags;
> > +
> > +	fw_domains = intel_uncore_forcewake_for_reg(uncore, lower_reg,
> > +						    FW_REG_READ);
> > +
> > +	fw_domains |= intel_uncore_forcewake_for_reg(uncore, upper_reg,
> > +						    FW_REG_READ);
> > +
> > +	spin_lock_irqsave(&uncore->lock, flags);
> > +	intel_uncore_forcewake_get__locked(uncore, fw_domains);
> > +
> > +	upper = intel_uncore_read_fw(uncore, upper_reg);
> > +	do {
> > +		old_upper = upper;
> > +		lower = intel_uncore_read_fw(uncore, lower_reg);
> > +		upper = intel_uncore_read_fw(uncore, upper_reg);
> > +	} while (upper != old_upper && loop++ < 2);
> > +
> > +	intel_uncore_forcewake_put__locked(uncore, fw_domains);
>
> I mulled over the fact this no longer applies the put hysteresis, but then
> I saw GuC busyness is essentially the only current caller so thought it
> doesn't really warrant adding a super long named
> intel_uncore_forcewake_put_delayed__locked helper.
>
> Perhaps it would make sense to move this out of static inline, in which
> case it would also be easier to have the hysteresis without needing to
> export any new helpers, but mostly because it does not feel the static
> inline is justified. Sounds an attractive option but it is passable as is.

Yup, copy that. Also see now how this reduces the read latency. And also it
would increase the latency a bit for a different thread trying to do an
uncore read/write since we hold uncore->lock longer but should be ok I
think.

> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Copy that too:

Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>

>
> > +	spin_unlock_irqrestore(&uncore->lock, flags);
> > +
> > +	return (u64)upper << 32 | lower;
> > +}
> > +
> >   static inline int intel_uncore_write_and_verify(struct intel_uncore *uncore,
> >						i915_reg_t reg, u32 val,
> >						u32 mask, u32 expected_val)
Umesh Nerlige Ramappa Nov. 8, 2022, 12:11 a.m. UTC | #3
On Mon, Nov 07, 2022 at 01:23:19PM -0800, Dixit, Ashutosh wrote:
>On Mon, 07 Nov 2022 02:13:46 -0800, Tvrtko Ursulin wrote:
>>
>> On 05/11/2022 00:32, Umesh Nerlige Ramappa wrote:
>> > PMU reads the GT timestamp as a 2x32 mmio read and since upper and lower
>> > 32 bit registers are read in a loop, there is a latency involved between
>> > getting the GT timestamp and the CPU timestamp. As part of the
>> > resolution, refactor intel_uncore_read64_2x32 to acquire forcewake and
>> > uncore lock prior to reading upper and lower regs.
>> >
>> > Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>> > ---
>> >   drivers/gpu/drm/i915/intel_uncore.h | 44 ++++++++++++++++++++---------
>> >   1 file changed, 30 insertions(+), 14 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/i915/intel_uncore.h b/drivers/gpu/drm/i915/intel_uncore.h
>> > index 5449146a0624..e9e38490815d 100644
>> > --- a/drivers/gpu/drm/i915/intel_uncore.h
>> > +++ b/drivers/gpu/drm/i915/intel_uncore.h
>> > @@ -382,20 +382,6 @@ __uncore_write(write_notrace, 32, l, false)
>> >    */
>> >   __uncore_read(read64, 64, q, true)
>> >   -static inline u64
>> > -intel_uncore_read64_2x32(struct intel_uncore *uncore,
>> > -			 i915_reg_t lower_reg, i915_reg_t upper_reg)
>> > -{
>> > -	u32 upper, lower, old_upper, loop = 0;
>> > -	upper = intel_uncore_read(uncore, upper_reg);
>> > -	do {
>> > -		old_upper = upper;
>> > -		lower = intel_uncore_read(uncore, lower_reg);
>> > -		upper = intel_uncore_read(uncore, upper_reg);
>> > -	} while (upper != old_upper && loop++ < 2);
>> > -	return (u64)upper << 32 | lower;
>> > -}
>> > -
>> >   #define intel_uncore_posting_read(...) ((void)intel_uncore_read_notrace(__VA_ARGS__))
>> >   #define intel_uncore_posting_read16(...) ((void)intel_uncore_read16_notrace(__VA_ARGS__))
>> >   @@ -455,6 +441,36 @@ static inline void intel_uncore_rmw_fw(struct
>> > intel_uncore *uncore,
>> >		intel_uncore_write_fw(uncore, reg, val);
>> >   }
>> >   +static inline u64
>> > +intel_uncore_read64_2x32(struct intel_uncore *uncore,
>> > +			 i915_reg_t lower_reg, i915_reg_t upper_reg)
>> > +{
>> > +	u32 upper, lower, old_upper, loop = 0;
>> > +	enum forcewake_domains fw_domains;
>> > +	unsigned long flags;
>> > +
>> > +	fw_domains = intel_uncore_forcewake_for_reg(uncore, lower_reg,
>> > +						    FW_REG_READ);
>> > +
>> > +	fw_domains |= intel_uncore_forcewake_for_reg(uncore, upper_reg,
>> > +						    FW_REG_READ);
>> > +
>> > +	spin_lock_irqsave(&uncore->lock, flags);
>> > +	intel_uncore_forcewake_get__locked(uncore, fw_domains);
>> > +
>> > +	upper = intel_uncore_read_fw(uncore, upper_reg);
>> > +	do {
>> > +		old_upper = upper;
>> > +		lower = intel_uncore_read_fw(uncore, lower_reg);
>> > +		upper = intel_uncore_read_fw(uncore, upper_reg);
>> > +	} while (upper != old_upper && loop++ < 2);
>> > +
>> > +	intel_uncore_forcewake_put__locked(uncore, fw_domains);
>>
>> I mulled over the fact this no longer applies the put hysteresis, but then
>> I saw GuC busyness is essentially the only current caller so thought it
>> doesn't really warrant adding a super long named
>> intel_uncore_forcewake_put_delayed__locked helper.
>>
>> Perhaps it would make sense to move this out of static inline,

Are you saying - drop the inline OR drop static inline? I am assuming 
the former.

>> in which
>> case it would also be easier to have the hysteresis without needing to
>> export any new helpers,

I don't understand this part. Do you mean that it makes it easier to 
just call __intel_uncore_forcewake_put(uncore, fw_domains, true) then?  
Just wondering how 'static inline' has any effect on that.

>> but mostly because it does not feel the static
>> inline is justified.

Agree, just carried it over from the previous helper definition.

>> Sounds an attractive option but it is passable as is.
>
>Yup, copy that. Also see now how this reduces the read latency. And also it
>would increase the latency a bit for a different thread trying to do an
>uncore read/write since we hold uncore->lock longer but should be ok I
>think.

Didn't think about it from that perspective. Worst case is that 
gt_park/gt_unpark may happen very frequently (as seen on some use 
cases). In that case, the unpark would end up calling this helper each 
time.

>
>> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
>Copy that too:
>
>Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>

Thanks,
Umesh
>

>>
>> > +	spin_unlock_irqrestore(&uncore->lock, flags);
>> > +
>> > +	return (u64)upper << 32 | lower;
>> > +}
>> > +
>> >   static inline int intel_uncore_write_and_verify(struct intel_uncore *uncore,
>> >						i915_reg_t reg, u32 val,
>> >						u32 mask, u32 expected_val)
Dixit, Ashutosh Nov. 8, 2022, 12:45 a.m. UTC | #4
On Mon, 07 Nov 2022 16:11:27 -0800, Umesh Nerlige Ramappa wrote:
>
> On Mon, Nov 07, 2022 at 01:23:19PM -0800, Dixit, Ashutosh wrote:
> > On Mon, 07 Nov 2022 02:13:46 -0800, Tvrtko Ursulin wrote:
> >>
> >> On 05/11/2022 00:32, Umesh Nerlige Ramappa wrote:
> >> > PMU reads the GT timestamp as a 2x32 mmio read and since upper and lower
> >> > 32 bit registers are read in a loop, there is a latency involved between
> >> > getting the GT timestamp and the CPU timestamp. As part of the
> >> > resolution, refactor intel_uncore_read64_2x32 to acquire forcewake and
> >> > uncore lock prior to reading upper and lower regs.
> >> >
> >> > Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> >> > ---
> >> >   drivers/gpu/drm/i915/intel_uncore.h | 44 ++++++++++++++++++++---------
> >> >   1 file changed, 30 insertions(+), 14 deletions(-)
> >> >
> >> > diff --git a/drivers/gpu/drm/i915/intel_uncore.h b/drivers/gpu/drm/i915/intel_uncore.h
> >> > index 5449146a0624..e9e38490815d 100644
> >> > --- a/drivers/gpu/drm/i915/intel_uncore.h
> >> > +++ b/drivers/gpu/drm/i915/intel_uncore.h
> >> > @@ -382,20 +382,6 @@ __uncore_write(write_notrace, 32, l, false)
> >> >    */
> >> >   __uncore_read(read64, 64, q, true)
> >> >   -static inline u64
> >> > -intel_uncore_read64_2x32(struct intel_uncore *uncore,
> >> > -			 i915_reg_t lower_reg, i915_reg_t upper_reg)
> >> > -{
> >> > -	u32 upper, lower, old_upper, loop = 0;
> >> > -	upper = intel_uncore_read(uncore, upper_reg);
> >> > -	do {
> >> > -		old_upper = upper;
> >> > -		lower = intel_uncore_read(uncore, lower_reg);
> >> > -		upper = intel_uncore_read(uncore, upper_reg);
> >> > -	} while (upper != old_upper && loop++ < 2);
> >> > -	return (u64)upper << 32 | lower;
> >> > -}
> >> > -
> >> >   #define intel_uncore_posting_read(...) ((void)intel_uncore_read_notrace(__VA_ARGS__))
> >> >   #define intel_uncore_posting_read16(...) ((void)intel_uncore_read16_notrace(__VA_ARGS__))
> >> >   @@ -455,6 +441,36 @@ static inline void intel_uncore_rmw_fw(struct
> >> > intel_uncore *uncore,
> >> >		intel_uncore_write_fw(uncore, reg, val);
> >> >   }
> >> >   +static inline u64
> >> > +intel_uncore_read64_2x32(struct intel_uncore *uncore,
> >> > +			 i915_reg_t lower_reg, i915_reg_t upper_reg)
> >> > +{
> >> > +	u32 upper, lower, old_upper, loop = 0;
> >> > +	enum forcewake_domains fw_domains;
> >> > +	unsigned long flags;
> >> > +
> >> > +	fw_domains = intel_uncore_forcewake_for_reg(uncore, lower_reg,
> >> > +						    FW_REG_READ);
> >> > +
> >> > +	fw_domains |= intel_uncore_forcewake_for_reg(uncore, upper_reg,
> >> > +						    FW_REG_READ);
> >> > +
> >> > +	spin_lock_irqsave(&uncore->lock, flags);
> >> > +	intel_uncore_forcewake_get__locked(uncore, fw_domains);
> >> > +
> >> > +	upper = intel_uncore_read_fw(uncore, upper_reg);
> >> > +	do {
> >> > +		old_upper = upper;
> >> > +		lower = intel_uncore_read_fw(uncore, lower_reg);
> >> > +		upper = intel_uncore_read_fw(uncore, upper_reg);
> >> > +	} while (upper != old_upper && loop++ < 2);
> >> > +
> >> > +	intel_uncore_forcewake_put__locked(uncore, fw_domains);
> >>
> >> I mulled over the fact this no longer applies the put hysteresis, but then
> >> I saw GuC busyness is essentially the only current caller so thought it
> >> doesn't really warrant adding a super long named
> >> intel_uncore_forcewake_put_delayed__locked helper.
> >>
> >> Perhaps it would make sense to move this out of static inline,
>
> Are you saying - drop the inline OR drop static inline? I am assuming the
> former.

No you need to have 'static inline' for functions defined in a header
file. I also don't understand completely but seems what Tvrtko is saying is
move the function to the .c leaving only the declarations in the .h? Anyway
let Tvrtko explain more.

>
> >> in which
> >> case it would also be easier to have the hysteresis without needing to
> >> export any new helpers,
>
> I don't understand this part. Do you mean that it makes it easier to just
> call __intel_uncore_forcewake_put(uncore, fw_domains, true) then?

Yes I think this will work, drop the lock and call
__intel_uncore_forcewake_put.

> Just
> wondering how 'static inline' has any effect on that.
>
> >> but mostly because it does not feel the static
> >> inline is justified.
>
> Agree, just carried it over from the previous helper definition.
>
> >> Sounds an attractive option but it is passable as is.
> >
> > Yup, copy that. Also see now how this reduces the read latency. And also it
> > would increase the latency a bit for a different thread trying to do an
> > uncore read/write since we hold uncore->lock longer but should be ok I
> > think.
>
> Didn't think about it from that perspective. Worst case is that
> gt_park/gt_unpark may happen very frequently (as seen on some use
> cases). In that case, the unpark would end up calling this helper each
> time.
>
> >
> >> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >
> > Copy that too:
> >
> > Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
>
> Thanks,
> Umesh
> >
>
> >>
> >> > +	spin_unlock_irqrestore(&uncore->lock, flags);
> >> > +
> >> > +	return (u64)upper << 32 | lower;
> >> > +}
> >> > +
> >> >   static inline int intel_uncore_write_and_verify(struct intel_uncore *uncore,
> >> >						i915_reg_t reg, u32 val,
> >> >						u32 mask, u32 expected_val)
Tvrtko Ursulin Nov. 8, 2022, 10:06 a.m. UTC | #5
On 08/11/2022 00:45, Dixit, Ashutosh wrote:
> On Mon, 07 Nov 2022 16:11:27 -0800, Umesh Nerlige Ramappa wrote:
>>
>> On Mon, Nov 07, 2022 at 01:23:19PM -0800, Dixit, Ashutosh wrote:
>>> On Mon, 07 Nov 2022 02:13:46 -0800, Tvrtko Ursulin wrote:
>>>>
>>>> On 05/11/2022 00:32, Umesh Nerlige Ramappa wrote:
>>>>> PMU reads the GT timestamp as a 2x32 mmio read and since upper and lower
>>>>> 32 bit registers are read in a loop, there is a latency involved between
>>>>> getting the GT timestamp and the CPU timestamp. As part of the
>>>>> resolution, refactor intel_uncore_read64_2x32 to acquire forcewake and
>>>>> uncore lock prior to reading upper and lower regs.
>>>>>
>>>>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>>>> ---
>>>>>    drivers/gpu/drm/i915/intel_uncore.h | 44 ++++++++++++++++++++---------
>>>>>    1 file changed, 30 insertions(+), 14 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/intel_uncore.h b/drivers/gpu/drm/i915/intel_uncore.h
>>>>> index 5449146a0624..e9e38490815d 100644
>>>>> --- a/drivers/gpu/drm/i915/intel_uncore.h
>>>>> +++ b/drivers/gpu/drm/i915/intel_uncore.h
>>>>> @@ -382,20 +382,6 @@ __uncore_write(write_notrace, 32, l, false)
>>>>>     */
>>>>>    __uncore_read(read64, 64, q, true)
>>>>>    -static inline u64
>>>>> -intel_uncore_read64_2x32(struct intel_uncore *uncore,
>>>>> -			 i915_reg_t lower_reg, i915_reg_t upper_reg)
>>>>> -{
>>>>> -	u32 upper, lower, old_upper, loop = 0;
>>>>> -	upper = intel_uncore_read(uncore, upper_reg);
>>>>> -	do {
>>>>> -		old_upper = upper;
>>>>> -		lower = intel_uncore_read(uncore, lower_reg);
>>>>> -		upper = intel_uncore_read(uncore, upper_reg);
>>>>> -	} while (upper != old_upper && loop++ < 2);
>>>>> -	return (u64)upper << 32 | lower;
>>>>> -}
>>>>> -
>>>>>    #define intel_uncore_posting_read(...) ((void)intel_uncore_read_notrace(__VA_ARGS__))
>>>>>    #define intel_uncore_posting_read16(...) ((void)intel_uncore_read16_notrace(__VA_ARGS__))
>>>>>    @@ -455,6 +441,36 @@ static inline void intel_uncore_rmw_fw(struct
>>>>> intel_uncore *uncore,
>>>>> 		intel_uncore_write_fw(uncore, reg, val);
>>>>>    }
>>>>>    +static inline u64
>>>>> +intel_uncore_read64_2x32(struct intel_uncore *uncore,
>>>>> +			 i915_reg_t lower_reg, i915_reg_t upper_reg)
>>>>> +{
>>>>> +	u32 upper, lower, old_upper, loop = 0;
>>>>> +	enum forcewake_domains fw_domains;
>>>>> +	unsigned long flags;
>>>>> +
>>>>> +	fw_domains = intel_uncore_forcewake_for_reg(uncore, lower_reg,
>>>>> +						    FW_REG_READ);
>>>>> +
>>>>> +	fw_domains |= intel_uncore_forcewake_for_reg(uncore, upper_reg,
>>>>> +						    FW_REG_READ);
>>>>> +
>>>>> +	spin_lock_irqsave(&uncore->lock, flags);
>>>>> +	intel_uncore_forcewake_get__locked(uncore, fw_domains);
>>>>> +
>>>>> +	upper = intel_uncore_read_fw(uncore, upper_reg);
>>>>> +	do {
>>>>> +		old_upper = upper;
>>>>> +		lower = intel_uncore_read_fw(uncore, lower_reg);
>>>>> +		upper = intel_uncore_read_fw(uncore, upper_reg);
>>>>> +	} while (upper != old_upper && loop++ < 2);
>>>>> +
>>>>> +	intel_uncore_forcewake_put__locked(uncore, fw_domains);
>>>>
>>>> I mulled over the fact this no longer applies the put hysteresis, but then
>>>> I saw GuC busyness is essentially the only current caller so thought it
>>>> doesn't really warrant adding a super long named
>>>> intel_uncore_forcewake_put_delayed__locked helper.
>>>>
>>>> Perhaps it would make sense to move this out of static inline,
>>
>> Are you saying - drop the inline OR drop static inline? I am assuming the
>> former.
> 
> No you need to have 'static inline' for functions defined in a header
> file. I also don't understand completely but seems what Tvrtko is saying is
> move the function to the .c leaving only the declarations in the .h? Anyway
> let Tvrtko explain more.

Yes I does not feel warranted for it to be a static inline so I'd just 
move it to .c. In which case..

>>>> in which
>>>> case it would also be easier to have the hysteresis without needing to
>>>> export any new helpers,
>>
>> I don't understand this part. Do you mean that it makes it easier to just
>> call __intel_uncore_forcewake_put(uncore, fw_domains, true) then?

.. you could indeed call this and keep the put hysteresis. But I don't 
think that it matters really. You can go with the patch as is for what I 
am concerned.

> Yes I think this will work, drop the lock and call
> __intel_uncore_forcewake_put.
> 
>> Just
>> wondering how 'static inline' has any effect on that.
>>
>>>> but mostly because it does not feel the static
>>>> inline is justified.
>>
>> Agree, just carried it over from the previous helper definition.
>>
>>>> Sounds an attractive option but it is passable as is.
>>>
>>> Yup, copy that. Also see now how this reduces the read latency. And also it
>>> would increase the latency a bit for a different thread trying to do an
>>> uncore read/write since we hold uncore->lock longer but should be ok I
>>> think.
>>
>> Didn't think about it from that perspective. Worst case is that
>> gt_park/gt_unpark may happen very frequently (as seen on some use
>> cases). In that case, the unpark would end up calling this helper each
>> time.

Concern is two mmio reads under the uncore lock versus two lock-unlock 
cycles with one mmio read under them each? Feels like a meh. I guess 
with this DC induced latency issue it's a worse worst case but 
difference between normal times and pathological spike is probably 
orders of magnitude right?

Regards,

Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/intel_uncore.h b/drivers/gpu/drm/i915/intel_uncore.h
index 5449146a0624..e9e38490815d 100644
--- a/drivers/gpu/drm/i915/intel_uncore.h
+++ b/drivers/gpu/drm/i915/intel_uncore.h
@@ -382,20 +382,6 @@  __uncore_write(write_notrace, 32, l, false)
  */
 __uncore_read(read64, 64, q, true)
 
-static inline u64
-intel_uncore_read64_2x32(struct intel_uncore *uncore,
-			 i915_reg_t lower_reg, i915_reg_t upper_reg)
-{
-	u32 upper, lower, old_upper, loop = 0;
-	upper = intel_uncore_read(uncore, upper_reg);
-	do {
-		old_upper = upper;
-		lower = intel_uncore_read(uncore, lower_reg);
-		upper = intel_uncore_read(uncore, upper_reg);
-	} while (upper != old_upper && loop++ < 2);
-	return (u64)upper << 32 | lower;
-}
-
 #define intel_uncore_posting_read(...) ((void)intel_uncore_read_notrace(__VA_ARGS__))
 #define intel_uncore_posting_read16(...) ((void)intel_uncore_read16_notrace(__VA_ARGS__))
 
@@ -455,6 +441,36 @@  static inline void intel_uncore_rmw_fw(struct intel_uncore *uncore,
 		intel_uncore_write_fw(uncore, reg, val);
 }
 
+static inline u64
+intel_uncore_read64_2x32(struct intel_uncore *uncore,
+			 i915_reg_t lower_reg, i915_reg_t upper_reg)
+{
+	u32 upper, lower, old_upper, loop = 0;
+	enum forcewake_domains fw_domains;
+	unsigned long flags;
+
+	fw_domains = intel_uncore_forcewake_for_reg(uncore, lower_reg,
+						    FW_REG_READ);
+
+	fw_domains |= intel_uncore_forcewake_for_reg(uncore, upper_reg,
+						    FW_REG_READ);
+
+	spin_lock_irqsave(&uncore->lock, flags);
+	intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+	upper = intel_uncore_read_fw(uncore, upper_reg);
+	do {
+		old_upper = upper;
+		lower = intel_uncore_read_fw(uncore, lower_reg);
+		upper = intel_uncore_read_fw(uncore, upper_reg);
+	} while (upper != old_upper && loop++ < 2);
+
+	intel_uncore_forcewake_put__locked(uncore, fw_domains);
+	spin_unlock_irqrestore(&uncore->lock, flags);
+
+	return (u64)upper << 32 | lower;
+}
+
 static inline int intel_uncore_write_and_verify(struct intel_uncore *uncore,
 						i915_reg_t reg, u32 val,
 						u32 mask, u32 expected_val)