diff mbox series

[v2] drm/i915/huc: Don't try to check HuC status if it's not loaded

Message ID 20190519215043.10712-1-michal.wajdeczko@intel.com (mailing list archive)
State New, archived
Headers show
Series [v2] drm/i915/huc: Don't try to check HuC status if it's not loaded | expand

Commit Message

Michal Wajdeczko May 19, 2019, 9:50 p.m. UTC
If we never attempted to load HuC firmware, or we already wedged
or reset GuC/HuC, then there is no reason to wake up the device
to check one bit in the register that will be for sure cleared.

v2: check if HuC was enabled; subtle change in ABI
    reuse hus_is_load helper

Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tony Ye <tony.ye@intel.com>
---
 drivers/gpu/drm/i915/intel_huc.c | 20 ++++++++++++--------
 drivers/gpu/drm/i915/intel_huc.h |  5 +++++
 2 files changed, 17 insertions(+), 8 deletions(-)

Comments

Chris Wilson May 20, 2019, 9:35 a.m. UTC | #1
Quoting Michal Wajdeczko (2019-05-19 22:50:43)
> If we never attempted to load HuC firmware, or we already wedged
> or reset GuC/HuC, then there is no reason to wake up the device
> to check one bit in the register that will be for sure cleared.
> 
> v2: check if HuC was enabled; subtle change in ABI
>     reuse hus_is_load helper
> 
> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tony Ye <tony.ye@intel.com>
> ---
>  drivers/gpu/drm/i915/intel_huc.c | 20 ++++++++++++--------
>  drivers/gpu/drm/i915/intel_huc.h |  5 +++++
>  2 files changed, 17 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_huc.c b/drivers/gpu/drm/i915/intel_huc.c
> index 1ff1fb015e58..bfdebec1cfc8 100644
> --- a/drivers/gpu/drm/i915/intel_huc.c
> +++ b/drivers/gpu/drm/i915/intel_huc.c
> @@ -113,7 +113,7 @@ int intel_huc_auth(struct intel_huc *huc)
>         u32 status;
>         int ret;
>  
> -       if (huc->fw.load_status != INTEL_UC_FIRMWARE_SUCCESS)
> +       if (!intel_huc_is_loaded(huc))
>                 return -ENOEXEC;
>  
>         ret = intel_guc_auth_huc(guc,
> @@ -150,21 +150,25 @@ int intel_huc_auth(struct intel_huc *huc)
>   * This function reads status register to verify if HuC
>   * firmware was successfully loaded.
>   *
> - * Returns: 1 if HuC firmware is loaded and verified,
> - * 0 if HuC firmware is not loaded and -ENODEV if HuC
> - * is not present on this platform.
> + * Returns: 1 if HuC firmware is loaded and verified, 0 if HuC firmware is not
> + * enabled, -ENOPKG if HuC firmware is not loaded and -ENODEV if HuC is not
> + * present on this platform.
>   */
>  int intel_huc_check_status(struct intel_huc *huc)
>  {
>         struct drm_i915_private *dev_priv = huc_to_i915(huc);
>         intel_wakeref_t wakeref;
> -       bool status = false;
> +       bool verified = false;
>  
>         if (!HAS_HUC(dev_priv))
>                 return -ENODEV;
>  
> -       with_intel_runtime_pm(dev_priv, wakeref)
> -               status = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
> +       if (!USES_HUC(dev_priv))
> +               return 0;

Hmm, EOPNOTSUPP here for the user disabled case?

Then

if (!intel_huc_is_loaded(huc))
	return -ENOPKG;

bool verified = false;
with_intel_runtime_pm(dev_priv, wakeref)
	verified = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
return verified.

That keeps it a bit flatter with the special casing separate and the 0/1
result as given by the register status. Then if negative, we know one of
the preconditions for using HuC is not available, and if 0 something went
wrong in mmio setup.

Does that make sense?

> -       return status;
> +       if (intel_huc_is_loaded(huc))
> +               with_intel_runtime_pm(dev_priv, wakeref)
> +                       verified = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
> +
> +       return verified ? 1 : -ENOPKG;
>  }
> diff --git a/drivers/gpu/drm/i915/intel_huc.h b/drivers/gpu/drm/i915/intel_huc.h
> index a0c21ae02a99..cde3d303718d 100644
> --- a/drivers/gpu/drm/i915/intel_huc.h
> +++ b/drivers/gpu/drm/i915/intel_huc.h
> @@ -44,6 +44,11 @@ void intel_huc_fini(struct intel_huc *huc);
>  int intel_huc_auth(struct intel_huc *huc);
>  int intel_huc_check_status(struct intel_huc *huc);
>  
> +static inline bool intel_huc_is_loaded(struct intel_huc *huc)
> +{
> +       return intel_uc_fw_is_loaded(&huc->fw);
> +}
> +
>  static inline void intel_huc_fini_misc(struct intel_huc *huc)
>  {
>         intel_uc_fw_cleanup_fetch(&huc->fw);
> -- 
> 2.19.2
>
Michal Wajdeczko May 20, 2019, 10:24 a.m. UTC | #2
On Mon, 20 May 2019 11:35:26 +0200, Chris Wilson  
<chris@chris-wilson.co.uk> wrote:

> Quoting Michal Wajdeczko (2019-05-19 22:50:43)
>> If we never attempted to load HuC firmware, or we already wedged
>> or reset GuC/HuC, then there is no reason to wake up the device
>> to check one bit in the register that will be for sure cleared.
>>
>> v2: check if HuC was enabled; subtle change in ABI
>>     reuse hus_is_load helper
>>
>> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Tony Ye <tony.ye@intel.com>
>> ---
>>  drivers/gpu/drm/i915/intel_huc.c | 20 ++++++++++++--------
>>  drivers/gpu/drm/i915/intel_huc.h |  5 +++++
>>  2 files changed, 17 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/intel_huc.c  
>> b/drivers/gpu/drm/i915/intel_huc.c
>> index 1ff1fb015e58..bfdebec1cfc8 100644
>> --- a/drivers/gpu/drm/i915/intel_huc.c
>> +++ b/drivers/gpu/drm/i915/intel_huc.c
>> @@ -113,7 +113,7 @@ int intel_huc_auth(struct intel_huc *huc)
>>         u32 status;
>>         int ret;
>>
>> -       if (huc->fw.load_status != INTEL_UC_FIRMWARE_SUCCESS)
>> +       if (!intel_huc_is_loaded(huc))
>>                 return -ENOEXEC;
>>
>>         ret = intel_guc_auth_huc(guc,
>> @@ -150,21 +150,25 @@ int intel_huc_auth(struct intel_huc *huc)
>>   * This function reads status register to verify if HuC
>>   * firmware was successfully loaded.
>>   *
>> - * Returns: 1 if HuC firmware is loaded and verified,
>> - * 0 if HuC firmware is not loaded and -ENODEV if HuC
>> - * is not present on this platform.
>> + * Returns: 1 if HuC firmware is loaded and verified, 0 if HuC  
>> firmware is not
>> + * enabled, -ENOPKG if HuC firmware is not loaded and -ENODEV if HuC  
>> is not
>> + * present on this platform.
>>   */
>>  int intel_huc_check_status(struct intel_huc *huc)
>>  {
>>         struct drm_i915_private *dev_priv = huc_to_i915(huc);
>>         intel_wakeref_t wakeref;
>> -       bool status = false;
>> +       bool verified = false;
>>
>>         if (!HAS_HUC(dev_priv))
>>                 return -ENODEV;
>>
>> -       with_intel_runtime_pm(dev_priv, wakeref)
>> -               status = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
>> +       if (!USES_HUC(dev_priv))
>> +               return 0;
>
> Hmm, EOPNOTSUPP here for the user disabled case?

I'm not sure that user disabled case shall be reported as error,
since it perfectly fits into legacy ABI 0="not loaded".
The only small change is that now 0="not loaded (not enabled by user)"

>
> Then
>
> if (!intel_huc_is_loaded(huc))
> 	return -ENOPKG;
>
> bool verified = false;
> with_intel_runtime_pm(dev_priv, wakeref)
> 	verified = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
> return verified.
>
> That keeps it a bit flatter with the special casing separate and the 0/1
> result as given by the register status. Then if negative, we know one of
> the preconditions for using HuC is not available, and if 0 something went
> wrong in mmio setup.
>
> Does that make sense?

IMHO, if something went wrong it is better to report it as error rather
then weak 0 status. Compare:

+---------------+-------+-------+-------+
  no hardware     -ENODEV -ENODEV -ENODEV
  fw disabled        0       0    -ENOTSUP
  fw not loaded      0    -ENOPKG -ENOPKG
  fw not verified*   0    -ENOPKG    0
  fw verified*       1       1       1
+---------------+-------+-------+-------+
*) registry access

Note that in your case, fw verification problem can be reported both
as -ENOPKG or as 0 (former when verification fails at load time, latter
as result of runtime reg read). Very likely we will never see 0 at all,
since probably that can't change once fw was verified at load time.
That might be viewed as undesired ABI change.

>
>> -       return status;
>> +       if (intel_huc_is_loaded(huc))
>> +               with_intel_runtime_pm(dev_priv, wakeref)
>> +                       verified = I915_READ(HUC_STATUS2) &  
>> HUC_FW_VERIFIED;
>> +
>> +       return verified ? 1 : -ENOPKG;
>>  }
>> diff --git a/drivers/gpu/drm/i915/intel_huc.h  
>> b/drivers/gpu/drm/i915/intel_huc.h
>> index a0c21ae02a99..cde3d303718d 100644
>> --- a/drivers/gpu/drm/i915/intel_huc.h
>> +++ b/drivers/gpu/drm/i915/intel_huc.h
>> @@ -44,6 +44,11 @@ void intel_huc_fini(struct intel_huc *huc);
>>  int intel_huc_auth(struct intel_huc *huc);
>>  int intel_huc_check_status(struct intel_huc *huc);
>>
>> +static inline bool intel_huc_is_loaded(struct intel_huc *huc)
>> +{
>> +       return intel_uc_fw_is_loaded(&huc->fw);
>> +}
>> +
>>  static inline void intel_huc_fini_misc(struct intel_huc *huc)
>>  {
>>         intel_uc_fw_cleanup_fetch(&huc->fw);
>> --
>> 2.19.2
Chris Wilson May 20, 2019, 10:44 a.m. UTC | #3
Quoting Michal Wajdeczko (2019-05-20 11:24:37)
> On Mon, 20 May 2019 11:35:26 +0200, Chris Wilson  
> <chris@chris-wilson.co.uk> wrote:
> 
> > Quoting Michal Wajdeczko (2019-05-19 22:50:43)
> >> If we never attempted to load HuC firmware, or we already wedged
> >> or reset GuC/HuC, then there is no reason to wake up the device
> >> to check one bit in the register that will be for sure cleared.
> >>
> >> v2: check if HuC was enabled; subtle change in ABI
> >>     reuse hus_is_load helper
> >>
> >> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> >> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
> >> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> >> Cc: Tony Ye <tony.ye@intel.com>
> >> ---
> >>  drivers/gpu/drm/i915/intel_huc.c | 20 ++++++++++++--------
> >>  drivers/gpu/drm/i915/intel_huc.h |  5 +++++
> >>  2 files changed, 17 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/i915/intel_huc.c  
> >> b/drivers/gpu/drm/i915/intel_huc.c
> >> index 1ff1fb015e58..bfdebec1cfc8 100644
> >> --- a/drivers/gpu/drm/i915/intel_huc.c
> >> +++ b/drivers/gpu/drm/i915/intel_huc.c
> >> @@ -113,7 +113,7 @@ int intel_huc_auth(struct intel_huc *huc)
> >>         u32 status;
> >>         int ret;
> >>
> >> -       if (huc->fw.load_status != INTEL_UC_FIRMWARE_SUCCESS)
> >> +       if (!intel_huc_is_loaded(huc))
> >>                 return -ENOEXEC;
> >>
> >>         ret = intel_guc_auth_huc(guc,
> >> @@ -150,21 +150,25 @@ int intel_huc_auth(struct intel_huc *huc)
> >>   * This function reads status register to verify if HuC
> >>   * firmware was successfully loaded.
> >>   *
> >> - * Returns: 1 if HuC firmware is loaded and verified,
> >> - * 0 if HuC firmware is not loaded and -ENODEV if HuC
> >> - * is not present on this platform.
> >> + * Returns: 1 if HuC firmware is loaded and verified, 0 if HuC  
> >> firmware is not
> >> + * enabled, -ENOPKG if HuC firmware is not loaded and -ENODEV if HuC  
> >> is not
> >> + * present on this platform.
> >>   */
> >>  int intel_huc_check_status(struct intel_huc *huc)
> >>  {
> >>         struct drm_i915_private *dev_priv = huc_to_i915(huc);
> >>         intel_wakeref_t wakeref;
> >> -       bool status = false;
> >> +       bool verified = false;
> >>
> >>         if (!HAS_HUC(dev_priv))
> >>                 return -ENODEV;
> >>
> >> -       with_intel_runtime_pm(dev_priv, wakeref)
> >> -               status = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
> >> +       if (!USES_HUC(dev_priv))
> >> +               return 0;
> >
> > Hmm, EOPNOTSUPP here for the user disabled case?
> 
> I'm not sure that user disabled case shall be reported as error,
> since it perfectly fits into legacy ABI 0="not loaded".
> The only small change is that now 0="not loaded (not enabled by user)"

The only requirement for the intel-media driver seems to "HUC_STATUS > 0".
That's our only user, so I think we have the liberty to redefine <=0 as
befits error reporting.

> 
> >
> > Then
> >
> > if (!intel_huc_is_loaded(huc))
> >       return -ENOPKG;
> >
> > bool verified = false;
> > with_intel_runtime_pm(dev_priv, wakeref)
> >       verified = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
> > return verified.
> >
> > That keeps it a bit flatter with the special casing separate and the 0/1
> > result as given by the register status. Then if negative, we know one of
> > the preconditions for using HuC is not available, and if 0 something went
> > wrong in mmio setup.
> >
> > Does that make sense?
> 
> IMHO, if something went wrong it is better to report it as error rather
> then weak 0 status. Compare:

I disagree, <0 is the weak case. 0 is the HW reports something went
wrong (and only that). 1 is the HW reports all went well.

If we have more ways we can report the HW went wrong, sure expand that
into extra error codes, but I don't see that. And if it's in the regs,
why are not exporting them via reg_read_ioctl?

> +---------------+-------+-------+-------+
>   no hardware     -ENODEV -ENODEV -ENODEV
>   fw disabled        0       0    -ENOTSUP
>   fw not loaded      0    -ENOPKG -ENOPKG
>   fw not verified*   0    -ENOPKG    0
>   fw verified*       1       1       1
> +---------------+-------+-------+-------+
> *) registry access
> 
> Note that in your case, fw verification problem can be reported both
> as -ENOPKG or as 0 (former when verification fails at load time, latter
> as result of runtime reg read). Very likely we will never see 0 at all,
> since probably that can't change once fw was verified at load time.

I see more distinction for ENODEV/ENOTSUP/ENOPKG/0/1. I am not getting
your point that we have conflation here, as to me it looks more
distinct, and it is clear when we report the value as given by the
register and when we give an interpreted value for a driver error.

> That might be viewed as undesired ABI change.

grep says it is fine, so I'm happy that no one else cares (as can be
seen by the lack of igt, no one has looked hard enough into how to
distinguish the API failures ;)
-Chris
Michal Wajdeczko May 20, 2019, 12:16 p.m. UTC | #4
On Mon, 20 May 2019 12:44:43 +0200, Chris Wilson  
<chris@chris-wilson.co.uk> wrote:

> Quoting Michal Wajdeczko (2019-05-20 11:24:37)
>> On Mon, 20 May 2019 11:35:26 +0200, Chris Wilson
>> <chris@chris-wilson.co.uk> wrote:
>>
>> > Quoting Michal Wajdeczko (2019-05-19 22:50:43)
>> >> If we never attempted to load HuC firmware, or we already wedged
>> >> or reset GuC/HuC, then there is no reason to wake up the device
>> >> to check one bit in the register that will be for sure cleared.
>> >>
>> >> v2: check if HuC was enabled; subtle change in ABI
>> >>     reuse hus_is_load helper
>> >>
>> >> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
>> >> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>> >> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> >> Cc: Tony Ye <tony.ye@intel.com>
>> >> ---
>> >>  drivers/gpu/drm/i915/intel_huc.c | 20 ++++++++++++--------
>> >>  drivers/gpu/drm/i915/intel_huc.h |  5 +++++
>> >>  2 files changed, 17 insertions(+), 8 deletions(-)
>> >>
>> >> diff --git a/drivers/gpu/drm/i915/intel_huc.c
>> >> b/drivers/gpu/drm/i915/intel_huc.c
>> >> index 1ff1fb015e58..bfdebec1cfc8 100644
>> >> --- a/drivers/gpu/drm/i915/intel_huc.c
>> >> +++ b/drivers/gpu/drm/i915/intel_huc.c
>> >> @@ -113,7 +113,7 @@ int intel_huc_auth(struct intel_huc *huc)
>> >>         u32 status;
>> >>         int ret;
>> >>
>> >> -       if (huc->fw.load_status != INTEL_UC_FIRMWARE_SUCCESS)
>> >> +       if (!intel_huc_is_loaded(huc))
>> >>                 return -ENOEXEC;
>> >>
>> >>         ret = intel_guc_auth_huc(guc,
>> >> @@ -150,21 +150,25 @@ int intel_huc_auth(struct intel_huc *huc)
>> >>   * This function reads status register to verify if HuC
>> >>   * firmware was successfully loaded.
>> >>   *
>> >> - * Returns: 1 if HuC firmware is loaded and verified,
>> >> - * 0 if HuC firmware is not loaded and -ENODEV if HuC
>> >> - * is not present on this platform.
>> >> + * Returns: 1 if HuC firmware is loaded and verified, 0 if HuC
>> >> firmware is not
>> >> + * enabled, -ENOPKG if HuC firmware is not loaded and -ENODEV if HuC
>> >> is not
>> >> + * present on this platform.
>> >>   */
>> >>  int intel_huc_check_status(struct intel_huc *huc)
>> >>  {
>> >>         struct drm_i915_private *dev_priv = huc_to_i915(huc);
>> >>         intel_wakeref_t wakeref;
>> >> -       bool status = false;
>> >> +       bool verified = false;
>> >>
>> >>         if (!HAS_HUC(dev_priv))
>> >>                 return -ENODEV;
>> >>
>> >> -       with_intel_runtime_pm(dev_priv, wakeref)
>> >> -               status = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
>> >> +       if (!USES_HUC(dev_priv))
>> >> +               return 0;
>> >
>> > Hmm, EOPNOTSUPP here for the user disabled case?
>>
>> I'm not sure that user disabled case shall be reported as error,
>> since it perfectly fits into legacy ABI 0="not loaded".
>> The only small change is that now 0="not loaded (not enabled by user)"
>
> The only requirement for the intel-media driver seems to "HUC_STATUS >  
> 0".
> That's our only user, so I think we have the liberty to redefine <=0 as
> befits error reporting.
>
>>
>> >
>> > Then
>> >
>> > if (!intel_huc_is_loaded(huc))
>> >       return -ENOPKG;
>> >
>> > bool verified = false;
>> > with_intel_runtime_pm(dev_priv, wakeref)
>> >       verified = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
>> > return verified.
>> >
>> > That keeps it a bit flatter with the special casing separate and the  
>> 0/1
>> > result as given by the register status. Then if negative, we know one  
>> of
>> > the preconditions for using HuC is not available, and if 0 something  
>> went
>> > wrong in mmio setup.
>> >
>> > Does that make sense?
>>
>> IMHO, if something went wrong it is better to report it as error rather
>> then weak 0 status. Compare:
>
> I disagree, <0 is the weak case. 0 is the HW reports something went
> wrong (and only that). 1 is the HW reports all went well.

But I was assuming that we're defining ABI here, not simply exposing  
unified
register value. Since we want 1 to report all ok, and we can fail for many
reasons (no hw, no fw blob, wrong fw, bad signature, ...) we should at  
least
try to match all these failures into error code (today just ENOPKG, but  
later
we can extend into ENOEXEC=bad fw, ENOSPC=can't fit into WOPCM, ...).

On other hand, at least for me, user decision to disable HuC should not
be treated the same way as other real failures, so 0 seems like perfect  
match.

Then we'll have: 1 = enabled, 0 = disabled, <0 = problem

>
> If we have more ways we can report the HW went wrong, sure expand that
> into extra error codes, but I don't see that. And if it's in the regs,
> why are not exporting them via reg_read_ioctl?
>
>> +---------------+-------+-------+-------+
>>   no hardware     -ENODEV -ENODEV -ENODEV
>>   fw disabled        0       0    -ENOTSUP
>>   fw not loaded      0    -ENOPKG -ENOPKG
>>   fw not verified*   0    -ENOPKG    0
>>   fw verified*       1       1       1
>> +---------------+-------+-------+-------+
>> *) registry access
>>
>> Note that in your case, fw verification problem can be reported both
>> as -ENOPKG or as 0 (former when verification fails at load time, latter
>> as result of runtime reg read). Very likely we will never see 0 at all,
>> since probably that can't change once fw was verified at load time.
>
> I see more distinction for ENODEV/ENOTSUP/ENOPKG/0/1. I am not getting
> your point that we have conflation here, as to me it looks more

Note that we are verifying HuC firmware right after it was loaded into
hardware using exactly the same register as here (see intel_huc_auth),
and if this fails we immediately change fw status to FAIL, so we never
go beyond huc_is_loaded point.

Assuming that once fw is verified it can't fail (to be verified later)
then your codes will be only ENODEV/ENOTSUP/ENOPKG/1 (no 0 value)

> distinct, and it is clear when we report the value as given by the
> register and when we give an interpreted value for a driver error.

Note that your initial idea was to reuse driver maintained fw status
instead of waking up device just to check register again (we don't do
that now as we need someone to confirm/verify that we can really skip
reg read). Use 0/1 for ABI as register values does not fit here, imho.

There is also a risk that we will report some known failures as <0
error codes, while other, captured by hw as just 0.

>
>> That might be viewed as undesired ABI change.
>
> grep says it is fine, so I'm happy that no one else cares (as can be
> seen by the lack of igt, no one has looked hard enough into how to
> distinguish the API failures ;)

it's always hard to come out from gray area into solid spec ;)

> -Chris
Ye, Tony May 21, 2019, 11:04 a.m. UTC | #5
There are two users of I915_PARAM_HUC_STATUS, the intel-media driver and 
the legacy intel-vaapi-driver.

Both drivers check the return value like this:

     has_huc = !! ret_value;

So the ABI change will break the existing stack because negative values 
are treated as 1, won't it?

On 5/20/2019 8:16 PM, Michal Wajdeczko wrote:

> On Mon, 20 May 2019 12:44:43 +0200, Chris Wilson 
> <chris@chris-wilson.co.uk> wrote:
>
>> Quoting Michal Wajdeczko (2019-05-20 11:24:37)
>>> On Mon, 20 May 2019 11:35:26 +0200, Chris Wilson
>>> <chris@chris-wilson.co.uk> wrote:
>>>
>>> > Quoting Michal Wajdeczko (2019-05-19 22:50:43)
>>> >> If we never attempted to load HuC firmware, or we already wedged
>>> >> or reset GuC/HuC, then there is no reason to wake up the device
>>> >> to check one bit in the register that will be for sure cleared.
>>> >>
>>> >> v2: check if HuC was enabled; subtle change in ABI
>>> >>     reuse hus_is_load helper
>>> >>
>>> >> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> >> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>>> >> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>> >> Cc: Tony Ye <tony.ye@intel.com>
>>> >> ---
>>> >>  drivers/gpu/drm/i915/intel_huc.c | 20 ++++++++++++--------
>>> >>  drivers/gpu/drm/i915/intel_huc.h |  5 +++++
>>> >>  2 files changed, 17 insertions(+), 8 deletions(-)
>>> >>
>>> >> diff --git a/drivers/gpu/drm/i915/intel_huc.c
>>> >> b/drivers/gpu/drm/i915/intel_huc.c
>>> >> index 1ff1fb015e58..bfdebec1cfc8 100644
>>> >> --- a/drivers/gpu/drm/i915/intel_huc.c
>>> >> +++ b/drivers/gpu/drm/i915/intel_huc.c
>>> >> @@ -113,7 +113,7 @@ int intel_huc_auth(struct intel_huc *huc)
>>> >>         u32 status;
>>> >>         int ret;
>>> >>
>>> >> -       if (huc->fw.load_status != INTEL_UC_FIRMWARE_SUCCESS)
>>> >> +       if (!intel_huc_is_loaded(huc))
>>> >>                 return -ENOEXEC;
>>> >>
>>> >>         ret = intel_guc_auth_huc(guc,
>>> >> @@ -150,21 +150,25 @@ int intel_huc_auth(struct intel_huc *huc)
>>> >>   * This function reads status register to verify if HuC
>>> >>   * firmware was successfully loaded.
>>> >>   *
>>> >> - * Returns: 1 if HuC firmware is loaded and verified,
>>> >> - * 0 if HuC firmware is not loaded and -ENODEV if HuC
>>> >> - * is not present on this platform.
>>> >> + * Returns: 1 if HuC firmware is loaded and verified, 0 if HuC
>>> >> firmware is not
>>> >> + * enabled, -ENOPKG if HuC firmware is not loaded and -ENODEV if 
>>> HuC
>>> >> is not
>>> >> + * present on this platform.
>>> >>   */
>>> >>  int intel_huc_check_status(struct intel_huc *huc)
>>> >>  {
>>> >>         struct drm_i915_private *dev_priv = huc_to_i915(huc);
>>> >>         intel_wakeref_t wakeref;
>>> >> -       bool status = false;
>>> >> +       bool verified = false;
>>> >>
>>> >>         if (!HAS_HUC(dev_priv))
>>> >>                 return -ENODEV;
>>> >>
>>> >> -       with_intel_runtime_pm(dev_priv, wakeref)
>>> >> -               status = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
>>> >> +       if (!USES_HUC(dev_priv))
>>> >> +               return 0;
>>> >
>>> > Hmm, EOPNOTSUPP here for the user disabled case?
>>>
>>> I'm not sure that user disabled case shall be reported as error,
>>> since it perfectly fits into legacy ABI 0="not loaded".
>>> The only small change is that now 0="not loaded (not enabled by user)"
>>
>> The only requirement for the intel-media driver seems to "HUC_STATUS 
>> > 0".
>> That's our only user, so I think we have the liberty to redefine <=0 as
>> befits error reporting.
>>
>>>
>>> >
>>> > Then
>>> >
>>> > if (!intel_huc_is_loaded(huc))
>>> >       return -ENOPKG;
>>> >
>>> > bool verified = false;
>>> > with_intel_runtime_pm(dev_priv, wakeref)
>>> >       verified = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
>>> > return verified.
>>> >
>>> > That keeps it a bit flatter with the special casing separate and 
>>> the 0/1
>>> > result as given by the register status. Then if negative, we know 
>>> one of
>>> > the preconditions for using HuC is not available, and if 0 
>>> something went
>>> > wrong in mmio setup.
>>> >
>>> > Does that make sense?
>>>
>>> IMHO, if something went wrong it is better to report it as error rather
>>> then weak 0 status. Compare:
>>
>> I disagree, <0 is the weak case. 0 is the HW reports something went
>> wrong (and only that). 1 is the HW reports all went well.
>
> But I was assuming that we're defining ABI here, not simply exposing 
> unified
> register value. Since we want 1 to report all ok, and we can fail for 
> many
> reasons (no hw, no fw blob, wrong fw, bad signature, ...) we should at 
> least
> try to match all these failures into error code (today just ENOPKG, 
> but later
> we can extend into ENOEXEC=bad fw, ENOSPC=can't fit into WOPCM, ...).
>
> On other hand, at least for me, user decision to disable HuC should not
> be treated the same way as other real failures, so 0 seems like 
> perfect match.
>
> Then we'll have: 1 = enabled, 0 = disabled, <0 = problem
>
>>
>> If we have more ways we can report the HW went wrong, sure expand that
>> into extra error codes, but I don't see that. And if it's in the regs,
>> why are not exporting them via reg_read_ioctl?
>>
>>> +---------------+-------+-------+-------+
>>>   no hardware     -ENODEV -ENODEV -ENODEV
>>>   fw disabled        0       0    -ENOTSUP
>>>   fw not loaded      0    -ENOPKG -ENOPKG
>>>   fw not verified*   0    -ENOPKG    0
>>>   fw verified*       1       1       1
>>> +---------------+-------+-------+-------+
>>> *) registry access
>>>
>>> Note that in your case, fw verification problem can be reported both
>>> as -ENOPKG or as 0 (former when verification fails at load time, latter
>>> as result of runtime reg read). Very likely we will never see 0 at all,
>>> since probably that can't change once fw was verified at load time.
>>
>> I see more distinction for ENODEV/ENOTSUP/ENOPKG/0/1. I am not getting
>> your point that we have conflation here, as to me it looks more
>
> Note that we are verifying HuC firmware right after it was loaded into
> hardware using exactly the same register as here (see intel_huc_auth),
> and if this fails we immediately change fw status to FAIL, so we never
> go beyond huc_is_loaded point.
>
> Assuming that once fw is verified it can't fail (to be verified later)
> then your codes will be only ENODEV/ENOTSUP/ENOPKG/1 (no 0 value)
>
>> distinct, and it is clear when we report the value as given by the
>> register and when we give an interpreted value for a driver error.
>
> Note that your initial idea was to reuse driver maintained fw status
> instead of waking up device just to check register again (we don't do
> that now as we need someone to confirm/verify that we can really skip
> reg read). Use 0/1 for ABI as register values does not fit here, imho.
>
> There is also a risk that we will report some known failures as <0
> error codes, while other, captured by hw as just 0.
>
>>
>>> That might be viewed as undesired ABI change.
>>
>> grep says it is fine, so I'm happy that no one else cares (as can be
>> seen by the lack of igt, no one has looked hard enough into how to
>> distinguish the API failures ;)
>
> it's always hard to come out from gray area into solid spec ;)
>
>> -Chris
Michal Wajdeczko May 21, 2019, 11:08 a.m. UTC | #6
On Tue, 21 May 2019 13:04:12 +0200, Ye, Tony <tony.ye@intel.com> wrote:

> There are two users of I915_PARAM_HUC_STATUS, the intel-media driver and  
> the legacy intel-vaapi-driver.
>
> Both drivers check the return value like this:
>
>      has_huc = !! ret_value;
>
> So the ABI change will break the existing stack because negative values  
> are treated as 1, won't it?

No, since negative errors returned by i915 intel_huc_check_status()  
function
will be only used to indicate ioctl() call failure and then stored in  
errno.
Note that your "ret_value" will be modified only when ioctl is successful.

Michal

>
> On 5/20/2019 8:16 PM, Michal Wajdeczko wrote:
>
>> On Mon, 20 May 2019 12:44:43 +0200, Chris Wilson  
>> <chris@chris-wilson.co.uk> wrote:
>>
>>> Quoting Michal Wajdeczko (2019-05-20 11:24:37)
>>>> On Mon, 20 May 2019 11:35:26 +0200, Chris Wilson
>>>> <chris@chris-wilson.co.uk> wrote:
>>>>
>>>> > Quoting Michal Wajdeczko (2019-05-19 22:50:43)
>>>> >> If we never attempted to load HuC firmware, or we already wedged
>>>> >> or reset GuC/HuC, then there is no reason to wake up the device
>>>> >> to check one bit in the register that will be for sure cleared.
>>>> >>
>>>> >> v2: check if HuC was enabled; subtle change in ABI
>>>> >>     reuse hus_is_load helper
>>>> >>
>>>> >> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>> >> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>>>> >> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>>> >> Cc: Tony Ye <tony.ye@intel.com>
>>>> >> ---
>>>> >>  drivers/gpu/drm/i915/intel_huc.c | 20 ++++++++++++--------
>>>> >>  drivers/gpu/drm/i915/intel_huc.h |  5 +++++
>>>> >>  2 files changed, 17 insertions(+), 8 deletions(-)
>>>> >>
>>>> >> diff --git a/drivers/gpu/drm/i915/intel_huc.c
>>>> >> b/drivers/gpu/drm/i915/intel_huc.c
>>>> >> index 1ff1fb015e58..bfdebec1cfc8 100644
>>>> >> --- a/drivers/gpu/drm/i915/intel_huc.c
>>>> >> +++ b/drivers/gpu/drm/i915/intel_huc.c
>>>> >> @@ -113,7 +113,7 @@ int intel_huc_auth(struct intel_huc *huc)
>>>> >>         u32 status;
>>>> >>         int ret;
>>>> >>
>>>> >> -       if (huc->fw.load_status != INTEL_UC_FIRMWARE_SUCCESS)
>>>> >> +       if (!intel_huc_is_loaded(huc))
>>>> >>                 return -ENOEXEC;
>>>> >>
>>>> >>         ret = intel_guc_auth_huc(guc,
>>>> >> @@ -150,21 +150,25 @@ int intel_huc_auth(struct intel_huc *huc)
>>>> >>   * This function reads status register to verify if HuC
>>>> >>   * firmware was successfully loaded.
>>>> >>   *
>>>> >> - * Returns: 1 if HuC firmware is loaded and verified,
>>>> >> - * 0 if HuC firmware is not loaded and -ENODEV if HuC
>>>> >> - * is not present on this platform.
>>>> >> + * Returns: 1 if HuC firmware is loaded and verified, 0 if HuC
>>>> >> firmware is not
>>>> >> + * enabled, -ENOPKG if HuC firmware is not loaded and -ENODEV if  
>>>> HuC
>>>> >> is not
>>>> >> + * present on this platform.
>>>> >>   */
>>>> >>  int intel_huc_check_status(struct intel_huc *huc)
>>>> >>  {
>>>> >>         struct drm_i915_private *dev_priv = huc_to_i915(huc);
>>>> >>         intel_wakeref_t wakeref;
>>>> >> -       bool status = false;
>>>> >> +       bool verified = false;
>>>> >>
>>>> >>         if (!HAS_HUC(dev_priv))
>>>> >>                 return -ENODEV;
>>>> >>
>>>> >> -       with_intel_runtime_pm(dev_priv, wakeref)
>>>> >> -               status = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
>>>> >> +       if (!USES_HUC(dev_priv))
>>>> >> +               return 0;
>>>> >
>>>> > Hmm, EOPNOTSUPP here for the user disabled case?
>>>>
>>>> I'm not sure that user disabled case shall be reported as error,
>>>> since it perfectly fits into legacy ABI 0="not loaded".
>>>> The only small change is that now 0="not loaded (not enabled by user)"
>>>
>>> The only requirement for the intel-media driver seems to "HUC_STATUS >  
>>> 0".
>>> That's our only user, so I think we have the liberty to redefine <=0 as
>>> befits error reporting.
>>>
>>>>
>>>> >
>>>> > Then
>>>> >
>>>> > if (!intel_huc_is_loaded(huc))
>>>> >       return -ENOPKG;
>>>> >
>>>> > bool verified = false;
>>>> > with_intel_runtime_pm(dev_priv, wakeref)
>>>> >       verified = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
>>>> > return verified.
>>>> >
>>>> > That keeps it a bit flatter with the special casing separate and  
>>>> the 0/1
>>>> > result as given by the register status. Then if negative, we know  
>>>> one of
>>>> > the preconditions for using HuC is not available, and if 0  
>>>> something went
>>>> > wrong in mmio setup.
>>>> >
>>>> > Does that make sense?
>>>>
>>>> IMHO, if something went wrong it is better to report it as error  
>>>> rather
>>>> then weak 0 status. Compare:
>>>
>>> I disagree, <0 is the weak case. 0 is the HW reports something went
>>> wrong (and only that). 1 is the HW reports all went well.
>>
>> But I was assuming that we're defining ABI here, not simply exposing  
>> unified
>> register value. Since we want 1 to report all ok, and we can fail for  
>> many
>> reasons (no hw, no fw blob, wrong fw, bad signature, ...) we should at  
>> least
>> try to match all these failures into error code (today just ENOPKG, but  
>> later
>> we can extend into ENOEXEC=bad fw, ENOSPC=can't fit into WOPCM, ...).
>>
>> On other hand, at least for me, user decision to disable HuC should not
>> be treated the same way as other real failures, so 0 seems like perfect  
>> match.
>>
>> Then we'll have: 1 = enabled, 0 = disabled, <0 = problem
>>
>>>
>>> If we have more ways we can report the HW went wrong, sure expand that
>>> into extra error codes, but I don't see that. And if it's in the regs,
>>> why are not exporting them via reg_read_ioctl?
>>>
>>>> +---------------+-------+-------+-------+
>>>>   no hardware     -ENODEV -ENODEV -ENODEV
>>>>   fw disabled        0       0    -ENOTSUP
>>>>   fw not loaded      0    -ENOPKG -ENOPKG
>>>>   fw not verified*   0    -ENOPKG    0
>>>>   fw verified*       1       1       1
>>>> +---------------+-------+-------+-------+
>>>> *) registry access
>>>>
>>>> Note that in your case, fw verification problem can be reported both
>>>> as -ENOPKG or as 0 (former when verification fails at load time,  
>>>> latter
>>>> as result of runtime reg read). Very likely we will never see 0 at  
>>>> all,
>>>> since probably that can't change once fw was verified at load time.
>>>
>>> I see more distinction for ENODEV/ENOTSUP/ENOPKG/0/1. I am not getting
>>> your point that we have conflation here, as to me it looks more
>>
>> Note that we are verifying HuC firmware right after it was loaded into
>> hardware using exactly the same register as here (see intel_huc_auth),
>> and if this fails we immediately change fw status to FAIL, so we never
>> go beyond huc_is_loaded point.
>>
>> Assuming that once fw is verified it can't fail (to be verified later)
>> then your codes will be only ENODEV/ENOTSUP/ENOPKG/1 (no 0 value)
>>
>>> distinct, and it is clear when we report the value as given by the
>>> register and when we give an interpreted value for a driver error.
>>
>> Note that your initial idea was to reuse driver maintained fw status
>> instead of waking up device just to check register again (we don't do
>> that now as we need someone to confirm/verify that we can really skip
>> reg read). Use 0/1 for ABI as register values does not fit here, imho.
>>
>> There is also a risk that we will report some known failures as <0
>> error codes, while other, captured by hw as just 0.
>>
>>>
>>>> That might be viewed as undesired ABI change.
>>>
>>> grep says it is fine, so I'm happy that no one else cares (as can be
>>> seen by the lack of igt, no one has looked hard enough into how to
>>> distinguish the API failures ;)
>>
>> it's always hard to come out from gray area into solid spec ;)
>>
>>> -Chris
Ye, Tony May 22, 2019, 11:32 a.m. UTC | #7
On 5/21/2019 7:08 PM, Michal Wajdeczko wrote:
> On Tue, 21 May 2019 13:04:12 +0200, Ye, Tony <tony.ye@intel.com> wrote:
>
>> There are two users of I915_PARAM_HUC_STATUS, the intel-media driver 
>> and the legacy intel-vaapi-driver.
>>
>> Both drivers check the return value like this:
>>
>>      has_huc = !! ret_value;
>>
>> So the ABI change will break the existing stack because negative 
>> values are treated as 1, won't it?
>
> No, since negative errors returned by i915 intel_huc_check_status() 
> function
> will be only used to indicate ioctl() call failure and then stored in 
> errno.
> Note that your "ret_value" will be modified only when ioctl is 
> successful.
>
> Michal
Yeah, you are right.
>
>>
>> On 5/20/2019 8:16 PM, Michal Wajdeczko wrote:
>>
>>> On Mon, 20 May 2019 12:44:43 +0200, Chris Wilson 
>>> <chris@chris-wilson.co.uk> wrote:
>>>
>>>> Quoting Michal Wajdeczko (2019-05-20 11:24:37)
>>>>> On Mon, 20 May 2019 11:35:26 +0200, Chris Wilson
>>>>> <chris@chris-wilson.co.uk> wrote:
>>>>>
>>>>> > Quoting Michal Wajdeczko (2019-05-19 22:50:43)
>>>>> >> If we never attempted to load HuC firmware, or we already wedged
>>>>> >> or reset GuC/HuC, then there is no reason to wake up the device
>>>>> >> to check one bit in the register that will be for sure cleared.
>>>>> >>
>>>>> >> v2: check if HuC was enabled; subtle change in ABI
>>>>> >>     reuse hus_is_load helper
>>>>> >>
>>>>> >> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>> >> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>>>>> >> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>>>> >> Cc: Tony Ye <tony.ye@intel.com>
>>>>> >> ---
>>>>> >>  drivers/gpu/drm/i915/intel_huc.c | 20 ++++++++++++--------
>>>>> >>  drivers/gpu/drm/i915/intel_huc.h |  5 +++++
>>>>> >>  2 files changed, 17 insertions(+), 8 deletions(-)
>>>>> >>
>>>>> >> diff --git a/drivers/gpu/drm/i915/intel_huc.c
>>>>> >> b/drivers/gpu/drm/i915/intel_huc.c
>>>>> >> index 1ff1fb015e58..bfdebec1cfc8 100644
>>>>> >> --- a/drivers/gpu/drm/i915/intel_huc.c
>>>>> >> +++ b/drivers/gpu/drm/i915/intel_huc.c
>>>>> >> @@ -113,7 +113,7 @@ int intel_huc_auth(struct intel_huc *huc)
>>>>> >>         u32 status;
>>>>> >>         int ret;
>>>>> >>
>>>>> >> -       if (huc->fw.load_status != INTEL_UC_FIRMWARE_SUCCESS)
>>>>> >> +       if (!intel_huc_is_loaded(huc))
>>>>> >>                 return -ENOEXEC;
>>>>> >>
>>>>> >>         ret = intel_guc_auth_huc(guc,
>>>>> >> @@ -150,21 +150,25 @@ int intel_huc_auth(struct intel_huc *huc)
>>>>> >>   * This function reads status register to verify if HuC
>>>>> >>   * firmware was successfully loaded.
>>>>> >>   *
>>>>> >> - * Returns: 1 if HuC firmware is loaded and verified,
>>>>> >> - * 0 if HuC firmware is not loaded and -ENODEV if HuC
>>>>> >> - * is not present on this platform.
>>>>> >> + * Returns: 1 if HuC firmware is loaded and verified, 0 if HuC
>>>>> >> firmware is not
>>>>> >> + * enabled, -ENOPKG if HuC firmware is not loaded and -ENODEV 
>>>>> if HuC
>>>>> >> is not
>>>>> >> + * present on this platform.
>>>>> >>   */
>>>>> >>  int intel_huc_check_status(struct intel_huc *huc)
>>>>> >>  {
>>>>> >>         struct drm_i915_private *dev_priv = huc_to_i915(huc);
>>>>> >>         intel_wakeref_t wakeref;
>>>>> >> -       bool status = false;
>>>>> >> +       bool verified = false;
>>>>> >>
>>>>> >>         if (!HAS_HUC(dev_priv))
>>>>> >>                 return -ENODEV;
>>>>> >>
>>>>> >> -       with_intel_runtime_pm(dev_priv, wakeref)
>>>>> >> -               status = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
>>>>> >> +       if (!USES_HUC(dev_priv))
>>>>> >> +               return 0;
>>>>> >
>>>>> > Hmm, EOPNOTSUPP here for the user disabled case?
>>>>>
>>>>> I'm not sure that user disabled case shall be reported as error,
>>>>> since it perfectly fits into legacy ABI 0="not loaded".
>>>>> The only small change is that now 0="not loaded (not enabled by 
>>>>> user)"
>>>>
>>>> The only requirement for the intel-media driver seems to 
>>>> "HUC_STATUS > 0".
>>>> That's our only user, so I think we have the liberty to redefine 
>>>> <=0 as
>>>> befits error reporting.
>>>>
>>>>>
>>>>> >
>>>>> > Then
>>>>> >
>>>>> > if (!intel_huc_is_loaded(huc))
>>>>> >       return -ENOPKG;
>>>>> >
>>>>> > bool verified = false;
>>>>> > with_intel_runtime_pm(dev_priv, wakeref)
>>>>> >       verified = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
>>>>> > return verified.
>>>>> >
>>>>> > That keeps it a bit flatter with the special casing separate and 
>>>>> the 0/1
>>>>> > result as given by the register status. Then if negative, we 
>>>>> know one of
>>>>> > the preconditions for using HuC is not available, and if 0 
>>>>> something went
>>>>> > wrong in mmio setup.
>>>>> >
>>>>> > Does that make sense?
>>>>>
>>>>> IMHO, if something went wrong it is better to report it as error 
>>>>> rather
>>>>> then weak 0 status. Compare:
>>>>
>>>> I disagree, <0 is the weak case. 0 is the HW reports something went
>>>> wrong (and only that). 1 is the HW reports all went well.
>>>
>>> But I was assuming that we're defining ABI here, not simply exposing 
>>> unified
>>> register value. Since we want 1 to report all ok, and we can fail 
>>> for many
>>> reasons (no hw, no fw blob, wrong fw, bad signature, ...) we should 
>>> at least
>>> try to match all these failures into error code (today just ENOPKG, 
>>> but later
>>> we can extend into ENOEXEC=bad fw, ENOSPC=can't fit into WOPCM, ...).
>>>
>>> On other hand, at least for me, user decision to disable HuC should not
>>> be treated the same way as other real failures, so 0 seems like 
>>> perfect match.
>>>
>>> Then we'll have: 1 = enabled, 0 = disabled, <0 = problem
>>>
>>>>
>>>> If we have more ways we can report the HW went wrong, sure expand that
>>>> into extra error codes, but I don't see that. And if it's in the regs,
>>>> why are not exporting them via reg_read_ioctl?
>>>>
>>>>> +---------------+-------+-------+-------+
>>>>>   no hardware     -ENODEV -ENODEV -ENODEV
>>>>>   fw disabled        0       0    -ENOTSUP
>>>>>   fw not loaded      0    -ENOPKG -ENOPKG
>>>>>   fw not verified*   0    -ENOPKG    0
>>>>>   fw verified*       1       1       1
>>>>> +---------------+-------+-------+-------+
>>>>> *) registry access
>>>>>
>>>>> Note that in your case, fw verification problem can be reported both
>>>>> as -ENOPKG or as 0 (former when verification fails at load time, 
>>>>> latter
>>>>> as result of runtime reg read). Very likely we will never see 0 at 
>>>>> all,
>>>>> since probably that can't change once fw was verified at load time.
>>>>
>>>> I see more distinction for ENODEV/ENOTSUP/ENOPKG/0/1. I am not getting
>>>> your point that we have conflation here, as to me it looks more
>>>
>>> Note that we are verifying HuC firmware right after it was loaded into
>>> hardware using exactly the same register as here (see intel_huc_auth),
>>> and if this fails we immediately change fw status to FAIL, so we never
>>> go beyond huc_is_loaded point.
>>>
>>> Assuming that once fw is verified it can't fail (to be verified later)
>>> then your codes will be only ENODEV/ENOTSUP/ENOPKG/1 (no 0 value)
>>>
>>>> distinct, and it is clear when we report the value as given by the
>>>> register and when we give an interpreted value for a driver error.
>>>
>>> Note that your initial idea was to reuse driver maintained fw status
>>> instead of waking up device just to check register again (we don't do
>>> that now as we need someone to confirm/verify that we can really skip
>>> reg read). Use 0/1 for ABI as register values does not fit here, imho.
>>>
>>> There is also a risk that we will report some known failures as <0
>>> error codes, while other, captured by hw as just 0.
>>>
>>>>
>>>>> That might be viewed as undesired ABI change.
>>>>
>>>> grep says it is fine, so I'm happy that no one else cares (as can be
>>>> seen by the lack of igt, no one has looked hard enough into how to
>>>> distinguish the API failures ;)

 From UMD perspective, when HuC is not working as expected, usually we 
look into the kernel log and i915_huc_load_status debugfs to find out 
why it's not working. It will be helpful to know the reason of the 
failure from the error code. The typically failures we encountered are 
"HuC FW not loaded (FW not built into initramfs)" and "HuC FW 
authentication failed".

And it looks clearer to me if we can define 0 as "disabled by user" to 
distinguish it from other errors like ENODEV, ENOPKG and "not 
authenticated".

Regards, --Tony

>>>
>>> it's always hard to come out from gray area into solid spec ;)
>>>
>>>> -Chris
Joonas Lahtinen May 24, 2019, 1:10 p.m. UTC | #8
Quoting Ye, Tony (2019-05-22 14:32:41)
>  From UMD perspective, when HuC is not working as expected, usually we 
> look into the kernel log and i915_huc_load_status debugfs to find out 
> why it's not working. It will be helpful to know the reason of the 
> failure from the error code. The typically failures we encountered are 
> "HuC FW not loaded (FW not built into initramfs)" and "HuC FW 
> authentication failed".
> 
> And it looks clearer to me if we can define 0 as "disabled by user" to 
> distinguish it from other errors like ENODEV, ENOPKG and "not 
> authenticated".

Suggestion by Chris for 0/1 for huc_status makes sense to me to reflect if
HuC authentication was succesful or not. Mostly because the name of the
debugfs and func is huc_status. It also nicely maps to a register.

One could have huc_enabled which would then collapse to easy 0/1, but that'd
be bit redundant.

Regards, Joonas
Michal Wajdeczko May 24, 2019, 1:29 p.m. UTC | #9
On Fri, 24 May 2019 15:10:58 +0200, Joonas Lahtinen  
<joonas.lahtinen@linux.intel.com> wrote:

> Quoting Ye, Tony (2019-05-22 14:32:41)
>>  From UMD perspective, when HuC is not working as expected, usually we
>> look into the kernel log and i915_huc_load_status debugfs to find out
>> why it's not working. It will be helpful to know the reason of the
>> failure from the error code. The typically failures we encountered are
>> "HuC FW not loaded (FW not built into initramfs)" and "HuC FW
>> authentication failed".
>>
>> And it looks clearer to me if we can define 0 as "disabled by user" to
>> distinguish it from other errors like ENODEV, ENOPKG and "not
>> authenticated".
>
> Suggestion by Chris for 0/1 for huc_status makes sense to me to reflect  
> if
> HuC authentication was succesful or not. Mostly because the name of the
> debugfs and func is huc_status. It also nicely maps to a register.

But this entry is not limited to "huc authentication status", as then
we should even not try to introduce new errors, but return 0 to match
register.

On other hand, under normal conditions, we expect HuC to be authenticated
and running or explicitly disabled by the user. Other cases are unexpected
error conditions. I would expect from the ABI that these error conditions
will be reported as errors. For me ABI is something more than reg value.

>
> One could have huc_enabled which would then collapse to easy 0/1, but  
> that'd
> be bit redundant.

that's why we wanted to provide more details in new error codes for
existing GETPARAM, without breaking current usages.

Michal

>
> Regards, Joonas
Michal Wajdeczko May 27, 2019, 11:23 a.m. UTC | #10
On Fri, 24 May 2019 15:29:22 +0200, Michal Wajdeczko  
<michal.wajdeczko@intel.com> wrote:

> On Fri, 24 May 2019 15:10:58 +0200, Joonas Lahtinen  
> <joonas.lahtinen@linux.intel.com> wrote:
>
>> Quoting Ye, Tony (2019-05-22 14:32:41)
>>>  From UMD perspective, when HuC is not working as expected, usually we
>>> look into the kernel log and i915_huc_load_status debugfs to find out
>>> why it's not working. It will be helpful to know the reason of the
>>> failure from the error code. The typically failures we encountered are
>>> "HuC FW not loaded (FW not built into initramfs)" and "HuC FW
>>> authentication failed".
>>>
>>> And it looks clearer to me if we can define 0 as "disabled by user" to
>>> distinguish it from other errors like ENODEV, ENOPKG and "not
>>> authenticated".
>>
>> Suggestion by Chris for 0/1 for huc_status makes sense to me to reflect  
>> if
>> HuC authentication was succesful or not. Mostly because the name of the
>> debugfs and func is huc_status.

one more thought on debugfs: we are dumping there raw register value,
not just single bit that holds authentication status (and auth bit is 7),
so I'm not sure why do you want to link that value with getparam value?

Michal

>> It also nicely maps to a register.
>
> But this entry is not limited to "huc authentication status", as then
> we should even not try to introduce new errors, but return 0 to match
> register.
>
> On other hand, under normal conditions, we expect HuC to be authenticated
> and running or explicitly disabled by the user. Other cases are  
> unexpected
> error conditions. I would expect from the ABI that these error conditions
> will be reported as errors. For me ABI is something more than reg value.
>
>>
>> One could have huc_enabled which would then collapse to easy 0/1, but  
>> that'd
>> be bit redundant.
>
> that's why we wanted to provide more details in new error codes for
> existing GETPARAM, without breaking current usages.
>
> Michal
>
>>
>> Regards, Joonas
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/intel_huc.c b/drivers/gpu/drm/i915/intel_huc.c
index 1ff1fb015e58..bfdebec1cfc8 100644
--- a/drivers/gpu/drm/i915/intel_huc.c
+++ b/drivers/gpu/drm/i915/intel_huc.c
@@ -113,7 +113,7 @@  int intel_huc_auth(struct intel_huc *huc)
 	u32 status;
 	int ret;
 
-	if (huc->fw.load_status != INTEL_UC_FIRMWARE_SUCCESS)
+	if (!intel_huc_is_loaded(huc))
 		return -ENOEXEC;
 
 	ret = intel_guc_auth_huc(guc,
@@ -150,21 +150,25 @@  int intel_huc_auth(struct intel_huc *huc)
  * This function reads status register to verify if HuC
  * firmware was successfully loaded.
  *
- * Returns: 1 if HuC firmware is loaded and verified,
- * 0 if HuC firmware is not loaded and -ENODEV if HuC
- * is not present on this platform.
+ * Returns: 1 if HuC firmware is loaded and verified, 0 if HuC firmware is not
+ * enabled, -ENOPKG if HuC firmware is not loaded and -ENODEV if HuC is not
+ * present on this platform.
  */
 int intel_huc_check_status(struct intel_huc *huc)
 {
 	struct drm_i915_private *dev_priv = huc_to_i915(huc);
 	intel_wakeref_t wakeref;
-	bool status = false;
+	bool verified = false;
 
 	if (!HAS_HUC(dev_priv))
 		return -ENODEV;
 
-	with_intel_runtime_pm(dev_priv, wakeref)
-		status = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
+	if (!USES_HUC(dev_priv))
+		return 0;
 
-	return status;
+	if (intel_huc_is_loaded(huc))
+		with_intel_runtime_pm(dev_priv, wakeref)
+			verified = I915_READ(HUC_STATUS2) & HUC_FW_VERIFIED;
+
+	return verified ? 1 : -ENOPKG;
 }
diff --git a/drivers/gpu/drm/i915/intel_huc.h b/drivers/gpu/drm/i915/intel_huc.h
index a0c21ae02a99..cde3d303718d 100644
--- a/drivers/gpu/drm/i915/intel_huc.h
+++ b/drivers/gpu/drm/i915/intel_huc.h
@@ -44,6 +44,11 @@  void intel_huc_fini(struct intel_huc *huc);
 int intel_huc_auth(struct intel_huc *huc);
 int intel_huc_check_status(struct intel_huc *huc);
 
+static inline bool intel_huc_is_loaded(struct intel_huc *huc)
+{
+	return intel_uc_fw_is_loaded(&huc->fw);
+}
+
 static inline void intel_huc_fini_misc(struct intel_huc *huc)
 {
 	intel_uc_fw_cleanup_fetch(&huc->fw);