Message ID | 1458844815-25925-2-git-send-email-david.s.gordon@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Mar 24, 2016 at 06:40:15PM +0000, Dave Gordon wrote: > After a suspend-resume cycle, the resumed kernel has no idea what the > booted kernel may have done to the GuC before replacing itself with the > resumed image. In particular, it may have already loaded the GuC with > firmware, which will then cause this kernel's attempt to (re)load the > firmware to fail (GuC program memory is write-once!). The symptoms > (GuC firmware reload fails after hibernation) are further described > in the Bugzilla reference below. > > So let's *always* reset the GuC just before (re)loading the firmware; > then the hardware should then be in a well-known state, and we may even > avoid some of the issues arising from unpredictable timing. > > Also added some more fields & values to the definition of the GUC_STATUS > register, which is the key diagnostic indicator if the GuC load fails. > > Signed-off-by: Dave Gordon <david.s.gordon@intel.com> > Reviewed-by: Arun Siluvery <arun.siluvery@linux.intel.com> > Cc: Alex Dai <yu.dai@intel.com> > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94390 I guess both of these should be labelled Cc: stable@vger.kernel.org when merging. -Daniel > > --- > drivers/gpu/drm/i915/i915_guc_reg.h | 12 ++++++++-- > drivers/gpu/drm/i915/intel_guc_loader.c | 40 ++++++++++++++++----------------- > 2 files changed, 29 insertions(+), 23 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_guc_reg.h b/drivers/gpu/drm/i915/i915_guc_reg.h > index 94ceee5..80786d9 100644 > --- a/drivers/gpu/drm/i915/i915_guc_reg.h > +++ b/drivers/gpu/drm/i915/i915_guc_reg.h > @@ -27,10 +27,12 @@ > /* Definitions of GuC H/W registers, bits, etc */ > > #define GUC_STATUS _MMIO(0xc000) > -#define GS_MIA_IN_RESET (1 << 0) > +#define GS_RESET_SHIFT 0 > +#define GS_MIA_IN_RESET (0x01 << GS_RESET_SHIFT) > #define GS_BOOTROM_SHIFT 1 > #define GS_BOOTROM_MASK (0x7F << GS_BOOTROM_SHIFT) > #define GS_BOOTROM_RSA_FAILED (0x50 << GS_BOOTROM_SHIFT) > +#define GS_BOOTROM_JUMP_PASSED (0x76 << GS_BOOTROM_SHIFT) > #define GS_UKERNEL_SHIFT 8 > #define GS_UKERNEL_MASK (0xFF << GS_UKERNEL_SHIFT) > #define GS_UKERNEL_LAPIC_DONE (0x30 << GS_UKERNEL_SHIFT) > @@ -38,7 +40,13 @@ > #define GS_UKERNEL_READY (0xF0 << GS_UKERNEL_SHIFT) > #define GS_MIA_SHIFT 16 > #define GS_MIA_MASK (0x07 << GS_MIA_SHIFT) > -#define GS_MIA_CORE_STATE (1 << GS_MIA_SHIFT) > +#define GS_MIA_CORE_STATE (0x01 << GS_MIA_SHIFT) > +#define GS_MIA_HALT_REQUESTED (0x02 << GS_MIA_SHIFT) > +#define GS_MIA_ISR_ENTRY (0x04 << GS_MIA_SHIFT) > +#define GS_AUTH_STATUS_SHIFT 30 > +#define GS_AUTH_STATUS_MASK (0x03 << GS_AUTH_STATUS_SHIFT) > +#define GS_AUTH_STATUS_BAD (0x01 << GS_AUTH_STATUS_SHIFT) > +#define GS_AUTH_STATUS_GOOD (0x02 << GS_AUTH_STATUS_SHIFT) > > #define SOFT_SCRATCH(n) _MMIO(0xc180 + (n) * 4) > #define SOFT_SCRATCH_COUNT 16 > diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c b/drivers/gpu/drm/i915/intel_guc_loader.c > index d84c560..876e5da 100644 > --- a/drivers/gpu/drm/i915/intel_guc_loader.c > +++ b/drivers/gpu/drm/i915/intel_guc_loader.c > @@ -387,7 +387,7 @@ int intel_guc_ucode_load(struct drm_device *dev) > { > struct drm_i915_private *dev_priv = dev->dev_private; > struct intel_guc_fw *guc_fw = &dev_priv->guc.guc_fw; > - int err = 0; > + int retries, err = 0; > > if (!i915.enable_guc_submission) > return 0; > @@ -441,29 +441,26 @@ int intel_guc_ucode_load(struct drm_device *dev) > * steppings also so this is extended as well. > */ > /* WaEnableGuCBootHashCheckNotSet:skl,bxt */ > - err = guc_ucode_xfer(dev_priv); > - if (err) { > - int retries = 3; > - > - DRM_ERROR("GuC fw load failed, err=%d, attempting reset and retry\n", err); > - > - while (retries--) { > - err = i915_reset_guc(dev_priv); > - if (err) > - break; > - > - err = guc_ucode_xfer(dev_priv); > - if (!err) { > - DRM_DEBUG_DRIVER("GuC fw reload succeeded after reset\n"); > - break; > - } > - DRM_DEBUG_DRIVER("GuC fw reload retries left: %d\n", retries); > - } > - > + for (retries = 3; ; ) { > + /* > + * Always reset the GuC just before (re)loading, so > + * that the state and timing are fairly predictable > + */ > + err = i915_reset_guc(dev_priv); > if (err) { > - DRM_ERROR("GuC fw reload attempt failed, ret=%d\n", err); > + DRM_ERROR("GuC reset failed, err %d\n", err); > goto fail; > } > + > + err = guc_ucode_xfer(dev_priv); > + if (!err) > + break; > + > + if (--retries == 0) > + goto fail; > + > + DRM_INFO("GuC fw load failed, err %d; will reset and " > + "retry %d more time(s)\n", err, retries); > } > > guc_fw->guc_fw_load_status = GUC_FIRMWARE_SUCCESS; > @@ -485,6 +482,7 @@ int intel_guc_ucode_load(struct drm_device *dev) > return 0; > > fail: > + DRM_ERROR("GuC firmware load failed, err %d\n", err); > if (guc_fw->guc_fw_load_status == GUC_FIRMWARE_PENDING) > guc_fw->guc_fw_load_status = GUC_FIRMWARE_FAIL; > > -- > 1.9.1 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
On 29/03/16 12:48, Daniel Vetter wrote: > On Thu, Mar 24, 2016 at 06:40:15PM +0000, Dave Gordon wrote: >> After a suspend-resume cycle, the resumed kernel has no idea what the >> booted kernel may have done to the GuC before replacing itself with the >> resumed image. In particular, it may have already loaded the GuC with >> firmware, which will then cause this kernel's attempt to (re)load the >> firmware to fail (GuC program memory is write-once!). The symptoms >> (GuC firmware reload fails after hibernation) are further described >> in the Bugzilla reference below. >> >> So let's *always* reset the GuC just before (re)loading the firmware; >> then the hardware should then be in a well-known state, and we may even >> avoid some of the issues arising from unpredictable timing. >> >> Also added some more fields & values to the definition of the GUC_STATUS >> register, which is the key diagnostic indicator if the GuC load fails. >> >> Signed-off-by: Dave Gordon <david.s.gordon@intel.com> >> Reviewed-by: Arun Siluvery <arun.siluvery@linux.intel.com> >> Cc: Alex Dai <yu.dai@intel.com> >> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94390 > > I guess both of these should be labelled Cc: stable@vger.kernel.org when > merging. > -Daniel Probably not necessary, as the patch to enable GuC loading (and fallback) by default hasn't yet been merged, so this code isn't reached unless the (unsafe) kernel parameter is overridden. .Dave.
diff --git a/drivers/gpu/drm/i915/i915_guc_reg.h b/drivers/gpu/drm/i915/i915_guc_reg.h index 94ceee5..80786d9 100644 --- a/drivers/gpu/drm/i915/i915_guc_reg.h +++ b/drivers/gpu/drm/i915/i915_guc_reg.h @@ -27,10 +27,12 @@ /* Definitions of GuC H/W registers, bits, etc */ #define GUC_STATUS _MMIO(0xc000) -#define GS_MIA_IN_RESET (1 << 0) +#define GS_RESET_SHIFT 0 +#define GS_MIA_IN_RESET (0x01 << GS_RESET_SHIFT) #define GS_BOOTROM_SHIFT 1 #define GS_BOOTROM_MASK (0x7F << GS_BOOTROM_SHIFT) #define GS_BOOTROM_RSA_FAILED (0x50 << GS_BOOTROM_SHIFT) +#define GS_BOOTROM_JUMP_PASSED (0x76 << GS_BOOTROM_SHIFT) #define GS_UKERNEL_SHIFT 8 #define GS_UKERNEL_MASK (0xFF << GS_UKERNEL_SHIFT) #define GS_UKERNEL_LAPIC_DONE (0x30 << GS_UKERNEL_SHIFT) @@ -38,7 +40,13 @@ #define GS_UKERNEL_READY (0xF0 << GS_UKERNEL_SHIFT) #define GS_MIA_SHIFT 16 #define GS_MIA_MASK (0x07 << GS_MIA_SHIFT) -#define GS_MIA_CORE_STATE (1 << GS_MIA_SHIFT) +#define GS_MIA_CORE_STATE (0x01 << GS_MIA_SHIFT) +#define GS_MIA_HALT_REQUESTED (0x02 << GS_MIA_SHIFT) +#define GS_MIA_ISR_ENTRY (0x04 << GS_MIA_SHIFT) +#define GS_AUTH_STATUS_SHIFT 30 +#define GS_AUTH_STATUS_MASK (0x03 << GS_AUTH_STATUS_SHIFT) +#define GS_AUTH_STATUS_BAD (0x01 << GS_AUTH_STATUS_SHIFT) +#define GS_AUTH_STATUS_GOOD (0x02 << GS_AUTH_STATUS_SHIFT) #define SOFT_SCRATCH(n) _MMIO(0xc180 + (n) * 4) #define SOFT_SCRATCH_COUNT 16 diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c b/drivers/gpu/drm/i915/intel_guc_loader.c index d84c560..876e5da 100644 --- a/drivers/gpu/drm/i915/intel_guc_loader.c +++ b/drivers/gpu/drm/i915/intel_guc_loader.c @@ -387,7 +387,7 @@ int intel_guc_ucode_load(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; struct intel_guc_fw *guc_fw = &dev_priv->guc.guc_fw; - int err = 0; + int retries, err = 0; if (!i915.enable_guc_submission) return 0; @@ -441,29 +441,26 @@ int intel_guc_ucode_load(struct drm_device *dev) * steppings also so this is extended as well. */ /* WaEnableGuCBootHashCheckNotSet:skl,bxt */ - err = guc_ucode_xfer(dev_priv); - if (err) { - int retries = 3; - - DRM_ERROR("GuC fw load failed, err=%d, attempting reset and retry\n", err); - - while (retries--) { - err = i915_reset_guc(dev_priv); - if (err) - break; - - err = guc_ucode_xfer(dev_priv); - if (!err) { - DRM_DEBUG_DRIVER("GuC fw reload succeeded after reset\n"); - break; - } - DRM_DEBUG_DRIVER("GuC fw reload retries left: %d\n", retries); - } - + for (retries = 3; ; ) { + /* + * Always reset the GuC just before (re)loading, so + * that the state and timing are fairly predictable + */ + err = i915_reset_guc(dev_priv); if (err) { - DRM_ERROR("GuC fw reload attempt failed, ret=%d\n", err); + DRM_ERROR("GuC reset failed, err %d\n", err); goto fail; } + + err = guc_ucode_xfer(dev_priv); + if (!err) + break; + + if (--retries == 0) + goto fail; + + DRM_INFO("GuC fw load failed, err %d; will reset and " + "retry %d more time(s)\n", err, retries); } guc_fw->guc_fw_load_status = GUC_FIRMWARE_SUCCESS; @@ -485,6 +482,7 @@ int intel_guc_ucode_load(struct drm_device *dev) return 0; fail: + DRM_ERROR("GuC firmware load failed, err %d\n", err); if (guc_fw->guc_fw_load_status == GUC_FIRMWARE_PENDING) guc_fw->guc_fw_load_status = GUC_FIRMWARE_FAIL;