Message ID | 20220616220158.15778-1-zhanjun.dong@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/i915/guc: Check for ct enabled while waiting for response | expand |
On Thu, 16 Jun 2022 15:01:59 -0700, Zhanjun Dong wrote: > > We are seeing error message of "No response for request". Some cases > happened while waiting for response and reset/suspend action was triggered. > In this case, no response is not an error, active requests will be > cancelled. > > This patch will handle this condition and change the error message into > debug message. The convention we follow in drm is to record the version of the patch and what changed in that version. Generally I am ok with this version of the patch but still have a couple of questions. > -static int wait_for_ct_request_update(struct ct_request *req, u32 *status) > +static int wait_for_ct_request_update(struct intel_guc_ct *ct, struct ct_request *req, u32 *status) > { > int err; > + bool ct_enabled; > > /* > * Fast commands should complete in less than 10us, so sample quickly > @@ -481,12 +483,15 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status) > #define GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS 10 > #define GUC_CTB_RESPONSE_TIMEOUT_LONG_MS 1000 > #define done \ > - (FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \ > + (!(ct_enabled = intel_guc_ct_enabled(ct)) || \ > + FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \ > GUC_HXG_ORIGIN_GUC) > err = wait_for_us(done, GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS); > if (err) > err = wait_for(done, GUC_CTB_RESPONSE_TIMEOUT_LONG_MS); > #undef done > + if (!ct_enabled) > + err = -ECANCELED; So we have the choice of either setting the request status here as I was suggesting earlier, e.g. as follows: #define GUC_HXG_TYPE_REQUEST_CANCELED 4u // unused value if (!ct_enabled) req->status = GUC_HXG_TYPE_REQUEST_CANCELED; We would return 0 in this case and would check for the req->status value above where needed. Or we can return -ECANCELED. I don't know if -ECANCELED is the right value to return but whatever we return will have to be unique (ununsed elsewhere) since we are relying on the return value. -ECANCELED is unique so that part is ok. Do other reviewers have a preference whether we should set req->status or return a unique return value? > *status = req->status; > return err; > @@ -703,11 +708,15 @@ static int ct_send(struct intel_guc_ct *ct, > > intel_guc_notify(ct_to_guc(ct)); > > - err = wait_for_ct_request_update(&request, status); > + err = wait_for_ct_request_update(ct, &request, status); > g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN); > if (unlikely(err)) { > - CT_ERROR(ct, "No response for request %#x (fence %u)\n", > - action[0], request.fence); > + if (err == -ECANCELED) > + CT_DEBUG(ct, "Request %#x (fence %u) cancelled as CTB is disabled\n", > + action[0], request.fence); > + else > + CT_ERROR(ct, "No response for request %#x (fence %u)\n", > + action[0], request.fence); > goto unlink; > } > > @@ -771,8 +780,9 @@ int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len, > > ret = ct_send(ct, action, len, response_buf, response_buf_size, &status); > if (unlikely(ret < 0)) { > - CT_ERROR(ct, "Sending action %#x failed (%pe) status=%#X\n", > - action[0], ERR_PTR(ret), status); > + if (ret != -ECANCELED) > + CT_ERROR(ct, "Sending action %#x failed (%pe) status=%#X\n", > + action[0], ERR_PTR(ret), status); I am wondering why we even have this print and should we just delete it or convert it to CT_DEBUG(). The reason is that only error prints closest to where the actual error occurs are useful since they pin-point the error clearly. This to be seems to be a "second" print from a higher level function which does not seem particularly useful. > } else if (unlikely(ret)) { > CT_DEBUG(ct, "send action %#x returned %d (%#x)\n", > action[0], ret, ret); > -- > 2.36.0 >
On Thu, 16 Jun 2022 15:01:59 -0700, Zhanjun Dong wrote: > > We are seeing error message of "No response for request". Some cases > happened while waiting for response and reset/suspend action was triggered. > In this case, no response is not an error, active requests will be > cancelled. > > This patch will handle this condition and change the error message into > debug message. > > Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com> > --- > drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 24 ++++++++++++++++------- > 1 file changed, 17 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c > index f01325cd1b62..f07a7666b1ad 100644 > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c > @@ -455,6 +455,7 @@ static int ct_write(struct intel_guc_ct *ct, > > /** > * wait_for_ct_request_update - Wait for CT request state update. > + * @ct: pointer to CT > * @req: pointer to pending request > * @status: placeholder for status > * > @@ -467,9 +468,10 @@ static int ct_write(struct intel_guc_ct *ct, > * * 0 response received (status is valid) > * * -ETIMEDOUT no response within hardcoded timeout > */ > -static int wait_for_ct_request_update(struct ct_request *req, u32 *status) > +static int wait_for_ct_request_update(struct intel_guc_ct *ct, struct ct_request *req, u32 *status) > { > int err; > + bool ct_enabled; > > /* > * Fast commands should complete in less than 10us, so sample quickly > @@ -481,12 +483,15 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status) > #define GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS 10 > #define GUC_CTB_RESPONSE_TIMEOUT_LONG_MS 1000 > #define done \ > - (FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \ > + (!(ct_enabled = intel_guc_ct_enabled(ct)) || \ > + FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \ > GUC_HXG_ORIGIN_GUC) > err = wait_for_us(done, GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS); > if (err) > err = wait_for(done, GUC_CTB_RESPONSE_TIMEOUT_LONG_MS); > #undef done > + if (!ct_enabled) > + err = -ECANCELED; Actually here's an even simpler suggestion. We could just do: if (!ct_enabled) CT_DEBUG(ct, "Request %#x (fence %u) cancelled as CTB is disabled\n", ...); And return 0 as before. This way we won't have to make any changes in either ct_send() or intel_guc_ct_send(). So intel_guc_ct_enabled() just serves to get us out of the wait early and prevent the -ETIMEDOUT return (and 0 return avoids all the error messages we are trying to eliminate).
On Thu, 16 Jun 2022 21:50:55 -0700, Dixit, Ashutosh wrote: > > On Thu, 16 Jun 2022 15:01:59 -0700, Zhanjun Dong wrote: > > > > We are seeing error message of "No response for request". Some cases > > happened while waiting for response and reset/suspend action was triggered. > > In this case, no response is not an error, active requests will be > > cancelled. > > > > This patch will handle this condition and change the error message into > > debug message. > > > > Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com> > > --- > > drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 24 ++++++++++++++++------- > > 1 file changed, 17 insertions(+), 7 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c > > index f01325cd1b62..f07a7666b1ad 100644 > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c > > @@ -455,6 +455,7 @@ static int ct_write(struct intel_guc_ct *ct, > > > > /** > > * wait_for_ct_request_update - Wait for CT request state update. > > + * @ct: pointer to CT > > * @req: pointer to pending request > > * @status: placeholder for status > > * > > @@ -467,9 +468,10 @@ static int ct_write(struct intel_guc_ct *ct, > > * * 0 response received (status is valid) > > * * -ETIMEDOUT no response within hardcoded timeout > > */ > > -static int wait_for_ct_request_update(struct ct_request *req, u32 *status) > > +static int wait_for_ct_request_update(struct intel_guc_ct *ct, struct ct_request *req, u32 *status) > > { > > int err; > > + bool ct_enabled; > > > > /* > > * Fast commands should complete in less than 10us, so sample quickly > > @@ -481,12 +483,15 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status) > > #define GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS 10 > > #define GUC_CTB_RESPONSE_TIMEOUT_LONG_MS 1000 > > #define done \ > > - (FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \ > > + (!(ct_enabled = intel_guc_ct_enabled(ct)) || \ > > + FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \ > > GUC_HXG_ORIGIN_GUC) > > err = wait_for_us(done, GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS); > > if (err) > > err = wait_for(done, GUC_CTB_RESPONSE_TIMEOUT_LONG_MS); > > #undef done > > + if (!ct_enabled) > > + err = -ECANCELED; > > Actually here's an even simpler suggestion. We could just do: > > if (!ct_enabled) > CT_DEBUG(ct, "Request %#x (fence %u) cancelled as CTB is disabled\n", ...); > > And return 0 as before. This way we won't have to make any changes in > either ct_send() or intel_guc_ct_send(). So intel_guc_ct_enabled() just > serves to get us out of the wait early and prevent the -ETIMEDOUT return > (and 0 return avoids all the error messages we are trying to eliminate). Actually will need to unlink the request too, so it will be something like: if (!ct_enabled) { CT_DEBUG(ct, "Request %#x (fence %u) cancelled as CTB is disabled\n", ...); spin_lock_irqsave(&ct->requests.lock, flags); list_del(&request.link); spin_unlock_irqrestore(&ct->requests.lock, flags); }
> -----Original Message----- > From: Dixit, Ashutosh <ashutosh.dixit@intel.com> > Sent: July 12, 2022 3:48 PM > To: Dong, Zhanjun <zhanjun.dong@intel.com> > Cc: intel-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org > Subject: Re: [Intel-gfx] [PATCH] drm/i915/guc: Check for ct enabled while > waiting for response > > On Thu, 16 Jun 2022 21:50:55 -0700, Dixit, Ashutosh wrote: > > > > On Thu, 16 Jun 2022 15:01:59 -0700, Zhanjun Dong wrote: > > > > > > We are seeing error message of "No response for request". Some cases > > > happened while waiting for response and reset/suspend action was > triggered. > > > In this case, no response is not an error, active requests will be > > > cancelled. > > > > > > This patch will handle this condition and change the error message > > > into debug message. > > > > > > Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com> > > > --- > > > drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 24 > > > ++++++++++++++++------- > > > 1 file changed, 17 insertions(+), 7 deletions(-) > > > > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c > > > b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c > > > index f01325cd1b62..f07a7666b1ad 100644 > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c > > > @@ -455,6 +455,7 @@ static int ct_write(struct intel_guc_ct *ct, > > > > > > /** > > > * wait_for_ct_request_update - Wait for CT request state update. > > > + * @ct: pointer to CT > > > * @req: pointer to pending request > > > * @status: placeholder for status > > > * > > > @@ -467,9 +468,10 @@ static int ct_write(struct intel_guc_ct *ct, > > > * * 0 response received (status is valid) > > > * * -ETIMEDOUT no response within hardcoded timeout > > > */ > > > -static int wait_for_ct_request_update(struct ct_request *req, u32 > > > *status) > > > +static int wait_for_ct_request_update(struct intel_guc_ct *ct, > > > +struct ct_request *req, u32 *status) > > > { > > > int err; > > > + bool ct_enabled; > > > > > > /* > > > * Fast commands should complete in less than 10us, so sample > > >quickly @@ -481,12 +483,15 @@ static int > > >wait_for_ct_request_update(struct ct_request *req, u32 *status) > > > #define GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS 10 > > > #define GUC_CTB_RESPONSE_TIMEOUT_LONG_MS 1000 > > > #define done \ > > > - (FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == > \ > > > + (!(ct_enabled = intel_guc_ct_enabled(ct)) || \ > > > + FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == > \ > > > GUC_HXG_ORIGIN_GUC) > > > err = wait_for_us(done, GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS); > > > if (err) > > > err = wait_for(done, > GUC_CTB_RESPONSE_TIMEOUT_LONG_MS); > > > #undef done > > > + if (!ct_enabled) > > > + err = -ECANCELED; > > > > Actually here's an even simpler suggestion. We could just do: > > > > if (!ct_enabled) > > CT_DEBUG(ct, "Request %#x (fence %u) cancelled as CTB is > disabled\n", > >...); > > > > And return 0 as before. This way we won't have to make any changes in > > either ct_send() or intel_guc_ct_send(). So intel_guc_ct_enabled() > > just serves to get us out of the wait early and prevent the -ETIMEDOUT > > return (and 0 return avoids all the error messages we are trying to > eliminate). > > Actually will need to unlink the request too, so it will be something like: > > if (!ct_enabled) { > CT_DEBUG(ct, "Request %#x (fence %u) cancelled as CTB is > disabled\n", ...); > > spin_lock_irqsave(&ct->requests.lock, flags); > list_del(&request.link); > spin_unlock_irqrestore(&ct->requests.lock, flags); > } I agree, the caller function need the err is non-zero to know the request is not success, and unlink the request. The caller function ct_send will do the unlink. For the err code ECANCELED, while in intel_guc_ct_send, it returns ENODEV if ct is disabled. This patch will be changed to ENODEV to match it.
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c index f01325cd1b62..f07a7666b1ad 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c @@ -455,6 +455,7 @@ static int ct_write(struct intel_guc_ct *ct, /** * wait_for_ct_request_update - Wait for CT request state update. + * @ct: pointer to CT * @req: pointer to pending request * @status: placeholder for status * @@ -467,9 +468,10 @@ static int ct_write(struct intel_guc_ct *ct, * * 0 response received (status is valid) * * -ETIMEDOUT no response within hardcoded timeout */ -static int wait_for_ct_request_update(struct ct_request *req, u32 *status) +static int wait_for_ct_request_update(struct intel_guc_ct *ct, struct ct_request *req, u32 *status) { int err; + bool ct_enabled; /* * Fast commands should complete in less than 10us, so sample quickly @@ -481,12 +483,15 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status) #define GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS 10 #define GUC_CTB_RESPONSE_TIMEOUT_LONG_MS 1000 #define done \ - (FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \ + (!(ct_enabled = intel_guc_ct_enabled(ct)) || \ + FIELD_GET(GUC_HXG_MSG_0_ORIGIN, READ_ONCE(req->status)) == \ GUC_HXG_ORIGIN_GUC) err = wait_for_us(done, GUC_CTB_RESPONSE_TIMEOUT_SHORT_MS); if (err) err = wait_for(done, GUC_CTB_RESPONSE_TIMEOUT_LONG_MS); #undef done + if (!ct_enabled) + err = -ECANCELED; *status = req->status; return err; @@ -703,11 +708,15 @@ static int ct_send(struct intel_guc_ct *ct, intel_guc_notify(ct_to_guc(ct)); - err = wait_for_ct_request_update(&request, status); + err = wait_for_ct_request_update(ct, &request, status); g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN); if (unlikely(err)) { - CT_ERROR(ct, "No response for request %#x (fence %u)\n", - action[0], request.fence); + if (err == -ECANCELED) + CT_DEBUG(ct, "Request %#x (fence %u) cancelled as CTB is disabled\n", + action[0], request.fence); + else + CT_ERROR(ct, "No response for request %#x (fence %u)\n", + action[0], request.fence); goto unlink; } @@ -771,8 +780,9 @@ int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len, ret = ct_send(ct, action, len, response_buf, response_buf_size, &status); if (unlikely(ret < 0)) { - CT_ERROR(ct, "Sending action %#x failed (%pe) status=%#X\n", - action[0], ERR_PTR(ret), status); + if (ret != -ECANCELED) + CT_ERROR(ct, "Sending action %#x failed (%pe) status=%#X\n", + action[0], ERR_PTR(ret), status); } else if (unlikely(ret)) { CT_DEBUG(ct, "send action %#x returned %d (%#x)\n", action[0], ret, ret);
We are seeing error message of "No response for request". Some cases happened while waiting for response and reset/suspend action was triggered. In this case, no response is not an error, active requests will be cancelled. This patch will handle this condition and change the error message into debug message. Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com> --- drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 24 ++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-)