diff mbox

[V5,11/13] mmc: block: Add CQE support

Message ID 1502366898-23691-12-git-send-email-adrian.hunter@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Adrian Hunter Aug. 10, 2017, 12:08 p.m. UTC
Add CQE support to the block driver, including:
	- optionally using DCMD for flush requests
	- manually issuing discard requests
	- issuing read / write requests to the CQE
	- supporting block-layer timeouts
	- handling recovery
	- supporting re-tuning

CQE offers 25% - 50% better random multi-threaded I/O.  There is a slight
(e.g. 2%) drop in sequential read speed but no observable change to sequential
write.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
 drivers/mmc/core/block.c | 195 ++++++++++++++++++++++++++++++++-
 drivers/mmc/core/block.h |   7 ++
 drivers/mmc/core/queue.c | 273 ++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/mmc/core/queue.h |  42 +++++++-
 4 files changed, 510 insertions(+), 7 deletions(-)

Comments

Linus Walleij Aug. 20, 2017, 12:13 p.m. UTC | #1
On Thu, Aug 10, 2017 at 2:08 PM, Adrian Hunter <adrian.hunter@intel.com> wrote:

> Add CQE support to the block driver, including:
>         - optionally using DCMD for flush requests
>         - manually issuing discard requests

"Manually" has no place in computer code IMO since there is no
man there. But I may be especially grumpy. But I don't understand the
comment anyway.

>         - issuing read / write requests to the CQE
>         - supporting block-layer timeouts
>         - handling recovery
>         - supporting re-tuning
>
> CQE offers 25% - 50% better random multi-threaded I/O.  There is a slight
> (e.g. 2%) drop in sequential read speed but no observable change to sequential
> write.
>
> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>

This commit message seriously needs to detail the design of the
request handling thread/engine.

So adding a new (in effect) invasive block driver needs to at least
be CC:ed to the block maintainers so we don't sneak anything like
that in under the radar.

This duplicates the "thread that sucks out requests" from the
MMC core, and doubles the problem of replacing it with
something like blk-mq.

Especially these two snippets I would personally NOT merge
without the explicit consent of a block layer maintainer:

> +static void mmc_cqe_request_fn(struct request_queue *q)
> +{
> +       struct mmc_queue *mq = q->queuedata;
> +       struct request *req;
> +
> +       if (!mq) {
> +               while ((req = blk_fetch_request(q)) != NULL) {
> +                       req->rq_flags |= RQF_QUIET;
> +                       __blk_end_request_all(req, BLK_STS_IOERR);
> +               }
> +               return;
> +       }
> +
> +       if (mq->asleep && !mq->cqe_busy)
> +               wake_up_process(mq->thread);
> +}
(...)
> +static int mmc_cqe_thread(void *d)
> +{
> +       struct mmc_queue *mq = d;
> +       struct request_queue *q = mq->queue;
> +       struct mmc_card *card = mq->card;
> +       struct mmc_host *host = card->host;
> +       unsigned long flags;
> +       int get_put = 0;
> +
> +       current->flags |= PF_MEMALLOC;
> +
> +       down(&mq->thread_sem);
> +       spin_lock_irqsave(q->queue_lock, flags);
> +       while (1) {
> +               struct request *req = NULL;
> +               enum mmc_issue_type issue_type;
> +               bool retune_ok = false;
> +
> +               if (mq->cqe_recovery_needed) {
> +                       spin_unlock_irqrestore(q->queue_lock, flags);
> +                       mmc_blk_cqe_recovery(mq);
> +                       spin_lock_irqsave(q->queue_lock, flags);
> +                       mq->cqe_recovery_needed = false;
> +               }
> +
> +               set_current_state(TASK_INTERRUPTIBLE);
> +
> +               if (!kthread_should_stop())
> +                       req = blk_peek_request(q);

Why are you using blk_peek_request() instead of blk_fetch_request()
when the other thread just goes with fetch?

Am I right in assuming that also this request queue replies on the
implicit semantic that blk_peek_request(q) shall return NULL
if there are no requests in the queue, and that it is pulling out a
few NULLs to flush the request-handling machinery? Or did it
fix this? (Put that in the commit message in that case.)

> +
> +               if (req) {
> +                       issue_type = mmc_cqe_issue_type(host, req);
> +                       switch (issue_type) {
> +                       case MMC_ISSUE_DCMD:
> +                               if (mmc_cqe_dcmd_busy(mq)) {
> +                                       mq->cqe_busy |= MMC_CQE_DCMD_BUSY;
> +                                       req = NULL;
> +                                       break;
> +                               }

I guess peek is used becaue you want to have the option to give it up here.

> +                               /* Fall through */
> +                       case MMC_ISSUE_ASYNC:
> +                               if (blk_queue_start_tag(q, req)) {
> +                                       mq->cqe_busy |= MMC_CQE_QUEUE_FULL;
> +                                       req = NULL;
> +                               }

Then you start it with blk_queue_start_tag(), now does "tag" in this
function call mean the same as the "tag" you just added to the
MMC request struct?

> +                               break;
> +                       default:
> +                               /*
> +                                * Timeouts are handled by mmc core, so set a
> +                                * large value to avoid races.
> +                                */
> +                               req->timeout = 600 * HZ;
> +                               blk_start_request(req);

And here it just starts.

> +                               break;
> +                       }
> +                       if (req) {
> +                               mq->cqe_in_flight[issue_type] += 1;
> +                               if (mmc_cqe_tot_in_flight(mq) == 1)
> +                                       get_put += 1;
> +                               if (mmc_cqe_qcnt(mq) == 1)
> +                                       retune_ok = true;
> +                       }
> +               }
> +
> +               mq->asleep = !req;
> +
> +               spin_unlock_irqrestore(q->queue_lock, flags);
> +
> +               if (req) {
> +                       enum mmc_issued issued;
> +
> +                       set_current_state(TASK_RUNNING);
> +
> +                       if (get_put) {
> +                               get_put = 0;
> +                               mmc_get_card(card);
> +                       }
> +
> +                       if (host->need_retune && retune_ok &&
> +                           !host->hold_retune)
> +                               host->retune_now = true;
> +                       else
> +                               host->retune_now = false;
> +
> +                       issued = mmc_blk_cqe_issue_rq(mq, req);
> +
> +                       cond_resched();
> +
> +                       spin_lock_irqsave(q->queue_lock, flags);
> +
> +                       switch (issued) {
> +                       case MMC_REQ_STARTED:
> +                               break;
> +                       case MMC_REQ_BUSY:
> +                               blk_requeue_request(q, req);

Here it is requeued.

> +                               goto finished;
> +                       case MMC_REQ_FAILED_TO_START:
> +                               __blk_end_request_all(req, BLK_STS_IOERR);

Or ended.

> +                               /* Fall through */
> +                       case MMC_REQ_FINISHED:
> +finished:
> +                               mq->cqe_in_flight[issue_type] -= 1;
> +                               if (mmc_cqe_tot_in_flight(mq) == 0)
> +                                       get_put = -1;
> +                       }
> +               } else {
> +                       if (get_put < 0) {
> +                               get_put = 0;
> +                               mmc_put_card(card);
> +                       }
> +                       /*
> +                        * Do not stop with requests in flight in case recovery
> +                        * is needed.
> +                        */
> +                       if (kthread_should_stop() &&
> +                           !mmc_cqe_tot_in_flight(mq)) {
> +                               set_current_state(TASK_RUNNING);
> +                               break;
> +                       }
> +                       up(&mq->thread_sem);
> +                       schedule();
> +                       down(&mq->thread_sem);
> +                       spin_lock_irqsave(q->queue_lock, flags);

And this semaphoring and threading is just as confusing as ever and now
we have two of them. (Sorry, I'm grumpy.)

I think I said in the beginning maybe not in response to this series but
another that I don't like the idea of making a second copy of the
whole request thread. I would much rather see that ONE request thread
is used for both regular requests and CQE.

Atleast I wanna see a serious rebuttal of why my claim that we should
keep this code down is such a pipe dream that we just have to go ahead
and make a second instance of all the core request mechanics.
And that this should be part of the commit message: "I'm duplicating
the queue thread because (...)" etc.

I'm sorry for being so conservative, but I am plain scared, I had serious
trouble refactoring this code already, and I got the feeling that several
people think the MMC stack has grown unapproachable for average
developers (my Googledoc where I tried to pry it apart got very popular
with developers I respect), and this is making that situation worse. Soon
only you and Ulf will understand this piece of code.

I do not know if I'm especially stupid when I feel the MMC stack code
is already hard to grasp, if that is the case I need to be told.

What we need is an MMC stack where it is clear where blocks come in
and out and how they are processed by the block layer, but now we
already have a scary Rube Goldberg-machine and it is not getting better.
If people have different feelings they can tell me off right now.

I have my hopes up that we can get the code lesser and more readable
with MQ, as I tried to illustrate in my attempts, which are indeed lame
because they don't work because of misc and SDIO use cases, but
I'm honestly doing my best. Currently with other clean-ups to get a
clean surface to do that.

As it stands, the MQ migration work size is in some spots doubled or
more than doubled after this commit :(

Yours,
Linus Walleij
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Adrian Hunter Aug. 21, 2017, 9:27 a.m. UTC | #2
On 20/08/17 15:13, Linus Walleij wrote:
> On Thu, Aug 10, 2017 at 2:08 PM, Adrian Hunter <adrian.hunter@intel.com> wrote:
> 
>> Add CQE support to the block driver, including:
>>         - optionally using DCMD for flush requests
>>         - manually issuing discard requests
> 
> "Manually" has no place in computer code IMO since there is no
> man there. But I may be especially grumpy. But I don't understand the
> comment anyway.

CQE automatically sends the commands to complete requests.  However it only
supports reads / writes and so-called "direct commands" (DCMD).  Furthermore
DCMD is limited to one command at a time, but discards require 3 commands.
That makes issuing discards through CQE very awkward, but some CQE's don't
support DCMD anyway.  So for discards, the existing non-CQE approach is
taken, where the mmc core code issues the 3 commands one at a time i.e.
mmc_erase().

> 
>>         - issuing read / write requests to the CQE
>>         - supporting block-layer timeouts
>>         - handling recovery
>>         - supporting re-tuning
>>
>> CQE offers 25% - 50% better random multi-threaded I/O.  There is a slight
>> (e.g. 2%) drop in sequential read speed but no observable change to sequential
>> write.
>>
>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
> 
> This commit message seriously needs to detail the design of the
> request handling thread/engine.
> 
> So adding a new (in effect) invasive block driver needs to at least
> be CC:ed to the block maintainers so we don't sneak anything like
> that in under the radar.
> 
> This duplicates the "thread that sucks out requests" from the
> MMC core, and doubles the problem of replacing it with
> something like blk-mq.

We need to start with a legacy API because people want to backport CQ to
earlier kernels (we really need to get features upstream more quickly), but
blk-mq has been evolving a lot (e.g. elevator support), so backporters face
having either something quite different from upstream or trying to backport
great chunks of the block layer.

We also don't know how blk-mq will perform so it would be prudent to start
with support for both the legacy API and blk-mq (as scsi does) so that we
can find out first.

A loop to remove requests from the queue is normal e.g. scsi_request_fn()
and taking a consistent approach with the existing mmc thread does not
double the the problem of blk-mq implementation, since the same approach can
be used for both.

> 
> Especially these two snippets I would personally NOT merge
> without the explicit consent of a block layer maintainer:
> 
>> +static void mmc_cqe_request_fn(struct request_queue *q)
>> +{
>> +       struct mmc_queue *mq = q->queuedata;
>> +       struct request *req;
>> +
>> +       if (!mq) {
>> +               while ((req = blk_fetch_request(q)) != NULL) {
>> +                       req->rq_flags |= RQF_QUIET;
>> +                       __blk_end_request_all(req, BLK_STS_IOERR);
>> +               }
>> +               return;
>> +       }
>> +
>> +       if (mq->asleep && !mq->cqe_busy)
>> +               wake_up_process(mq->thread);
>> +}
> (...)
>> +static int mmc_cqe_thread(void *d)
>> +{
>> +       struct mmc_queue *mq = d;
>> +       struct request_queue *q = mq->queue;
>> +       struct mmc_card *card = mq->card;
>> +       struct mmc_host *host = card->host;
>> +       unsigned long flags;
>> +       int get_put = 0;
>> +
>> +       current->flags |= PF_MEMALLOC;
>> +
>> +       down(&mq->thread_sem);
>> +       spin_lock_irqsave(q->queue_lock, flags);
>> +       while (1) {
>> +               struct request *req = NULL;
>> +               enum mmc_issue_type issue_type;
>> +               bool retune_ok = false;
>> +
>> +               if (mq->cqe_recovery_needed) {
>> +                       spin_unlock_irqrestore(q->queue_lock, flags);
>> +                       mmc_blk_cqe_recovery(mq);
>> +                       spin_lock_irqsave(q->queue_lock, flags);
>> +                       mq->cqe_recovery_needed = false;
>> +               }
>> +
>> +               set_current_state(TASK_INTERRUPTIBLE);
>> +
>> +               if (!kthread_should_stop())
>> +                       req = blk_peek_request(q);
> 
> Why are you using blk_peek_request() instead of blk_fetch_request()
> when the other thread just goes with fetch?

Because we are not always able to start the request.

> Am I right in assuming that also this request queue replies on the
> implicit semantic that blk_peek_request(q) shall return NULL
> if there are no requests in the queue, and that it is pulling out a
> few NULLs to flush the request-handling machinery? Or did it
> fix this? (Put that in the commit message in that case.)

CQ is different.  Because read / write requests are processed asynchronously
we can keep preparing and issuing more requests until the hardware queue is
full.  That is, we don't have to wait for the first request to complete
before preparing and issuing the second request, and so on.

However, the existing thread's need to issue a "null" is not because it is a
thread.  It is because of the convoluted design of mmc_start_areq().
However, you still need a way to issue the prepared request when the current
request completes.  Either wake-up the thread to do it, or in the past I
have suggested that should be done in the completion path, but that needs
changes to the host controller API.

> 
>> +
>> +               if (req) {
>> +                       issue_type = mmc_cqe_issue_type(host, req);
>> +                       switch (issue_type) {
>> +                       case MMC_ISSUE_DCMD:
>> +                               if (mmc_cqe_dcmd_busy(mq)) {
>> +                                       mq->cqe_busy |= MMC_CQE_DCMD_BUSY;
>> +                                       req = NULL;
>> +                                       break;
>> +                               }
> 
> I guess peek is used becaue you want to have the option to give it up here.

Yes

> 
>> +                               /* Fall through */
>> +                       case MMC_ISSUE_ASYNC:
>> +                               if (blk_queue_start_tag(q, req)) {
>> +                                       mq->cqe_busy |= MMC_CQE_QUEUE_FULL;
>> +                                       req = NULL;
>> +                               }
> 
> Then you start it with blk_queue_start_tag(), now does "tag" in this
> function call mean the same as the "tag" you just added to the
> MMC request struct?

Yes

> 
>> +                               break;
>> +                       default:
>> +                               /*
>> +                                * Timeouts are handled by mmc core, so set a
>> +                                * large value to avoid races.
>> +                                */
>> +                               req->timeout = 600 * HZ;
>> +                               blk_start_request(req);
> 
> And here it just starts.

This is the path for requests that the CQE cannot process.  They don't need
to be tagged because there is no command queue - no way to do more than one
request at a time.

> 
>> +                               break;
>> +                       }
>> +                       if (req) {
>> +                               mq->cqe_in_flight[issue_type] += 1;
>> +                               if (mmc_cqe_tot_in_flight(mq) == 1)
>> +                                       get_put += 1;
>> +                               if (mmc_cqe_qcnt(mq) == 1)
>> +                                       retune_ok = true;
>> +                       }
>> +               }
>> +
>> +               mq->asleep = !req;
>> +
>> +               spin_unlock_irqrestore(q->queue_lock, flags);
>> +
>> +               if (req) {
>> +                       enum mmc_issued issued;
>> +
>> +                       set_current_state(TASK_RUNNING);
>> +
>> +                       if (get_put) {
>> +                               get_put = 0;
>> +                               mmc_get_card(card);
>> +                       }
>> +
>> +                       if (host->need_retune && retune_ok &&
>> +                           !host->hold_retune)
>> +                               host->retune_now = true;
>> +                       else
>> +                               host->retune_now = false;
>> +
>> +                       issued = mmc_blk_cqe_issue_rq(mq, req);
>> +
>> +                       cond_resched();
>> +
>> +                       spin_lock_irqsave(q->queue_lock, flags);
>> +
>> +                       switch (issued) {
>> +                       case MMC_REQ_STARTED:
>> +                               break;
>> +                       case MMC_REQ_BUSY:
>> +                               blk_requeue_request(q, req);
> 
> Here it is requeued.

Any request that is started has to be either re-queued or completed.

> 
>> +                               goto finished;
>> +                       case MMC_REQ_FAILED_TO_START:
>> +                               __blk_end_request_all(req, BLK_STS_IOERR);
> 
> Or ended.

Any request that is started has to be either re-queued or completed.

> 
>> +                               /* Fall through */
>> +                       case MMC_REQ_FINISHED:
>> +finished:
>> +                               mq->cqe_in_flight[issue_type] -= 1;
>> +                               if (mmc_cqe_tot_in_flight(mq) == 0)
>> +                                       get_put = -1;
>> +                       }
>> +               } else {
>> +                       if (get_put < 0) {
>> +                               get_put = 0;
>> +                               mmc_put_card(card);
>> +                       }
>> +                       /*
>> +                        * Do not stop with requests in flight in case recovery
>> +                        * is needed.
>> +                        */
>> +                       if (kthread_should_stop() &&
>> +                           !mmc_cqe_tot_in_flight(mq)) {
>> +                               set_current_state(TASK_RUNNING);
>> +                               break;
>> +                       }
>> +                       up(&mq->thread_sem);
>> +                       schedule();
>> +                       down(&mq->thread_sem);
>> +                       spin_lock_irqsave(q->queue_lock, flags);
> 
> And this semaphoring and threading is just as confusing as ever and now
> we have two of them. (Sorry, I'm grumpy.)

Sure we have two, but they follow the same approach. i.e. if you can get rid
of one, then you can use the same method to get rid of the other.

> 
> I think I said in the beginning maybe not in response to this series but
> another that I don't like the idea of making a second copy of the
> whole request thread. I would much rather see that ONE request thread
> is used for both regular requests and CQE.

They issue requests in completely different ways.  There is essentially no
common code.

> 
> Atleast I wanna see a serious rebuttal of why my claim that we should
> keep this code down is such a pipe dream that we just have to go ahead
> and make a second instance of all the core request mechanics.
> And that this should be part of the commit message: "I'm duplicating
> the queue thread because (...)" etc.

As I wrote above, we need to start with the legacy API, because of
backporting and because we don't know how blk-mq will perform.

> 
> I'm sorry for being so conservative, but I am plain scared, I had serious
> trouble refactoring this code already, and I got the feeling that several
> people think the MMC stack has grown unapproachable for average
> developers (my Googledoc where I tried to pry it apart got very popular
> with developers I respect), and this is making that situation worse. Soon
> only you and Ulf will understand this piece of code.

The mmc block driver has problems, but the thread isn't one of them.

> 
> I do not know if I'm especially stupid when I feel the MMC stack code
> is already hard to grasp, if that is the case I need to be told.

mmc_start_areq() is very convoluted.  However, getting rid of
mmc_start_areq() is only the first step.  Step two: have the host controller
complete requests without the need of block driver polling i.e.
card_busy_detect() must not be called in the normal completion path.  Step
three: make it possible to issue the prepared request in the completion path
of the current request.  Steps two and three need host controller driver
changes.

> 
> What we need is an MMC stack where it is clear where blocks come in
> and out and how they are processed by the block layer, but now we
> already have a scary Rube Goldberg-machine and it is not getting better.
> If people have different feelings they can tell me off right now.
> 
> I have my hopes up that we can get the code lesser and more readable
> with MQ, as I tried to illustrate in my attempts, which are indeed lame
> because they don't work because of misc and SDIO use cases, but
> I'm honestly doing my best. Currently with other clean-ups to get a
> clean surface to do that.
> 
> As it stands, the MQ migration work size is in some spots doubled or
> more than doubled after this commit :(

Not true.  I have begun work on blk-mq for CQE and I will send an RFC in a
day or two.

--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Linus Walleij Aug. 31, 2017, 10:05 a.m. UTC | #3
On Mon, Aug 21, 2017 at 11:27 AM, Adrian Hunter <adrian.hunter@intel.com> wrote:
> On 20/08/17 15:13, Linus Walleij wrote:
>> On Thu, Aug 10, 2017 at 2:08 PM, Adrian Hunter <adrian.hunter@intel.com> wrote:
>>
>>> Add CQE support to the block driver, including:
>>>         - optionally using DCMD for flush requests
>>>         - manually issuing discard requests
>>
>> "Manually" has no place in computer code IMO since there is no
>> man there. But I may be especially grumpy. But I don't understand the
>> comment anyway.
>
> CQE automatically sends the commands to complete requests.  However it only
> supports reads / writes and so-called "direct commands" (DCMD).  Furthermore
> DCMD is limited to one command at a time, but discards require 3 commands.
> That makes issuing discards through CQE very awkward, but some CQE's don't
> support DCMD anyway.  So for discards, the existing non-CQE approach is
> taken, where the mmc core code issues the 3 commands one at a time i.e.
> mmc_erase().

Ah that's a great deal of good technical detail (I think I might have
even understood
what is going on) can we please make sure to get this text into the commit.

>> So adding a new (in effect) invasive block driver needs to at least
>> be CC:ed to the block maintainers so we don't sneak anything like
>> that in under the radar.
>>
>> This duplicates the "thread that sucks out requests" from the
>> MMC core, and doubles the problem of replacing it with
>> something like blk-mq.
>
> We need to start with a legacy API because people want to backport CQ to
> earlier kernels (we really need to get features upstream more quickly), but
> blk-mq has been evolving a lot (e.g. elevator support), so backporters face
> having either something quite different from upstream or trying to backport
> great chunks of the block layer.

I hope I do not come across as rude to a senior kernel developer if I just
reference this:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/stable-api-nonsense.rst

My position is that any attempts to push around the community due
to internal product development falls under the "argument ti moderation"
fallacy. Even for huge enterprises.
https://en.wikipedia.org/wiki/Argument_to_moderation

> We also don't know how blk-mq will perform so it would be prudent to start
> with support for both the legacy API and blk-mq (as scsi does) so that we
> can find out first.

As far as I have understood, that is not the position of the block layer
maintainers, who are quickly phasing out the old block layer in favor
of the new mq scheduler.

Example: they did not accept the new BFQ scheduling policy until it was
rewritten as a plugin to mq.

> A loop to remove requests from the queue is normal e.g. scsi_request_fn()
> and taking a consistent approach with the existing mmc thread does not
> double the the problem of blk-mq implementation, since the same approach can
> be used for both.

If the two threads are so similar, why do we need two threads?

My general feeling is that this part of the code is duct-taped on the side
of the old request poll loop instead of refactoring and integrating the new
code with the existing poll loop.

I need a clear argument as to why that is not possible if this solution
should persist.

>>> +               if (!kthread_should_stop())
>>> +                       req = blk_peek_request(q);
>>
>> Why are you using blk_peek_request() instead of blk_fetch_request()
>> when the other thread just goes with fetch?
>
> Because we are not always able to start the request.

OK I understand this now, sorry for my ignorance.

>> Am I right in assuming that also this request queue replies on the
>> implicit semantic that blk_peek_request(q) shall return NULL
>> if there are no requests in the queue, and that it is pulling out a
>> few NULLs to flush the request-handling machinery? Or did it
>> fix this? (Put that in the commit message in that case.)
>
> CQ is different.  Because read / write requests are processed asynchronously
> we can keep preparing and issuing more requests until the hardware queue is
> full.  That is, we don't have to wait for the first request to complete
> before preparing and issuing the second request, and so on.
>
> However, the existing thread's need to issue a "null" is not because it is a
> thread.  It is because of the convoluted design of mmc_start_areq().

Yeah that is true :(

> However, you still need a way to issue the prepared request when the current
> request completes.  Either wake-up the thread to do it, or in the past I
> have suggested that should be done in the completion path, but that needs
> changes to the host controller API.

We might have to do that then? It is anyways necessary for MQ
which (in my implementation) sets the queue depth to 2 and get the
same issue with two outstanding requests.

>>> +                               break;
>>> +                       default:
>>> +                               /*
>>> +                                * Timeouts are handled by mmc core, so set a
>>> +                                * large value to avoid races.
>>> +                                */
>>> +                               req->timeout = 600 * HZ;
>>> +                               blk_start_request(req);
>>
>> And here it just starts.
>
> This is the path for requests that the CQE cannot process.  They don't need
> to be tagged because there is no command queue - no way to do more than one
> request at a time.

OK I get it.

>>> +                       switch (issued) {
>>> +                       case MMC_REQ_STARTED:
>>> +                               break;
>>> +                       case MMC_REQ_BUSY:
>>> +                               blk_requeue_request(q, req);
>>
>> Here it is requeued.
>
> Any request that is started has to be either re-queued or completed.

Yup I get it.

>>> +                               goto finished;
>>> +                       case MMC_REQ_FAILED_TO_START:
>>> +                               __blk_end_request_all(req, BLK_STS_IOERR);
>>
>> Or ended.
>
> Any request that is started has to be either re-queued or completed.

Yup.

So why can't we do all this in the existing thread instead of a new
thread?

>> I think I said in the beginning maybe not in response to this series but
>> another that I don't like the idea of making a second copy of the
>> whole request thread. I would much rather see that ONE request thread
>> is used for both regular requests and CQE.
>
> They issue requests in completely different ways.  There is essentially no
> common code.

That sounds a bit handwavy, sorry.

I need to know exactly why.

Will it also have to be two separate MQ paths when we migrate
to that?

>> Atleast I wanna see a serious rebuttal of why my claim that we should
>> keep this code down is such a pipe dream that we just have to go ahead
>> and make a second instance of all the core request mechanics.
>> And that this should be part of the commit message: "I'm duplicating
>> the queue thread because (...)" etc.
>
> As I wrote above, we need to start with the legacy API, because of
> backporting and because we don't know how blk-mq will perform.

Sorry I do not buy it. (See the above references.)

We need to agree to disagree and leave this to
the higher authority then.

>> I'm sorry for being so conservative, but I am plain scared, I had serious
>> trouble refactoring this code already, and I got the feeling that several
>> people think the MMC stack has grown unapproachable for average
>> developers (my Googledoc where I tried to pry it apart got very popular
>> with developers I respect), and this is making that situation worse. Soon
>> only you and Ulf will understand this piece of code.
>
> The mmc block driver has problems, but the thread isn't one of them.

The pulling out of NULLs from the end of the request queue to
flush the request pipeline is definately an abuse of the API and
that is a big problem.

> mmc_start_areq() is very convoluted.  However, getting rid of
> mmc_start_areq() is only the first step.

We are in violent agreement, at last.

> Step two: have the host controller
> complete requests without the need of block driver polling i.e.
> card_busy_detect() must not be called in the normal completion path.

I agree on this too.

> Step
> three: make it possible to issue the prepared request in the completion path
> of the current request.

I actually did patches to do that.

OK the patches suck for several reasons. But I have a proof of concept:
https://marc.info/?l=linux-mmc&m=148665460925587&w=2

Despite the convoluted subject this is what that patch does and
what the series try to build up to.

> Steps two and three need host controller driver
> changes.

I don't know about that. I think I kind of pulled off three.

Yours,
Linus Walleij
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Aug. 31, 2017, 10:23 a.m. UTC | #4
On Sun, Aug 20, 2017 at 02:13:03PM +0200, Linus Walleij wrote:
> So adding a new (in effect) invasive block driver needs to at least
> be CC:ed to the block maintainers so we don't sneak anything like
> that in under the radar.

Yes.

> And this semaphoring and threading is just as confusing as ever and now
> we have two of them. (Sorry, I'm grumpy.)

But your are grumpy for a good reason.  The MMC driver is a pain
to understand for even a seasons block layer developer.

> What we need is an MMC stack where it is clear where blocks come in
> and out and how they are processed by the block layer, but now we
> already have a scary Rube Goldberg-machine and it is not getting better.
> If people have different feelings they can tell me off right now.

Agreed.

> 
> I have my hopes up that we can get the code lesser and more readable
> with MQ, as I tried to illustrate in my attempts, which are indeed lame
> because they don't work because of misc and SDIO use cases, but
> I'm honestly doing my best. Currently with other clean-ups to get a
> clean surface to do that.
> 
> As it stands, the MQ migration work size is in some spots doubled or
> more than doubled after this commit :(

I don't think we should merge this.
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Aug. 31, 2017, 10:25 a.m. UTC | #5
On Mon, Aug 21, 2017 at 12:27:30PM +0300, Adrian Hunter wrote:
> We need to start with a legacy API because people want to backport CQ to
> earlier kernels (we really need to get features upstream more quickly), but
> blk-mq has been evolving a lot (e.g. elevator support), so backporters face
> having either something quite different from upstream or trying to backport
> great chunks of the block layer.

Hell no - the point of Linux mainline development is to move forward.
We'll need to do the right thing for long term Linux development FIRST
and then worry about backporting SECOND.

Stop thinking about all kernels while you wear your mainline kernel
developer hat.  Yes, occasionally we can do smaller amount of reordering
to keep a small fix first for stable backports, but not to this extent
of using obsolete infrastructure and making an already messy driver
worse just to please backports to long obsolete kernels.

--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Adrian Hunter Aug. 31, 2017, noon UTC | #6
On 31/08/17 13:23, Christoph Hellwig wrote:
> On Sun, Aug 20, 2017 at 02:13:03PM +0200, Linus Walleij wrote:
>> So adding a new (in effect) invasive block driver needs to at least
>> be CC:ed to the block maintainers so we don't sneak anything like
>> that in under the radar.
> 
> Yes.
> 
>> And this semaphoring and threading is just as confusing as ever and now
>> we have two of them. (Sorry, I'm grumpy.)
> 
> But your are grumpy for a good reason.  The MMC driver is a pain
> to understand for even a seasons block layer developer.
> 
>> What we need is an MMC stack where it is clear where blocks come in
>> and out and how they are processed by the block layer, but now we
>> already have a scary Rube Goldberg-machine and it is not getting better.
>> If people have different feelings they can tell me off right now.
> 
> Agreed.
> 
>>
>> I have my hopes up that we can get the code lesser and more readable
>> with MQ, as I tried to illustrate in my attempts, which are indeed lame
>> because they don't work because of misc and SDIO use cases, but
>> I'm honestly doing my best. Currently with other clean-ups to get a
>> clean surface to do that.
>>
>> As it stands, the MQ migration work size is in some spots doubled or
>> more than doubled after this commit :(
> 
> I don't think we should merge this.
> 

OK, let's merge the blk-mq version then.  Here it is in V7:

	https://marc.info/?l=linux-mmc&m=150418101630159&w=2
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index d8677b28af5c..f1fc9a3b8b53 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -108,6 +108,7 @@  struct mmc_blk_data {
 #define MMC_BLK_WRITE		BIT(1)
 #define MMC_BLK_DISCARD		BIT(2)
 #define MMC_BLK_SECDISCARD	BIT(3)
+#define MMC_BLK_CQE_RECOVERY	BIT(4)
 
 	/*
 	 * Only set in main mmc_blk_data associated
@@ -1610,6 +1611,198 @@  static void mmc_blk_data_prep(struct mmc_queue *mq, struct mmc_queue_req *mqrq,
 		*do_data_tag_p = do_data_tag;
 }
 
+#define MMC_CQE_RETRIES 2
+
+void mmc_blk_cqe_complete_rq(struct request *req)
+{
+	struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
+	struct mmc_request *mrq = &mqrq->brq.mrq;
+	struct request_queue *q = req->q;
+	struct mmc_queue *mq = q->queuedata;
+	struct mmc_host *host = mq->card->host;
+	unsigned long flags;
+	bool put_card;
+	int err;
+
+	mmc_cqe_post_req(host, mrq);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	mq->cqe_in_flight[mmc_cqe_issue_type(host, req)] -= 1;
+
+	put_card = mmc_cqe_tot_in_flight(mq) == 0;
+
+	if (mrq->cmd && mrq->cmd->error)
+		err = mrq->cmd->error;
+	else if (mrq->data && mrq->data->error)
+		err = mrq->data->error;
+	else
+		err = 0;
+
+	if (err) {
+		if (mqrq->retries++ < MMC_CQE_RETRIES)
+			blk_requeue_request(q, req);
+		else
+			__blk_end_request_all(req, BLK_STS_IOERR);
+	} else if (mrq->data) {
+		if (__blk_end_request(req, BLK_STS_OK, mrq->data->bytes_xfered))
+			blk_requeue_request(q, req);
+	} else {
+		__blk_end_request_all(req, BLK_STS_OK);
+	}
+
+	mmc_cqe_kick_queue(mq);
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	if (put_card)
+		mmc_put_card(mq->card);
+}
+
+void mmc_blk_cqe_recovery(struct mmc_queue *mq)
+{
+	struct mmc_card *card = mq->card;
+	struct mmc_host *host = card->host;
+	int err;
+
+	mmc_get_card(card);
+
+	pr_debug("%s: CQE recovery start\n", mmc_hostname(host));
+
+	mq->cqe_in_recovery = true;
+
+	err = mmc_cqe_recovery(host);
+	if (err)
+		mmc_blk_reset(mq->blkdata, host, MMC_BLK_CQE_RECOVERY);
+	else
+		mmc_blk_reset_success(mq->blkdata, MMC_BLK_CQE_RECOVERY);
+
+	mq->cqe_in_recovery = false;
+
+	pr_debug("%s: CQE recovery done\n", mmc_hostname(host));
+
+	mmc_put_card(card);
+}
+
+static void mmc_blk_cqe_req_done(struct mmc_request *mrq)
+{
+	struct mmc_queue_req *mqrq = container_of(mrq, struct mmc_queue_req,
+						  brq.mrq);
+	struct request *req = mmc_queue_req_to_req(mqrq);
+	struct request_queue *q = req->q;
+	struct mmc_queue *mq = q->queuedata;
+
+	/*
+	 * Block layer timeouts race with completions which means the normal
+	 * completion path cannot be used during recovery.
+	 */
+	if (mq->cqe_in_recovery)
+		mmc_blk_cqe_complete_rq(req);
+	else
+		blk_complete_request(req);
+}
+
+static int mmc_blk_cqe_start_req(struct mmc_host *host, struct mmc_request *mrq)
+{
+	mrq->done = mmc_blk_cqe_req_done;
+	return mmc_cqe_start_req(host, mrq);
+}
+
+static struct mmc_request *mmc_blk_cqe_prep_dcmd(struct mmc_queue_req *mqrq,
+						 struct request *req)
+{
+	struct mmc_blk_request *brq = &mqrq->brq;
+
+	memset(brq, 0, sizeof(*brq));
+
+	brq->mrq.cmd = &brq->cmd;
+	brq->mrq.tag = req->tag;
+
+	return &brq->mrq;
+}
+
+static int mmc_blk_cqe_issue_flush(struct mmc_queue *mq, struct request *req)
+{
+	struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
+	struct mmc_request *mrq = mmc_blk_cqe_prep_dcmd(mqrq, req);
+
+	mrq->cmd->opcode = MMC_SWITCH;
+	mrq->cmd->arg = (MMC_SWITCH_MODE_WRITE_BYTE << 24) |
+			(EXT_CSD_FLUSH_CACHE << 16) |
+			(1 << 8) |
+			EXT_CSD_CMD_SET_NORMAL;
+	mrq->cmd->flags = MMC_CMD_AC | MMC_RSP_R1B;
+
+	return mmc_blk_cqe_start_req(mq->card->host, mrq);
+}
+
+static int mmc_blk_cqe_issue_rw_rq(struct mmc_queue *mq, struct request *req)
+{
+	struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
+
+	mmc_blk_data_prep(mq, mqrq, 0, NULL, NULL);
+
+	return mmc_blk_cqe_start_req(mq->card->host, &mqrq->brq.mrq);
+}
+
+enum mmc_issued mmc_blk_cqe_issue_rq(struct mmc_queue *mq, struct request *req)
+{
+	struct mmc_blk_data *md = mq->blkdata;
+	struct mmc_card *card = md->queue.card;
+	struct mmc_host *host = card->host;
+	int ret;
+
+	ret = mmc_blk_part_switch(card, md);
+	if (ret)
+		return MMC_REQ_FAILED_TO_START;
+
+	switch (mmc_cqe_issue_type(host, req)) {
+	case MMC_ISSUE_SYNC:
+		ret = host->cqe_ops->cqe_wait_for_idle(host);
+		if (ret)
+			return MMC_REQ_BUSY;
+		switch (req_op(req)) {
+		case REQ_OP_DRV_IN:
+		case REQ_OP_DRV_OUT:
+			mmc_blk_issue_drv_op(mq, req);
+			break;
+		case REQ_OP_DISCARD:
+			mmc_blk_issue_discard_rq(mq, req);
+			break;
+		case REQ_OP_SECURE_ERASE:
+			mmc_blk_issue_secdiscard_rq(mq, req);
+			break;
+		case REQ_OP_FLUSH:
+			mmc_blk_issue_flush(mq, req);
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			return MMC_REQ_FAILED_TO_START;
+		}
+		return MMC_REQ_FINISHED;
+	case MMC_ISSUE_DCMD:
+	case MMC_ISSUE_ASYNC:
+		switch (req_op(req)) {
+		case REQ_OP_FLUSH:
+			ret = mmc_blk_cqe_issue_flush(mq, req);
+			break;
+		case REQ_OP_READ:
+		case REQ_OP_WRITE:
+			ret = mmc_blk_cqe_issue_rw_rq(mq, req);
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			ret = -EINVAL;
+		}
+		if (!ret)
+			return MMC_REQ_STARTED;
+		return ret == -EBUSY ? MMC_REQ_BUSY : MMC_REQ_FAILED_TO_START;
+	default:
+		WARN_ON_ONCE(1);
+		return MMC_REQ_FAILED_TO_START;
+	}
+}
+
 static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq,
 			       struct mmc_card *card,
 			       int disable_multi,
@@ -2033,7 +2226,7 @@  static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
 	INIT_LIST_HEAD(&md->part);
 	md->usage = 1;
 
-	ret = mmc_init_queue(&md->queue, card, &md->lock, subname);
+	ret = mmc_init_queue(&md->queue, card, &md->lock, subname, area_type);
 	if (ret)
 		goto err_putdisk;
 
diff --git a/drivers/mmc/core/block.h b/drivers/mmc/core/block.h
index 860ca7c8df86..d7b3d7008b00 100644
--- a/drivers/mmc/core/block.h
+++ b/drivers/mmc/core/block.h
@@ -6,4 +6,11 @@ 
 
 void mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req);
 
+enum mmc_issued;
+
+enum mmc_issued mmc_blk_cqe_issue_rq(struct mmc_queue *mq,
+				     struct request *req);
+void mmc_blk_cqe_complete_rq(struct request *rq);
+void mmc_blk_cqe_recovery(struct mmc_queue *mq);
+
 #endif
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index affa7370ba82..0cb7b0e8ee58 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -36,10 +36,254 @@  static int mmc_prep_request(struct request_queue *q, struct request *req)
 		return BLKPREP_KILL;
 
 	req->rq_flags |= RQF_DONTPREP;
+	req_to_mmc_queue_req(req)->retries = 0;
 
 	return BLKPREP_OK;
 }
 
+static void mmc_cqe_request_fn(struct request_queue *q)
+{
+	struct mmc_queue *mq = q->queuedata;
+	struct request *req;
+
+	if (!mq) {
+		while ((req = blk_fetch_request(q)) != NULL) {
+			req->rq_flags |= RQF_QUIET;
+			__blk_end_request_all(req, BLK_STS_IOERR);
+		}
+		return;
+	}
+
+	if (mq->asleep && !mq->cqe_busy)
+		wake_up_process(mq->thread);
+}
+
+static inline bool mmc_cqe_dcmd_busy(struct mmc_queue *mq)
+{
+	/* Allow only 1 DCMD at a time */
+	return mq->cqe_in_flight[MMC_ISSUE_DCMD];
+}
+
+void mmc_cqe_kick_queue(struct mmc_queue *mq)
+{
+	if ((mq->cqe_busy & MMC_CQE_DCMD_BUSY) && !mmc_cqe_dcmd_busy(mq))
+		mq->cqe_busy &= ~MMC_CQE_DCMD_BUSY;
+
+	mq->cqe_busy &= ~MMC_CQE_QUEUE_FULL;
+
+	if (mq->asleep && !mq->cqe_busy)
+		__blk_run_queue(mq->queue);
+}
+
+static inline bool mmc_cqe_can_dcmd(struct mmc_host *host)
+{
+	return host->caps2 & MMC_CAP2_CQE_DCMD;
+}
+
+enum mmc_issue_type mmc_cqe_issue_type(struct mmc_host *host,
+				       struct request *req)
+{
+	switch (req_op(req)) {
+	case REQ_OP_DRV_IN:
+	case REQ_OP_DRV_OUT:
+	case REQ_OP_DISCARD:
+	case REQ_OP_SECURE_ERASE:
+		return MMC_ISSUE_SYNC;
+	case REQ_OP_FLUSH:
+		return mmc_cqe_can_dcmd(host) ? MMC_ISSUE_DCMD : MMC_ISSUE_SYNC;
+	default:
+		return MMC_ISSUE_ASYNC;
+	}
+}
+
+static void __mmc_cqe_recovery_notifier(struct mmc_queue *mq)
+{
+	if (!mq->cqe_recovery_needed) {
+		mq->cqe_recovery_needed = true;
+		wake_up_process(mq->thread);
+	}
+}
+
+static void mmc_cqe_recovery_notifier(struct mmc_host *host,
+				      struct mmc_request *mrq)
+{
+	struct mmc_queue_req *mqrq = container_of(mrq, struct mmc_queue_req,
+						  brq.mrq);
+	struct request *req = mmc_queue_req_to_req(mqrq);
+	struct request_queue *q = req->q;
+	struct mmc_queue *mq = q->queuedata;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	__mmc_cqe_recovery_notifier(mq);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static int mmc_cqe_thread(void *d)
+{
+	struct mmc_queue *mq = d;
+	struct request_queue *q = mq->queue;
+	struct mmc_card *card = mq->card;
+	struct mmc_host *host = card->host;
+	unsigned long flags;
+	int get_put = 0;
+
+	current->flags |= PF_MEMALLOC;
+
+	down(&mq->thread_sem);
+	spin_lock_irqsave(q->queue_lock, flags);
+	while (1) {
+		struct request *req = NULL;
+		enum mmc_issue_type issue_type;
+		bool retune_ok = false;
+
+		if (mq->cqe_recovery_needed) {
+			spin_unlock_irqrestore(q->queue_lock, flags);
+			mmc_blk_cqe_recovery(mq);
+			spin_lock_irqsave(q->queue_lock, flags);
+			mq->cqe_recovery_needed = false;
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (!kthread_should_stop())
+			req = blk_peek_request(q);
+
+		if (req) {
+			issue_type = mmc_cqe_issue_type(host, req);
+			switch (issue_type) {
+			case MMC_ISSUE_DCMD:
+				if (mmc_cqe_dcmd_busy(mq)) {
+					mq->cqe_busy |= MMC_CQE_DCMD_BUSY;
+					req = NULL;
+					break;
+				}
+				/* Fall through */
+			case MMC_ISSUE_ASYNC:
+				if (blk_queue_start_tag(q, req)) {
+					mq->cqe_busy |= MMC_CQE_QUEUE_FULL;
+					req = NULL;
+				}
+				break;
+			default:
+				/*
+				 * Timeouts are handled by mmc core, so set a
+				 * large value to avoid races.
+				 */
+				req->timeout = 600 * HZ;
+				blk_start_request(req);
+				break;
+			}
+			if (req) {
+				mq->cqe_in_flight[issue_type] += 1;
+				if (mmc_cqe_tot_in_flight(mq) == 1)
+					get_put += 1;
+				if (mmc_cqe_qcnt(mq) == 1)
+					retune_ok = true;
+			}
+		}
+
+		mq->asleep = !req;
+
+		spin_unlock_irqrestore(q->queue_lock, flags);
+
+		if (req) {
+			enum mmc_issued issued;
+
+			set_current_state(TASK_RUNNING);
+
+			if (get_put) {
+				get_put = 0;
+				mmc_get_card(card);
+			}
+
+			if (host->need_retune && retune_ok &&
+			    !host->hold_retune)
+				host->retune_now = true;
+			else
+				host->retune_now = false;
+
+			issued = mmc_blk_cqe_issue_rq(mq, req);
+
+			cond_resched();
+
+			spin_lock_irqsave(q->queue_lock, flags);
+
+			switch (issued) {
+			case MMC_REQ_STARTED:
+				break;
+			case MMC_REQ_BUSY:
+				blk_requeue_request(q, req);
+				goto finished;
+			case MMC_REQ_FAILED_TO_START:
+				__blk_end_request_all(req, BLK_STS_IOERR);
+				/* Fall through */
+			case MMC_REQ_FINISHED:
+finished:
+				mq->cqe_in_flight[issue_type] -= 1;
+				if (mmc_cqe_tot_in_flight(mq) == 0)
+					get_put = -1;
+			}
+		} else {
+			if (get_put < 0) {
+				get_put = 0;
+				mmc_put_card(card);
+			}
+			/*
+			 * Do not stop with requests in flight in case recovery
+			 * is needed.
+			 */
+			if (kthread_should_stop() &&
+			    !mmc_cqe_tot_in_flight(mq)) {
+				set_current_state(TASK_RUNNING);
+				break;
+			}
+			up(&mq->thread_sem);
+			schedule();
+			down(&mq->thread_sem);
+			spin_lock_irqsave(q->queue_lock, flags);
+		}
+	} /* loop */
+	up(&mq->thread_sem);
+
+	return 0;
+}
+
+static enum blk_eh_timer_return __mmc_cqe_timed_out(struct request *req)
+{
+	struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
+	struct mmc_request *mrq = &mqrq->brq.mrq;
+	struct mmc_queue *mq = req->q->queuedata;
+	struct mmc_host *host = mq->card->host;
+	enum mmc_issue_type issue_type = mmc_cqe_issue_type(host, req);
+	bool recovery_needed = false;
+
+	switch (issue_type) {
+	case MMC_ISSUE_ASYNC:
+	case MMC_ISSUE_DCMD:
+		if (host->cqe_ops->cqe_timeout(host, mrq, &recovery_needed)) {
+			if (recovery_needed)
+				__mmc_cqe_recovery_notifier(mq);
+			return BLK_EH_RESET_TIMER;
+		}
+		/* No timeout */
+		return BLK_EH_HANDLED;
+	default:
+		/* Timeout is handled by mmc core */
+		return BLK_EH_RESET_TIMER;
+	}
+}
+
+static enum blk_eh_timer_return mmc_cqe_timed_out(struct request *req)
+{
+	struct mmc_queue *mq = req->q->queuedata;
+
+	if (mq->cqe_recovery_needed)
+		return BLK_EH_RESET_TIMER;
+
+	return __mmc_cqe_timed_out(req);
+}
+
 static int mmc_queue_thread(void *d)
 {
 	struct mmc_queue *mq = d;
@@ -233,11 +477,12 @@  static void mmc_exit_request(struct request_queue *q, struct request *req)
  * Initialise a MMC card request queue.
  */
 int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
-		   spinlock_t *lock, const char *subname)
+		   spinlock_t *lock, const char *subname, int area_type)
 {
 	struct mmc_host *host = card->host;
 	u64 limit = BLK_BOUNCE_HIGH;
 	int ret = -ENOMEM;
+	bool use_cqe = host->cqe_enabled && area_type != MMC_BLK_DATA_AREA_RPMB;
 
 	if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask)
 		limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT;
@@ -247,7 +492,7 @@  int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 	if (!mq->queue)
 		return -ENOMEM;
 	mq->queue->queue_lock = lock;
-	mq->queue->request_fn = mmc_request_fn;
+	mq->queue->request_fn = use_cqe ? mmc_cqe_request_fn : mmc_request_fn;
 	mq->queue->init_rq_fn = mmc_init_request;
 	mq->queue->exit_rq_fn = mmc_exit_request;
 	mq->queue->cmd_size = sizeof(struct mmc_queue_req);
@@ -259,6 +504,24 @@  int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 		return ret;
 	}
 
+	if (use_cqe) {
+		int q_depth = card->ext_csd.cmdq_depth;
+
+		if (q_depth > host->cqe_qdepth)
+			q_depth = host->cqe_qdepth;
+
+		ret = blk_queue_init_tags(mq->queue, q_depth, NULL,
+					  BLK_TAG_ALLOC_FIFO);
+		if (ret)
+			goto cleanup_queue;
+
+		blk_queue_softirq_done(mq->queue, mmc_blk_cqe_complete_rq);
+		blk_queue_rq_timed_out(mq->queue, mmc_cqe_timed_out);
+		blk_queue_rq_timeout(mq->queue, 60 * HZ);
+
+		host->cqe_recovery_notifier = mmc_cqe_recovery_notifier;
+	}
+
 	blk_queue_prep_rq(mq->queue, mmc_prep_request);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);
 	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, mq->queue);
@@ -280,9 +543,9 @@  int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 
 	sema_init(&mq->thread_sem, 1);
 
-	mq->thread = kthread_run(mmc_queue_thread, mq, "mmcqd/%d%s",
-		host->index, subname ? subname : "");
-
+	mq->thread = kthread_run(use_cqe ? mmc_cqe_thread : mmc_queue_thread,
+				 mq, "mmcqd/%d%s", host->index,
+				 subname ? subname : "");
 	if (IS_ERR(mq->thread)) {
 		ret = PTR_ERR(mq->thread);
 		goto cleanup_queue;
diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h
index 361b46408e0f..8e9273d977c0 100644
--- a/drivers/mmc/core/queue.h
+++ b/drivers/mmc/core/queue.h
@@ -7,6 +7,20 @@ 
 #include <linux/mmc/core.h>
 #include <linux/mmc/host.h>
 
+enum mmc_issued {
+	MMC_REQ_STARTED,
+	MMC_REQ_BUSY,
+	MMC_REQ_FAILED_TO_START,
+	MMC_REQ_FINISHED,
+};
+
+enum mmc_issue_type {
+	MMC_ISSUE_SYNC,
+	MMC_ISSUE_DCMD,
+	MMC_ISSUE_ASYNC,
+	MMC_ISSUE_MAX,
+};
+
 static inline struct mmc_queue_req *req_to_mmc_queue_req(struct request *rq)
 {
 	return blk_mq_rq_to_pdu(rq);
@@ -53,6 +67,7 @@  struct mmc_queue_req {
 	int			drv_op_result;
 	struct mmc_blk_ioc_data	**idata;
 	unsigned int		ioc_count;
+	int			retries;
 };
 
 struct mmc_queue {
@@ -70,10 +85,17 @@  struct mmc_queue {
 	 * associated mmc_queue_req data.
 	 */
 	int			qcnt;
+	/* Following are defined for a Command Queue Engine */
+	int			cqe_in_flight[MMC_ISSUE_MAX];
+	unsigned int		cqe_busy;
+	bool			cqe_recovery_needed;
+	bool			cqe_in_recovery;
+#define MMC_CQE_DCMD_BUSY	BIT(0)
+#define MMC_CQE_QUEUE_FULL	BIT(1)
 };
 
 extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *,
-			  const char *);
+			  const char *, int);
 extern void mmc_cleanup_queue(struct mmc_queue *);
 extern void mmc_queue_suspend(struct mmc_queue *);
 extern void mmc_queue_resume(struct mmc_queue *);
@@ -85,4 +107,22 @@  extern unsigned int mmc_queue_map_sg(struct mmc_queue *,
 
 extern int mmc_access_rpmb(struct mmc_queue *);
 
+void mmc_cqe_kick_queue(struct mmc_queue *mq);
+
+enum mmc_issue_type mmc_cqe_issue_type(struct mmc_host *host,
+				       struct request *req);
+
+static inline int mmc_cqe_tot_in_flight(struct mmc_queue *mq)
+{
+	return mq->cqe_in_flight[MMC_ISSUE_SYNC] +
+	       mq->cqe_in_flight[MMC_ISSUE_DCMD] +
+	       mq->cqe_in_flight[MMC_ISSUE_ASYNC];
+}
+
+static inline int mmc_cqe_qcnt(struct mmc_queue *mq)
+{
+	return mq->cqe_in_flight[MMC_ISSUE_DCMD] +
+	       mq->cqe_in_flight[MMC_ISSUE_ASYNC];
+}
+
 #endif