Message ID | 20211117033807.185715-5-axboe@kernel.dk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add support for list issue | expand |
On Tue, Nov 16, 2021 at 08:38:07PM -0700, Jens Axboe wrote: > This enables the block layer to send us a full plug list of requests > that need submitting. The block layer guarantees that they all belong > to the same queue, but we do have to check the hardware queue mapping > for each request. > > If errors are encountered, leave them in the passed in list. Then the > block layer will handle them individually. > > This is good for about a 4% improvement in peak performance, taking us > from 9.6M to 10M IOPS/core. The concept looks sensible, but the loop in nvme_queue_rqs is a complete mess to follow. What about something like this (untested) on top? diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 13722cc400c2c..555a7609580c7 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -509,21 +509,6 @@ static inline void nvme_copy_cmd(struct nvme_queue *nvmeq, nvmeq->sq_tail = 0; } -/** - * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell - * @nvmeq: The queue to use - * @cmd: The command to send - * @write_sq: whether to write to the SQ doorbell - */ -static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, - bool write_sq) -{ - spin_lock(&nvmeq->sq_lock); - nvme_copy_cmd(nvmeq, cmd); - nvme_write_sq_db(nvmeq, write_sq); - spin_unlock(&nvmeq->sq_lock); -} - static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) { struct nvme_queue *nvmeq = hctx->driver_data; @@ -918,8 +903,7 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, return BLK_STS_OK; } -static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct nvme_ns *ns, - struct request *req, struct nvme_command *cmnd) +static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret; @@ -928,18 +912,18 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct nvme_ns *ns, iod->npages = -1; iod->nents = 0; - ret = nvme_setup_cmd(ns, req); + ret = nvme_setup_cmd(req->q->queuedata, req); if (ret) return ret; if (blk_rq_nr_phys_segments(req)) { - ret = nvme_map_data(dev, req, cmnd); + ret = nvme_map_data(dev, req, &iod->cmd); if (ret) goto out_free_cmd; } if (blk_integrity_rq(req)) { - ret = nvme_map_metadata(dev, req, cmnd); + ret = nvme_map_metadata(dev, req, &iod->cmd); if (ret) goto out_unmap_data; } @@ -959,7 +943,6 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct nvme_ns *ns, static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { - struct nvme_ns *ns = hctx->queue->queuedata; struct nvme_queue *nvmeq = hctx->driver_data; struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; @@ -976,12 +959,15 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (!nvme_check_ready(&dev->ctrl, req, true)) return nvme_fail_nonready_command(&dev->ctrl, req); - ret = nvme_prep_rq(dev, ns, req, &iod->cmd); - if (ret == BLK_STS_OK) { - nvme_submit_cmd(nvmeq, &iod->cmd, bd->last); - return BLK_STS_OK; - } - return ret; + ret = nvme_prep_rq(dev, req); + if (ret) + return ret; + + spin_lock(&nvmeq->sq_lock); + nvme_copy_cmd(nvmeq, &iod->cmd); + nvme_write_sq_db(nvmeq, bd->last); + spin_unlock(&nvmeq->sq_lock); + return BLK_STS_OK; } static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist) @@ -997,56 +983,47 @@ static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist) spin_unlock(&nvmeq->sq_lock); } -static void nvme_queue_rqs(struct request **rqlist) +static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req) { - struct request *requeue_list = NULL, *req, *prev = NULL; - struct blk_mq_hw_ctx *hctx; - struct nvme_queue *nvmeq; - struct nvme_ns *ns; - -restart: - req = rq_list_peek(rqlist); - hctx = req->mq_hctx; - nvmeq = hctx->driver_data; - ns = hctx->queue->queuedata; - /* * We should not need to do this, but we're still using this to * ensure we can drain requests on a dying queue. */ if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) - return; + return false; + if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true))) + return false; - rq_list_for_each(rqlist, req) { - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - blk_status_t ret; + req->mq_hctx->tags->rqs[req->tag] = req; + return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK; +} - if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true))) - goto requeue; +static void nvme_queue_rqs(struct request **rqlist) +{ + struct request *req = rq_list_peek(rqlist), *prev = NULL; + struct request *requeue_list = NULL; + + do { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + + if (!nvme_prep_rq_batch(nvmeq, req)) { + /* detach 'req' and add to remainder list */ + if (prev) + prev->rq_next = req->rq_next; + rq_list_add(&requeue_list, req); + } else { + prev = req; + } - if (req->mq_hctx != hctx) { + req = rq_list_next(req); + if (!req || (prev && req->mq_hctx != prev->mq_hctx)) { /* detach rest of list, and submit */ prev->rq_next = NULL; nvme_submit_cmds(nvmeq, rqlist); - /* req now start of new list for this hw queue */ *rqlist = req; - goto restart; - } - - hctx->tags->rqs[req->tag] = req; - ret = nvme_prep_rq(nvmeq->dev, ns, req, &iod->cmd); - if (ret == BLK_STS_OK) { - prev = req; - continue; } -requeue: - /* detach 'req' and add to remainder list */ - if (prev) - prev->rq_next = req->rq_next; - rq_list_add(&requeue_list, req); - } + } while (req); - nvme_submit_cmds(nvmeq, rqlist); *rqlist = requeue_list; } @@ -1224,7 +1201,11 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) c.common.opcode = nvme_admin_async_event; c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; - nvme_submit_cmd(nvmeq, &c, true); + + spin_lock(&nvmeq->sq_lock); + nvme_copy_cmd(nvmeq, &c); + nvme_write_sq_db(nvmeq, true); + spin_unlock(&nvmeq->sq_lock); } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
On 11/17/21 1:39 AM, Christoph Hellwig wrote: > On Tue, Nov 16, 2021 at 08:38:07PM -0700, Jens Axboe wrote: >> This enables the block layer to send us a full plug list of requests >> that need submitting. The block layer guarantees that they all belong >> to the same queue, but we do have to check the hardware queue mapping >> for each request. >> >> If errors are encountered, leave them in the passed in list. Then the >> block layer will handle them individually. >> >> This is good for about a 4% improvement in peak performance, taking us >> from 9.6M to 10M IOPS/core. > > The concept looks sensible, but the loop in nvme_queue_rqs is a complete > mess to follow. What about something like this (untested) on top? Let me take a closer look. > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index 13722cc400c2c..555a7609580c7 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -509,21 +509,6 @@ static inline void nvme_copy_cmd(struct nvme_queue *nvmeq, > nvmeq->sq_tail = 0; > } > > -/** > - * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell > - * @nvmeq: The queue to use > - * @cmd: The command to send > - * @write_sq: whether to write to the SQ doorbell > - */ > -static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, > - bool write_sq) > -{ > - spin_lock(&nvmeq->sq_lock); > - nvme_copy_cmd(nvmeq, cmd); > - nvme_write_sq_db(nvmeq, write_sq); > - spin_unlock(&nvmeq->sq_lock); > -} You really don't like helpers? Code generation wise it doesn't matter, but without this and the copy helper we do end up having some trivial duplicated code...
On 11/17/21 8:55 AM, Jens Axboe wrote: > On 11/17/21 1:39 AM, Christoph Hellwig wrote: >> On Tue, Nov 16, 2021 at 08:38:07PM -0700, Jens Axboe wrote: >>> This enables the block layer to send us a full plug list of requests >>> that need submitting. The block layer guarantees that they all belong >>> to the same queue, but we do have to check the hardware queue mapping >>> for each request. >>> >>> If errors are encountered, leave them in the passed in list. Then the >>> block layer will handle them individually. >>> >>> This is good for about a 4% improvement in peak performance, taking us >>> from 9.6M to 10M IOPS/core. >> >> The concept looks sensible, but the loop in nvme_queue_rqs is a complete >> mess to follow. What about something like this (untested) on top? > > Let me take a closer look. Something changed, efficiency is way down: 2.26% +4.34% [nvme] [k] nvme_queue_rqs
[adding linux-nvme] On Tue, Nov 16, 2021 at 08:38:07PM -0700, Jens Axboe wrote: > This enables the block layer to send us a full plug list of requests > that need submitting. The block layer guarantees that they all belong > to the same queue, but we do have to check the hardware queue mapping > for each request. So this means that the nvme namespace will always be the same for all the requests in the list, but the rqlist may contain requests allocated from different CPUs? > If errors are encountered, leave them in the passed in list. Then the > block layer will handle them individually. > > This is good for about a 4% improvement in peak performance, taking us > from 9.6M to 10M IOPS/core. > > Signed-off-by: Jens Axboe <axboe@kernel.dk> > --- > drivers/nvme/host/pci.c | 67 +++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 67 insertions(+) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index d2b654fc3603..2eedd04b1f90 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -1004,6 +1004,72 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, > return ret; > } > > +static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist) > +{ > + spin_lock(&nvmeq->sq_lock); > + while (!rq_list_empty(*rqlist)) { > + struct request *req = rq_list_pop(rqlist); > + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); > + > + nvme_copy_cmd(nvmeq, absolute_pointer(&iod->cmd)); > + } > + nvme_write_sq_db(nvmeq, true); > + spin_unlock(&nvmeq->sq_lock); > +} > + > +static void nvme_queue_rqs(struct request **rqlist) > +{ > + struct request *requeue_list = NULL, *req, *prev = NULL; > + struct blk_mq_hw_ctx *hctx; > + struct nvme_queue *nvmeq; > + struct nvme_ns *ns; > + > +restart: > + req = rq_list_peek(rqlist); > + hctx = req->mq_hctx; > + nvmeq = hctx->driver_data; > + ns = hctx->queue->queuedata; > + > + /* > + * We should not need to do this, but we're still using this to > + * ensure we can drain requests on a dying queue. > + */ > + if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) > + return; > + > + rq_list_for_each(rqlist, req) { > + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); > + blk_status_t ret; > + > + if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true))) > + goto requeue; > + > + if (req->mq_hctx != hctx) { > + /* detach rest of list, and submit */ > + prev->rq_next = NULL; > + nvme_submit_cmds(nvmeq, rqlist); > + /* req now start of new list for this hw queue */ > + *rqlist = req; > + goto restart; > + } > + > + hctx->tags->rqs[req->tag] = req; After checking how this is handled previously, it appears this new .queue_rqs() skips incrementing active requests, and bypasses the hctx_lock(). Won't that break quiesce? > + ret = nvme_prep_rq(nvmeq->dev, ns, req, &iod->cmd); > + if (ret == BLK_STS_OK) { > + prev = req; > + continue; > + } > +requeue: > + /* detach 'req' and add to remainder list */ > + if (prev) > + prev->rq_next = req->rq_next; > + rq_list_add(&requeue_list, req); > + } > + > + nvme_submit_cmds(nvmeq, rqlist); > + *rqlist = requeue_list; > +} > + > static __always_inline void nvme_pci_unmap_rq(struct request *req) > { > struct nvme_iod *iod = blk_mq_rq_to_pdu(req); > @@ -1741,6 +1807,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = { > > static const struct blk_mq_ops nvme_mq_ops = { > .queue_rq = nvme_queue_rq, > + .queue_rqs = nvme_queue_rqs, > .complete = nvme_pci_complete_rq, > .commit_rqs = nvme_commit_rqs, > .init_hctx = nvme_init_hctx, > --
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d2b654fc3603..2eedd04b1f90 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1004,6 +1004,72 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, return ret; } +static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist) +{ + spin_lock(&nvmeq->sq_lock); + while (!rq_list_empty(*rqlist)) { + struct request *req = rq_list_pop(rqlist); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + + nvme_copy_cmd(nvmeq, absolute_pointer(&iod->cmd)); + } + nvme_write_sq_db(nvmeq, true); + spin_unlock(&nvmeq->sq_lock); +} + +static void nvme_queue_rqs(struct request **rqlist) +{ + struct request *requeue_list = NULL, *req, *prev = NULL; + struct blk_mq_hw_ctx *hctx; + struct nvme_queue *nvmeq; + struct nvme_ns *ns; + +restart: + req = rq_list_peek(rqlist); + hctx = req->mq_hctx; + nvmeq = hctx->driver_data; + ns = hctx->queue->queuedata; + + /* + * We should not need to do this, but we're still using this to + * ensure we can drain requests on a dying queue. + */ + if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) + return; + + rq_list_for_each(rqlist, req) { + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + blk_status_t ret; + + if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true))) + goto requeue; + + if (req->mq_hctx != hctx) { + /* detach rest of list, and submit */ + prev->rq_next = NULL; + nvme_submit_cmds(nvmeq, rqlist); + /* req now start of new list for this hw queue */ + *rqlist = req; + goto restart; + } + + hctx->tags->rqs[req->tag] = req; + ret = nvme_prep_rq(nvmeq->dev, ns, req, &iod->cmd); + if (ret == BLK_STS_OK) { + prev = req; + continue; + } +requeue: + /* detach 'req' and add to remainder list */ + if (prev) + prev->rq_next = req->rq_next; + rq_list_add(&requeue_list, req); + } + + nvme_submit_cmds(nvmeq, rqlist); + *rqlist = requeue_list; +} + static __always_inline void nvme_pci_unmap_rq(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -1741,6 +1807,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = { static const struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, + .queue_rqs = nvme_queue_rqs, .complete = nvme_pci_complete_rq, .commit_rqs = nvme_commit_rqs, .init_hctx = nvme_init_hctx,
This enables the block layer to send us a full plug list of requests that need submitting. The block layer guarantees that they all belong to the same queue, but we do have to check the hardware queue mapping for each request. If errors are encountered, leave them in the passed in list. Then the block layer will handle them individually. This is good for about a 4% improvement in peak performance, taking us from 9.6M to 10M IOPS/core. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/nvme/host/pci.c | 67 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+)