Message ID | 20211012181742.672391-7-axboe@kernel.dk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Batched completions | expand |
On Tue, Oct 12, 2021 at 12:17:39PM -0600, Jens Axboe wrote: > Take advantage of struct io_batch, if passed in to the nvme poll handler. > If it's set, rather than complete each request individually inline, store > them in the io_batch list. We only do so for requests that will complete > successfully, anything else will be completed inline as before. > > Add an mq_ops->complete_batch() handler to do the post-processing of > the io_batch list once polling is complete. > > Signed-off-by: Jens Axboe <axboe@kernel.dk> > --- > drivers/nvme/host/pci.c | 69 +++++++++++++++++++++++++++++++++++++---- > 1 file changed, 63 insertions(+), 6 deletions(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index 4ad63bb9f415..4713da708cd4 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, > return ret; > } > > -static void nvme_pci_complete_rq(struct request *req) > +static void nvme_pci_unmap_rq(struct request *req) > { > struct nvme_iod *iod = blk_mq_rq_to_pdu(req); > struct nvme_dev *dev = iod->nvmeq->dev; > @@ -969,9 +969,34 @@ static void nvme_pci_complete_rq(struct request *req) > rq_integrity_vec(req)->bv_len, rq_data_dir(req)); > if (blk_rq_nr_phys_segments(req)) > nvme_unmap_data(dev, req); > +} > + > +static void nvme_pci_complete_rq(struct request *req) > +{ > + nvme_pci_unmap_rq(req); > nvme_complete_rq(req); > } > > +static void nvme_pci_complete_batch(struct io_batch *ib) > +{ > + struct request *req; > + > + req = ib->req_list; > + while (req) { > + nvme_pci_unmap_rq(req); > + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) > + nvme_cleanup_cmd(req); > + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && > + req_op(req) == REQ_OP_ZONE_APPEND) > + req->__sector = nvme_lba_to_sect(req->q->queuedata, > + le64_to_cpu(nvme_req(req)->result.u64)); > + req->status = nvme_error_status(nvme_req(req)->status); > + req = req->rq_next; > + } > + > + blk_mq_end_request_batch(ib); I hate all this open coding. All the common logic needs to be in a common helper. Also please add a for_each macro for the request list iteration. I already thought about that for the batch allocation in for-next, but with ever more callers this becomes essential. > + if (!nvme_try_complete_req(req, cqe->status, cqe->result)) { > + enum nvme_disposition ret; > + > + ret = nvme_decide_disposition(req); > + if (unlikely(!ib || req->end_io || ret != COMPLETE)) { > + nvme_pci_complete_rq(req); This actually is the likely case as only polling ever does the batch completion. In doubt I'd prefer if we can avoid these likely/unlikely annotations as much as possible. > + } else { > + req->rq_next = ib->req_list; > + ib->req_list = req; > + } And all this list manipulation should use proper helper. > + } Also - can you look into turning this logic into an inline function with a callback for the driver? I think in general gcc will avoid the indirect call for function pointers passed directly. That way we can keep a nice code structure but also avoid the indirections. Same for nvme_pci_complete_batch.
On 12/10/2021 19:17, Jens Axboe wrote: > Signed-off-by: Jens Axboe<axboe@kernel.dk> > --- > drivers/nvme/host/pci.c | 69 +++++++++++++++++++++++++++++++++++++---- > 1 file changed, 63 insertions(+), 6 deletions(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index 4ad63bb9f415..4713da708cd4 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, > return ret; > } > > -static void nvme_pci_complete_rq(struct request *req) > +static void nvme_pci_unmap_rq(struct request *req) > { > struct nvme_iod *iod = blk_mq_rq_to_pdu(req); > struct nvme_dev *dev = iod->nvmeq->dev; > @@ -969,9 +969,34 @@ static void nvme_pci_complete_rq(struct request *req) > rq_integrity_vec(req)->bv_len, rq_data_dir(req)); > if (blk_rq_nr_phys_segments(req)) > nvme_unmap_data(dev, req); > +} > + > +static void nvme_pci_complete_rq(struct request *req) > +{ > + nvme_pci_unmap_rq(req); > nvme_complete_rq(req); > } > > +static void nvme_pci_complete_batch(struct io_batch *ib) > +{ > + struct request *req; > + > + req = ib->req_list; > + while (req) { > + nvme_pci_unmap_rq(req); This will do the DMA SG unmap per request. Often this is a performance bottle neck when we have an IOMMU enabled in strict mode. So since we complete in batches, could we combine all the SGs in the batch to do one big DMA unmap SG, and not one-by-one? Just a thought. > + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) > + nvme_cleanup_cmd(req); > + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && > + req_op(req) == REQ_OP_ZONE_APPEND) > + req->__sector = nvme_lba_to_sect(req->q->queuedata, > + le64_to_cpu(nvme_req(req)->result.u64)); > + req->status = nvme_error_status(nvme_req(req)->status); > + req = req->rq_next; > + } > + > + blk_mq_end_request_batch(ib); > +}
On 10/13/21 3:09 AM, John Garry wrote: > On 12/10/2021 19:17, Jens Axboe wrote: >> Signed-off-by: Jens Axboe<axboe@kernel.dk> >> --- >> drivers/nvme/host/pci.c | 69 +++++++++++++++++++++++++++++++++++++---- >> 1 file changed, 63 insertions(+), 6 deletions(-) >> >> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c >> index 4ad63bb9f415..4713da708cd4 100644 >> --- a/drivers/nvme/host/pci.c >> +++ b/drivers/nvme/host/pci.c >> @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, >> return ret; >> } >> >> -static void nvme_pci_complete_rq(struct request *req) >> +static void nvme_pci_unmap_rq(struct request *req) >> { >> struct nvme_iod *iod = blk_mq_rq_to_pdu(req); >> struct nvme_dev *dev = iod->nvmeq->dev; >> @@ -969,9 +969,34 @@ static void nvme_pci_complete_rq(struct request *req) >> rq_integrity_vec(req)->bv_len, rq_data_dir(req)); >> if (blk_rq_nr_phys_segments(req)) >> nvme_unmap_data(dev, req); >> +} >> + >> +static void nvme_pci_complete_rq(struct request *req) >> +{ >> + nvme_pci_unmap_rq(req); >> nvme_complete_rq(req); >> } >> >> +static void nvme_pci_complete_batch(struct io_batch *ib) >> +{ >> + struct request *req; >> + >> + req = ib->req_list; >> + while (req) { >> + nvme_pci_unmap_rq(req); > > This will do the DMA SG unmap per request. Often this is a performance > bottle neck when we have an IOMMU enabled in strict mode. So since we > complete in batches, could we combine all the SGs in the batch to do one > big DMA unmap SG, and not one-by-one? It is indeed, I actually have a patch for persistent maps as well. But even without that, it would make sense to handle these unmaps a bit smarter. That requires some iommu work though which I'm not that interested in right now, could be done on top of this one for someone motivated enough.
On 10/13/21 1:08 AM, Christoph Hellwig wrote: > On Tue, Oct 12, 2021 at 12:17:39PM -0600, Jens Axboe wrote: >> Take advantage of struct io_batch, if passed in to the nvme poll handler. >> If it's set, rather than complete each request individually inline, store >> them in the io_batch list. We only do so for requests that will complete >> successfully, anything else will be completed inline as before. >> >> Add an mq_ops->complete_batch() handler to do the post-processing of >> the io_batch list once polling is complete. >> >> Signed-off-by: Jens Axboe <axboe@kernel.dk> >> --- >> drivers/nvme/host/pci.c | 69 +++++++++++++++++++++++++++++++++++++---- >> 1 file changed, 63 insertions(+), 6 deletions(-) >> >> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c >> index 4ad63bb9f415..4713da708cd4 100644 >> --- a/drivers/nvme/host/pci.c >> +++ b/drivers/nvme/host/pci.c >> @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, >> return ret; >> } >> >> -static void nvme_pci_complete_rq(struct request *req) >> +static void nvme_pci_unmap_rq(struct request *req) >> { >> struct nvme_iod *iod = blk_mq_rq_to_pdu(req); >> struct nvme_dev *dev = iod->nvmeq->dev; >> @@ -969,9 +969,34 @@ static void nvme_pci_complete_rq(struct request *req) >> rq_integrity_vec(req)->bv_len, rq_data_dir(req)); >> if (blk_rq_nr_phys_segments(req)) >> nvme_unmap_data(dev, req); >> +} >> + >> +static void nvme_pci_complete_rq(struct request *req) >> +{ >> + nvme_pci_unmap_rq(req); >> nvme_complete_rq(req); >> } >> >> +static void nvme_pci_complete_batch(struct io_batch *ib) >> +{ >> + struct request *req; >> + >> + req = ib->req_list; >> + while (req) { >> + nvme_pci_unmap_rq(req); >> + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) >> + nvme_cleanup_cmd(req); >> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && >> + req_op(req) == REQ_OP_ZONE_APPEND) >> + req->__sector = nvme_lba_to_sect(req->q->queuedata, >> + le64_to_cpu(nvme_req(req)->result.u64)); >> + req->status = nvme_error_status(nvme_req(req)->status); >> + req = req->rq_next; >> + } >> + >> + blk_mq_end_request_batch(ib); > > I hate all this open coding. All the common logic needs to be in a > common helper. I'll see if I can unify this a bit. > Also please add a for_each macro for the request list > iteration. I already thought about that for the batch allocation in > for-next, but with ever more callers this becomes essential. Added a prep patch with list helpers, current version is using those now. >> + if (!nvme_try_complete_req(req, cqe->status, cqe->result)) { >> + enum nvme_disposition ret; >> + >> + ret = nvme_decide_disposition(req); >> + if (unlikely(!ib || req->end_io || ret != COMPLETE)) { >> + nvme_pci_complete_rq(req); > > This actually is the likely case as only polling ever does the batch > completion. In doubt I'd prefer if we can avoid these likely/unlikely > annotations as much as possible. If you look at the end of the series, IRQ is wired up for it too. But I do agree with the unlikely, I generally dislike them too. I'll just kill this one. >> + } else { >> + req->rq_next = ib->req_list; >> + ib->req_list = req; >> + } > > And all this list manipulation should use proper helper. Done >> + } > > Also - can you look into turning this logic into an inline function with > a callback for the driver? I think in general gcc will avoid the > indirect call for function pointers passed directly. That way we can > keep a nice code structure but also avoid the indirections. > > Same for nvme_pci_complete_batch. Not sure I follow. It's hard to do a generic callback for this, as the batch can live outside the block layer through the plug. That's why it's passed the way it is in terms of completion hooks.
On Wed, Oct 13, 2021 at 09:10:01AM -0600, Jens Axboe wrote: > > Also - can you look into turning this logic into an inline function with > > a callback for the driver? I think in general gcc will avoid the > > indirect call for function pointers passed directly. That way we can > > keep a nice code structure but also avoid the indirections. > > > > Same for nvme_pci_complete_batch. > > Not sure I follow. It's hard to do a generic callback for this, as the > batch can live outside the block layer through the plug. That's why > it's passed the way it is in terms of completion hooks. Basically turn nvme_pci_complete_batch into a core nvme helper (inline) with nvme_pci_unmap_rq passed as a function pointer that gets inlined.
On 10/13/21 9:16 AM, Christoph Hellwig wrote: > On Wed, Oct 13, 2021 at 09:10:01AM -0600, Jens Axboe wrote: >>> Also - can you look into turning this logic into an inline function with >>> a callback for the driver? I think in general gcc will avoid the >>> indirect call for function pointers passed directly. That way we can >>> keep a nice code structure but also avoid the indirections. >>> >>> Same for nvme_pci_complete_batch. >> >> Not sure I follow. It's hard to do a generic callback for this, as the >> batch can live outside the block layer through the plug. That's why >> it's passed the way it is in terms of completion hooks. > > Basically turn nvme_pci_complete_batch into a core nvme helper (inline) > with nvme_pci_unmap_rq passed as a function pointer that gets inlined. Something like this? diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0ac7bad405ef..1aff0ca37063 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -346,15 +346,19 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req) return RETRY; } -static inline void nvme_end_req(struct request *req) +static inline void nvme_end_req_zoned(struct request *req) { - blk_status_t status = nvme_error_status(nvme_req(req)->status); - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = nvme_lba_to_sect(req->q->queuedata, le64_to_cpu(nvme_req(req)->result.u64)); +} + +static inline void nvme_end_req(struct request *req) +{ + blk_status_t status = nvme_error_status(nvme_req(req)->status); + nvme_end_req_zoned(req); nvme_trace_bio_complete(req); blk_mq_end_request(req, status); } @@ -381,6 +385,23 @@ void nvme_complete_rq(struct request *req) } EXPORT_SYMBOL_GPL(nvme_complete_rq); +void nvme_complete_batch(struct io_batch *iob, void (*fn)(struct request *rq)) +{ + struct request *req; + + req = rq_list_peek(&iob->req_list); + while (req) { + fn(req); + nvme_cleanup_cmd(req); + nvme_end_req_zoned(req); + req->status = BLK_STS_OK; + req = rq_list_next(req); + } + + blk_mq_end_request_batch(iob); +} +EXPORT_SYMBOL_GPL(nvme_complete_batch); + /* * Called to unwind from ->queue_rq on a failed command submission so that the * multipathing code gets called to potentially failover to another path. diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ed79a6c7e804..b73a573472d9 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -638,6 +638,7 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id) } void nvme_complete_rq(struct request *req); +void nvme_complete_batch(struct io_batch *iob, void (*fn)(struct request *)); blk_status_t nvme_host_path_error(struct request *req); bool nvme_cancel_request(struct request *req, void *data, bool reserved); void nvme_cancel_tagset(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b8dbee47fced..e79c0f0268b3 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -992,22 +992,7 @@ static void nvme_pci_complete_rq(struct request *req) static void nvme_pci_complete_batch(struct io_batch *iob) { - struct request *req; - - req = rq_list_peek(&iob->req_list); - while (req) { - nvme_pci_unmap_rq(req); - if (req->rq_flags & RQF_SPECIAL_PAYLOAD) - nvme_cleanup_cmd(req); - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - req_op(req) == REQ_OP_ZONE_APPEND) - req->__sector = nvme_lba_to_sect(req->q->queuedata, - le64_to_cpu(nvme_req(req)->result.u64)); - req->status = BLK_STS_OK; - req = rq_list_next(req); - } - - blk_mq_end_request_batch(iob); + nvme_complete_batch(iob, nvme_pci_unmap_rq); } /* We read the CQE phase first to check if the rest of the entry is valid */
On 10/13/21 9:42 AM, Jens Axboe wrote: > On 10/13/21 9:16 AM, Christoph Hellwig wrote: >> On Wed, Oct 13, 2021 at 09:10:01AM -0600, Jens Axboe wrote: >>>> Also - can you look into turning this logic into an inline function with >>>> a callback for the driver? I think in general gcc will avoid the >>>> indirect call for function pointers passed directly. That way we can >>>> keep a nice code structure but also avoid the indirections. >>>> >>>> Same for nvme_pci_complete_batch. >>> >>> Not sure I follow. It's hard to do a generic callback for this, as the >>> batch can live outside the block layer through the plug. That's why >>> it's passed the way it is in terms of completion hooks. >> >> Basically turn nvme_pci_complete_batch into a core nvme helper (inline) >> with nvme_pci_unmap_rq passed as a function pointer that gets inlined. > > Something like this? Full patch below for reference, might be easier to read. Got rid of the prep patch for nvme as well, so this is everything. commit 002fcc4dd9869cd0fc8684b37ede8e20bdca16a4 Author: Jens Axboe <axboe@kernel.dk> Date: Fri Oct 8 05:59:37 2021 -0600 nvme: add support for batched completion of polled IO Take advantage of struct io_batch, if passed in to the nvme poll handler. If it's set, rather than complete each request individually inline, store them in the io_batch list. We only do so for requests that will complete successfully, anything else will be completed inline as before. Add an mq_ops->complete_batch() handler to do the post-processing of the io_batch list once polling is complete. Signed-off-by: Jens Axboe <axboe@kernel.dk> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c2c2e8545292..26328fd26ff0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -346,15 +346,19 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req) return RETRY; } -static inline void nvme_end_req(struct request *req) +static inline void nvme_end_req_zoned(struct request *req) { - blk_status_t status = nvme_error_status(nvme_req(req)->status); - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = nvme_lba_to_sect(req->q->queuedata, le64_to_cpu(nvme_req(req)->result.u64)); +} + +static inline void nvme_end_req(struct request *req) +{ + blk_status_t status = nvme_error_status(nvme_req(req)->status); + nvme_end_req_zoned(req); nvme_trace_bio_complete(req); blk_mq_end_request(req, status); } @@ -381,6 +385,23 @@ void nvme_complete_rq(struct request *req) } EXPORT_SYMBOL_GPL(nvme_complete_rq); +void nvme_complete_batch(struct io_batch *iob, void (*fn)(struct request *rq)) +{ + struct request *req; + + req = rq_list_peek(&iob->req_list); + while (req) { + fn(req); + nvme_cleanup_cmd(req); + nvme_end_req_zoned(req); + req->status = BLK_STS_OK; + req = rq_list_next(req); + } + + blk_mq_end_request_batch(iob); +} +EXPORT_SYMBOL_GPL(nvme_complete_batch); + /* * Called to unwind from ->queue_rq on a failed command submission so that the * multipathing code gets called to potentially failover to another path. diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ed79a6c7e804..b73a573472d9 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -638,6 +638,7 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id) } void nvme_complete_rq(struct request *req); +void nvme_complete_batch(struct io_batch *iob, void (*fn)(struct request *)); blk_status_t nvme_host_path_error(struct request *req); bool nvme_cancel_request(struct request *req, void *data, bool reserved); void nvme_cancel_tagset(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9db6e23f41ef..b3e686404adf 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, return ret; } -static void nvme_pci_complete_rq(struct request *req) +static void nvme_pci_unmap_rq(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_dev *dev = iod->nvmeq->dev; @@ -969,9 +969,19 @@ static void nvme_pci_complete_rq(struct request *req) rq_integrity_vec(req)->bv_len, rq_data_dir(req)); if (blk_rq_nr_phys_segments(req)) nvme_unmap_data(dev, req); +} + +static void nvme_pci_complete_rq(struct request *req) +{ + nvme_pci_unmap_rq(req); nvme_complete_rq(req); } +static void nvme_pci_complete_batch(struct io_batch *iob) +{ + nvme_complete_batch(iob, nvme_pci_unmap_rq); +} + /* We read the CQE phase first to check if the rest of the entry is valid */ static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq) { @@ -996,7 +1006,8 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq) return nvmeq->dev->tagset.tags[nvmeq->qid - 1]; } -static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) +static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, + struct io_batch *iob, u16 idx) { struct nvme_completion *cqe = &nvmeq->cqes[idx]; __u16 command_id = READ_ONCE(cqe->command_id); @@ -1023,8 +1034,17 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) } trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); - if (!nvme_try_complete_req(req, cqe->status, cqe->result)) - nvme_pci_complete_rq(req); + if (!nvme_try_complete_req(req, cqe->status, cqe->result)) { + /* + * Do normal inline completion if we don't have a batch + * list, if we have an end_io handler, or if the status of + * the request isn't just normal success. + */ + if (!iob || req->end_io || nvme_req(req)->status) + nvme_pci_complete_rq(req); + else + rq_list_add_tail(&iob->req_list, req); + } } static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) @@ -1050,7 +1070,7 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq) * the cqe requires a full read memory barrier */ dma_rmb(); - nvme_handle_cqe(nvmeq, nvmeq->cq_head); + nvme_handle_cqe(nvmeq, NULL, nvmeq->cq_head); nvme_update_cq_head(nvmeq); } @@ -1092,6 +1112,27 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } +static inline int nvme_poll_cq(struct nvme_queue *nvmeq, struct io_batch *iob) +{ + int found = 0; + + while (nvme_cqe_pending(nvmeq)) { + found++; + /* + * load-load control dependency between phase and the rest of + * the cqe requires a full read memory barrier + */ + dma_rmb(); + nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head); + nvme_update_cq_head(nvmeq); + } + + if (found) + nvme_ring_cq_doorbell(nvmeq); + return found; +} + + static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *iob) { struct nvme_queue *nvmeq = hctx->driver_data; @@ -1101,7 +1142,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *iob) return 0; spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq); + found = nvme_poll_cq(nvmeq, iob); spin_unlock(&nvmeq->cq_poll_lock); return found; @@ -1639,6 +1680,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = { static const struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, .complete = nvme_pci_complete_rq, + .complete_batch = nvme_pci_complete_batch, .commit_rqs = nvme_commit_rqs, .init_hctx = nvme_init_hctx, .init_request = nvme_init_request,
On Wed, Oct 13, 2021 at 09:42:23AM -0600, Jens Axboe wrote:
> Something like this?
Something like that. Although without making the new function inline
this will generate an indirect call.
On 10/13/21 9:50 AM, Christoph Hellwig wrote: > On Wed, Oct 13, 2021 at 09:42:23AM -0600, Jens Axboe wrote: >> Something like this? > > Something like that. Although without making the new function inline > this will generate an indirect call. It will, but I don't see how we can have it both ways...
On Wed, Oct 13, 2021 at 10:04:36AM -0600, Jens Axboe wrote: > On 10/13/21 9:50 AM, Christoph Hellwig wrote: > > On Wed, Oct 13, 2021 at 09:42:23AM -0600, Jens Axboe wrote: > >> Something like this? > > > > Something like that. Although without making the new function inline > > this will generate an indirect call. > > It will, but I don't see how we can have it both ways... Last time I played with these optimization gcc did inline function pointers passed to __always_inline function into the calling function. That is you can keep the source level abstraction but get the code generation as if it was open coded.
On 10/13/21 10:13 AM, Christoph Hellwig wrote: > On Wed, Oct 13, 2021 at 10:04:36AM -0600, Jens Axboe wrote: >> On 10/13/21 9:50 AM, Christoph Hellwig wrote: >>> On Wed, Oct 13, 2021 at 09:42:23AM -0600, Jens Axboe wrote: >>>> Something like this? >>> >>> Something like that. Although without making the new function inline >>> this will generate an indirect call. >> >> It will, but I don't see how we can have it both ways... > > Last time I played with these optimization gcc did inline function > pointers passed to __always_inline function into the calling > function. That is you can keep the source level abstraction but get the > code generation as if it was open coded. Gotcha, so place nvme_complete_batch() in the nvme.h header. That might work, let me give it a whirl.
On 10/13/21 10:33 AM, Jens Axboe wrote: > On 10/13/21 10:13 AM, Christoph Hellwig wrote: >> On Wed, Oct 13, 2021 at 10:04:36AM -0600, Jens Axboe wrote: >>> On 10/13/21 9:50 AM, Christoph Hellwig wrote: >>>> On Wed, Oct 13, 2021 at 09:42:23AM -0600, Jens Axboe wrote: >>>>> Something like this? >>>> >>>> Something like that. Although without making the new function inline >>>> this will generate an indirect call. >>> >>> It will, but I don't see how we can have it both ways... >> >> Last time I played with these optimization gcc did inline function >> pointers passed to __always_inline function into the calling >> function. That is you can keep the source level abstraction but get the >> code generation as if it was open coded. > > Gotcha, so place nvme_complete_batch() in the nvme.h header. That might > work, let me give it a whirl. Looks like this, and it is indeed not an indirect call. I'll re-test the series and send out a v2. commit 70ed26ee000d626105b0d807545af42e6d95a323 Author: Jens Axboe <axboe@kernel.dk> Date: Fri Oct 8 05:59:37 2021 -0600 nvme: add support for batched completion of polled IO Take advantage of struct io_batch, if passed in to the nvme poll handler. If it's set, rather than complete each request individually inline, store them in the io_batch list. We only do so for requests that will complete successfully, anything else will be completed inline as before. Add an mq_ops->complete_batch() handler to do the post-processing of the io_batch list once polling is complete. Signed-off-by: Jens Axboe <axboe@kernel.dk> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c2c2e8545292..4b14258a3bac 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -346,15 +346,19 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req) return RETRY; } -static inline void nvme_end_req(struct request *req) +static inline void nvme_end_req_zoned(struct request *req) { - blk_status_t status = nvme_error_status(nvme_req(req)->status); - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = nvme_lba_to_sect(req->q->queuedata, le64_to_cpu(nvme_req(req)->result.u64)); +} + +static inline void nvme_end_req(struct request *req) +{ + blk_status_t status = nvme_error_status(nvme_req(req)->status); + nvme_end_req_zoned(req); nvme_trace_bio_complete(req); blk_mq_end_request(req, status); } @@ -381,6 +385,14 @@ void nvme_complete_rq(struct request *req) } EXPORT_SYMBOL_GPL(nvme_complete_rq); +void nvme_complete_batch_req(struct request *req) +{ + nvme_cleanup_cmd(req); + nvme_end_req_zoned(req); + req->status = BLK_STS_OK; +} +EXPORT_SYMBOL_GPL(nvme_complete_batch_req); + /* * Called to unwind from ->queue_rq on a failed command submission so that the * multipathing code gets called to potentially failover to another path. diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ed79a6c7e804..e0c079f704cf 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -638,6 +638,23 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id) } void nvme_complete_rq(struct request *req); +void nvme_complete_batch_req(struct request *req); + +static __always_inline void nvme_complete_batch(struct io_batch *iob, + void (*fn)(struct request *rq)) +{ + struct request *req; + + req = rq_list_peek(&iob->req_list); + while (req) { + fn(req); + nvme_complete_batch_req(req); + req = rq_list_next(req); + } + + blk_mq_end_request_batch(iob); +} + blk_status_t nvme_host_path_error(struct request *req); bool nvme_cancel_request(struct request *req, void *data, bool reserved); void nvme_cancel_tagset(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9db6e23f41ef..ae253f6f5c80 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, return ret; } -static void nvme_pci_complete_rq(struct request *req) +static __always_inline void nvme_pci_unmap_rq(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_dev *dev = iod->nvmeq->dev; @@ -969,9 +969,19 @@ static void nvme_pci_complete_rq(struct request *req) rq_integrity_vec(req)->bv_len, rq_data_dir(req)); if (blk_rq_nr_phys_segments(req)) nvme_unmap_data(dev, req); +} + +static void nvme_pci_complete_rq(struct request *req) +{ + nvme_pci_unmap_rq(req); nvme_complete_rq(req); } +static void nvme_pci_complete_batch(struct io_batch *iob) +{ + nvme_complete_batch(iob, nvme_pci_unmap_rq); +} + /* We read the CQE phase first to check if the rest of the entry is valid */ static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq) { @@ -996,7 +1006,8 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq) return nvmeq->dev->tagset.tags[nvmeq->qid - 1]; } -static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) +static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, + struct io_batch *iob, u16 idx) { struct nvme_completion *cqe = &nvmeq->cqes[idx]; __u16 command_id = READ_ONCE(cqe->command_id); @@ -1023,8 +1034,17 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) } trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); - if (!nvme_try_complete_req(req, cqe->status, cqe->result)) - nvme_pci_complete_rq(req); + if (!nvme_try_complete_req(req, cqe->status, cqe->result)) { + /* + * Do normal inline completion if we don't have a batch + * list, if we have an end_io handler, or if the status of + * the request isn't just normal success. + */ + if (!iob || req->end_io || nvme_req(req)->status) + nvme_pci_complete_rq(req); + else + rq_list_add_tail(&iob->req_list, req); + } } static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) @@ -1050,7 +1070,7 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq) * the cqe requires a full read memory barrier */ dma_rmb(); - nvme_handle_cqe(nvmeq, nvmeq->cq_head); + nvme_handle_cqe(nvmeq, NULL, nvmeq->cq_head); nvme_update_cq_head(nvmeq); } @@ -1092,6 +1112,27 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } +static inline int nvme_poll_cq(struct nvme_queue *nvmeq, struct io_batch *iob) +{ + int found = 0; + + while (nvme_cqe_pending(nvmeq)) { + found++; + /* + * load-load control dependency between phase and the rest of + * the cqe requires a full read memory barrier + */ + dma_rmb(); + nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head); + nvme_update_cq_head(nvmeq); + } + + if (found) + nvme_ring_cq_doorbell(nvmeq); + return found; +} + + static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *iob) { struct nvme_queue *nvmeq = hctx->driver_data; @@ -1101,7 +1142,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *iob) return 0; spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq); + found = nvme_poll_cq(nvmeq, iob); spin_unlock(&nvmeq->cq_poll_lock); return found; @@ -1639,6 +1680,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = { static const struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, .complete = nvme_pci_complete_rq, + .complete_batch = nvme_pci_complete_batch, .commit_rqs = nvme_commit_rqs, .init_hctx = nvme_init_hctx, .init_request = nvme_init_request,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4ad63bb9f415..4713da708cd4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, return ret; } -static void nvme_pci_complete_rq(struct request *req) +static void nvme_pci_unmap_rq(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_dev *dev = iod->nvmeq->dev; @@ -969,9 +969,34 @@ static void nvme_pci_complete_rq(struct request *req) rq_integrity_vec(req)->bv_len, rq_data_dir(req)); if (blk_rq_nr_phys_segments(req)) nvme_unmap_data(dev, req); +} + +static void nvme_pci_complete_rq(struct request *req) +{ + nvme_pci_unmap_rq(req); nvme_complete_rq(req); } +static void nvme_pci_complete_batch(struct io_batch *ib) +{ + struct request *req; + + req = ib->req_list; + while (req) { + nvme_pci_unmap_rq(req); + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) + nvme_cleanup_cmd(req); + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = nvme_lba_to_sect(req->q->queuedata, + le64_to_cpu(nvme_req(req)->result.u64)); + req->status = nvme_error_status(nvme_req(req)->status); + req = req->rq_next; + } + + blk_mq_end_request_batch(ib); +} + /* We read the CQE phase first to check if the rest of the entry is valid */ static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq) { @@ -996,7 +1021,8 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq) return nvmeq->dev->tagset.tags[nvmeq->qid - 1]; } -static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) +static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, + struct io_batch *ib, u16 idx) { struct nvme_completion *cqe = &nvmeq->cqes[idx]; __u16 command_id = READ_ONCE(cqe->command_id); @@ -1023,8 +1049,17 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) } trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); - if (!nvme_try_complete_req(req, cqe->status, cqe->result)) - nvme_pci_complete_rq(req); + if (!nvme_try_complete_req(req, cqe->status, cqe->result)) { + enum nvme_disposition ret; + + ret = nvme_decide_disposition(req); + if (unlikely(!ib || req->end_io || ret != COMPLETE)) { + nvme_pci_complete_rq(req); + } else { + req->rq_next = ib->req_list; + ib->req_list = req; + } + } } static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) @@ -1050,7 +1085,7 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq) * the cqe requires a full read memory barrier */ dma_rmb(); - nvme_handle_cqe(nvmeq, nvmeq->cq_head); + nvme_handle_cqe(nvmeq, NULL, nvmeq->cq_head); nvme_update_cq_head(nvmeq); } @@ -1092,6 +1127,27 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } +static inline int nvme_poll_cq(struct nvme_queue *nvmeq, struct io_batch *ib) +{ + int found = 0; + + while (nvme_cqe_pending(nvmeq)) { + found++; + /* + * load-load control dependency between phase and the rest of + * the cqe requires a full read memory barrier + */ + dma_rmb(); + nvme_handle_cqe(nvmeq, ib, nvmeq->cq_head); + nvme_update_cq_head(nvmeq); + } + + if (found) + nvme_ring_cq_doorbell(nvmeq); + return found; +} + + static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *ib) { struct nvme_queue *nvmeq = hctx->driver_data; @@ -1101,7 +1157,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *ib) return 0; spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq); + found = nvme_poll_cq(nvmeq, ib); spin_unlock(&nvmeq->cq_poll_lock); return found; @@ -1639,6 +1695,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = { static const struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, .complete = nvme_pci_complete_rq, + .complete_batch = nvme_pci_complete_batch, .commit_rqs = nvme_commit_rqs, .init_hctx = nvme_init_hctx, .init_request = nvme_init_request,
Take advantage of struct io_batch, if passed in to the nvme poll handler. If it's set, rather than complete each request individually inline, store them in the io_batch list. We only do so for requests that will complete successfully, anything else will be completed inline as before. Add an mq_ops->complete_batch() handler to do the post-processing of the io_batch list once polling is complete. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/nvme/host/pci.c | 69 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 6 deletions(-)