diff mbox series

[6/9] nvme: add support for batched completion of polled IO

Message ID 20211012181742.672391-7-axboe@kernel.dk (mailing list archive)
State New, archived
Headers show
Series Batched completions | expand

Commit Message

Jens Axboe Oct. 12, 2021, 6:17 p.m. UTC
Take advantage of struct io_batch, if passed in to the nvme poll handler.
If it's set, rather than complete each request individually inline, store
them in the io_batch list. We only do so for requests that will complete
successfully, anything else will be completed inline as before.

Add an mq_ops->complete_batch() handler to do the post-processing of
the io_batch list once polling is complete.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 69 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 63 insertions(+), 6 deletions(-)

Comments

Christoph Hellwig Oct. 13, 2021, 7:08 a.m. UTC | #1
On Tue, Oct 12, 2021 at 12:17:39PM -0600, Jens Axboe wrote:
> Take advantage of struct io_batch, if passed in to the nvme poll handler.
> If it's set, rather than complete each request individually inline, store
> them in the io_batch list. We only do so for requests that will complete
> successfully, anything else will be completed inline as before.
> 
> Add an mq_ops->complete_batch() handler to do the post-processing of
> the io_batch list once polling is complete.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  drivers/nvme/host/pci.c | 69 +++++++++++++++++++++++++++++++++++++----
>  1 file changed, 63 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 4ad63bb9f415..4713da708cd4 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>  	return ret;
>  }
>  
> -static void nvme_pci_complete_rq(struct request *req)
> +static void nvme_pci_unmap_rq(struct request *req)
>  {
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>  	struct nvme_dev *dev = iod->nvmeq->dev;
> @@ -969,9 +969,34 @@ static void nvme_pci_complete_rq(struct request *req)
>  			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
>  	if (blk_rq_nr_phys_segments(req))
>  		nvme_unmap_data(dev, req);
> +}
> +
> +static void nvme_pci_complete_rq(struct request *req)
> +{
> +	nvme_pci_unmap_rq(req);
>  	nvme_complete_rq(req);
>  }
>  
> +static void nvme_pci_complete_batch(struct io_batch *ib)
> +{
> +	struct request *req;
> +
> +	req = ib->req_list;
> +	while (req) {
> +		nvme_pci_unmap_rq(req);
> +		if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
> +			nvme_cleanup_cmd(req);
> +		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> +				req_op(req) == REQ_OP_ZONE_APPEND)
> +			req->__sector = nvme_lba_to_sect(req->q->queuedata,
> +					le64_to_cpu(nvme_req(req)->result.u64));
> +		req->status = nvme_error_status(nvme_req(req)->status);
> +		req = req->rq_next;
> +	}
> +
> +	blk_mq_end_request_batch(ib);

I hate all this open coding.  All the common logic needs to be in a
common helper.  Also please add a for_each macro for the request list
iteration.  I already thought about that for the batch allocation in
for-next, but with ever more callers this becomes essential.

> +	if (!nvme_try_complete_req(req, cqe->status, cqe->result)) {
> +		enum nvme_disposition ret;
> +
> +		ret = nvme_decide_disposition(req);
> +		if (unlikely(!ib || req->end_io || ret != COMPLETE)) {
> +			nvme_pci_complete_rq(req);

This actually is the likely case as only polling ever does the batch
completion.  In doubt I'd prefer if we can avoid these likely/unlikely
annotations as much as possible.

> +		} else {
> +			req->rq_next = ib->req_list;
> +			ib->req_list = req;
> +		}

And all this list manipulation should use proper helper.

> +	}

Also - can you look into turning this logic into an inline function with
a callback for the driver?  I think in general gcc will avoid the
indirect call for function pointers passed directly.  That way we can
keep a nice code structure but also avoid the indirections.

Same for nvme_pci_complete_batch.
John Garry Oct. 13, 2021, 9:09 a.m. UTC | #2
On 12/10/2021 19:17, Jens Axboe wrote:
> Signed-off-by: Jens Axboe<axboe@kernel.dk>
> ---
>   drivers/nvme/host/pci.c | 69 +++++++++++++++++++++++++++++++++++++----
>   1 file changed, 63 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 4ad63bb9f415..4713da708cd4 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>   	return ret;
>   }
>   
> -static void nvme_pci_complete_rq(struct request *req)
> +static void nvme_pci_unmap_rq(struct request *req)
>   {
>   	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>   	struct nvme_dev *dev = iod->nvmeq->dev;
> @@ -969,9 +969,34 @@ static void nvme_pci_complete_rq(struct request *req)
>   			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
>   	if (blk_rq_nr_phys_segments(req))
>   		nvme_unmap_data(dev, req);
> +}
> +
> +static void nvme_pci_complete_rq(struct request *req)
> +{
> +	nvme_pci_unmap_rq(req);
>   	nvme_complete_rq(req);
>   }
>   
> +static void nvme_pci_complete_batch(struct io_batch *ib)
> +{
> +	struct request *req;
> +
> +	req = ib->req_list;
> +	while (req) {
> +		nvme_pci_unmap_rq(req);

This will do the DMA SG unmap per request. Often this is a performance 
bottle neck when we have an IOMMU enabled in strict mode. So since we 
complete in batches, could we combine all the SGs in the batch to do one 
big DMA unmap SG, and not one-by-one?

Just a thought.

> +		if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
> +			nvme_cleanup_cmd(req);
> +		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> +				req_op(req) == REQ_OP_ZONE_APPEND)
> +			req->__sector = nvme_lba_to_sect(req->q->queuedata,
> +					le64_to_cpu(nvme_req(req)->result.u64));
> +		req->status = nvme_error_status(nvme_req(req)->status);
> +		req = req->rq_next;
> +	}
> +
> +	blk_mq_end_request_batch(ib);
> +}
Jens Axboe Oct. 13, 2021, 3:07 p.m. UTC | #3
On 10/13/21 3:09 AM, John Garry wrote:
> On 12/10/2021 19:17, Jens Axboe wrote:
>> Signed-off-by: Jens Axboe<axboe@kernel.dk>
>> ---
>>   drivers/nvme/host/pci.c | 69 +++++++++++++++++++++++++++++++++++++----
>>   1 file changed, 63 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index 4ad63bb9f415..4713da708cd4 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>>   	return ret;
>>   }
>>   
>> -static void nvme_pci_complete_rq(struct request *req)
>> +static void nvme_pci_unmap_rq(struct request *req)
>>   {
>>   	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>>   	struct nvme_dev *dev = iod->nvmeq->dev;
>> @@ -969,9 +969,34 @@ static void nvme_pci_complete_rq(struct request *req)
>>   			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
>>   	if (blk_rq_nr_phys_segments(req))
>>   		nvme_unmap_data(dev, req);
>> +}
>> +
>> +static void nvme_pci_complete_rq(struct request *req)
>> +{
>> +	nvme_pci_unmap_rq(req);
>>   	nvme_complete_rq(req);
>>   }
>>   
>> +static void nvme_pci_complete_batch(struct io_batch *ib)
>> +{
>> +	struct request *req;
>> +
>> +	req = ib->req_list;
>> +	while (req) {
>> +		nvme_pci_unmap_rq(req);
> 
> This will do the DMA SG unmap per request. Often this is a performance 
> bottle neck when we have an IOMMU enabled in strict mode. So since we 
> complete in batches, could we combine all the SGs in the batch to do one 
> big DMA unmap SG, and not one-by-one?

It is indeed, I actually have a patch for persistent maps as well. But even
without that, it would make sense to handle these unmaps a bit smarter. That
requires some iommu work though which I'm not that interested in right now,
could be done on top of this one for someone motivated enough.
Jens Axboe Oct. 13, 2021, 3:10 p.m. UTC | #4
On 10/13/21 1:08 AM, Christoph Hellwig wrote:
> On Tue, Oct 12, 2021 at 12:17:39PM -0600, Jens Axboe wrote:
>> Take advantage of struct io_batch, if passed in to the nvme poll handler.
>> If it's set, rather than complete each request individually inline, store
>> them in the io_batch list. We only do so for requests that will complete
>> successfully, anything else will be completed inline as before.
>>
>> Add an mq_ops->complete_batch() handler to do the post-processing of
>> the io_batch list once polling is complete.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  drivers/nvme/host/pci.c | 69 +++++++++++++++++++++++++++++++++++++----
>>  1 file changed, 63 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index 4ad63bb9f415..4713da708cd4 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>>  	return ret;
>>  }
>>  
>> -static void nvme_pci_complete_rq(struct request *req)
>> +static void nvme_pci_unmap_rq(struct request *req)
>>  {
>>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>>  	struct nvme_dev *dev = iod->nvmeq->dev;
>> @@ -969,9 +969,34 @@ static void nvme_pci_complete_rq(struct request *req)
>>  			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
>>  	if (blk_rq_nr_phys_segments(req))
>>  		nvme_unmap_data(dev, req);
>> +}
>> +
>> +static void nvme_pci_complete_rq(struct request *req)
>> +{
>> +	nvme_pci_unmap_rq(req);
>>  	nvme_complete_rq(req);
>>  }
>>  
>> +static void nvme_pci_complete_batch(struct io_batch *ib)
>> +{
>> +	struct request *req;
>> +
>> +	req = ib->req_list;
>> +	while (req) {
>> +		nvme_pci_unmap_rq(req);
>> +		if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
>> +			nvme_cleanup_cmd(req);
>> +		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>> +				req_op(req) == REQ_OP_ZONE_APPEND)
>> +			req->__sector = nvme_lba_to_sect(req->q->queuedata,
>> +					le64_to_cpu(nvme_req(req)->result.u64));
>> +		req->status = nvme_error_status(nvme_req(req)->status);
>> +		req = req->rq_next;
>> +	}
>> +
>> +	blk_mq_end_request_batch(ib);
> 
> I hate all this open coding.  All the common logic needs to be in a
> common helper.

I'll see if I can unify this a bit.

> Also please add a for_each macro for the request list
> iteration.  I already thought about that for the batch allocation in
> for-next, but with ever more callers this becomes essential.

Added a prep patch with list helpers, current version is using those
now.

>> +	if (!nvme_try_complete_req(req, cqe->status, cqe->result)) {
>> +		enum nvme_disposition ret;
>> +
>> +		ret = nvme_decide_disposition(req);
>> +		if (unlikely(!ib || req->end_io || ret != COMPLETE)) {
>> +			nvme_pci_complete_rq(req);
> 
> This actually is the likely case as only polling ever does the batch
> completion.  In doubt I'd prefer if we can avoid these likely/unlikely
> annotations as much as possible.

If you look at the end of the series, IRQ is wired up for it too. But I
do agree with the unlikely, I generally dislike them too. I'll just kill
this one.

>> +		} else {
>> +			req->rq_next = ib->req_list;
>> +			ib->req_list = req;
>> +		}
> 
> And all this list manipulation should use proper helper.

Done

>> +	}
> 
> Also - can you look into turning this logic into an inline function with
> a callback for the driver?  I think in general gcc will avoid the
> indirect call for function pointers passed directly.  That way we can
> keep a nice code structure but also avoid the indirections.
> 
> Same for nvme_pci_complete_batch.

Not sure I follow. It's hard to do a generic callback for this, as the
batch can live outside the block layer through the plug. That's why
it's passed the way it is in terms of completion hooks.
Christoph Hellwig Oct. 13, 2021, 3:16 p.m. UTC | #5
On Wed, Oct 13, 2021 at 09:10:01AM -0600, Jens Axboe wrote:
> > Also - can you look into turning this logic into an inline function with
> > a callback for the driver?  I think in general gcc will avoid the
> > indirect call for function pointers passed directly.  That way we can
> > keep a nice code structure but also avoid the indirections.
> > 
> > Same for nvme_pci_complete_batch.
> 
> Not sure I follow. It's hard to do a generic callback for this, as the
> batch can live outside the block layer through the plug. That's why
> it's passed the way it is in terms of completion hooks.

Basically turn nvme_pci_complete_batch into a core nvme helper (inline)
with nvme_pci_unmap_rq passed as a function pointer that gets inlined.
Jens Axboe Oct. 13, 2021, 3:42 p.m. UTC | #6
On 10/13/21 9:16 AM, Christoph Hellwig wrote:
> On Wed, Oct 13, 2021 at 09:10:01AM -0600, Jens Axboe wrote:
>>> Also - can you look into turning this logic into an inline function with
>>> a callback for the driver?  I think in general gcc will avoid the
>>> indirect call for function pointers passed directly.  That way we can
>>> keep a nice code structure but also avoid the indirections.
>>>
>>> Same for nvme_pci_complete_batch.
>>
>> Not sure I follow. It's hard to do a generic callback for this, as the
>> batch can live outside the block layer through the plug. That's why
>> it's passed the way it is in terms of completion hooks.
> 
> Basically turn nvme_pci_complete_batch into a core nvme helper (inline)
> with nvme_pci_unmap_rq passed as a function pointer that gets inlined.

Something like this?


diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0ac7bad405ef..1aff0ca37063 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -346,15 +346,19 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 	return RETRY;
 }
 
-static inline void nvme_end_req(struct request *req)
+static inline void nvme_end_req_zoned(struct request *req)
 {
-	blk_status_t status = nvme_error_status(nvme_req(req)->status);
-
 	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
 	    req_op(req) == REQ_OP_ZONE_APPEND)
 		req->__sector = nvme_lba_to_sect(req->q->queuedata,
 			le64_to_cpu(nvme_req(req)->result.u64));
+}
+
+static inline void nvme_end_req(struct request *req)
+{
+	blk_status_t status = nvme_error_status(nvme_req(req)->status);
 
+	nvme_end_req_zoned(req);
 	nvme_trace_bio_complete(req);
 	blk_mq_end_request(req, status);
 }
@@ -381,6 +385,23 @@ void nvme_complete_rq(struct request *req)
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
+void nvme_complete_batch(struct io_batch *iob, void (*fn)(struct request *rq))
+{
+	struct request *req;
+
+	req = rq_list_peek(&iob->req_list);
+	while (req) {
+		fn(req);
+		nvme_cleanup_cmd(req);
+		nvme_end_req_zoned(req);
+		req->status = BLK_STS_OK;
+		req = rq_list_next(req);
+	}
+
+	blk_mq_end_request_batch(iob);
+}
+EXPORT_SYMBOL_GPL(nvme_complete_batch);
+
 /*
  * Called to unwind from ->queue_rq on a failed command submission so that the
  * multipathing code gets called to potentially failover to another path.
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ed79a6c7e804..b73a573472d9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -638,6 +638,7 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
 }
 
 void nvme_complete_rq(struct request *req);
+void nvme_complete_batch(struct io_batch *iob, void (*fn)(struct request *));
 blk_status_t nvme_host_path_error(struct request *req);
 bool nvme_cancel_request(struct request *req, void *data, bool reserved);
 void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b8dbee47fced..e79c0f0268b3 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -992,22 +992,7 @@ static void nvme_pci_complete_rq(struct request *req)
 
 static void nvme_pci_complete_batch(struct io_batch *iob)
 {
-	struct request *req;
-
-	req = rq_list_peek(&iob->req_list);
-	while (req) {
-		nvme_pci_unmap_rq(req);
-		if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
-			nvme_cleanup_cmd(req);
-		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
-				req_op(req) == REQ_OP_ZONE_APPEND)
-			req->__sector = nvme_lba_to_sect(req->q->queuedata,
-					le64_to_cpu(nvme_req(req)->result.u64));
-		req->status = BLK_STS_OK;
-		req = rq_list_next(req);
-	}
-
-	blk_mq_end_request_batch(iob);
+	nvme_complete_batch(iob, nvme_pci_unmap_rq);
 }
 
 /* We read the CQE phase first to check if the rest of the entry is valid */
Jens Axboe Oct. 13, 2021, 3:49 p.m. UTC | #7
On 10/13/21 9:42 AM, Jens Axboe wrote:
> On 10/13/21 9:16 AM, Christoph Hellwig wrote:
>> On Wed, Oct 13, 2021 at 09:10:01AM -0600, Jens Axboe wrote:
>>>> Also - can you look into turning this logic into an inline function with
>>>> a callback for the driver?  I think in general gcc will avoid the
>>>> indirect call for function pointers passed directly.  That way we can
>>>> keep a nice code structure but also avoid the indirections.
>>>>
>>>> Same for nvme_pci_complete_batch.
>>>
>>> Not sure I follow. It's hard to do a generic callback for this, as the
>>> batch can live outside the block layer through the plug. That's why
>>> it's passed the way it is in terms of completion hooks.
>>
>> Basically turn nvme_pci_complete_batch into a core nvme helper (inline)
>> with nvme_pci_unmap_rq passed as a function pointer that gets inlined.
> 
> Something like this?

Full patch below for reference, might be easier to read. Got rid of
the prep patch for nvme as well, so this is everything.


commit 002fcc4dd9869cd0fc8684b37ede8e20bdca16a4
Author: Jens Axboe <axboe@kernel.dk>
Date:   Fri Oct 8 05:59:37 2021 -0600

    nvme: add support for batched completion of polled IO
    
    Take advantage of struct io_batch, if passed in to the nvme poll handler.
    If it's set, rather than complete each request individually inline, store
    them in the io_batch list. We only do so for requests that will complete
    successfully, anything else will be completed inline as before.
    
    Add an mq_ops->complete_batch() handler to do the post-processing of
    the io_batch list once polling is complete.
    
    Signed-off-by: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c2c2e8545292..26328fd26ff0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -346,15 +346,19 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 	return RETRY;
 }
 
-static inline void nvme_end_req(struct request *req)
+static inline void nvme_end_req_zoned(struct request *req)
 {
-	blk_status_t status = nvme_error_status(nvme_req(req)->status);
-
 	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
 	    req_op(req) == REQ_OP_ZONE_APPEND)
 		req->__sector = nvme_lba_to_sect(req->q->queuedata,
 			le64_to_cpu(nvme_req(req)->result.u64));
+}
+
+static inline void nvme_end_req(struct request *req)
+{
+	blk_status_t status = nvme_error_status(nvme_req(req)->status);
 
+	nvme_end_req_zoned(req);
 	nvme_trace_bio_complete(req);
 	blk_mq_end_request(req, status);
 }
@@ -381,6 +385,23 @@ void nvme_complete_rq(struct request *req)
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
+void nvme_complete_batch(struct io_batch *iob, void (*fn)(struct request *rq))
+{
+	struct request *req;
+
+	req = rq_list_peek(&iob->req_list);
+	while (req) {
+		fn(req);
+		nvme_cleanup_cmd(req);
+		nvme_end_req_zoned(req);
+		req->status = BLK_STS_OK;
+		req = rq_list_next(req);
+	}
+
+	blk_mq_end_request_batch(iob);
+}
+EXPORT_SYMBOL_GPL(nvme_complete_batch);
+
 /*
  * Called to unwind from ->queue_rq on a failed command submission so that the
  * multipathing code gets called to potentially failover to another path.
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ed79a6c7e804..b73a573472d9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -638,6 +638,7 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
 }
 
 void nvme_complete_rq(struct request *req);
+void nvme_complete_batch(struct io_batch *iob, void (*fn)(struct request *));
 blk_status_t nvme_host_path_error(struct request *req);
 bool nvme_cancel_request(struct request *req, void *data, bool reserved);
 void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9db6e23f41ef..b3e686404adf 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-static void nvme_pci_complete_rq(struct request *req)
+static void nvme_pci_unmap_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_dev *dev = iod->nvmeq->dev;
@@ -969,9 +969,19 @@ static void nvme_pci_complete_rq(struct request *req)
 			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
 	if (blk_rq_nr_phys_segments(req))
 		nvme_unmap_data(dev, req);
+}
+
+static void nvme_pci_complete_rq(struct request *req)
+{
+	nvme_pci_unmap_rq(req);
 	nvme_complete_rq(req);
 }
 
+static void nvme_pci_complete_batch(struct io_batch *iob)
+{
+	nvme_complete_batch(iob, nvme_pci_unmap_rq);
+}
+
 /* We read the CQE phase first to check if the rest of the entry is valid */
 static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
 {
@@ -996,7 +1006,8 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
 	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
 }
 
-static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
+				   struct io_batch *iob, u16 idx)
 {
 	struct nvme_completion *cqe = &nvmeq->cqes[idx];
 	__u16 command_id = READ_ONCE(cqe->command_id);
@@ -1023,8 +1034,17 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
 	}
 
 	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
-	if (!nvme_try_complete_req(req, cqe->status, cqe->result))
-		nvme_pci_complete_rq(req);
+	if (!nvme_try_complete_req(req, cqe->status, cqe->result)) {
+		/*
+		 * Do normal inline completion if we don't have a batch
+		 * list, if we have an end_io handler, or if the status of
+		 * the request isn't just normal success.
+		 */
+		if (!iob || req->end_io || nvme_req(req)->status)
+			nvme_pci_complete_rq(req);
+		else
+			rq_list_add_tail(&iob->req_list, req);
+	}
 }
 
 static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
@@ -1050,7 +1070,7 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
 		 * the cqe requires a full read memory barrier
 		 */
 		dma_rmb();
-		nvme_handle_cqe(nvmeq, nvmeq->cq_head);
+		nvme_handle_cqe(nvmeq, NULL, nvmeq->cq_head);
 		nvme_update_cq_head(nvmeq);
 	}
 
@@ -1092,6 +1112,27 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
 	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
 }
 
+static inline int nvme_poll_cq(struct nvme_queue *nvmeq, struct io_batch *iob)
+{
+	int found = 0;
+
+	while (nvme_cqe_pending(nvmeq)) {
+		found++;
+		/*
+		 * load-load control dependency between phase and the rest of
+		 * the cqe requires a full read memory barrier
+		 */
+		dma_rmb();
+		nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head);
+		nvme_update_cq_head(nvmeq);
+	}
+
+	if (found)
+		nvme_ring_cq_doorbell(nvmeq);
+	return found;
+}
+
+
 static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *iob)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
@@ -1101,7 +1142,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *iob)
 		return 0;
 
 	spin_lock(&nvmeq->cq_poll_lock);
-	found = nvme_process_cq(nvmeq);
+	found = nvme_poll_cq(nvmeq, iob);
 	spin_unlock(&nvmeq->cq_poll_lock);
 
 	return found;
@@ -1639,6 +1680,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 static const struct blk_mq_ops nvme_mq_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.complete	= nvme_pci_complete_rq,
+	.complete_batch = nvme_pci_complete_batch,
 	.commit_rqs	= nvme_commit_rqs,
 	.init_hctx	= nvme_init_hctx,
 	.init_request	= nvme_init_request,
Christoph Hellwig Oct. 13, 2021, 3:50 p.m. UTC | #8
On Wed, Oct 13, 2021 at 09:42:23AM -0600, Jens Axboe wrote:
> Something like this?

Something like that.  Although without making the new function inline
this will generate an indirect call.
Jens Axboe Oct. 13, 2021, 4:04 p.m. UTC | #9
On 10/13/21 9:50 AM, Christoph Hellwig wrote:
> On Wed, Oct 13, 2021 at 09:42:23AM -0600, Jens Axboe wrote:
>> Something like this?
> 
> Something like that.  Although without making the new function inline
> this will generate an indirect call.

It will, but I don't see how we can have it both ways...
Christoph Hellwig Oct. 13, 2021, 4:13 p.m. UTC | #10
On Wed, Oct 13, 2021 at 10:04:36AM -0600, Jens Axboe wrote:
> On 10/13/21 9:50 AM, Christoph Hellwig wrote:
> > On Wed, Oct 13, 2021 at 09:42:23AM -0600, Jens Axboe wrote:
> >> Something like this?
> > 
> > Something like that.  Although without making the new function inline
> > this will generate an indirect call.
> 
> It will, but I don't see how we can have it both ways...

Last time I played with these optimization gcc did inline function
pointers passed to __always_inline function into the calling
function.  That is you can keep the source level abstraction but get the
code generation as if it was open coded.
Jens Axboe Oct. 13, 2021, 4:33 p.m. UTC | #11
On 10/13/21 10:13 AM, Christoph Hellwig wrote:
> On Wed, Oct 13, 2021 at 10:04:36AM -0600, Jens Axboe wrote:
>> On 10/13/21 9:50 AM, Christoph Hellwig wrote:
>>> On Wed, Oct 13, 2021 at 09:42:23AM -0600, Jens Axboe wrote:
>>>> Something like this?
>>>
>>> Something like that.  Although without making the new function inline
>>> this will generate an indirect call.
>>
>> It will, but I don't see how we can have it both ways...
> 
> Last time I played with these optimization gcc did inline function
> pointers passed to __always_inline function into the calling
> function.  That is you can keep the source level abstraction but get the
> code generation as if it was open coded.

Gotcha, so place nvme_complete_batch() in the nvme.h header. That might
work, let me give it a whirl.
Jens Axboe Oct. 13, 2021, 4:45 p.m. UTC | #12
On 10/13/21 10:33 AM, Jens Axboe wrote:
> On 10/13/21 10:13 AM, Christoph Hellwig wrote:
>> On Wed, Oct 13, 2021 at 10:04:36AM -0600, Jens Axboe wrote:
>>> On 10/13/21 9:50 AM, Christoph Hellwig wrote:
>>>> On Wed, Oct 13, 2021 at 09:42:23AM -0600, Jens Axboe wrote:
>>>>> Something like this?
>>>>
>>>> Something like that.  Although without making the new function inline
>>>> this will generate an indirect call.
>>>
>>> It will, but I don't see how we can have it both ways...
>>
>> Last time I played with these optimization gcc did inline function
>> pointers passed to __always_inline function into the calling
>> function.  That is you can keep the source level abstraction but get the
>> code generation as if it was open coded.
> 
> Gotcha, so place nvme_complete_batch() in the nvme.h header. That might
> work, let me give it a whirl.

Looks like this, and it is indeed not an indirect call. I'll re-test the
series and send out a v2.


commit 70ed26ee000d626105b0d807545af42e6d95a323
Author: Jens Axboe <axboe@kernel.dk>
Date:   Fri Oct 8 05:59:37 2021 -0600

    nvme: add support for batched completion of polled IO
    
    Take advantage of struct io_batch, if passed in to the nvme poll handler.
    If it's set, rather than complete each request individually inline, store
    them in the io_batch list. We only do so for requests that will complete
    successfully, anything else will be completed inline as before.
    
    Add an mq_ops->complete_batch() handler to do the post-processing of
    the io_batch list once polling is complete.
    
    Signed-off-by: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c2c2e8545292..4b14258a3bac 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -346,15 +346,19 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 	return RETRY;
 }
 
-static inline void nvme_end_req(struct request *req)
+static inline void nvme_end_req_zoned(struct request *req)
 {
-	blk_status_t status = nvme_error_status(nvme_req(req)->status);
-
 	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
 	    req_op(req) == REQ_OP_ZONE_APPEND)
 		req->__sector = nvme_lba_to_sect(req->q->queuedata,
 			le64_to_cpu(nvme_req(req)->result.u64));
+}
+
+static inline void nvme_end_req(struct request *req)
+{
+	blk_status_t status = nvme_error_status(nvme_req(req)->status);
 
+	nvme_end_req_zoned(req);
 	nvme_trace_bio_complete(req);
 	blk_mq_end_request(req, status);
 }
@@ -381,6 +385,14 @@ void nvme_complete_rq(struct request *req)
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
+void nvme_complete_batch_req(struct request *req)
+{
+	nvme_cleanup_cmd(req);
+	nvme_end_req_zoned(req);
+	req->status = BLK_STS_OK;
+}
+EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
+
 /*
  * Called to unwind from ->queue_rq on a failed command submission so that the
  * multipathing code gets called to potentially failover to another path.
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ed79a6c7e804..e0c079f704cf 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -638,6 +638,23 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
 }
 
 void nvme_complete_rq(struct request *req);
+void nvme_complete_batch_req(struct request *req);
+
+static __always_inline void nvme_complete_batch(struct io_batch *iob,
+						void (*fn)(struct request *rq))
+{
+	struct request *req;
+
+	req = rq_list_peek(&iob->req_list);
+	while (req) {
+		fn(req);
+		nvme_complete_batch_req(req);
+		req = rq_list_next(req);
+	}
+
+	blk_mq_end_request_batch(iob);
+}
+
 blk_status_t nvme_host_path_error(struct request *req);
 bool nvme_cancel_request(struct request *req, void *data, bool reserved);
 void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9db6e23f41ef..ae253f6f5c80 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-static void nvme_pci_complete_rq(struct request *req)
+static __always_inline void nvme_pci_unmap_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_dev *dev = iod->nvmeq->dev;
@@ -969,9 +969,19 @@ static void nvme_pci_complete_rq(struct request *req)
 			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
 	if (blk_rq_nr_phys_segments(req))
 		nvme_unmap_data(dev, req);
+}
+
+static void nvme_pci_complete_rq(struct request *req)
+{
+	nvme_pci_unmap_rq(req);
 	nvme_complete_rq(req);
 }
 
+static void nvme_pci_complete_batch(struct io_batch *iob)
+{
+	nvme_complete_batch(iob, nvme_pci_unmap_rq);
+}
+
 /* We read the CQE phase first to check if the rest of the entry is valid */
 static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
 {
@@ -996,7 +1006,8 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
 	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
 }
 
-static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
+				   struct io_batch *iob, u16 idx)
 {
 	struct nvme_completion *cqe = &nvmeq->cqes[idx];
 	__u16 command_id = READ_ONCE(cqe->command_id);
@@ -1023,8 +1034,17 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
 	}
 
 	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
-	if (!nvme_try_complete_req(req, cqe->status, cqe->result))
-		nvme_pci_complete_rq(req);
+	if (!nvme_try_complete_req(req, cqe->status, cqe->result)) {
+		/*
+		 * Do normal inline completion if we don't have a batch
+		 * list, if we have an end_io handler, or if the status of
+		 * the request isn't just normal success.
+		 */
+		if (!iob || req->end_io || nvme_req(req)->status)
+			nvme_pci_complete_rq(req);
+		else
+			rq_list_add_tail(&iob->req_list, req);
+	}
 }
 
 static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
@@ -1050,7 +1070,7 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
 		 * the cqe requires a full read memory barrier
 		 */
 		dma_rmb();
-		nvme_handle_cqe(nvmeq, nvmeq->cq_head);
+		nvme_handle_cqe(nvmeq, NULL, nvmeq->cq_head);
 		nvme_update_cq_head(nvmeq);
 	}
 
@@ -1092,6 +1112,27 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
 	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
 }
 
+static inline int nvme_poll_cq(struct nvme_queue *nvmeq, struct io_batch *iob)
+{
+	int found = 0;
+
+	while (nvme_cqe_pending(nvmeq)) {
+		found++;
+		/*
+		 * load-load control dependency between phase and the rest of
+		 * the cqe requires a full read memory barrier
+		 */
+		dma_rmb();
+		nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head);
+		nvme_update_cq_head(nvmeq);
+	}
+
+	if (found)
+		nvme_ring_cq_doorbell(nvmeq);
+	return found;
+}
+
+
 static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *iob)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
@@ -1101,7 +1142,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *iob)
 		return 0;
 
 	spin_lock(&nvmeq->cq_poll_lock);
-	found = nvme_process_cq(nvmeq);
+	found = nvme_poll_cq(nvmeq, iob);
 	spin_unlock(&nvmeq->cq_poll_lock);
 
 	return found;
@@ -1639,6 +1680,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 static const struct blk_mq_ops nvme_mq_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.complete	= nvme_pci_complete_rq,
+	.complete_batch = nvme_pci_complete_batch,
 	.commit_rqs	= nvme_commit_rqs,
 	.init_hctx	= nvme_init_hctx,
 	.init_request	= nvme_init_request,
diff mbox series

Patch

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4ad63bb9f415..4713da708cd4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -959,7 +959,7 @@  static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-static void nvme_pci_complete_rq(struct request *req)
+static void nvme_pci_unmap_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_dev *dev = iod->nvmeq->dev;
@@ -969,9 +969,34 @@  static void nvme_pci_complete_rq(struct request *req)
 			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
 	if (blk_rq_nr_phys_segments(req))
 		nvme_unmap_data(dev, req);
+}
+
+static void nvme_pci_complete_rq(struct request *req)
+{
+	nvme_pci_unmap_rq(req);
 	nvme_complete_rq(req);
 }
 
+static void nvme_pci_complete_batch(struct io_batch *ib)
+{
+	struct request *req;
+
+	req = ib->req_list;
+	while (req) {
+		nvme_pci_unmap_rq(req);
+		if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
+			nvme_cleanup_cmd(req);
+		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+				req_op(req) == REQ_OP_ZONE_APPEND)
+			req->__sector = nvme_lba_to_sect(req->q->queuedata,
+					le64_to_cpu(nvme_req(req)->result.u64));
+		req->status = nvme_error_status(nvme_req(req)->status);
+		req = req->rq_next;
+	}
+
+	blk_mq_end_request_batch(ib);
+}
+
 /* We read the CQE phase first to check if the rest of the entry is valid */
 static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
 {
@@ -996,7 +1021,8 @@  static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
 	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
 }
 
-static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
+				   struct io_batch *ib, u16 idx)
 {
 	struct nvme_completion *cqe = &nvmeq->cqes[idx];
 	__u16 command_id = READ_ONCE(cqe->command_id);
@@ -1023,8 +1049,17 @@  static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
 	}
 
 	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
-	if (!nvme_try_complete_req(req, cqe->status, cqe->result))
-		nvme_pci_complete_rq(req);
+	if (!nvme_try_complete_req(req, cqe->status, cqe->result)) {
+		enum nvme_disposition ret;
+
+		ret = nvme_decide_disposition(req);
+		if (unlikely(!ib || req->end_io || ret != COMPLETE)) {
+			nvme_pci_complete_rq(req);
+		} else {
+			req->rq_next = ib->req_list;
+			ib->req_list = req;
+		}
+	}
 }
 
 static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
@@ -1050,7 +1085,7 @@  static inline int nvme_process_cq(struct nvme_queue *nvmeq)
 		 * the cqe requires a full read memory barrier
 		 */
 		dma_rmb();
-		nvme_handle_cqe(nvmeq, nvmeq->cq_head);
+		nvme_handle_cqe(nvmeq, NULL, nvmeq->cq_head);
 		nvme_update_cq_head(nvmeq);
 	}
 
@@ -1092,6 +1127,27 @@  static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
 	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
 }
 
+static inline int nvme_poll_cq(struct nvme_queue *nvmeq, struct io_batch *ib)
+{
+	int found = 0;
+
+	while (nvme_cqe_pending(nvmeq)) {
+		found++;
+		/*
+		 * load-load control dependency between phase and the rest of
+		 * the cqe requires a full read memory barrier
+		 */
+		dma_rmb();
+		nvme_handle_cqe(nvmeq, ib, nvmeq->cq_head);
+		nvme_update_cq_head(nvmeq);
+	}
+
+	if (found)
+		nvme_ring_cq_doorbell(nvmeq);
+	return found;
+}
+
+
 static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *ib)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
@@ -1101,7 +1157,7 @@  static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *ib)
 		return 0;
 
 	spin_lock(&nvmeq->cq_poll_lock);
-	found = nvme_process_cq(nvmeq);
+	found = nvme_poll_cq(nvmeq, ib);
 	spin_unlock(&nvmeq->cq_poll_lock);
 
 	return found;
@@ -1639,6 +1695,7 @@  static const struct blk_mq_ops nvme_mq_admin_ops = {
 static const struct blk_mq_ops nvme_mq_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.complete	= nvme_pci_complete_rq,
+	.complete_batch = nvme_pci_complete_batch,
 	.commit_rqs	= nvme_commit_rqs,
 	.init_hctx	= nvme_init_hctx,
 	.init_request	= nvme_init_request,