diff mbox series

[RFC,7/7] nvme: Enable swiotlb throttling for NVMe PCI devices

Message ID 20240822183718.1234-8-mhklinux@outlook.com (mailing list archive)
State Not Applicable
Headers show
Series Introduce swiotlb throttling | expand

Commit Message

Michael Kelley Aug. 22, 2024, 6:37 p.m. UTC
From: Michael Kelley <mhklinux@outlook.com>

In a CoCo VM, all DMA-based I/O must use swiotlb bounce buffers
because DMA cannot be done to private (encrypted) portions of VM
memory. The bounce buffer memory is marked shared (decrypted) at
boot time, so I/O is done to/from the bounce buffer memory and then
copied by the CPU to/from the final target memory (i.e, "bounced").
Storage devices can be large consumers of bounce buffer memory because
it is possible to have large numbers of I/Os in flight across multiple
devices. Bounce buffer memory must be pre-allocated at boot time, and
it is difficult to know how much memory to allocate to handle peak
storage I/O loads. Consequently, bounce buffer memory is typically
over-provisioned, which wastes memory, and may still not avoid a peak
that exhausts bounce buffer memory and cause storage I/O errors.

For Coco VMs running with NVMe PCI devices, update the driver to
permit bounce buffer throttling. Gate the throttling behavior
on a DMA layer check indicating that throttling is useful, so that
no change occurs in a non-CoCo VM. If throttling is useful, enable
the BLK_MQ_F_BLOCKING flag, and pass the DMA_ATTR_MAY_BLOCK attribute
into dma_map_bvec() and dma_map_sgtable() calls. With these options in
place, DMA map requests are pended when necessary to reduce the
likelihood of usage peaks caused by the NVMe driver that could exhaust
bounce buffer memory and generate errors.

Signed-off-by: Michael Kelley <mhklinux@outlook.com>
---
 drivers/nvme/host/pci.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

Comments

Petr Tesařík Aug. 23, 2024, 8:26 a.m. UTC | #1
On Thu, 22 Aug 2024 11:37:18 -0700
mhkelley58@gmail.com wrote:

> From: Michael Kelley <mhklinux@outlook.com>
> 
> In a CoCo VM, all DMA-based I/O must use swiotlb bounce buffers
> because DMA cannot be done to private (encrypted) portions of VM
> memory. The bounce buffer memory is marked shared (decrypted) at
> boot time, so I/O is done to/from the bounce buffer memory and then
> copied by the CPU to/from the final target memory (i.e, "bounced").
> Storage devices can be large consumers of bounce buffer memory because
> it is possible to have large numbers of I/Os in flight across multiple
> devices. Bounce buffer memory must be pre-allocated at boot time, and
> it is difficult to know how much memory to allocate to handle peak
> storage I/O loads. Consequently, bounce buffer memory is typically
> over-provisioned, which wastes memory, and may still not avoid a peak
> that exhausts bounce buffer memory and cause storage I/O errors.
> 
> For Coco VMs running with NVMe PCI devices, update the driver to
> permit bounce buffer throttling. Gate the throttling behavior
> on a DMA layer check indicating that throttling is useful, so that
> no change occurs in a non-CoCo VM. If throttling is useful, enable
> the BLK_MQ_F_BLOCKING flag, and pass the DMA_ATTR_MAY_BLOCK attribute
> into dma_map_bvec() and dma_map_sgtable() calls. With these options in
> place, DMA map requests are pended when necessary to reduce the
> likelihood of usage peaks caused by the NVMe driver that could exhaust
> bounce buffer memory and generate errors.
> 
> Signed-off-by: Michael Kelley <mhklinux@outlook.com>

LGTM.

Reviewed-by: Petr Tesarik <ptesarik@suse.com>

Petr T

> ---
>  drivers/nvme/host/pci.c | 18 ++++++++++++++----
>  1 file changed, 14 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 6cd9395ba9ec..2c39943a87f8 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -156,6 +156,7 @@ struct nvme_dev {
>  	dma_addr_t host_mem_descs_dma;
>  	struct nvme_host_mem_buf_desc *host_mem_descs;
>  	void **host_mem_desc_bufs;
> +	unsigned long dma_attrs;
>  	unsigned int nr_allocated_queues;
>  	unsigned int nr_write_queues;
>  	unsigned int nr_poll_queues;
> @@ -735,7 +736,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
>  	unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
>  	unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
>  
> -	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
> +	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req),
> +					dev->dma_attrs);
>  	if (dma_mapping_error(dev->dev, iod->first_dma))
>  		return BLK_STS_RESOURCE;
>  	iod->dma_len = bv->bv_len;
> @@ -754,7 +756,8 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
>  {
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>  
> -	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
> +	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req),
> +					dev->dma_attrs);
>  	if (dma_mapping_error(dev->dev, iod->first_dma))
>  		return BLK_STS_RESOURCE;
>  	iod->dma_len = bv->bv_len;
> @@ -800,7 +803,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
>  		goto out_free_sg;
>  
>  	rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req),
> -			     DMA_ATTR_NO_WARN);
> +			     dev->dma_attrs | DMA_ATTR_NO_WARN);
>  	if (rc) {
>  		if (rc == -EREMOTEIO)
>  			ret = BLK_STS_TARGET;
> @@ -828,7 +831,8 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>  	struct bio_vec bv = rq_integrity_vec(req);
>  
> -	iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
> +	iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req),
> +					dev->dma_attrs);
>  	if (dma_mapping_error(dev->dev, iod->meta_dma))
>  		return BLK_STS_IOERR;
>  	cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
> @@ -3040,6 +3044,12 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
>  	 * a single integrity segment for the separate metadata pointer.
>  	 */
>  	dev->ctrl.max_integrity_segments = 1;
> +
> +	if (dma_recommend_may_block(dev->dev)) {
> +		dev->ctrl.blocking = true;
> +		dev->dma_attrs = DMA_ATTR_MAY_BLOCK;
> +	}
> +
>  	return dev;
>  
>  out_put_device:
diff mbox series

Patch

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6cd9395ba9ec..2c39943a87f8 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -156,6 +156,7 @@  struct nvme_dev {
 	dma_addr_t host_mem_descs_dma;
 	struct nvme_host_mem_buf_desc *host_mem_descs;
 	void **host_mem_desc_bufs;
+	unsigned long dma_attrs;
 	unsigned int nr_allocated_queues;
 	unsigned int nr_write_queues;
 	unsigned int nr_poll_queues;
@@ -735,7 +736,8 @@  static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
 	unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
 	unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
 
-	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
+	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req),
+					dev->dma_attrs);
 	if (dma_mapping_error(dev->dev, iod->first_dma))
 		return BLK_STS_RESOURCE;
 	iod->dma_len = bv->bv_len;
@@ -754,7 +756,8 @@  static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 
-	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
+	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req),
+					dev->dma_attrs);
 	if (dma_mapping_error(dev->dev, iod->first_dma))
 		return BLK_STS_RESOURCE;
 	iod->dma_len = bv->bv_len;
@@ -800,7 +803,7 @@  static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 		goto out_free_sg;
 
 	rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req),
-			     DMA_ATTR_NO_WARN);
+			     dev->dma_attrs | DMA_ATTR_NO_WARN);
 	if (rc) {
 		if (rc == -EREMOTEIO)
 			ret = BLK_STS_TARGET;
@@ -828,7 +831,8 @@  static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct bio_vec bv = rq_integrity_vec(req);
 
-	iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
+	iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req),
+					dev->dma_attrs);
 	if (dma_mapping_error(dev->dev, iod->meta_dma))
 		return BLK_STS_IOERR;
 	cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
@@ -3040,6 +3044,12 @@  static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
 	 * a single integrity segment for the separate metadata pointer.
 	 */
 	dev->ctrl.max_integrity_segments = 1;
+
+	if (dma_recommend_may_block(dev->dev)) {
+		dev->ctrl.blocking = true;
+		dev->dma_attrs = DMA_ATTR_MAY_BLOCK;
+	}
+
 	return dev;
 
 out_put_device: