Message ID | 20240822183718.1234-8-mhklinux@outlook.com (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Series | Introduce swiotlb throttling | expand |
On Thu, 22 Aug 2024 11:37:18 -0700 mhkelley58@gmail.com wrote: > From: Michael Kelley <mhklinux@outlook.com> > > In a CoCo VM, all DMA-based I/O must use swiotlb bounce buffers > because DMA cannot be done to private (encrypted) portions of VM > memory. The bounce buffer memory is marked shared (decrypted) at > boot time, so I/O is done to/from the bounce buffer memory and then > copied by the CPU to/from the final target memory (i.e, "bounced"). > Storage devices can be large consumers of bounce buffer memory because > it is possible to have large numbers of I/Os in flight across multiple > devices. Bounce buffer memory must be pre-allocated at boot time, and > it is difficult to know how much memory to allocate to handle peak > storage I/O loads. Consequently, bounce buffer memory is typically > over-provisioned, which wastes memory, and may still not avoid a peak > that exhausts bounce buffer memory and cause storage I/O errors. > > For Coco VMs running with NVMe PCI devices, update the driver to > permit bounce buffer throttling. Gate the throttling behavior > on a DMA layer check indicating that throttling is useful, so that > no change occurs in a non-CoCo VM. If throttling is useful, enable > the BLK_MQ_F_BLOCKING flag, and pass the DMA_ATTR_MAY_BLOCK attribute > into dma_map_bvec() and dma_map_sgtable() calls. With these options in > place, DMA map requests are pended when necessary to reduce the > likelihood of usage peaks caused by the NVMe driver that could exhaust > bounce buffer memory and generate errors. > > Signed-off-by: Michael Kelley <mhklinux@outlook.com> LGTM. Reviewed-by: Petr Tesarik <ptesarik@suse.com> Petr T > --- > drivers/nvme/host/pci.c | 18 ++++++++++++++---- > 1 file changed, 14 insertions(+), 4 deletions(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index 6cd9395ba9ec..2c39943a87f8 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -156,6 +156,7 @@ struct nvme_dev { > dma_addr_t host_mem_descs_dma; > struct nvme_host_mem_buf_desc *host_mem_descs; > void **host_mem_desc_bufs; > + unsigned long dma_attrs; > unsigned int nr_allocated_queues; > unsigned int nr_write_queues; > unsigned int nr_poll_queues; > @@ -735,7 +736,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, > unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1); > unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset; > > - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); > + iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), > + dev->dma_attrs); > if (dma_mapping_error(dev->dev, iod->first_dma)) > return BLK_STS_RESOURCE; > iod->dma_len = bv->bv_len; > @@ -754,7 +756,8 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, > { > struct nvme_iod *iod = blk_mq_rq_to_pdu(req); > > - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); > + iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), > + dev->dma_attrs); > if (dma_mapping_error(dev->dev, iod->first_dma)) > return BLK_STS_RESOURCE; > iod->dma_len = bv->bv_len; > @@ -800,7 +803,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, > goto out_free_sg; > > rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), > - DMA_ATTR_NO_WARN); > + dev->dma_attrs | DMA_ATTR_NO_WARN); > if (rc) { > if (rc == -EREMOTEIO) > ret = BLK_STS_TARGET; > @@ -828,7 +831,8 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, > struct nvme_iod *iod = blk_mq_rq_to_pdu(req); > struct bio_vec bv = rq_integrity_vec(req); > > - iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0); > + iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), > + dev->dma_attrs); > if (dma_mapping_error(dev->dev, iod->meta_dma)) > return BLK_STS_IOERR; > cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); > @@ -3040,6 +3044,12 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, > * a single integrity segment for the separate metadata pointer. > */ > dev->ctrl.max_integrity_segments = 1; > + > + if (dma_recommend_may_block(dev->dev)) { > + dev->ctrl.blocking = true; > + dev->dma_attrs = DMA_ATTR_MAY_BLOCK; > + } > + > return dev; > > out_put_device:
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6cd9395ba9ec..2c39943a87f8 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -156,6 +156,7 @@ struct nvme_dev { dma_addr_t host_mem_descs_dma; struct nvme_host_mem_buf_desc *host_mem_descs; void **host_mem_desc_bufs; + unsigned long dma_attrs; unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; @@ -735,7 +736,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1); unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset; - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); + iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), + dev->dma_attrs); if (dma_mapping_error(dev->dev, iod->first_dma)) return BLK_STS_RESOURCE; iod->dma_len = bv->bv_len; @@ -754,7 +756,8 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); + iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), + dev->dma_attrs); if (dma_mapping_error(dev->dev, iod->first_dma)) return BLK_STS_RESOURCE; iod->dma_len = bv->bv_len; @@ -800,7 +803,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, goto out_free_sg; rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), - DMA_ATTR_NO_WARN); + dev->dma_attrs | DMA_ATTR_NO_WARN); if (rc) { if (rc == -EREMOTEIO) ret = BLK_STS_TARGET; @@ -828,7 +831,8 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct bio_vec bv = rq_integrity_vec(req); - iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0); + iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), + dev->dma_attrs); if (dma_mapping_error(dev->dev, iod->meta_dma)) return BLK_STS_IOERR; cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); @@ -3040,6 +3044,12 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, * a single integrity segment for the separate metadata pointer. */ dev->ctrl.max_integrity_segments = 1; + + if (dma_recommend_may_block(dev->dev)) { + dev->ctrl.blocking = true; + dev->dma_attrs = DMA_ATTR_MAY_BLOCK; + } + return dev; out_put_device: