Message ID | 20231101131611.775299-5-mnissler@rivosinc.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Support message-based DMA in vfio-user server | expand |
> On Nov 1, 2023, at 9:16 AM, Mattias Nissler <mnissler@rivosinc.com> wrote: > > Wire up support for DMA for the case where the vfio-user client does not > provide mmap()-able file descriptors, but DMA requests must be performed > via the VFIO-user protocol. This installs an indirect memory region, > which already works for pci_dma_{read,write}, and pci_dma_map works > thanks to the existing DMA bounce buffering support. > > Note that while simple scenarios work with this patch, there's a known > race condition in libvfio-user that will mess up the communication > channel. See https://github.com/nutanix/libvfio-user/issues/279 for > details as well as a proposed fix. > > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com> Reviewed-by: Jagannathan Raman <jag.raman@oracle.com> > --- > hw/remote/trace-events | 2 + > hw/remote/vfio-user-obj.c | 100 ++++++++++++++++++++++++++++++++------ > 2 files changed, 87 insertions(+), 15 deletions(-) > > diff --git a/hw/remote/trace-events b/hw/remote/trace-events > index 0d1b7d56a5..358a68fb34 100644 > --- a/hw/remote/trace-events > +++ b/hw/remote/trace-events > @@ -9,6 +9,8 @@ vfu_cfg_read(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x -> 0x%x" > vfu_cfg_write(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x <- 0x%x" > vfu_dma_register(uint64_t gpa, size_t len) "vfu: registering GPA 0x%"PRIx64", %zu bytes" > vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64"" > +vfu_dma_read(uint64_t gpa, size_t len) "vfu: DMA read 0x%"PRIx64", %zu bytes" > +vfu_dma_write(uint64_t gpa, size_t len) "vfu: DMA write 0x%"PRIx64", %zu bytes" > vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64"" > vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64"" > vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64"" > diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c > index 8b10c32a3c..9f5e385668 100644 > --- a/hw/remote/vfio-user-obj.c > +++ b/hw/remote/vfio-user-obj.c > @@ -300,6 +300,63 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf, > return count; > } > > +static MemTxResult vfu_dma_read(void *opaque, hwaddr addr, uint64_t *val, > + unsigned size, MemTxAttrs attrs) > +{ > + MemoryRegion *region = opaque; > + vfu_ctx_t *vfu_ctx = VFU_OBJECT(region->owner)->vfu_ctx; > + uint8_t buf[sizeof(uint64_t)]; > + > + trace_vfu_dma_read(region->addr + addr, size); > + > + g_autofree dma_sg_t *sg = g_malloc0(dma_sg_size()); > + vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr); > + if (vfu_addr_to_sgl(vfu_ctx, vfu_addr, size, sg, 1, PROT_READ) < 0 || > + vfu_sgl_read(vfu_ctx, sg, 1, buf) != 0) { > + return MEMTX_ERROR; > + } > + > + *val = ldn_he_p(buf, size); > + > + return MEMTX_OK; > +} > + > +static MemTxResult vfu_dma_write(void *opaque, hwaddr addr, uint64_t val, > + unsigned size, MemTxAttrs attrs) > +{ > + MemoryRegion *region = opaque; > + vfu_ctx_t *vfu_ctx = VFU_OBJECT(region->owner)->vfu_ctx; > + uint8_t buf[sizeof(uint64_t)]; > + > + trace_vfu_dma_write(region->addr + addr, size); > + > + stn_he_p(buf, size, val); > + > + g_autofree dma_sg_t *sg = g_malloc0(dma_sg_size()); > + vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr); > + if (vfu_addr_to_sgl(vfu_ctx, vfu_addr, size, sg, 1, PROT_WRITE) < 0 || > + vfu_sgl_write(vfu_ctx, sg, 1, buf) != 0) { > + return MEMTX_ERROR; > + } > + > + return MEMTX_OK; > +} > + > +static const MemoryRegionOps vfu_dma_ops = { > + .read_with_attrs = vfu_dma_read, > + .write_with_attrs = vfu_dma_write, > + .endianness = DEVICE_HOST_ENDIAN, > + .valid = { > + .min_access_size = 1, > + .max_access_size = 8, > + .unaligned = true, > + }, > + .impl = { > + .min_access_size = 1, > + .max_access_size = 8, > + }, > +}; > + > static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) > { > VfuObject *o = vfu_get_private(vfu_ctx); > @@ -308,17 +365,30 @@ static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) > g_autofree char *name = NULL; > struct iovec *iov = &info->iova; > > - if (!info->vaddr) { > - return; > - } > - > name = g_strdup_printf("mem-%s-%"PRIx64"", o->device, > - (uint64_t)info->vaddr); > + (uint64_t)iov->iov_base); > > subregion = g_new0(MemoryRegion, 1); > > - memory_region_init_ram_ptr(subregion, NULL, name, > - iov->iov_len, info->vaddr); > + if (info->vaddr) { > + memory_region_init_ram_ptr(subregion, OBJECT(o), name, > + iov->iov_len, info->vaddr); > + } else { > + /* > + * Note that I/O regions' MemoryRegionOps handle accesses of at most 8 > + * bytes at a time, and larger accesses are broken down. However, > + * many/most DMA accesses are larger than 8 bytes and VFIO-user can > + * handle large DMA accesses just fine, thus this size restriction > + * unnecessarily hurts performance, in particular given that each > + * access causes a round trip on the VFIO-user socket. > + * > + * TODO: Investigate how to plumb larger accesses through memory > + * regions, possibly by amending MemoryRegionOps or by creating a new > + * memory region type. > + */ > + memory_region_init_io(subregion, OBJECT(o), &vfu_dma_ops, subregion, > + name, iov->iov_len); > + } > > dma_as = pci_device_iommu_address_space(o->pci_dev); > > @@ -330,20 +400,20 @@ static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) > static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) > { > VfuObject *o = vfu_get_private(vfu_ctx); > + MemoryRegionSection mr_section; > AddressSpace *dma_as = NULL; > - MemoryRegion *mr = NULL; > - ram_addr_t offset; > > - mr = memory_region_from_host(info->vaddr, &offset); > - if (!mr) { > + dma_as = pci_device_iommu_address_space(o->pci_dev); > + > + mr_section = > + memory_region_find(dma_as->root, (hwaddr)info->iova.iov_base, 1); > + if (!mr_section.mr) { > return; > } > > - dma_as = pci_device_iommu_address_space(o->pci_dev); > - > - memory_region_del_subregion(dma_as->root, mr); > + memory_region_del_subregion(dma_as->root, mr_section.mr); > > - object_unparent((OBJECT(mr))); > + object_unparent((OBJECT(mr_section.mr))); > > trace_vfu_dma_unregister((uint64_t)info->iova.iov_base); > } > -- > 2.34.1 >
diff --git a/hw/remote/trace-events b/hw/remote/trace-events index 0d1b7d56a5..358a68fb34 100644 --- a/hw/remote/trace-events +++ b/hw/remote/trace-events @@ -9,6 +9,8 @@ vfu_cfg_read(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x -> 0x%x" vfu_cfg_write(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x <- 0x%x" vfu_dma_register(uint64_t gpa, size_t len) "vfu: registering GPA 0x%"PRIx64", %zu bytes" vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64"" +vfu_dma_read(uint64_t gpa, size_t len) "vfu: DMA read 0x%"PRIx64", %zu bytes" +vfu_dma_write(uint64_t gpa, size_t len) "vfu: DMA write 0x%"PRIx64", %zu bytes" vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64"" vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64"" vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64"" diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c index 8b10c32a3c..9f5e385668 100644 --- a/hw/remote/vfio-user-obj.c +++ b/hw/remote/vfio-user-obj.c @@ -300,6 +300,63 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf, return count; } +static MemTxResult vfu_dma_read(void *opaque, hwaddr addr, uint64_t *val, + unsigned size, MemTxAttrs attrs) +{ + MemoryRegion *region = opaque; + vfu_ctx_t *vfu_ctx = VFU_OBJECT(region->owner)->vfu_ctx; + uint8_t buf[sizeof(uint64_t)]; + + trace_vfu_dma_read(region->addr + addr, size); + + g_autofree dma_sg_t *sg = g_malloc0(dma_sg_size()); + vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr); + if (vfu_addr_to_sgl(vfu_ctx, vfu_addr, size, sg, 1, PROT_READ) < 0 || + vfu_sgl_read(vfu_ctx, sg, 1, buf) != 0) { + return MEMTX_ERROR; + } + + *val = ldn_he_p(buf, size); + + return MEMTX_OK; +} + +static MemTxResult vfu_dma_write(void *opaque, hwaddr addr, uint64_t val, + unsigned size, MemTxAttrs attrs) +{ + MemoryRegion *region = opaque; + vfu_ctx_t *vfu_ctx = VFU_OBJECT(region->owner)->vfu_ctx; + uint8_t buf[sizeof(uint64_t)]; + + trace_vfu_dma_write(region->addr + addr, size); + + stn_he_p(buf, size, val); + + g_autofree dma_sg_t *sg = g_malloc0(dma_sg_size()); + vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr); + if (vfu_addr_to_sgl(vfu_ctx, vfu_addr, size, sg, 1, PROT_WRITE) < 0 || + vfu_sgl_write(vfu_ctx, sg, 1, buf) != 0) { + return MEMTX_ERROR; + } + + return MEMTX_OK; +} + +static const MemoryRegionOps vfu_dma_ops = { + .read_with_attrs = vfu_dma_read, + .write_with_attrs = vfu_dma_write, + .endianness = DEVICE_HOST_ENDIAN, + .valid = { + .min_access_size = 1, + .max_access_size = 8, + .unaligned = true, + }, + .impl = { + .min_access_size = 1, + .max_access_size = 8, + }, +}; + static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) { VfuObject *o = vfu_get_private(vfu_ctx); @@ -308,17 +365,30 @@ static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) g_autofree char *name = NULL; struct iovec *iov = &info->iova; - if (!info->vaddr) { - return; - } - name = g_strdup_printf("mem-%s-%"PRIx64"", o->device, - (uint64_t)info->vaddr); + (uint64_t)iov->iov_base); subregion = g_new0(MemoryRegion, 1); - memory_region_init_ram_ptr(subregion, NULL, name, - iov->iov_len, info->vaddr); + if (info->vaddr) { + memory_region_init_ram_ptr(subregion, OBJECT(o), name, + iov->iov_len, info->vaddr); + } else { + /* + * Note that I/O regions' MemoryRegionOps handle accesses of at most 8 + * bytes at a time, and larger accesses are broken down. However, + * many/most DMA accesses are larger than 8 bytes and VFIO-user can + * handle large DMA accesses just fine, thus this size restriction + * unnecessarily hurts performance, in particular given that each + * access causes a round trip on the VFIO-user socket. + * + * TODO: Investigate how to plumb larger accesses through memory + * regions, possibly by amending MemoryRegionOps or by creating a new + * memory region type. + */ + memory_region_init_io(subregion, OBJECT(o), &vfu_dma_ops, subregion, + name, iov->iov_len); + } dma_as = pci_device_iommu_address_space(o->pci_dev); @@ -330,20 +400,20 @@ static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) { VfuObject *o = vfu_get_private(vfu_ctx); + MemoryRegionSection mr_section; AddressSpace *dma_as = NULL; - MemoryRegion *mr = NULL; - ram_addr_t offset; - mr = memory_region_from_host(info->vaddr, &offset); - if (!mr) { + dma_as = pci_device_iommu_address_space(o->pci_dev); + + mr_section = + memory_region_find(dma_as->root, (hwaddr)info->iova.iov_base, 1); + if (!mr_section.mr) { return; } - dma_as = pci_device_iommu_address_space(o->pci_dev); - - memory_region_del_subregion(dma_as->root, mr); + memory_region_del_subregion(dma_as->root, mr_section.mr); - object_unparent((OBJECT(mr))); + object_unparent((OBJECT(mr_section.mr))); trace_vfu_dma_unregister((uint64_t)info->iova.iov_base); }
Wire up support for DMA for the case where the vfio-user client does not provide mmap()-able file descriptors, but DMA requests must be performed via the VFIO-user protocol. This installs an indirect memory region, which already works for pci_dma_{read,write}, and pci_dma_map works thanks to the existing DMA bounce buffering support. Note that while simple scenarios work with this patch, there's a known race condition in libvfio-user that will mess up the communication channel. See https://github.com/nutanix/libvfio-user/issues/279 for details as well as a proposed fix. Signed-off-by: Mattias Nissler <mnissler@rivosinc.com> --- hw/remote/trace-events | 2 + hw/remote/vfio-user-obj.c | 100 ++++++++++++++++++++++++++++++++------ 2 files changed, 87 insertions(+), 15 deletions(-)