diff mbox series

[RFC,v2,13/16] vfio-user: dma map/unmap operations

Message ID 8960370c1ce02089099702878ed6fb6c3d42ad06.1629131628.git.elena.ufimtseva@oracle.com (mailing list archive)
State New, archived
Headers show
Series vfio-user implementation | expand

Commit Message

Elena Ufimtseva Aug. 16, 2021, 4:42 p.m. UTC
From: John Johnson <john.g.johnson@oracle.com>

Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
---
 hw/vfio/user-protocol.h       |  32 +++++++++
 hw/vfio/user.h                |   6 ++
 include/hw/vfio/vfio-common.h |   1 +
 hw/vfio/common.c              |  71 ++++++++++++++++---
 hw/vfio/user.c                | 124 ++++++++++++++++++++++++++++++++++
 5 files changed, 226 insertions(+), 8 deletions(-)

Comments

Stefan Hajnoczi Sept. 8, 2021, 9:16 a.m. UTC | #1
On Mon, Aug 16, 2021 at 09:42:46AM -0700, Elena Ufimtseva wrote:
> +void vfio_user_drain_reqs(VFIOProxy *proxy)
> +{
> +    VFIOUserReply *reply;
> +    bool iolock = 0;
> +
> +    /*
> +     * Any DMA map/unmap requests sent in the middle
> +     * of a memory region transaction were sent async.
> +     * Wait for them here.
> +     */
> +    QEMU_LOCK_GUARD(&proxy->lock);
> +    if (proxy->last_nowait != NULL) {
> +        iolock = qemu_mutex_iothread_locked();
> +        if (iolock) {
> +            qemu_mutex_unlock_iothread();
> +        }
> +
> +        reply = proxy->last_nowait;
> +        reply->nowait = 0;
> +        while (reply->complete == 0) {
> +            if (!qemu_cond_timedwait(&reply->cv, &proxy->lock, wait_time)) {
> +                error_printf("vfio_drain_reqs - timed out\n");
> +                break;
> +            }
> +        }
> +
> +        if (reply->msg->flags & VFIO_USER_ERROR) {
> +            error_printf("vfio_user_rcv error reply on async request ");
> +            error_printf("command %x error %s\n", reply->msg->command,
> +                         strerror(reply->msg->error_reply));
> +        }
> +        proxy->last_nowait = NULL;
> +        g_free(reply->msg);
> +        QTAILQ_INSERT_HEAD(&proxy->free, reply, next);
> +    }
> +
> +    if (iolock) {
> +        qemu_mutex_lock_iothread();
> +    }

Not sure this lock ordering is correct. Above we acquire proxy->lock
while holding the BQL and here we acquire the BQL while holding
proxy->lock. If another thread (e.g. a vCPU thread) does something
similar this is the ABBA lock ordering problem.

The more obviously correct way to write this is:

  WITH_QEMU_LOCK_GUARD(&proxy->lock) {
      ...
  }

  if (iolock) {
      qemu_mutex_lock_iothread();
  }

> +}
> +
>  static void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd,
>                                    uint32_t size, uint32_t flags)
>  {
> @@ -715,6 +756,89 @@ int vfio_user_validate_version(VFIODevice *vbasedev, Error **errp)
>      return 0;
>  }
>  
> +int vfio_user_dma_map(VFIOProxy *proxy, struct vfio_iommu_type1_dma_map *map,
> +                      VFIOUserFDs *fds, bool will_commit)
> +{
> +    VFIOUserDMAMap *msgp = g_malloc(sizeof(*msgp));

Is this zero-initialized anywhere to guarantee that QEMU memory isn't
exposed to the VFIO device emulation program?

> +    int ret, flags;
> +
> +    /* commit will wait, so send async without dropping BQL */
> +    flags = will_commit ? (NOIOLOCK | NOWAIT) : 0;

Why is this distinction between will_commit and !will_commit necessary?
I get a sense that the network communications code drops the BQL and
that's undesirable here for some reason. I wonder why the code doesn't
take the NOWAIT code path all the time?
diff mbox series

Patch

diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h
index 5614efa0a4..ca53fce5f4 100644
--- a/hw/vfio/user-protocol.h
+++ b/hw/vfio/user-protocol.h
@@ -82,6 +82,31 @@  typedef struct {
 #define VFIO_USER_MAX_MAX_XFER  (64 * 1024 * 1024)
 
 
+/*
+ * VFIO_USER_DMA_MAP
+ * imported from struct vfio_iommu_type1_dma_map
+ */
+typedef struct {
+    VFIOUserHdr hdr;
+    uint32_t argsz;
+    uint32_t flags;
+    uint64_t offset;    /* FD offset */
+    uint64_t iova;
+    uint64_t size;
+} VFIOUserDMAMap;
+
+/*
+ * VFIO_USER_DMA_UNMAP
+ * imported from struct vfio_iommu_type1_dma_unmap
+ */
+typedef struct {
+    VFIOUserHdr hdr;
+    uint32_t argsz;
+    uint32_t flags;
+    uint64_t iova;
+    uint64_t size;
+} VFIOUserDMAUnmap;
+
 /*
  * VFIO_USER_DEVICE_GET_INFO
  * imported from struct_device_info
@@ -146,4 +171,11 @@  typedef struct {
     char data[];
 } VFIOUserRegionRW;
 
+/*imported from struct vfio_bitmap */
+typedef struct {
+    uint64_t pgsize;
+    uint64_t size;
+    char data[];
+} VFIOUserBitmap;
+
 #endif /* VFIO_USER_PROTOCOL_H */
diff --git a/hw/vfio/user.h b/hw/vfio/user.h
index 248ad80943..7786ab57c5 100644
--- a/hw/vfio/user.h
+++ b/hw/vfio/user.h
@@ -71,6 +71,11 @@  void vfio_user_set_reqhandler(VFIODevice *vbasdev,
                                              void *reqarg);
 void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret);
 int vfio_user_validate_version(VFIODevice *vbasedev, Error **errp);
+int vfio_user_dma_map(VFIOProxy *proxy, struct vfio_iommu_type1_dma_map *map,
+                      VFIOUserFDs *fds, bool will_commit);
+int vfio_user_dma_unmap(VFIOProxy *proxy,
+                        struct vfio_iommu_type1_dma_unmap *unmap,
+                        struct vfio_bitmap *bitmap, bool will_commit);
 int vfio_user_get_info(VFIODevice *vbasedev);
 int vfio_user_get_region_info(VFIODevice *vbasedev, int index,
                               struct vfio_region_info *info, VFIOUserFDs *fds);
@@ -80,5 +85,6 @@  int vfio_user_region_read(VFIODevice *vbasedev, uint32_t index, uint64_t offset,
                           uint32_t count, void *data);
 int vfio_user_region_write(VFIODevice *vbasedev, uint32_t index,
                            uint64_t offset, uint32_t count, void *data);
+void vfio_user_drain_reqs(VFIOProxy *proxy);
 
 #endif /* VFIO_USER_H */
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 688660c28d..13d1d14c3b 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -87,6 +87,7 @@  typedef struct VFIOContainer {
     Error *error;
     bool initialized;
     bool dirty_pages_supported;
+    bool will_commit;
     uint64_t dirty_pgsizes;
     uint64_t max_dirty_bitmap_size;
     unsigned long pgsizes;
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 57b9e111e6..a532e52bcf 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -427,6 +427,7 @@  static int vfio_dma_unmap_bitmap(VFIOContainer *container,
     struct vfio_iommu_type1_dma_unmap *unmap;
     struct vfio_bitmap *bitmap;
     uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size;
+    bool will_commit = container->will_commit;
     int ret;
 
     unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
@@ -460,7 +461,11 @@  static int vfio_dma_unmap_bitmap(VFIOContainer *container,
         goto unmap_exit;
     }
 
-    ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
+    if (container->proxy != NULL) {
+        ret = vfio_user_dma_unmap(container->proxy, unmap, bitmap, will_commit);
+    } else {
+        ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
+    }
     if (!ret) {
         cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
                 iotlb->translated_addr, pages);
@@ -487,12 +492,17 @@  static int vfio_dma_unmap(VFIOContainer *container,
         .iova = iova,
         .size = size,
     };
+    bool will_commit = container->will_commit;
 
     if (iotlb && container->dirty_pages_supported &&
         vfio_devices_all_running_and_saving(container)) {
         return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
     }
 
+    if (container->proxy != NULL) {
+        return vfio_user_dma_unmap(container->proxy, &unmap, NULL, will_commit);
+    }
+
     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
         /*
          * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
@@ -519,7 +529,7 @@  static int vfio_dma_unmap(VFIOContainer *container,
     return 0;
 }
 
-static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
+static int vfio_dma_map(VFIOContainer *container, MemoryRegion *mr, hwaddr iova,
                         ram_addr_t size, void *vaddr, bool readonly)
 {
     struct vfio_iommu_type1_dma_map map = {
@@ -529,11 +539,30 @@  static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
         .iova = iova,
         .size = size,
     };
+    bool will_commit = container->will_commit;
 
     if (!readonly) {
         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
     }
 
+    if (container->proxy != NULL) {
+        VFIOUserFDs fds;
+        int fd;
+
+        fd = memory_region_get_fd(mr);
+        if (fd != -1 && !(container->proxy->flags & VFIO_PROXY_SECURE)) {
+            fds.send_fds = 1;
+            fds.recv_fds = 0;
+            fds.fds = &fd;
+            map.vaddr = qemu_ram_block_host_offset(mr->ram_block, vaddr);
+
+            return vfio_user_dma_map(container->proxy, &map, &fds, will_commit);
+        } else {
+            map.vaddr = 0;
+            return vfio_user_dma_map(container->proxy, &map, NULL, will_commit);
+        }
+    }
+
     /*
      * Try the mapping, if it fails with EBUSY, unmap the region and try
      * again.  This shouldn't be necessary, but we sometimes see it in
@@ -602,7 +631,8 @@  static bool vfio_listener_skipped_section(MemoryRegionSection *section)
 
 /* Called with rcu_read_lock held.  */
 static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
-                               ram_addr_t *ram_addr, bool *read_only)
+                               ram_addr_t *ram_addr, bool *read_only,
+                               MemoryRegion **mrp)
 {
     MemoryRegion *mr;
     hwaddr xlat;
@@ -683,6 +713,10 @@  static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
         *read_only = !writable || mr->readonly;
     }
 
+    if (mrp != NULL) {
+        *mrp = mr;
+    }
+
     return true;
 }
 
@@ -690,6 +724,7 @@  static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 {
     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
     VFIOContainer *container = giommu->container;
+    MemoryRegion *mr;
     hwaddr iova = iotlb->iova + giommu->iommu_offset;
     void *vaddr;
     int ret;
@@ -708,7 +743,7 @@  static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
         bool read_only;
 
-        if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
+        if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &mr)) {
             goto out;
         }
         /*
@@ -718,7 +753,7 @@  static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
          * of vaddr will always be there, even if the memory object is
          * destroyed and its backing memory munmap-ed.
          */
-        ret = vfio_dma_map(container, iova,
+        ret = vfio_dma_map(container, mr, iova,
                            iotlb->addr_mask + 1, vaddr,
                            read_only);
         if (ret) {
@@ -780,7 +815,7 @@  static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
                section->offset_within_address_space;
         vaddr = memory_region_get_ram_ptr(section->mr) + start;
 
-        ret = vfio_dma_map(vrdl->container, iova, next - start,
+        ret = vfio_dma_map(vrdl->container, section->mr, iova, next - start,
                            vaddr, section->readonly);
         if (ret) {
             /* Rollback */
@@ -888,6 +923,24 @@  static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
     g_free(vrdl);
 }
 
+static void vfio_listener_begin(MemoryListener *listener)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+
+    container->will_commit = 1;
+}
+
+static void vfio_listener_commit(MemoryListener *listener)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+
+    /* wait for any async requests sent during the transaction */
+    if (container->proxy != NULL) {
+        vfio_user_drain_reqs(container->proxy);
+    }
+    container->will_commit = 0;
+}
+
 static void vfio_listener_region_add(MemoryListener *listener,
                                      MemoryRegionSection *section)
 {
@@ -1080,7 +1133,7 @@  static void vfio_listener_region_add(MemoryListener *listener,
         }
     }
 
-    ret = vfio_dma_map(container, iova, int128_get64(llsize),
+    ret = vfio_dma_map(container, section->mr, iova, int128_get64(llsize),
                        vaddr, section->readonly);
     if (ret) {
         error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
@@ -1346,7 +1399,7 @@  static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     }
 
     rcu_read_lock();
-    if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
+    if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, NULL)) {
         int ret;
 
         ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
@@ -1463,6 +1516,8 @@  static void vfio_listener_log_sync(MemoryListener *listener,
 }
 
 static const MemoryListener vfio_memory_listener = {
+    .begin = vfio_listener_begin,
+    .commit = vfio_listener_commit,
     .region_add = vfio_listener_region_add,
     .region_del = vfio_listener_region_del,
     .log_global_start = vfio_listener_log_global_start,
diff --git a/hw/vfio/user.c b/hw/vfio/user.c
index b68ca1279d..06bcd46e60 100644
--- a/hw/vfio/user.c
+++ b/hw/vfio/user.c
@@ -408,6 +408,47 @@  static void vfio_user_send_recv(VFIOProxy *proxy, VFIOUserHdr *msg,
     }
 }
 
+void vfio_user_drain_reqs(VFIOProxy *proxy)
+{
+    VFIOUserReply *reply;
+    bool iolock = 0;
+
+    /*
+     * Any DMA map/unmap requests sent in the middle
+     * of a memory region transaction were sent async.
+     * Wait for them here.
+     */
+    QEMU_LOCK_GUARD(&proxy->lock);
+    if (proxy->last_nowait != NULL) {
+        iolock = qemu_mutex_iothread_locked();
+        if (iolock) {
+            qemu_mutex_unlock_iothread();
+        }
+
+        reply = proxy->last_nowait;
+        reply->nowait = 0;
+        while (reply->complete == 0) {
+            if (!qemu_cond_timedwait(&reply->cv, &proxy->lock, wait_time)) {
+                error_printf("vfio_drain_reqs - timed out\n");
+                break;
+            }
+        }
+
+        if (reply->msg->flags & VFIO_USER_ERROR) {
+            error_printf("vfio_user_rcv error reply on async request ");
+            error_printf("command %x error %s\n", reply->msg->command,
+                         strerror(reply->msg->error_reply));
+        }
+        proxy->last_nowait = NULL;
+        g_free(reply->msg);
+        QTAILQ_INSERT_HEAD(&proxy->free, reply, next);
+    }
+
+    if (iolock) {
+        qemu_mutex_lock_iothread();
+    }
+}
+
 static void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd,
                                   uint32_t size, uint32_t flags)
 {
@@ -715,6 +756,89 @@  int vfio_user_validate_version(VFIODevice *vbasedev, Error **errp)
     return 0;
 }
 
+int vfio_user_dma_map(VFIOProxy *proxy, struct vfio_iommu_type1_dma_map *map,
+                      VFIOUserFDs *fds, bool will_commit)
+{
+    VFIOUserDMAMap *msgp = g_malloc(sizeof(*msgp));
+    int ret, flags;
+
+    /* commit will wait, so send async without dropping BQL */
+    flags = will_commit ? (NOIOLOCK | NOWAIT) : 0;
+
+    vfio_user_request_msg(&msgp->hdr, VFIO_USER_DMA_MAP, sizeof(*msgp), 0);
+    msgp->argsz = map->argsz;
+    msgp->flags = map->flags;
+    msgp->offset = map->vaddr;
+    msgp->iova = map->iova;
+    msgp->size = map->size;
+
+    vfio_user_send_recv(proxy, &msgp->hdr, fds, 0, flags);
+    ret = (msgp->hdr.flags & VFIO_USER_ERROR) ? -msgp->hdr.error_reply : 0;
+
+    if (!(flags & NOWAIT)) {
+        g_free(msgp);
+    }
+    return ret;
+}
+
+int vfio_user_dma_unmap(VFIOProxy *proxy,
+                        struct vfio_iommu_type1_dma_unmap *unmap,
+                        struct vfio_bitmap *bitmap, bool will_commit)
+{
+    struct {
+        VFIOUserDMAUnmap msg;
+        VFIOUserBitmap bitmap;
+    } *msgp = NULL;
+    int msize, rsize, flags;
+
+    if (bitmap == NULL && (unmap->flags &
+                           VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)) {
+        error_printf("vfio_user_dma_unmap mismatched flags and bitmap\n");
+        return -EINVAL;
+    }
+
+    /* can't drop BQL until commit */
+    flags = will_commit ? NOIOLOCK : 0;
+
+    /*
+     * If a dirty bitmap is returned, allocate extra space for it
+     * otherwise, just send the unmap request
+     */
+    if (bitmap != NULL) {
+        msize = sizeof(*msgp);
+        rsize = msize + bitmap->size;
+        msgp = g_malloc0(rsize);
+        msgp->bitmap.pgsize = bitmap->pgsize;
+        msgp->bitmap.size = bitmap->size;
+    } else {
+        /* can only send async if no bitmap returned */
+        flags |= will_commit ? NOWAIT : 0;
+        msize = rsize = sizeof(VFIOUserDMAUnmap);
+        msgp = g_malloc0(rsize);
+    }
+
+    vfio_user_request_msg(&msgp->msg.hdr, VFIO_USER_DMA_UNMAP, msize, flags);
+    msgp->msg.argsz = unmap->argsz;
+    msgp->msg.flags = unmap->flags;
+    msgp->msg.iova = unmap->iova;
+    msgp->msg.size = unmap->size;
+
+    vfio_user_send_recv(proxy, &msgp->msg.hdr, NULL, rsize, flags);
+    if (msgp->msg.hdr.flags & VFIO_USER_ERROR) {
+        g_free(msgp);
+        return -msgp->msg.hdr.error_reply;
+    }
+
+    if (bitmap != NULL) {
+        memcpy(bitmap->data, &msgp->bitmap.data, bitmap->size);
+    }
+    if (!(flags & NOWAIT)) {
+        g_free(msgp);
+    }
+
+    return 0;
+}
+
 int vfio_user_get_info(VFIODevice *vbasedev)
 {
     VFIOUserDeviceInfo msg;