diff mbox series

[v7,4/7] libvhost-user: Support tracking inflight I/O in shared memory

Message ID 20190228085355.9614-5-xieyongji@baidu.com (mailing list archive)
State New, archived
Headers show
Series vhost-user-blk: Add support for backend reconnecting | expand

Commit Message

Yongji Xie Feb. 28, 2019, 8:53 a.m. UTC
From: Xie Yongji <xieyongji@baidu.com>

This patch adds support for VHOST_USER_GET_INFLIGHT_FD and
VHOST_USER_SET_INFLIGHT_FD message to set/get shared buffer
to/from qemu. Then backend can track inflight I/O in this buffer.

Signed-off-by: Xie Yongji <xieyongji@baidu.com>
Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
---
 Makefile                              |   2 +-
 contrib/libvhost-user/libvhost-user.c | 349 ++++++++++++++++++++++++--
 contrib/libvhost-user/libvhost-user.h |  70 ++++++
 3 files changed, 400 insertions(+), 21 deletions(-)

Comments

Marc-André Lureau Nov. 16, 2019, 5:42 p.m. UTC | #1
On Wed, Mar 13, 2019 at 6:59 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> From: Xie Yongji <xieyongji@baidu.com>
>
> This patch adds support for VHOST_USER_GET_INFLIGHT_FD and
> VHOST_USER_SET_INFLIGHT_FD message to set/get shared buffer
> to/from qemu. Then backend can track inflight I/O in this buffer.
>
> Signed-off-by: Xie Yongji <xieyongji@baidu.com>
> Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
> Message-Id: <20190228085355.9614-5-xieyongji@baidu.com>
> Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  Makefile                              |   2 +-
>  contrib/libvhost-user/libvhost-user.h |  70 ++++++
>  contrib/libvhost-user/libvhost-user.c | 349 ++++++++++++++++++++++++--
>  3 files changed, 400 insertions(+), 21 deletions(-)
>
> diff --git a/Makefile b/Makefile
> index 6ccb8639b0..abd78a9826 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -497,7 +497,7 @@ Makefile: $(version-obj-y)
>  # Build libraries
>
>  libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y)
> -libvhost-user.a: $(libvhost-user-obj-y)
> +libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y)
>
>  ######################################################################
>
> diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
> index 3de8414898..414ceb0a2f 100644
> --- a/contrib/libvhost-user/libvhost-user.h
> +++ b/contrib/libvhost-user/libvhost-user.h
> @@ -53,6 +53,7 @@ enum VhostUserProtocolFeature {
>      VHOST_USER_PROTOCOL_F_CONFIG = 9,
>      VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
>      VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
> +    VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
>
>      VHOST_USER_PROTOCOL_F_MAX
>  };
> @@ -91,6 +92,8 @@ typedef enum VhostUserRequest {
>      VHOST_USER_POSTCOPY_ADVISE  = 28,
>      VHOST_USER_POSTCOPY_LISTEN  = 29,
>      VHOST_USER_POSTCOPY_END     = 30,
> +    VHOST_USER_GET_INFLIGHT_FD = 31,
> +    VHOST_USER_SET_INFLIGHT_FD = 32,
>      VHOST_USER_MAX
>  } VhostUserRequest;
>
> @@ -138,6 +141,13 @@ typedef struct VhostUserVringArea {
>      uint64_t offset;
>  } VhostUserVringArea;
>
> +typedef struct VhostUserInflight {
> +    uint64_t mmap_size;
> +    uint64_t mmap_offset;
> +    uint16_t num_queues;
> +    uint16_t queue_size;
> +} VhostUserInflight;
> +
>  #if defined(_WIN32)
>  # define VU_PACKED __attribute__((gcc_struct, packed))
>  #else
> @@ -163,6 +173,7 @@ typedef struct VhostUserMsg {
>          VhostUserLog log;
>          VhostUserConfig config;
>          VhostUserVringArea area;
> +        VhostUserInflight inflight;
>      } payload;
>
>      int fds[VHOST_MEMORY_MAX_NREGIONS];
> @@ -234,9 +245,61 @@ typedef struct VuRing {
>      uint32_t flags;
>  } VuRing;
>
> +typedef struct VuDescStateSplit {
> +    /* Indicate whether this descriptor is inflight or not.
> +     * Only available for head-descriptor. */
> +    uint8_t inflight;
> +
> +    /* Padding */
> +    uint8_t padding[5];
> +
> +    /* Maintain a list for the last batch of used descriptors.
> +     * Only available when batching is used for submitting */
> +    uint16_t next;
> +
> +    /* Used to preserve the order of fetching available descriptors.
> +     * Only available for head-descriptor. */
> +    uint64_t counter;
> +} VuDescStateSplit;
> +
> +typedef struct VuVirtqInflight {
> +    /* The feature flags of this region. Now it's initialized to 0. */
> +    uint64_t features;
> +
> +    /* The version of this region. It's 1 currently.
> +     * Zero value indicates a vm reset happened. */
> +    uint16_t version;
> +
> +    /* The size of VuDescStateSplit array. It's equal to the virtqueue
> +     * size. Slave could get it from queue size field of VhostUserInflight. */
> +    uint16_t desc_num;
> +
> +    /* The head of list that track the last batch of used descriptors. */
> +    uint16_t last_batch_head;
> +
> +    /* Storing the idx value of used ring */
> +    uint16_t used_idx;
> +
> +    /* Used to track the state of each descriptor in descriptor table */
> +    VuDescStateSplit desc[0];
> +} VuVirtqInflight;
> +
> +typedef struct VuVirtqInflightDesc {
> +    uint16_t index;
> +    uint64_t counter;
> +} VuVirtqInflightDesc;
> +
>  typedef struct VuVirtq {
>      VuRing vring;
>
> +    VuVirtqInflight *inflight;
> +
> +    VuVirtqInflightDesc *resubmit_list;
> +
> +    uint16_t resubmit_num;
> +
> +    uint64_t counter;
> +
>      /* Next head to pop */
>      uint16_t last_avail_idx;
>
> @@ -279,11 +342,18 @@ typedef void (*vu_set_watch_cb) (VuDev *dev, int fd, int condition,
>                                   vu_watch_cb cb, void *data);
>  typedef void (*vu_remove_watch_cb) (VuDev *dev, int fd);
>
> +typedef struct VuDevInflightInfo {
> +    int fd;
> +    void *addr;
> +    uint64_t size;
> +} VuDevInflightInfo;
> +
>  struct VuDev {
>      int sock;
>      uint32_t nregions;
>      VuDevRegion regions[VHOST_MEMORY_MAX_NREGIONS];
>      VuVirtq vq[VHOST_MAX_NR_VIRTQUEUE];
> +    VuDevInflightInfo inflight_info;
>      int log_call_fd;
>      int slave_fd;
>      uint64_t log_size;
> diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
> index ddd15d79cf..e08d6c7b97 100644
> --- a/contrib/libvhost-user/libvhost-user.c
> +++ b/contrib/libvhost-user/libvhost-user.c
> @@ -41,6 +41,8 @@
>  #endif
>
>  #include "qemu/atomic.h"
> +#include "qemu/osdep.h"
> +#include "qemu/memfd.h"
>
>  #include "libvhost-user.h"
>
> @@ -53,6 +55,18 @@
>              _min1 < _min2 ? _min1 : _min2; })
>  #endif
>
> +/* Round number down to multiple */
> +#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> +
> +/* Round number up to multiple */
> +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> +
> +/* Align each region to cache line size in inflight buffer */
> +#define INFLIGHT_ALIGNMENT 64
> +
> +/* The version of inflight buffer */
> +#define INFLIGHT_VERSION 1
> +
>  #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
>
>  /* The version of the protocol we support */
> @@ -66,6 +80,20 @@
>          }                                       \
>      } while (0)
>
> +static inline
> +bool has_feature(uint64_t features, unsigned int fbit)
> +{
> +    assert(fbit < 64);
> +    return !!(features & (1ULL << fbit));
> +}
> +
> +static inline
> +bool vu_has_feature(VuDev *dev,
> +                    unsigned int fbit)
> +{
> +    return has_feature(dev->features, fbit);
> +}
> +
>  static const char *
>  vu_request_to_string(unsigned int req)
>  {
> @@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req)
>          REQ(VHOST_USER_POSTCOPY_ADVISE),
>          REQ(VHOST_USER_POSTCOPY_LISTEN),
>          REQ(VHOST_USER_POSTCOPY_END),
> +        REQ(VHOST_USER_GET_INFLIGHT_FD),
> +        REQ(VHOST_USER_SET_INFLIGHT_FD),
>          REQ(VHOST_USER_MAX),
>      };
>  #undef REQ
> @@ -890,6 +920,91 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
>      return true;
>  }
>
> +static int
> +inflight_desc_compare(const void *a, const void *b)
> +{
> +    VuVirtqInflightDesc *desc0 = (VuVirtqInflightDesc *)a,
> +                        *desc1 = (VuVirtqInflightDesc *)b;
> +
> +    if (desc1->counter > desc0->counter &&
> +        (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) {
> +        return 1;
> +    }
> +
> +    return -1;
> +}
> +
> +static int
> +vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
> +{
> +    int i = 0;
> +
> +    if (!has_feature(dev->protocol_features,
> +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> +        return 0;
> +    }
> +
> +    if (unlikely(!vq->inflight)) {
> +        return -1;
> +    }
> +
> +    if (unlikely(!vq->inflight->version)) {
> +        /* initialize the buffer */
> +        vq->inflight->version = INFLIGHT_VERSION;
> +        return 0;
> +    }
> +
> +    vq->used_idx = vq->vring.used->idx;
> +    vq->resubmit_num = 0;
> +    vq->resubmit_list = NULL;
> +    vq->counter = 0;
> +
> +    if (unlikely(vq->inflight->used_idx != vq->used_idx)) {
> +        vq->inflight->desc[vq->inflight->last_batch_head].inflight = 0;
> +
> +        barrier();
> +
> +        vq->inflight->used_idx = vq->used_idx;
> +    }
> +
> +    for (i = 0; i < vq->inflight->desc_num; i++) {
> +        if (vq->inflight->desc[i].inflight == 1) {
> +            vq->inuse++;
> +        }
> +    }
> +
> +    vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
> +
> +    if (vq->inuse) {
> +        vq->resubmit_list = malloc(sizeof(VuVirtqInflightDesc) * vq->inuse);
> +        if (!vq->resubmit_list) {
> +            return -1;
> +        }
> +
> +        for (i = 0; i < vq->inflight->desc_num; i++) {
> +            if (vq->inflight->desc[i].inflight) {
> +                vq->resubmit_list[vq->resubmit_num].index = i;
> +                vq->resubmit_list[vq->resubmit_num].counter =
> +                                        vq->inflight->desc[i].counter;
> +                vq->resubmit_num++;
> +            }
> +        }
> +
> +        if (vq->resubmit_num > 1) {
> +            qsort(vq->resubmit_list, vq->resubmit_num,
> +                  sizeof(VuVirtqInflightDesc), inflight_desc_compare);
> +        }
> +        vq->counter = vq->resubmit_list[0].counter + 1;

scan-build reports that vq->resubmit_list[0].counter may be garbage
value, if it's not initialized in the loop above.
Xie, could you provide a fix?

> +    }
> +
> +    /* in case of I/O hang after reconnecting */
> +    if (eventfd_write(vq->kick_fd, 1)) {
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
>  static bool
>  vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
>  {
> @@ -923,6 +1038,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
>                 dev->vq[index].kick_fd, index);
>      }
>
> +    if (vu_check_queue_inflights(dev, &dev->vq[index])) {
> +        vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
> +    }
> +
>      return false;
>  }
>
> @@ -995,6 +1114,11 @@ vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
>
>      dev->vq[index].call_fd = vmsg->fds[0];
>
> +    /* in case of I/O hang after reconnecting */
> +    if (eventfd_write(vmsg->fds[0], 1)) {
> +        return -1;
> +    }
> +
>      DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
>
>      return false;
> @@ -1209,6 +1333,116 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
>      return true;
>  }
>
> +static inline uint64_t
> +vu_inflight_queue_size(uint16_t queue_size)
> +{
> +    return ALIGN_UP(sizeof(VuDescStateSplit) * queue_size +
> +           sizeof(uint16_t), INFLIGHT_ALIGNMENT);
> +}
> +
> +static bool
> +vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> +{
> +    int fd;
> +    void *addr;
> +    uint64_t mmap_size;
> +    uint16_t num_queues, queue_size;
> +
> +    if (vmsg->size != sizeof(vmsg->payload.inflight)) {
> +        vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
> +        vmsg->payload.inflight.mmap_size = 0;
> +        return true;
> +    }
> +
> +    num_queues = vmsg->payload.inflight.num_queues;
> +    queue_size = vmsg->payload.inflight.queue_size;
> +
> +    DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
> +    DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
> +
> +    mmap_size = vu_inflight_queue_size(queue_size) * num_queues;
> +
> +    addr = qemu_memfd_alloc("vhost-inflight", mmap_size,
> +                            F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
> +                            &fd, NULL);
> +
> +    if (!addr) {
> +        vu_panic(dev, "Failed to alloc vhost inflight area");
> +        vmsg->payload.inflight.mmap_size = 0;
> +        return true;
> +    }
> +
> +    memset(addr, 0, mmap_size);
> +
> +    dev->inflight_info.addr = addr;
> +    dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
> +    dev->inflight_info.fd = vmsg->fds[0] = fd;
> +    vmsg->fd_num = 1;
> +    vmsg->payload.inflight.mmap_offset = 0;
> +
> +    DPRINT("send inflight mmap_size: %"PRId64"\n",
> +           vmsg->payload.inflight.mmap_size);
> +    DPRINT("send inflight mmap offset: %"PRId64"\n",
> +           vmsg->payload.inflight.mmap_offset);
> +
> +    return true;
> +}
> +
> +static bool
> +vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> +{
> +    int fd, i;
> +    uint64_t mmap_size, mmap_offset;
> +    uint16_t num_queues, queue_size;
> +    void *rc;
> +
> +    if (vmsg->fd_num != 1 ||
> +        vmsg->size != sizeof(vmsg->payload.inflight)) {
> +        vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
> +                 vmsg->size, vmsg->fd_num);
> +        return false;
> +    }
> +
> +    fd = vmsg->fds[0];
> +    mmap_size = vmsg->payload.inflight.mmap_size;
> +    mmap_offset = vmsg->payload.inflight.mmap_offset;
> +    num_queues = vmsg->payload.inflight.num_queues;
> +    queue_size = vmsg->payload.inflight.queue_size;
> +
> +    DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
> +    DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
> +    DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
> +    DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
> +
> +    rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> +              fd, mmap_offset);
> +
> +    if (rc == MAP_FAILED) {
> +        vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
> +        return false;
> +    }
> +
> +    if (dev->inflight_info.fd) {
> +        close(dev->inflight_info.fd);
> +    }
> +
> +    if (dev->inflight_info.addr) {
> +        munmap(dev->inflight_info.addr, dev->inflight_info.size);
> +    }
> +
> +    dev->inflight_info.fd = fd;
> +    dev->inflight_info.addr = rc;
> +    dev->inflight_info.size = mmap_size;
> +
> +    for (i = 0; i < num_queues; i++) {
> +        dev->vq[i].inflight = (VuVirtqInflight *)rc;
> +        dev->vq[i].inflight->desc_num = queue_size;
> +        rc = (void *)((char *)rc + vu_inflight_queue_size(queue_size));
> +    }
> +
> +    return false;
> +}
> +
>  static bool
>  vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
>  {
> @@ -1287,6 +1521,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
>          return vu_set_postcopy_listen(dev, vmsg);
>      case VHOST_USER_POSTCOPY_END:
>          return vu_set_postcopy_end(dev, vmsg);
> +    case VHOST_USER_GET_INFLIGHT_FD:
> +        return vu_get_inflight_fd(dev, vmsg);
> +    case VHOST_USER_SET_INFLIGHT_FD:
> +        return vu_set_inflight_fd(dev, vmsg);
>      default:
>          vmsg_close_fds(vmsg);
>          vu_panic(dev, "Unhandled request: %d", vmsg->request);
> @@ -1354,8 +1592,24 @@ vu_deinit(VuDev *dev)
>              close(vq->err_fd);
>              vq->err_fd = -1;
>          }
> +
> +        if (vq->resubmit_list) {
> +            free(vq->resubmit_list);
> +            vq->resubmit_list = NULL;
> +        }
> +
> +        vq->inflight = NULL;
>      }
>
> +    if (dev->inflight_info.addr) {
> +        munmap(dev->inflight_info.addr, dev->inflight_info.size);
> +        dev->inflight_info.addr = NULL;
> +    }
> +
> +    if (dev->inflight_info.fd > 0) {
> +        close(dev->inflight_info.fd);
> +        dev->inflight_info.fd = -1;
> +    }
>
>      vu_close_log(dev);
>      if (dev->slave_fd != -1) {
> @@ -1682,20 +1936,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq)
>      return vring_avail_idx(vq) == vq->last_avail_idx;
>  }
>
> -static inline
> -bool has_feature(uint64_t features, unsigned int fbit)
> -{
> -    assert(fbit < 64);
> -    return !!(features & (1ULL << fbit));
> -}
> -
> -static inline
> -bool vu_has_feature(VuDev *dev,
> -                    unsigned int fbit)
> -{
> -    return has_feature(dev->features, fbit);
> -}
> -
>  static bool
>  vring_notify(VuDev *dev, VuVirtq *vq)
>  {
> @@ -1824,12 +2064,6 @@ virtqueue_map_desc(VuDev *dev,
>      *p_num_sg = num_sg;
>  }
>
> -/* Round number down to multiple */
> -#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> -
> -/* Round number up to multiple */
> -#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> -
>  static void *
>  virtqueue_alloc_element(size_t sz,
>                                       unsigned out_num, unsigned in_num)
> @@ -1930,9 +2164,68 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
>      return elem;
>  }
>
> +static int
> +vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
> +{
> +    if (!has_feature(dev->protocol_features,
> +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> +        return 0;
> +    }
> +
> +    if (unlikely(!vq->inflight)) {
> +        return -1;
> +    }
> +
> +    vq->inflight->desc[desc_idx].counter = vq->counter++;
> +    vq->inflight->desc[desc_idx].inflight = 1;
> +
> +    return 0;
> +}
> +
> +static int
> +vu_queue_inflight_pre_put(VuDev *dev, VuVirtq *vq, int desc_idx)
> +{
> +    if (!has_feature(dev->protocol_features,
> +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> +        return 0;
> +    }
> +
> +    if (unlikely(!vq->inflight)) {
> +        return -1;
> +    }
> +
> +    vq->inflight->last_batch_head = desc_idx;
> +
> +    return 0;
> +}
> +
> +static int
> +vu_queue_inflight_post_put(VuDev *dev, VuVirtq *vq, int desc_idx)
> +{
> +    if (!has_feature(dev->protocol_features,
> +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> +        return 0;
> +    }
> +
> +    if (unlikely(!vq->inflight)) {
> +        return -1;
> +    }
> +
> +    barrier();
> +
> +    vq->inflight->desc[desc_idx].inflight = 0;
> +
> +    barrier();
> +
> +    vq->inflight->used_idx = vq->used_idx;
> +
> +    return 0;
> +}
> +
>  void *
>  vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
>  {
> +    int i;
>      unsigned int head;
>      VuVirtqElement *elem;
>
> @@ -1941,6 +2234,18 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
>          return NULL;
>      }
>
> +    if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
> +        i = (--vq->resubmit_num);
> +        elem = vu_queue_map_desc(dev, vq, vq->resubmit_list[i].index, sz);
> +
> +        if (!vq->resubmit_num) {
> +            free(vq->resubmit_list);
> +            vq->resubmit_list = NULL;
> +        }
> +
> +        return elem;
> +    }
> +
>      if (vu_queue_empty(dev, vq)) {
>          return NULL;
>      }
> @@ -1971,6 +2276,8 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
>
>      vq->inuse++;
>
> +    vu_queue_inflight_get(dev, vq, head);
> +
>      return elem;
>  }
>
> @@ -2131,5 +2438,7 @@ vu_queue_push(VuDev *dev, VuVirtq *vq,
>                const VuVirtqElement *elem, unsigned int len)
>  {
>      vu_queue_fill(dev, vq, elem, len, 0);
> +    vu_queue_inflight_pre_put(dev, vq, elem->index);
>      vu_queue_flush(dev, vq, 1);
> +    vu_queue_inflight_post_put(dev, vq, elem->index);
>  }
> --
> MST
>
>
Yongji Xie Nov. 18, 2019, 9:15 a.m. UTC | #2
On Sun, 17 Nov 2019 at 01:43, Marc-André Lureau
<marcandre.lureau@gmail.com> wrote:
>
> On Wed, Mar 13, 2019 at 6:59 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > From: Xie Yongji <xieyongji@baidu.com>
> >
> > This patch adds support for VHOST_USER_GET_INFLIGHT_FD and
> > VHOST_USER_SET_INFLIGHT_FD message to set/get shared buffer
> > to/from qemu. Then backend can track inflight I/O in this buffer.
> >
> > Signed-off-by: Xie Yongji <xieyongji@baidu.com>
> > Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
> > Message-Id: <20190228085355.9614-5-xieyongji@baidu.com>
> > Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> >  Makefile                              |   2 +-
> >  contrib/libvhost-user/libvhost-user.h |  70 ++++++
> >  contrib/libvhost-user/libvhost-user.c | 349 ++++++++++++++++++++++++--
> >  3 files changed, 400 insertions(+), 21 deletions(-)
> >
> > diff --git a/Makefile b/Makefile
> > index 6ccb8639b0..abd78a9826 100644
> > --- a/Makefile
> > +++ b/Makefile
> > @@ -497,7 +497,7 @@ Makefile: $(version-obj-y)
> >  # Build libraries
> >
> >  libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y)
> > -libvhost-user.a: $(libvhost-user-obj-y)
> > +libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y)
> >
> >  ######################################################################
> >
> > diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
> > index 3de8414898..414ceb0a2f 100644
> > --- a/contrib/libvhost-user/libvhost-user.h
> > +++ b/contrib/libvhost-user/libvhost-user.h
> > @@ -53,6 +53,7 @@ enum VhostUserProtocolFeature {
> >      VHOST_USER_PROTOCOL_F_CONFIG = 9,
> >      VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
> >      VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
> > +    VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
> >
> >      VHOST_USER_PROTOCOL_F_MAX
> >  };
> > @@ -91,6 +92,8 @@ typedef enum VhostUserRequest {
> >      VHOST_USER_POSTCOPY_ADVISE  = 28,
> >      VHOST_USER_POSTCOPY_LISTEN  = 29,
> >      VHOST_USER_POSTCOPY_END     = 30,
> > +    VHOST_USER_GET_INFLIGHT_FD = 31,
> > +    VHOST_USER_SET_INFLIGHT_FD = 32,
> >      VHOST_USER_MAX
> >  } VhostUserRequest;
> >
> > @@ -138,6 +141,13 @@ typedef struct VhostUserVringArea {
> >      uint64_t offset;
> >  } VhostUserVringArea;
> >
> > +typedef struct VhostUserInflight {
> > +    uint64_t mmap_size;
> > +    uint64_t mmap_offset;
> > +    uint16_t num_queues;
> > +    uint16_t queue_size;
> > +} VhostUserInflight;
> > +
> >  #if defined(_WIN32)
> >  # define VU_PACKED __attribute__((gcc_struct, packed))
> >  #else
> > @@ -163,6 +173,7 @@ typedef struct VhostUserMsg {
> >          VhostUserLog log;
> >          VhostUserConfig config;
> >          VhostUserVringArea area;
> > +        VhostUserInflight inflight;
> >      } payload;
> >
> >      int fds[VHOST_MEMORY_MAX_NREGIONS];
> > @@ -234,9 +245,61 @@ typedef struct VuRing {
> >      uint32_t flags;
> >  } VuRing;
> >
> > +typedef struct VuDescStateSplit {
> > +    /* Indicate whether this descriptor is inflight or not.
> > +     * Only available for head-descriptor. */
> > +    uint8_t inflight;
> > +
> > +    /* Padding */
> > +    uint8_t padding[5];
> > +
> > +    /* Maintain a list for the last batch of used descriptors.
> > +     * Only available when batching is used for submitting */
> > +    uint16_t next;
> > +
> > +    /* Used to preserve the order of fetching available descriptors.
> > +     * Only available for head-descriptor. */
> > +    uint64_t counter;
> > +} VuDescStateSplit;
> > +
> > +typedef struct VuVirtqInflight {
> > +    /* The feature flags of this region. Now it's initialized to 0. */
> > +    uint64_t features;
> > +
> > +    /* The version of this region. It's 1 currently.
> > +     * Zero value indicates a vm reset happened. */
> > +    uint16_t version;
> > +
> > +    /* The size of VuDescStateSplit array. It's equal to the virtqueue
> > +     * size. Slave could get it from queue size field of VhostUserInflight. */
> > +    uint16_t desc_num;
> > +
> > +    /* The head of list that track the last batch of used descriptors. */
> > +    uint16_t last_batch_head;
> > +
> > +    /* Storing the idx value of used ring */
> > +    uint16_t used_idx;
> > +
> > +    /* Used to track the state of each descriptor in descriptor table */
> > +    VuDescStateSplit desc[0];
> > +} VuVirtqInflight;
> > +
> > +typedef struct VuVirtqInflightDesc {
> > +    uint16_t index;
> > +    uint64_t counter;
> > +} VuVirtqInflightDesc;
> > +
> >  typedef struct VuVirtq {
> >      VuRing vring;
> >
> > +    VuVirtqInflight *inflight;
> > +
> > +    VuVirtqInflightDesc *resubmit_list;
> > +
> > +    uint16_t resubmit_num;
> > +
> > +    uint64_t counter;
> > +
> >      /* Next head to pop */
> >      uint16_t last_avail_idx;
> >
> > @@ -279,11 +342,18 @@ typedef void (*vu_set_watch_cb) (VuDev *dev, int fd, int condition,
> >                                   vu_watch_cb cb, void *data);
> >  typedef void (*vu_remove_watch_cb) (VuDev *dev, int fd);
> >
> > +typedef struct VuDevInflightInfo {
> > +    int fd;
> > +    void *addr;
> > +    uint64_t size;
> > +} VuDevInflightInfo;
> > +
> >  struct VuDev {
> >      int sock;
> >      uint32_t nregions;
> >      VuDevRegion regions[VHOST_MEMORY_MAX_NREGIONS];
> >      VuVirtq vq[VHOST_MAX_NR_VIRTQUEUE];
> > +    VuDevInflightInfo inflight_info;
> >      int log_call_fd;
> >      int slave_fd;
> >      uint64_t log_size;
> > diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
> > index ddd15d79cf..e08d6c7b97 100644
> > --- a/contrib/libvhost-user/libvhost-user.c
> > +++ b/contrib/libvhost-user/libvhost-user.c
> > @@ -41,6 +41,8 @@
> >  #endif
> >
> >  #include "qemu/atomic.h"
> > +#include "qemu/osdep.h"
> > +#include "qemu/memfd.h"
> >
> >  #include "libvhost-user.h"
> >
> > @@ -53,6 +55,18 @@
> >              _min1 < _min2 ? _min1 : _min2; })
> >  #endif
> >
> > +/* Round number down to multiple */
> > +#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> > +
> > +/* Round number up to multiple */
> > +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> > +
> > +/* Align each region to cache line size in inflight buffer */
> > +#define INFLIGHT_ALIGNMENT 64
> > +
> > +/* The version of inflight buffer */
> > +#define INFLIGHT_VERSION 1
> > +
> >  #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
> >
> >  /* The version of the protocol we support */
> > @@ -66,6 +80,20 @@
> >          }                                       \
> >      } while (0)
> >
> > +static inline
> > +bool has_feature(uint64_t features, unsigned int fbit)
> > +{
> > +    assert(fbit < 64);
> > +    return !!(features & (1ULL << fbit));
> > +}
> > +
> > +static inline
> > +bool vu_has_feature(VuDev *dev,
> > +                    unsigned int fbit)
> > +{
> > +    return has_feature(dev->features, fbit);
> > +}
> > +
> >  static const char *
> >  vu_request_to_string(unsigned int req)
> >  {
> > @@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req)
> >          REQ(VHOST_USER_POSTCOPY_ADVISE),
> >          REQ(VHOST_USER_POSTCOPY_LISTEN),
> >          REQ(VHOST_USER_POSTCOPY_END),
> > +        REQ(VHOST_USER_GET_INFLIGHT_FD),
> > +        REQ(VHOST_USER_SET_INFLIGHT_FD),
> >          REQ(VHOST_USER_MAX),
> >      };
> >  #undef REQ
> > @@ -890,6 +920,91 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
> >      return true;
> >  }
> >
> > +static int
> > +inflight_desc_compare(const void *a, const void *b)
> > +{
> > +    VuVirtqInflightDesc *desc0 = (VuVirtqInflightDesc *)a,
> > +                        *desc1 = (VuVirtqInflightDesc *)b;
> > +
> > +    if (desc1->counter > desc0->counter &&
> > +        (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) {
> > +        return 1;
> > +    }
> > +
> > +    return -1;
> > +}
> > +
> > +static int
> > +vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
> > +{
> > +    int i = 0;
> > +
> > +    if (!has_feature(dev->protocol_features,
> > +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> > +        return 0;
> > +    }
> > +
> > +    if (unlikely(!vq->inflight)) {
> > +        return -1;
> > +    }
> > +
> > +    if (unlikely(!vq->inflight->version)) {
> > +        /* initialize the buffer */
> > +        vq->inflight->version = INFLIGHT_VERSION;
> > +        return 0;
> > +    }
> > +
> > +    vq->used_idx = vq->vring.used->idx;
> > +    vq->resubmit_num = 0;
> > +    vq->resubmit_list = NULL;
> > +    vq->counter = 0;
> > +
> > +    if (unlikely(vq->inflight->used_idx != vq->used_idx)) {
> > +        vq->inflight->desc[vq->inflight->last_batch_head].inflight = 0;
> > +
> > +        barrier();
> > +
> > +        vq->inflight->used_idx = vq->used_idx;
> > +    }
> > +
> > +    for (i = 0; i < vq->inflight->desc_num; i++) {
> > +        if (vq->inflight->desc[i].inflight == 1) {
> > +            vq->inuse++;
> > +        }
> > +    }
> > +
> > +    vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
> > +
> > +    if (vq->inuse) {
> > +        vq->resubmit_list = malloc(sizeof(VuVirtqInflightDesc) * vq->inuse);
> > +        if (!vq->resubmit_list) {
> > +            return -1;
> > +        }
> > +
> > +        for (i = 0; i < vq->inflight->desc_num; i++) {
> > +            if (vq->inflight->desc[i].inflight) {
> > +                vq->resubmit_list[vq->resubmit_num].index = i;
> > +                vq->resubmit_list[vq->resubmit_num].counter =
> > +                                        vq->inflight->desc[i].counter;
> > +                vq->resubmit_num++;
> > +            }
> > +        }
> > +
> > +        if (vq->resubmit_num > 1) {
> > +            qsort(vq->resubmit_list, vq->resubmit_num,
> > +                  sizeof(VuVirtqInflightDesc), inflight_desc_compare);
> > +        }
> > +        vq->counter = vq->resubmit_list[0].counter + 1;
>
> scan-build reports that vq->resubmit_list[0].counter may be garbage
> value, if it's not initialized in the loop above.
> Xie, could you provide a fix?
>

OK, will fix it soon. Thank you!

Thanks,
Yongji
> > +    }
> > +
> > +    /* in case of I/O hang after reconnecting */
> > +    if (eventfd_write(vq->kick_fd, 1)) {
> > +        return -1;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >  static bool
> >  vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
> >  {
> > @@ -923,6 +1038,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
> >                 dev->vq[index].kick_fd, index);
> >      }
> >
> > +    if (vu_check_queue_inflights(dev, &dev->vq[index])) {
> > +        vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
> > +    }
> > +
> >      return false;
> >  }
> >
> > @@ -995,6 +1114,11 @@ vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
> >
> >      dev->vq[index].call_fd = vmsg->fds[0];
> >
> > +    /* in case of I/O hang after reconnecting */
> > +    if (eventfd_write(vmsg->fds[0], 1)) {
> > +        return -1;
> > +    }
> > +
> >      DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
> >
> >      return false;
> > @@ -1209,6 +1333,116 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
> >      return true;
> >  }
> >
> > +static inline uint64_t
> > +vu_inflight_queue_size(uint16_t queue_size)
> > +{
> > +    return ALIGN_UP(sizeof(VuDescStateSplit) * queue_size +
> > +           sizeof(uint16_t), INFLIGHT_ALIGNMENT);
> > +}
> > +
> > +static bool
> > +vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> > +{
> > +    int fd;
> > +    void *addr;
> > +    uint64_t mmap_size;
> > +    uint16_t num_queues, queue_size;
> > +
> > +    if (vmsg->size != sizeof(vmsg->payload.inflight)) {
> > +        vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
> > +        vmsg->payload.inflight.mmap_size = 0;
> > +        return true;
> > +    }
> > +
> > +    num_queues = vmsg->payload.inflight.num_queues;
> > +    queue_size = vmsg->payload.inflight.queue_size;
> > +
> > +    DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
> > +    DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
> > +
> > +    mmap_size = vu_inflight_queue_size(queue_size) * num_queues;
> > +
> > +    addr = qemu_memfd_alloc("vhost-inflight", mmap_size,
> > +                            F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
> > +                            &fd, NULL);
> > +
> > +    if (!addr) {
> > +        vu_panic(dev, "Failed to alloc vhost inflight area");
> > +        vmsg->payload.inflight.mmap_size = 0;
> > +        return true;
> > +    }
> > +
> > +    memset(addr, 0, mmap_size);
> > +
> > +    dev->inflight_info.addr = addr;
> > +    dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
> > +    dev->inflight_info.fd = vmsg->fds[0] = fd;
> > +    vmsg->fd_num = 1;
> > +    vmsg->payload.inflight.mmap_offset = 0;
> > +
> > +    DPRINT("send inflight mmap_size: %"PRId64"\n",
> > +           vmsg->payload.inflight.mmap_size);
> > +    DPRINT("send inflight mmap offset: %"PRId64"\n",
> > +           vmsg->payload.inflight.mmap_offset);
> > +
> > +    return true;
> > +}
> > +
> > +static bool
> > +vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> > +{
> > +    int fd, i;
> > +    uint64_t mmap_size, mmap_offset;
> > +    uint16_t num_queues, queue_size;
> > +    void *rc;
> > +
> > +    if (vmsg->fd_num != 1 ||
> > +        vmsg->size != sizeof(vmsg->payload.inflight)) {
> > +        vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
> > +                 vmsg->size, vmsg->fd_num);
> > +        return false;
> > +    }
> > +
> > +    fd = vmsg->fds[0];
> > +    mmap_size = vmsg->payload.inflight.mmap_size;
> > +    mmap_offset = vmsg->payload.inflight.mmap_offset;
> > +    num_queues = vmsg->payload.inflight.num_queues;
> > +    queue_size = vmsg->payload.inflight.queue_size;
> > +
> > +    DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
> > +    DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
> > +    DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
> > +    DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
> > +
> > +    rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> > +              fd, mmap_offset);
> > +
> > +    if (rc == MAP_FAILED) {
> > +        vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
> > +        return false;
> > +    }
> > +
> > +    if (dev->inflight_info.fd) {
> > +        close(dev->inflight_info.fd);
> > +    }
> > +
> > +    if (dev->inflight_info.addr) {
> > +        munmap(dev->inflight_info.addr, dev->inflight_info.size);
> > +    }
> > +
> > +    dev->inflight_info.fd = fd;
> > +    dev->inflight_info.addr = rc;
> > +    dev->inflight_info.size = mmap_size;
> > +
> > +    for (i = 0; i < num_queues; i++) {
> > +        dev->vq[i].inflight = (VuVirtqInflight *)rc;
> > +        dev->vq[i].inflight->desc_num = queue_size;
> > +        rc = (void *)((char *)rc + vu_inflight_queue_size(queue_size));
> > +    }
> > +
> > +    return false;
> > +}
> > +
> >  static bool
> >  vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
> >  {
> > @@ -1287,6 +1521,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
> >          return vu_set_postcopy_listen(dev, vmsg);
> >      case VHOST_USER_POSTCOPY_END:
> >          return vu_set_postcopy_end(dev, vmsg);
> > +    case VHOST_USER_GET_INFLIGHT_FD:
> > +        return vu_get_inflight_fd(dev, vmsg);
> > +    case VHOST_USER_SET_INFLIGHT_FD:
> > +        return vu_set_inflight_fd(dev, vmsg);
> >      default:
> >          vmsg_close_fds(vmsg);
> >          vu_panic(dev, "Unhandled request: %d", vmsg->request);
> > @@ -1354,8 +1592,24 @@ vu_deinit(VuDev *dev)
> >              close(vq->err_fd);
> >              vq->err_fd = -1;
> >          }
> > +
> > +        if (vq->resubmit_list) {
> > +            free(vq->resubmit_list);
> > +            vq->resubmit_list = NULL;
> > +        }
> > +
> > +        vq->inflight = NULL;
> >      }
> >
> > +    if (dev->inflight_info.addr) {
> > +        munmap(dev->inflight_info.addr, dev->inflight_info.size);
> > +        dev->inflight_info.addr = NULL;
> > +    }
> > +
> > +    if (dev->inflight_info.fd > 0) {
> > +        close(dev->inflight_info.fd);
> > +        dev->inflight_info.fd = -1;
> > +    }
> >
> >      vu_close_log(dev);
> >      if (dev->slave_fd != -1) {
> > @@ -1682,20 +1936,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq)
> >      return vring_avail_idx(vq) == vq->last_avail_idx;
> >  }
> >
> > -static inline
> > -bool has_feature(uint64_t features, unsigned int fbit)
> > -{
> > -    assert(fbit < 64);
> > -    return !!(features & (1ULL << fbit));
> > -}
> > -
> > -static inline
> > -bool vu_has_feature(VuDev *dev,
> > -                    unsigned int fbit)
> > -{
> > -    return has_feature(dev->features, fbit);
> > -}
> > -
> >  static bool
> >  vring_notify(VuDev *dev, VuVirtq *vq)
> >  {
> > @@ -1824,12 +2064,6 @@ virtqueue_map_desc(VuDev *dev,
> >      *p_num_sg = num_sg;
> >  }
> >
> > -/* Round number down to multiple */
> > -#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> > -
> > -/* Round number up to multiple */
> > -#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> > -
> >  static void *
> >  virtqueue_alloc_element(size_t sz,
> >                                       unsigned out_num, unsigned in_num)
> > @@ -1930,9 +2164,68 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
> >      return elem;
> >  }
> >
> > +static int
> > +vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
> > +{
> > +    if (!has_feature(dev->protocol_features,
> > +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> > +        return 0;
> > +    }
> > +
> > +    if (unlikely(!vq->inflight)) {
> > +        return -1;
> > +    }
> > +
> > +    vq->inflight->desc[desc_idx].counter = vq->counter++;
> > +    vq->inflight->desc[desc_idx].inflight = 1;
> > +
> > +    return 0;
> > +}
> > +
> > +static int
> > +vu_queue_inflight_pre_put(VuDev *dev, VuVirtq *vq, int desc_idx)
> > +{
> > +    if (!has_feature(dev->protocol_features,
> > +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> > +        return 0;
> > +    }
> > +
> > +    if (unlikely(!vq->inflight)) {
> > +        return -1;
> > +    }
> > +
> > +    vq->inflight->last_batch_head = desc_idx;
> > +
> > +    return 0;
> > +}
> > +
> > +static int
> > +vu_queue_inflight_post_put(VuDev *dev, VuVirtq *vq, int desc_idx)
> > +{
> > +    if (!has_feature(dev->protocol_features,
> > +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> > +        return 0;
> > +    }
> > +
> > +    if (unlikely(!vq->inflight)) {
> > +        return -1;
> > +    }
> > +
> > +    barrier();
> > +
> > +    vq->inflight->desc[desc_idx].inflight = 0;
> > +
> > +    barrier();
> > +
> > +    vq->inflight->used_idx = vq->used_idx;
> > +
> > +    return 0;
> > +}
> > +
> >  void *
> >  vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
> >  {
> > +    int i;
> >      unsigned int head;
> >      VuVirtqElement *elem;
> >
> > @@ -1941,6 +2234,18 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
> >          return NULL;
> >      }
> >
> > +    if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
> > +        i = (--vq->resubmit_num);
> > +        elem = vu_queue_map_desc(dev, vq, vq->resubmit_list[i].index, sz);
> > +
> > +        if (!vq->resubmit_num) {
> > +            free(vq->resubmit_list);
> > +            vq->resubmit_list = NULL;
> > +        }
> > +
> > +        return elem;
> > +    }
> > +
> >      if (vu_queue_empty(dev, vq)) {
> >          return NULL;
> >      }
> > @@ -1971,6 +2276,8 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
> >
> >      vq->inuse++;
> >
> > +    vu_queue_inflight_get(dev, vq, head);
> > +
> >      return elem;
> >  }
> >
> > @@ -2131,5 +2438,7 @@ vu_queue_push(VuDev *dev, VuVirtq *vq,
> >                const VuVirtqElement *elem, unsigned int len)
> >  {
> >      vu_queue_fill(dev, vq, elem, len, 0);
> > +    vu_queue_inflight_pre_put(dev, vq, elem->index);
> >      vu_queue_flush(dev, vq, 1);
> > +    vu_queue_inflight_post_put(dev, vq, elem->index);
> >  }
> > --
> > MST
> >
> >
>
>
> --
> Marc-André Lureau
>
diff mbox series

Patch

diff --git a/Makefile b/Makefile
index 7fa04e0821..3cf34bf8b3 100644
--- a/Makefile
+++ b/Makefile
@@ -479,7 +479,7 @@  Makefile: $(version-obj-y)
 # Build libraries
 
 libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y)
-libvhost-user.a: $(libvhost-user-obj-y)
+libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y)
 
 ######################################################################
 
diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index ea0f414b6d..065ab60924 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -41,6 +41,8 @@ 
 #endif
 
 #include "qemu/atomic.h"
+#include "qemu/osdep.h"
+#include "qemu/memfd.h"
 
 #include "libvhost-user.h"
 
@@ -53,6 +55,18 @@ 
             _min1 < _min2 ? _min1 : _min2; })
 #endif
 
+/* Round number down to multiple */
+#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
+
+/* Round number up to multiple */
+#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
+
+/* Align each region to cache line size in inflight buffer */
+#define INFLIGHT_ALIGNMENT 64
+
+/* The version of inflight buffer */
+#define INFLIGHT_VERSION 1
+
 #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
 
 /* The version of the protocol we support */
@@ -66,6 +80,20 @@ 
         }                                       \
     } while (0)
 
+static inline
+bool has_feature(uint64_t features, unsigned int fbit)
+{
+    assert(fbit < 64);
+    return !!(features & (1ULL << fbit));
+}
+
+static inline
+bool vu_has_feature(VuDev *dev,
+                    unsigned int fbit)
+{
+    return has_feature(dev->features, fbit);
+}
+
 static const char *
 vu_request_to_string(unsigned int req)
 {
@@ -100,6 +128,8 @@  vu_request_to_string(unsigned int req)
         REQ(VHOST_USER_POSTCOPY_ADVISE),
         REQ(VHOST_USER_POSTCOPY_LISTEN),
         REQ(VHOST_USER_POSTCOPY_END),
+        REQ(VHOST_USER_GET_INFLIGHT_FD),
+        REQ(VHOST_USER_SET_INFLIGHT_FD),
         REQ(VHOST_USER_MAX),
     };
 #undef REQ
@@ -890,6 +920,91 @@  vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
     return true;
 }
 
+static int
+inflight_desc_compare(const void *a, const void *b)
+{
+    VuVirtqInflightDesc *desc0 = (VuVirtqInflightDesc *)a,
+                        *desc1 = (VuVirtqInflightDesc *)b;
+
+    if (desc1->counter > desc0->counter &&
+        (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) {
+        return 1;
+    }
+
+    return -1;
+}
+
+static int
+vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
+{
+    int i = 0;
+
+    if (!has_feature(dev->protocol_features,
+        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+        return 0;
+    }
+
+    if (unlikely(!vq->inflight)) {
+        return -1;
+    }
+
+    if (unlikely(!vq->inflight->version)) {
+        /* initialize the buffer */
+        vq->inflight->version = INFLIGHT_VERSION;
+        return 0;
+    }
+
+    vq->used_idx = vq->vring.used->idx;
+    vq->resubmit_num = 0;
+    vq->resubmit_list = NULL;
+    vq->counter = 0;
+
+    if (unlikely(vq->inflight->used_idx != vq->used_idx)) {
+        vq->inflight->desc[vq->inflight->last_batch_head].inflight = 0;
+
+        barrier();
+
+        vq->inflight->used_idx = vq->used_idx;
+    }
+
+    for (i = 0; i < vq->inflight->desc_num; i++) {
+        if (vq->inflight->desc[i].inflight == 1) {
+            vq->inuse++;
+        }
+    }
+
+    vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
+
+    if (vq->inuse) {
+        vq->resubmit_list = malloc(sizeof(VuVirtqInflightDesc) * vq->inuse);
+        if (!vq->resubmit_list) {
+            return -1;
+        }
+
+        for (i = 0; i < vq->inflight->desc_num; i++) {
+            if (vq->inflight->desc[i].inflight) {
+                vq->resubmit_list[vq->resubmit_num].index = i;
+                vq->resubmit_list[vq->resubmit_num].counter =
+                                        vq->inflight->desc[i].counter;
+                vq->resubmit_num++;
+            }
+        }
+
+        if (vq->resubmit_num > 1) {
+            qsort(vq->resubmit_list, vq->resubmit_num,
+                  sizeof(VuVirtqInflightDesc), inflight_desc_compare);
+        }
+        vq->counter = vq->resubmit_list[0].counter + 1;
+    }
+
+    /* in case of I/O hang after reconnecting */
+    if (eventfd_write(vq->kick_fd, 1)) {
+        return -1;
+    }
+
+    return 0;
+}
+
 static bool
 vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
 {
@@ -923,6 +1038,10 @@  vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
                dev->vq[index].kick_fd, index);
     }
 
+    if (vu_check_queue_inflights(dev, &dev->vq[index])) {
+        vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
+    }
+
     return false;
 }
 
@@ -995,6 +1114,11 @@  vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
 
     dev->vq[index].call_fd = vmsg->fds[0];
 
+    /* in case of I/O hang after reconnecting */
+    if (eventfd_write(vmsg->fds[0], 1)) {
+        return -1;
+    }
+
     DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
 
     return false;
@@ -1209,6 +1333,116 @@  vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
     return true;
 }
 
+static inline uint64_t
+vu_inflight_queue_size(uint16_t queue_size)
+{
+    return ALIGN_UP(sizeof(VuDescStateSplit) * queue_size +
+           sizeof(uint16_t), INFLIGHT_ALIGNMENT);
+}
+
+static bool
+vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
+{
+    int fd;
+    void *addr;
+    uint64_t mmap_size;
+    uint16_t num_queues, queue_size;
+
+    if (vmsg->size != sizeof(vmsg->payload.inflight)) {
+        vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
+        vmsg->payload.inflight.mmap_size = 0;
+        return true;
+    }
+
+    num_queues = vmsg->payload.inflight.num_queues;
+    queue_size = vmsg->payload.inflight.queue_size;
+
+    DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
+    DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
+
+    mmap_size = vu_inflight_queue_size(queue_size) * num_queues;
+
+    addr = qemu_memfd_alloc("vhost-inflight", mmap_size,
+                            F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
+                            &fd, NULL);
+
+    if (!addr) {
+        vu_panic(dev, "Failed to alloc vhost inflight area");
+        vmsg->payload.inflight.mmap_size = 0;
+        return true;
+    }
+
+    memset(addr, 0, mmap_size);
+
+    dev->inflight_info.addr = addr;
+    dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
+    dev->inflight_info.fd = vmsg->fds[0] = fd;
+    vmsg->fd_num = 1;
+    vmsg->payload.inflight.mmap_offset = 0;
+
+    DPRINT("send inflight mmap_size: %"PRId64"\n",
+           vmsg->payload.inflight.mmap_size);
+    DPRINT("send inflight mmap offset: %"PRId64"\n",
+           vmsg->payload.inflight.mmap_offset);
+
+    return true;
+}
+
+static bool
+vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
+{
+    int fd, i;
+    uint64_t mmap_size, mmap_offset;
+    uint16_t num_queues, queue_size;
+    void *rc;
+
+    if (vmsg->fd_num != 1 ||
+        vmsg->size != sizeof(vmsg->payload.inflight)) {
+        vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
+                 vmsg->size, vmsg->fd_num);
+        return false;
+    }
+
+    fd = vmsg->fds[0];
+    mmap_size = vmsg->payload.inflight.mmap_size;
+    mmap_offset = vmsg->payload.inflight.mmap_offset;
+    num_queues = vmsg->payload.inflight.num_queues;
+    queue_size = vmsg->payload.inflight.queue_size;
+
+    DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
+    DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
+    DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
+    DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
+
+    rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+              fd, mmap_offset);
+
+    if (rc == MAP_FAILED) {
+        vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
+        return false;
+    }
+
+    if (dev->inflight_info.fd) {
+        close(dev->inflight_info.fd);
+    }
+
+    if (dev->inflight_info.addr) {
+        munmap(dev->inflight_info.addr, dev->inflight_info.size);
+    }
+
+    dev->inflight_info.fd = fd;
+    dev->inflight_info.addr = rc;
+    dev->inflight_info.size = mmap_size;
+
+    for (i = 0; i < num_queues; i++) {
+        dev->vq[i].inflight = (VuVirtqInflight *)rc;
+        dev->vq[i].inflight->desc_num = queue_size;
+        rc = (void *)((char *)rc + vu_inflight_queue_size(queue_size));
+    }
+
+    return false;
+}
+
 static bool
 vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
 {
@@ -1286,6 +1520,10 @@  vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
         return vu_set_postcopy_listen(dev, vmsg);
     case VHOST_USER_POSTCOPY_END:
         return vu_set_postcopy_end(dev, vmsg);
+    case VHOST_USER_GET_INFLIGHT_FD:
+        return vu_get_inflight_fd(dev, vmsg);
+    case VHOST_USER_SET_INFLIGHT_FD:
+        return vu_set_inflight_fd(dev, vmsg);
     default:
         vmsg_close_fds(vmsg);
         vu_panic(dev, "Unhandled request: %d", vmsg->request);
@@ -1353,8 +1591,24 @@  vu_deinit(VuDev *dev)
             close(vq->err_fd);
             vq->err_fd = -1;
         }
+
+        if (vq->resubmit_list) {
+            free(vq->resubmit_list);
+            vq->resubmit_list = NULL;
+        }
+
+        vq->inflight = NULL;
+    }
+
+    if (dev->inflight_info.addr) {
+        munmap(dev->inflight_info.addr, dev->inflight_info.size);
+        dev->inflight_info.addr = NULL;
     }
 
+    if (dev->inflight_info.fd > 0) {
+        close(dev->inflight_info.fd);
+        dev->inflight_info.fd = -1;
+    }
 
     vu_close_log(dev);
     if (dev->slave_fd != -1) {
@@ -1681,20 +1935,6 @@  vu_queue_empty(VuDev *dev, VuVirtq *vq)
     return vring_avail_idx(vq) == vq->last_avail_idx;
 }
 
-static inline
-bool has_feature(uint64_t features, unsigned int fbit)
-{
-    assert(fbit < 64);
-    return !!(features & (1ULL << fbit));
-}
-
-static inline
-bool vu_has_feature(VuDev *dev,
-                    unsigned int fbit)
-{
-    return has_feature(dev->features, fbit);
-}
-
 static bool
 vring_notify(VuDev *dev, VuVirtq *vq)
 {
@@ -1823,12 +2063,6 @@  virtqueue_map_desc(VuDev *dev,
     *p_num_sg = num_sg;
 }
 
-/* Round number down to multiple */
-#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
-
-/* Round number up to multiple */
-#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
-
 static void *
 virtqueue_alloc_element(size_t sz,
                                      unsigned out_num, unsigned in_num)
@@ -1929,9 +2163,68 @@  vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
     return elem;
 }
 
+static int
+vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
+{
+    if (!has_feature(dev->protocol_features,
+        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+        return 0;
+    }
+
+    if (unlikely(!vq->inflight)) {
+        return -1;
+    }
+
+    vq->inflight->desc[desc_idx].counter = vq->counter++;
+    vq->inflight->desc[desc_idx].inflight = 1;
+
+    return 0;
+}
+
+static int
+vu_queue_inflight_pre_put(VuDev *dev, VuVirtq *vq, int desc_idx)
+{
+    if (!has_feature(dev->protocol_features,
+        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+        return 0;
+    }
+
+    if (unlikely(!vq->inflight)) {
+        return -1;
+    }
+
+    vq->inflight->last_batch_head = desc_idx;
+
+    return 0;
+}
+
+static int
+vu_queue_inflight_post_put(VuDev *dev, VuVirtq *vq, int desc_idx)
+{
+    if (!has_feature(dev->protocol_features,
+        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+        return 0;
+    }
+
+    if (unlikely(!vq->inflight)) {
+        return -1;
+    }
+
+    barrier();
+
+    vq->inflight->desc[desc_idx].inflight = 0;
+
+    barrier();
+
+    vq->inflight->used_idx = vq->used_idx;
+
+    return 0;
+}
+
 void *
 vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
 {
+    int i;
     unsigned int head;
     VuVirtqElement *elem;
 
@@ -1940,6 +2233,18 @@  vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
         return NULL;
     }
 
+    if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
+        i = (--vq->resubmit_num);
+        elem = vu_queue_map_desc(dev, vq, vq->resubmit_list[i].index, sz);
+
+        if (!vq->resubmit_num) {
+            free(vq->resubmit_list);
+            vq->resubmit_list = NULL;
+        }
+
+        return elem;
+    }
+
     if (vu_queue_empty(dev, vq)) {
         return NULL;
     }
@@ -1970,6 +2275,8 @@  vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
 
     vq->inuse++;
 
+    vu_queue_inflight_get(dev, vq, head);
+
     return elem;
 }
 
@@ -2114,5 +2421,7 @@  vu_queue_push(VuDev *dev, VuVirtq *vq,
               const VuVirtqElement *elem, unsigned int len)
 {
     vu_queue_fill(dev, vq, elem, len, 0);
+    vu_queue_inflight_pre_put(dev, vq, elem->index);
     vu_queue_flush(dev, vq, 1);
+    vu_queue_inflight_post_put(dev, vq, elem->index);
 }
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index 4aa55b4d2d..8a32456943 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -53,6 +53,7 @@  enum VhostUserProtocolFeature {
     VHOST_USER_PROTOCOL_F_CONFIG = 9,
     VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
     VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
+    VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
 
     VHOST_USER_PROTOCOL_F_MAX
 };
@@ -91,6 +92,8 @@  typedef enum VhostUserRequest {
     VHOST_USER_POSTCOPY_ADVISE  = 28,
     VHOST_USER_POSTCOPY_LISTEN  = 29,
     VHOST_USER_POSTCOPY_END     = 30,
+    VHOST_USER_GET_INFLIGHT_FD = 31,
+    VHOST_USER_SET_INFLIGHT_FD = 32,
     VHOST_USER_MAX
 } VhostUserRequest;
 
@@ -138,6 +141,13 @@  typedef struct VhostUserVringArea {
     uint64_t offset;
 } VhostUserVringArea;
 
+typedef struct VhostUserInflight {
+    uint64_t mmap_size;
+    uint64_t mmap_offset;
+    uint16_t num_queues;
+    uint16_t queue_size;
+} VhostUserInflight;
+
 #if defined(_WIN32)
 # define VU_PACKED __attribute__((gcc_struct, packed))
 #else
@@ -163,6 +173,7 @@  typedef struct VhostUserMsg {
         VhostUserLog log;
         VhostUserConfig config;
         VhostUserVringArea area;
+        VhostUserInflight inflight;
     } payload;
 
     int fds[VHOST_MEMORY_MAX_NREGIONS];
@@ -234,9 +245,61 @@  typedef struct VuRing {
     uint32_t flags;
 } VuRing;
 
+typedef struct VuDescStateSplit {
+    /* Indicate whether this descriptor is inflight or not.
+     * Only available for head-descriptor. */
+    uint8_t inflight;
+
+    /* Padding */
+    uint8_t padding[5];
+
+    /* Maintain a list for the last batch of used descriptors.
+     * Only available when batching is used for submitting */
+    uint16_t next;
+
+    /* Used to preserve the order of fetching available descriptors.
+     * Only available for head-descriptor. */
+    uint64_t counter;
+} VuDescStateSplit;
+
+typedef struct VuVirtqInflight {
+    /* The feature flags of this region. Now it's initialized to 0. */
+    uint64_t features;
+
+    /* The version of this region. It's 1 currently.
+     * Zero value indicates a vm reset happened. */
+    uint16_t version;
+
+    /* The size of VuDescStateSplit array. It's equal to the virtqueue
+     * size. Slave could get it from queue size field of VhostUserInflight. */
+    uint16_t desc_num;
+
+    /* The head of list that track the last batch of used descriptors. */
+    uint16_t last_batch_head;
+
+    /* Storing the idx value of used ring */
+    uint16_t used_idx;
+
+    /* Used to track the state of each descriptor in descriptor table */
+    VuDescStateSplit desc[0];
+} VuVirtqInflight;
+
+typedef struct VuVirtqInflightDesc {
+    uint16_t index;
+    uint64_t counter;
+} VuVirtqInflightDesc;
+
 typedef struct VuVirtq {
     VuRing vring;
 
+    VuVirtqInflight *inflight;
+
+    VuVirtqInflightDesc *resubmit_list;
+
+    uint16_t resubmit_num;
+
+    uint64_t counter;
+
     /* Next head to pop */
     uint16_t last_avail_idx;
 
@@ -279,11 +342,18 @@  typedef void (*vu_set_watch_cb) (VuDev *dev, int fd, int condition,
                                  vu_watch_cb cb, void *data);
 typedef void (*vu_remove_watch_cb) (VuDev *dev, int fd);
 
+typedef struct VuDevInflightInfo {
+    int fd;
+    void *addr;
+    uint64_t size;
+} VuDevInflightInfo;
+
 struct VuDev {
     int sock;
     uint32_t nregions;
     VuDevRegion regions[VHOST_MEMORY_MAX_NREGIONS];
     VuVirtq vq[VHOST_MAX_NR_VIRTQUEUE];
+    VuDevInflightInfo inflight_info;
     int log_call_fd;
     int slave_fd;
     uint64_t log_size;