Message ID | 20211208052010.1719-1-longpeng2@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support | expand |
On Wed, Dec 8, 2021 at 1:20 PM Longpeng(Mike) <longpeng2@huawei.com> wrote: > > From: Longpeng <longpeng2@huawei.com> > > Hi guys, > > This patch introduces vhost-vdpa-net device, which is inspired > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > I've tested this patch on Huawei's offload card: > ./x86_64-softmmu/qemu-system-x86_64 \ > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > For virtio hardware offloading, the most important requirement for us > is to support live migration between offloading cards from different > vendors, the combination of netdev and virtio-net seems too heavy, we > prefer a lightweight way. Could you elaborate more on this? It's mainly the control path when using with netdev, and it provides a lot of other benefits: - decouple the transport specific stuff out of the vhost abstraction, mmio device is supported with 0 line of code - migration compatibility, reuse the migration stream that is already supported by Qemu virtio-net, this will allow migration among different vhost backends. - software mediation facility, not all the virtqueues are assigned to guests directly. One example is the virtio-net cvq, qemu may want to intercept and record the device state for migration. Reusing the current virtio-net codes simplifies a lot of codes. - transparent failover (in the future), the nic model can choose to switch between vhost backends etc. > > Maybe we could support both in the future ? For the net, we need to figure out the advantages of this approach first. Note that we didn't have vhost-user-net-pci or vhost-pci in the past. For the block, I will leave Stefan and Stefano to comment. > Such as: > > * Lightweight > Net: vhost-vdpa-net > Storage: vhost-vdpa-blk > > * Heavy but more powerful > Net: netdev + virtio-net + vhost-vdpa > Storage: bdrv + virtio-blk + vhost-vdpa > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com> > --- > hw/net/meson.build | 1 + > hw/net/vhost-vdpa-net.c | 338 +++++++++++++++++++++++++++++++++++++ > hw/virtio/Kconfig | 5 + > hw/virtio/meson.build | 1 + > hw/virtio/vhost-vdpa-net-pci.c | 118 +++++++++++++ I'd expect there's no device type specific code in this approach and any kind of vDPA devices could be used with a general pci device. Any reason for having net specific types here? > include/hw/virtio/vhost-vdpa-net.h | 31 ++++ > include/net/vhost-vdpa.h | 2 + > net/vhost-vdpa.c | 2 +- > 8 files changed, 497 insertions(+), 1 deletion(-) > create mode 100644 hw/net/vhost-vdpa-net.c > create mode 100644 hw/virtio/vhost-vdpa-net-pci.c > create mode 100644 include/hw/virtio/vhost-vdpa-net.h > > diff --git a/hw/net/meson.build b/hw/net/meson.build > index bdf71f1..139ebc4 100644 > --- a/hw/net/meson.build > +++ b/hw/net/meson.build > @@ -44,6 +44,7 @@ specific_ss.add(when: 'CONFIG_XILINX_ETHLITE', if_true: files('xilinx_ethlite.c' > > softmmu_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('net_rx_pkt.c')) > specific_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('virtio-net.c')) > +specific_ss.add(when: 'CONFIG_VHOST_VDPA_NET', if_true: files('vhost-vdpa-net.c')) > > softmmu_ss.add(when: ['CONFIG_VIRTIO_NET', 'CONFIG_VHOST_NET'], if_true: files('vhost_net.c'), if_false: files('vhost_net-stub.c')) > softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost_net-stub.c')) > diff --git a/hw/net/vhost-vdpa-net.c b/hw/net/vhost-vdpa-net.c > new file mode 100644 > index 0000000..48b99f9 > --- /dev/null > +++ b/hw/net/vhost-vdpa-net.c > @@ -0,0 +1,338 @@ > +#include "qemu/osdep.h" > +#include "qapi/error.h" > +#include "qemu/error-report.h" > +#include "qemu/cutils.h" > +#include "hw/qdev-core.h" > +#include "hw/qdev-properties.h" > +#include "hw/qdev-properties-system.h" > +#include "hw/virtio/vhost.h" > +#include "hw/virtio/vhost-vdpa-net.h" > +#include "hw/virtio/virtio.h" > +#include "hw/virtio/virtio-bus.h" > +#include "hw/virtio/virtio-access.h" > +#include "sysemu/sysemu.h" > +#include "sysemu/runstate.h" > +#include "net/vhost-vdpa.h" > + > +static void vhost_vdpa_net_get_config(VirtIODevice *vdev, uint8_t *config) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + > + memcpy(config, &s->netcfg, sizeof(struct virtio_net_config)); > +} > + > +static void vhost_vdpa_net_set_config(VirtIODevice *vdev, const uint8_t *config) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + struct virtio_net_config *netcfg = (struct virtio_net_config *)config; > + int ret; > + > + ret = vhost_dev_set_config(&s->dev, (uint8_t *)netcfg, 0, sizeof(*netcfg), > + VHOST_SET_CONFIG_TYPE_MASTER); > + if (ret) { > + error_report("set device config space failed"); > + return; > + } > +} > + > +static uint64_t vhost_vdpa_net_get_features(VirtIODevice *vdev, > + uint64_t features, > + Error **errp) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + > + virtio_add_feature(&features, VIRTIO_NET_F_CSUM); > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_CSUM); > + virtio_add_feature(&features, VIRTIO_NET_F_MAC); > + virtio_add_feature(&features, VIRTIO_NET_F_GSO); > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_TSO4); > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_TSO6); > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_ECN); > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_UFO); > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_ANNOUNCE); > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO4); > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO6); > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_ECN); > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_UFO); > + virtio_add_feature(&features, VIRTIO_NET_F_MRG_RXBUF); > + virtio_add_feature(&features, VIRTIO_NET_F_STATUS); > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_VQ); > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_RX); > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_VLAN); > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_RX_EXTRA); > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_MAC_ADDR); > + virtio_add_feature(&features, VIRTIO_NET_F_MQ); Any reason for those hand crafted features? > + > + return vhost_get_features(&s->dev, vdpa_feature_bits, features); > +} > + > +static int vhost_vdpa_net_start(VirtIODevice *vdev, Error **errp) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); > + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); > + int i, ret; > + > + if (!k->set_guest_notifiers) { > + error_setg(errp, "binding does not support guest notifiers"); > + return -ENOSYS; > + } > + > + ret = vhost_dev_enable_notifiers(&s->dev, vdev); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Error enabling host notifiers"); > + return ret; > + } > + > + ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Error binding guest notifier"); > + goto err_host_notifiers; > + } > + > + s->dev.acked_features = vdev->guest_features; > + > + ret = vhost_dev_start(&s->dev, vdev); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Error starting vhost"); > + goto err_guest_notifiers; > + } > + s->started = true; > + > + /* guest_notifier_mask/pending not used yet, so just unmask > + * everything here. virtio-pci will do the right thing by > + * enabling/disabling irqfd. > + */ > + for (i = 0; i < s->dev.nvqs; i++) { > + vhost_virtqueue_mask(&s->dev, vdev, i, false); > + } > + > + return ret; > + > +err_guest_notifiers: > + k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false); > +err_host_notifiers: > + vhost_dev_disable_notifiers(&s->dev, vdev); > + return ret; > +} > + > +static void vhost_vdpa_net_handle_output(VirtIODevice *vdev, VirtQueue *vq) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + Error *local_err = NULL; > + int i, ret; > + > + if (!vdev->start_on_kick) { > + return; > + } > + > + if (s->dev.started) { > + return; > + } > + > + /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start > + * vhost here instead of waiting for .set_status(). > + */ > + ret = vhost_vdpa_net_start(vdev, &local_err); > + if (ret < 0) { > + error_reportf_err(local_err, "vhost-vdpa-net: start failed: "); > + return; > + } > + > + /* Kick right away to begin processing requests already in vring */ > + for (i = 0; i < s->dev.nvqs; i++) { > + VirtQueue *kick_vq = virtio_get_queue(vdev, i); > + > + if (!virtio_queue_get_desc_addr(vdev, i)) { > + continue; > + } > + event_notifier_set(virtio_queue_get_host_notifier(kick_vq)); > + } > +} > + > +static void vhost_vdpa_net_stop(VirtIODevice *vdev) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); > + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); > + int ret; > + > + if (!s->started) { > + return; > + } > + s->started = false; > + > + if (!k->set_guest_notifiers) { > + return; > + } > + > + vhost_dev_stop(&s->dev, vdev); > + > + ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false); > + if (ret < 0) { > + error_report("vhost guest notifier cleanup failed: %d", ret); > + return; > + } > + > + vhost_dev_disable_notifiers(&s->dev, vdev); > +} > + > +static void vhost_vdpa_net_set_status(VirtIODevice *vdev, uint8_t status) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + bool should_start = virtio_device_started(vdev, status); > + Error *local_err = NULL; > + int ret; > + > + if (!vdev->vm_running) { > + should_start = false; > + } > + > + if (s->started == should_start) { > + return; > + } > + > + if (should_start) { > + ret = vhost_vdpa_net_start(vdev, &local_err); > + if (ret < 0) { > + error_reportf_err(local_err, "vhost-vdpa-net: start failed: "); > + } > + } else { > + vhost_vdpa_net_stop(vdev); > + } > +} > + > +static void vhost_vdpa_net_unrealize(VHostVdpaNet *s) > +{ > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > + int i; > + > + for (i = 0; i < s->queue_pairs * 2; i++) { > + virtio_delete_queue(s->virtqs[i]); > + } > + /* ctrl vq */ > + virtio_delete_queue(s->virtqs[i]); > + > + g_free(s->virtqs); > + virtio_cleanup(vdev); > +} > + > +static void vhost_vdpa_net_device_realize(DeviceState *dev, Error **errp) > +{ > + VirtIODevice *vdev = VIRTIO_DEVICE(dev); > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + int i, ret; > + > + s->vdpa.device_fd = qemu_open_old(s->vdpa_dev, O_RDWR); > + if (s->vdpa.device_fd == -1) { > + error_setg(errp, "vhost-vdpa-net: open %s failed: %s", > + s->vdpa_dev, strerror(errno)); > + return; > + } > + > + virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, > + sizeof(struct virtio_net_config)); > + > + s->dev.nvqs = s->queue_pairs * 2 + 1; > + s->dev.vqs = g_new0(struct vhost_virtqueue, s->dev.nvqs); > + s->dev.vq_index = 0; > + s->dev.vq_index_end = s->dev.nvqs; > + s->dev.backend_features = 0; > + s->started = false; > + > + s->virtqs = g_new0(VirtQueue *, s->dev.nvqs); > + for (i = 0; i < s->dev.nvqs; i++) { > + s->virtqs[i] = virtio_add_queue(vdev, s->queue_size, > + vhost_vdpa_net_handle_output); We should check whether MQ is negotiated since the index varies depending on that. > + } > + > + ret = vhost_dev_init(&s->dev, &s->vdpa, VHOST_BACKEND_TYPE_VDPA, 0, NULL); > + if (ret < 0) { > + error_setg(errp, "vhost-vdpa-net: vhost initialization failed: %s", > + strerror(-ret)); > + goto init_err; > + } > + > + ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->netcfg, > + sizeof(struct virtio_net_config), NULL); > + if (ret < 0) { > + error_setg(errp, "vhost-vdpa-net: get network config failed"); > + goto config_err; > + } > + > + return; > +config_err: > + vhost_dev_cleanup(&s->dev); > +init_err: > + vhost_vdpa_net_unrealize(s); > + close(s->vdpa.device_fd); > +} > + > +static void vhost_vdpa_net_device_unrealize(DeviceState *dev) > +{ > + VirtIODevice *vdev = VIRTIO_DEVICE(dev); > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + > + virtio_set_status(vdev, 0); > + vhost_dev_cleanup(&s->dev); > + vhost_vdpa_net_unrealize(s); > + close(s->vdpa.device_fd); > +} > + > +static const VMStateDescription vmstate_vhost_vdpa_net = { > + .name = "vhost-vdpa-net", > + .minimum_version_id = 1, > + .version_id = 1, > + .fields = (VMStateField[]) { > + VMSTATE_VIRTIO_DEVICE, > + VMSTATE_END_OF_LIST() > + }, > +}; > + > +static void vhost_vdpa_net_instance_init(Object *obj) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(obj); > + > + device_add_bootindex_property(obj, &s->bootindex, "bootindex", > + "/ethernet-phy@0,0", DEVICE(obj)); > +} > + > +static Property vhost_vdpa_net_properties[] = { > + DEFINE_PROP_STRING("vdpa-dev", VHostVdpaNet, vdpa_dev), > + DEFINE_PROP_UINT16("queue-pairs", VHostVdpaNet, queue_pairs, > + VHOST_VDPA_NET_AUTO_QUEUE_PAIRS), Any reason that we need the queue pairs parameter? Note that it is expected to be provisioned by the netlink for the management device. Thanks
On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > From: Longpeng <longpeng2@huawei.com> > > Hi guys, > > This patch introduces vhost-vdpa-net device, which is inspired > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > I've tested this patch on Huawei's offload card: > ./x86_64-softmmu/qemu-system-x86_64 \ > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > For virtio hardware offloading, the most important requirement for us > is to support live migration between offloading cards from different > vendors, the combination of netdev and virtio-net seems too heavy, we > prefer a lightweight way. Did not look at the patch in depth yet. Is this already supported with this patch? Or is that just the plan? > Maybe we could support both in the future ? Such as: > > * Lightweight > Net: vhost-vdpa-net > Storage: vhost-vdpa-blk > > * Heavy but more powerful > Net: netdev + virtio-net + vhost-vdpa > Storage: bdrv + virtio-blk + vhost-vdpa I'd like to better understand what is in and out of scope for this device. Which features would be "more powerful" and belong in virtio-net, and which in vhost-vdpa-net? > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com> > --- > hw/net/meson.build | 1 + > hw/net/vhost-vdpa-net.c | 338 +++++++++++++++++++++++++++++++++++++ > hw/virtio/Kconfig | 5 + > hw/virtio/meson.build | 1 + > hw/virtio/vhost-vdpa-net-pci.c | 118 +++++++++++++ > include/hw/virtio/vhost-vdpa-net.h | 31 ++++ > include/net/vhost-vdpa.h | 2 + > net/vhost-vdpa.c | 2 +- > 8 files changed, 497 insertions(+), 1 deletion(-) > create mode 100644 hw/net/vhost-vdpa-net.c > create mode 100644 hw/virtio/vhost-vdpa-net-pci.c > create mode 100644 include/hw/virtio/vhost-vdpa-net.h > > diff --git a/hw/net/meson.build b/hw/net/meson.build > index bdf71f1..139ebc4 100644 > --- a/hw/net/meson.build > +++ b/hw/net/meson.build > @@ -44,6 +44,7 @@ specific_ss.add(when: 'CONFIG_XILINX_ETHLITE', if_true: files('xilinx_ethlite.c' > > softmmu_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('net_rx_pkt.c')) > specific_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('virtio-net.c')) > +specific_ss.add(when: 'CONFIG_VHOST_VDPA_NET', if_true: files('vhost-vdpa-net.c')) > > softmmu_ss.add(when: ['CONFIG_VIRTIO_NET', 'CONFIG_VHOST_NET'], if_true: files('vhost_net.c'), if_false: files('vhost_net-stub.c')) > softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost_net-stub.c')) > diff --git a/hw/net/vhost-vdpa-net.c b/hw/net/vhost-vdpa-net.c > new file mode 100644 > index 0000000..48b99f9 > --- /dev/null > +++ b/hw/net/vhost-vdpa-net.c > @@ -0,0 +1,338 @@ > +#include "qemu/osdep.h" > +#include "qapi/error.h" > +#include "qemu/error-report.h" > +#include "qemu/cutils.h" > +#include "hw/qdev-core.h" > +#include "hw/qdev-properties.h" > +#include "hw/qdev-properties-system.h" > +#include "hw/virtio/vhost.h" > +#include "hw/virtio/vhost-vdpa-net.h" > +#include "hw/virtio/virtio.h" > +#include "hw/virtio/virtio-bus.h" > +#include "hw/virtio/virtio-access.h" > +#include "sysemu/sysemu.h" > +#include "sysemu/runstate.h" > +#include "net/vhost-vdpa.h" > + > +static void vhost_vdpa_net_get_config(VirtIODevice *vdev, uint8_t *config) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + > + memcpy(config, &s->netcfg, sizeof(struct virtio_net_config)); > +} > + > +static void vhost_vdpa_net_set_config(VirtIODevice *vdev, const uint8_t *config) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + struct virtio_net_config *netcfg = (struct virtio_net_config *)config; > + int ret; > + > + ret = vhost_dev_set_config(&s->dev, (uint8_t *)netcfg, 0, sizeof(*netcfg), > + VHOST_SET_CONFIG_TYPE_MASTER); > + if (ret) { > + error_report("set device config space failed"); > + return; > + } > +} > + > +static uint64_t vhost_vdpa_net_get_features(VirtIODevice *vdev, > + uint64_t features, > + Error **errp) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + > + virtio_add_feature(&features, VIRTIO_NET_F_CSUM); > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_CSUM); > + virtio_add_feature(&features, VIRTIO_NET_F_MAC); > + virtio_add_feature(&features, VIRTIO_NET_F_GSO); > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_TSO4); > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_TSO6); > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_ECN); > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_UFO); > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_ANNOUNCE); > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO4); > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO6); > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_ECN); > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_UFO); > + virtio_add_feature(&features, VIRTIO_NET_F_MRG_RXBUF); > + virtio_add_feature(&features, VIRTIO_NET_F_STATUS); > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_VQ); > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_RX); > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_VLAN); > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_RX_EXTRA); > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_MAC_ADDR); > + virtio_add_feature(&features, VIRTIO_NET_F_MQ); > + > + return vhost_get_features(&s->dev, vdpa_feature_bits, features); > +} > + > +static int vhost_vdpa_net_start(VirtIODevice *vdev, Error **errp) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); > + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); > + int i, ret; > + > + if (!k->set_guest_notifiers) { > + error_setg(errp, "binding does not support guest notifiers"); > + return -ENOSYS; > + } > + > + ret = vhost_dev_enable_notifiers(&s->dev, vdev); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Error enabling host notifiers"); > + return ret; > + } > + > + ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Error binding guest notifier"); > + goto err_host_notifiers; > + } > + > + s->dev.acked_features = vdev->guest_features; > + > + ret = vhost_dev_start(&s->dev, vdev); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Error starting vhost"); > + goto err_guest_notifiers; > + } > + s->started = true; > + > + /* guest_notifier_mask/pending not used yet, so just unmask > + * everything here. virtio-pci will do the right thing by > + * enabling/disabling irqfd. > + */ > + for (i = 0; i < s->dev.nvqs; i++) { > + vhost_virtqueue_mask(&s->dev, vdev, i, false); > + } > + > + return ret; > + > +err_guest_notifiers: > + k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false); > +err_host_notifiers: > + vhost_dev_disable_notifiers(&s->dev, vdev); > + return ret; > +} > + > +static void vhost_vdpa_net_handle_output(VirtIODevice *vdev, VirtQueue *vq) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + Error *local_err = NULL; > + int i, ret; > + > + if (!vdev->start_on_kick) { > + return; > + } > + > + if (s->dev.started) { > + return; > + } > + > + /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start > + * vhost here instead of waiting for .set_status(). > + */ > + ret = vhost_vdpa_net_start(vdev, &local_err); > + if (ret < 0) { > + error_reportf_err(local_err, "vhost-vdpa-net: start failed: "); > + return; > + } > + > + /* Kick right away to begin processing requests already in vring */ > + for (i = 0; i < s->dev.nvqs; i++) { > + VirtQueue *kick_vq = virtio_get_queue(vdev, i); > + > + if (!virtio_queue_get_desc_addr(vdev, i)) { > + continue; > + } > + event_notifier_set(virtio_queue_get_host_notifier(kick_vq)); > + } > +} > + > +static void vhost_vdpa_net_stop(VirtIODevice *vdev) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); > + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); > + int ret; > + > + if (!s->started) { > + return; > + } > + s->started = false; > + > + if (!k->set_guest_notifiers) { > + return; > + } > + > + vhost_dev_stop(&s->dev, vdev); > + > + ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false); > + if (ret < 0) { > + error_report("vhost guest notifier cleanup failed: %d", ret); > + return; > + } > + > + vhost_dev_disable_notifiers(&s->dev, vdev); > +} > + > +static void vhost_vdpa_net_set_status(VirtIODevice *vdev, uint8_t status) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + bool should_start = virtio_device_started(vdev, status); > + Error *local_err = NULL; > + int ret; > + > + if (!vdev->vm_running) { > + should_start = false; > + } > + > + if (s->started == should_start) { > + return; > + } > + > + if (should_start) { > + ret = vhost_vdpa_net_start(vdev, &local_err); > + if (ret < 0) { > + error_reportf_err(local_err, "vhost-vdpa-net: start failed: "); > + } > + } else { > + vhost_vdpa_net_stop(vdev); > + } > +} > + > +static void vhost_vdpa_net_unrealize(VHostVdpaNet *s) > +{ > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > + int i; > + > + for (i = 0; i < s->queue_pairs * 2; i++) { > + virtio_delete_queue(s->virtqs[i]); > + } > + /* ctrl vq */ > + virtio_delete_queue(s->virtqs[i]); > + > + g_free(s->virtqs); > + virtio_cleanup(vdev); > +} > + > +static void vhost_vdpa_net_device_realize(DeviceState *dev, Error **errp) > +{ > + VirtIODevice *vdev = VIRTIO_DEVICE(dev); > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + int i, ret; > + > + s->vdpa.device_fd = qemu_open_old(s->vdpa_dev, O_RDWR); > + if (s->vdpa.device_fd == -1) { > + error_setg(errp, "vhost-vdpa-net: open %s failed: %s", > + s->vdpa_dev, strerror(errno)); > + return; > + } > + > + virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, > + sizeof(struct virtio_net_config)); > + > + s->dev.nvqs = s->queue_pairs * 2 + 1; > + s->dev.vqs = g_new0(struct vhost_virtqueue, s->dev.nvqs); > + s->dev.vq_index = 0; > + s->dev.vq_index_end = s->dev.nvqs; > + s->dev.backend_features = 0; > + s->started = false; > + > + s->virtqs = g_new0(VirtQueue *, s->dev.nvqs); > + for (i = 0; i < s->dev.nvqs; i++) { > + s->virtqs[i] = virtio_add_queue(vdev, s->queue_size, > + vhost_vdpa_net_handle_output); > + } > + > + ret = vhost_dev_init(&s->dev, &s->vdpa, VHOST_BACKEND_TYPE_VDPA, 0, NULL); > + if (ret < 0) { > + error_setg(errp, "vhost-vdpa-net: vhost initialization failed: %s", > + strerror(-ret)); > + goto init_err; > + } > + > + ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->netcfg, > + sizeof(struct virtio_net_config), NULL); > + if (ret < 0) { > + error_setg(errp, "vhost-vdpa-net: get network config failed"); > + goto config_err; > + } > + > + return; > +config_err: > + vhost_dev_cleanup(&s->dev); > +init_err: > + vhost_vdpa_net_unrealize(s); > + close(s->vdpa.device_fd); > +} > + > +static void vhost_vdpa_net_device_unrealize(DeviceState *dev) > +{ > + VirtIODevice *vdev = VIRTIO_DEVICE(dev); > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > + > + virtio_set_status(vdev, 0); > + vhost_dev_cleanup(&s->dev); > + vhost_vdpa_net_unrealize(s); > + close(s->vdpa.device_fd); > +} > + > +static const VMStateDescription vmstate_vhost_vdpa_net = { > + .name = "vhost-vdpa-net", > + .minimum_version_id = 1, > + .version_id = 1, > + .fields = (VMStateField[]) { > + VMSTATE_VIRTIO_DEVICE, > + VMSTATE_END_OF_LIST() > + }, > +}; > + Not a lot of state here compared to virtio net. I didn't look at the code deeply so I don't know - how is the rest of the state migrated? > +static void vhost_vdpa_net_instance_init(Object *obj) > +{ > + VHostVdpaNet *s = VHOST_VDPA_NET(obj); > + > + device_add_bootindex_property(obj, &s->bootindex, "bootindex", > + "/ethernet-phy@0,0", DEVICE(obj)); > +} > + > +static Property vhost_vdpa_net_properties[] = { > + DEFINE_PROP_STRING("vdpa-dev", VHostVdpaNet, vdpa_dev), > + DEFINE_PROP_UINT16("queue-pairs", VHostVdpaNet, queue_pairs, > + VHOST_VDPA_NET_AUTO_QUEUE_PAIRS), > + DEFINE_PROP_UINT32("queue-size", VHostVdpaNet, queue_size, > + VHOST_VDPA_NET_QUEUE_DEFAULT_SIZE), > + DEFINE_PROP_END_OF_LIST(), > +}; > + > +static void vhost_vdpa_net_class_init(ObjectClass *klass, void *data) > +{ > + DeviceClass *dc = DEVICE_CLASS(klass); > + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); > + > + device_class_set_props(dc, vhost_vdpa_net_properties); > + dc->vmsd = &vmstate_vhost_vdpa_net; > + set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); > + vdc->realize = vhost_vdpa_net_device_realize; > + vdc->unrealize = vhost_vdpa_net_device_unrealize; > + vdc->get_config = vhost_vdpa_net_get_config; > + vdc->set_config = vhost_vdpa_net_set_config; > + vdc->get_features = vhost_vdpa_net_get_features; > + vdc->set_status = vhost_vdpa_net_set_status; > +} > + > +static const TypeInfo vhost_vdpa_net_info = { > + .name = TYPE_VHOST_VDPA_NET, > + .parent = TYPE_VIRTIO_DEVICE, > + .instance_size = sizeof(VHostVdpaNet), > + .instance_init = vhost_vdpa_net_instance_init, > + .class_init = vhost_vdpa_net_class_init, > +}; > + > +static void virtio_register_types(void) > +{ > + type_register_static(&vhost_vdpa_net_info); > +} > + > +type_init(virtio_register_types) > diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig > index c144d42..50dba2e 100644 > --- a/hw/virtio/Kconfig > +++ b/hw/virtio/Kconfig > @@ -68,3 +68,8 @@ config VHOST_USER_RNG > bool > default y > depends on VIRTIO && VHOST_USER > + > +config VHOST_VDPA_NET > + bool > + default y if VIRTIO_PCI > + depends on VIRTIO && VHOST_VDPA && LINUX > diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build > index 521f7d6..3089222 100644 > --- a/hw/virtio/meson.build > +++ b/hw/virtio/meson.build > @@ -34,6 +34,7 @@ virtio_pci_ss = ss.source_set() > virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: files('vhost-vsock-pci.c')) > virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_VSOCK', if_true: files('vhost-user-vsock-pci.c')) > virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: files('vhost-user-blk-pci.c')) > +virtio_pci_ss.add(when: 'CONFIG_VHOST_VDPA_NET', if_true: files('vhost-vdpa-net-pci.c')) > virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_INPUT', if_true: files('vhost-user-input-pci.c')) > virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_SCSI', if_true: files('vhost-user-scsi-pci.c')) > virtio_pci_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: files('vhost-scsi-pci.c')) > diff --git a/hw/virtio/vhost-vdpa-net-pci.c b/hw/virtio/vhost-vdpa-net-pci.c > new file mode 100644 > index 0000000..84199a8 > --- /dev/null > +++ b/hw/virtio/vhost-vdpa-net-pci.c > @@ -0,0 +1,118 @@ > +#include "qemu/osdep.h" > +#include "standard-headers/linux/virtio_pci.h" > +#include "hw/virtio/virtio.h" > +#include "hw/virtio/vhost-vdpa-net.h" > +#include "hw/pci/pci.h" > +#include "hw/qdev-properties.h" > +#include "qapi/error.h" > +#include "qemu/error-report.h" > +#include "qemu/module.h" > +#include "virtio-pci.h" > +#include "qom/object.h" > +#include "net/vhost-vdpa.h" > + > +typedef struct VHostVdpaNetPCI VHostVdpaNetPCI; > + > +#define TYPE_VHOST_VDPA_NET_PCI "vhost-vdpa-net-pci-base" > +DECLARE_INSTANCE_CHECKER(VHostVdpaNetPCI, VHOST_VDPA_NET_PCI, > + TYPE_VHOST_VDPA_NET_PCI) > + > +struct VHostVdpaNetPCI { > + VirtIOPCIProxy parent_obj; > + VHostVdpaNet vdev; > +}; > + > +static Property vhost_vdpa_net_pci_properties[] = { > + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, > + DEV_NVECTORS_UNSPECIFIED), > + DEFINE_PROP_END_OF_LIST(), > +}; > + > +static int vhost_vdpa_net_get_queue_pairs(VHostVdpaNetPCI *dev, Error **errp) > +{ > + int device_fd, queue_pairs; > + int has_cvq; > + > + device_fd = qemu_open_old(dev->vdev.vdpa_dev, O_RDWR); > + if (device_fd == -1) { > + error_setg(errp, "vhost-vdpa-net: open %s failed: %s", > + dev->vdev.vdpa_dev, strerror(errno)); > + return -1; > + } > + > + queue_pairs = vhost_vdpa_get_max_queue_pairs(device_fd, &has_cvq, errp); > + if (queue_pairs < 0) { > + error_setg(errp, "vhost-vdpa-net: get queue pairs failed: %s", > + strerror(errno)); > + goto out; > + } > + > + if (!has_cvq) { > + error_setg(errp, "vhost-vdpa-net: not support ctrl vq"); > + } > + > +out: > + close(device_fd); > + return queue_pairs; > +} > + > +static void vhost_vdpa_net_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) > +{ > + VHostVdpaNetPCI *dev = VHOST_VDPA_NET_PCI(vpci_dev); > + DeviceState *vdev = DEVICE(&dev->vdev); > + > + if (dev->vdev.queue_pairs == VHOST_VDPA_NET_AUTO_QUEUE_PAIRS) { > + dev->vdev.queue_pairs = vhost_vdpa_net_get_queue_pairs(dev, errp); > + if (*errp) { > + return; > + } > + } > + > + if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) { > + vpci_dev->nvectors = dev->vdev.queue_pairs * 2 + 1; > + } > + > + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); > +} > + > +static void vhost_vdpa_net_pci_class_init(ObjectClass *klass, void *data) > +{ > + DeviceClass *dc = DEVICE_CLASS(klass); > + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); > + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); > + > + set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); > + device_class_set_props(dc, vhost_vdpa_net_pci_properties); > + k->realize = vhost_vdpa_net_pci_realize; > + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; > + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_NET; > + pcidev_k->revision = VIRTIO_PCI_ABI_VERSION; > + pcidev_k->class_id = PCI_CLASS_NETWORK_ETHERNET; > +} > + > +static void vhost_vdpa_net_pci_instance_init(Object *obj) > +{ > + VHostVdpaNetPCI *dev = VHOST_VDPA_NET_PCI(obj); > + > + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), > + TYPE_VHOST_VDPA_NET); > + object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev), > + "bootindex"); > +} > + > +static const VirtioPCIDeviceTypeInfo vhost_vdpa_net_pci_info = { > + .base_name = TYPE_VHOST_VDPA_NET_PCI, > + .generic_name = "vhost-vdpa-net-pci", > + .transitional_name = "vhost-vdpa-net-pci-transitional", > + .non_transitional_name = "vhost-vdpa-net-pci-non-transitional", > + .instance_size = sizeof(VHostVdpaNetPCI), > + .instance_init = vhost_vdpa_net_pci_instance_init, > + .class_init = vhost_vdpa_net_pci_class_init, > +}; > + > +static void vhost_vdpa_net_pci_register(void) > +{ > + virtio_pci_types_register(&vhost_vdpa_net_pci_info); > +} > + > +type_init(vhost_vdpa_net_pci_register) > diff --git a/include/hw/virtio/vhost-vdpa-net.h b/include/hw/virtio/vhost-vdpa-net.h > new file mode 100644 > index 0000000..63bf3a6 > --- /dev/null > +++ b/include/hw/virtio/vhost-vdpa-net.h > @@ -0,0 +1,31 @@ > +#ifndef VHOST_VDPA_NET_H > +#define VHOST_VDPA_NET_H > + > +#include "standard-headers/linux/virtio_blk.h" > +#include "hw/block/block.h" > +#include "chardev/char-fe.h" > +#include "hw/virtio/vhost.h" > +#include "hw/virtio/vhost-vdpa.h" > +#include "hw/virtio/virtio-net.h" > +#include "qom/object.h" > + > +#define TYPE_VHOST_VDPA_NET "vhost-vdpa-net" > +OBJECT_DECLARE_SIMPLE_TYPE(VHostVdpaNet, VHOST_VDPA_NET) > + > +struct VHostVdpaNet { > + VirtIODevice parent_obj; > + int32_t bootindex; > + struct virtio_net_config netcfg; > + uint16_t queue_pairs; > + uint32_t queue_size; > + struct vhost_dev dev; > + VirtQueue **virtqs; > + struct vhost_vdpa vdpa; > + char *vdpa_dev; > + bool started; > +}; > + > +#define VHOST_VDPA_NET_AUTO_QUEUE_PAIRS UINT16_MAX > +#define VHOST_VDPA_NET_QUEUE_DEFAULT_SIZE 256 > + > +#endif > diff --git a/include/net/vhost-vdpa.h b/include/net/vhost-vdpa.h > index b81f9a6..f029972 100644 > --- a/include/net/vhost-vdpa.h > +++ b/include/net/vhost-vdpa.h > @@ -18,4 +18,6 @@ struct vhost_net *vhost_vdpa_get_vhost_net(NetClientState *nc); > > extern const int vdpa_feature_bits[]; > > +int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp); > + > #endif /* VHOST_VDPA_H */ > diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c > index 25dd6dd..8ee6ba5 100644 > --- a/net/vhost-vdpa.c > +++ b/net/vhost-vdpa.c > @@ -219,7 +219,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer, > return nc; > } > > -static int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp) > +int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp) > { > unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); > g_autofree struct vhost_vdpa_config *config = NULL; > -- > 1.8.3.1
On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > From: Longpeng <longpeng2@huawei.com> > > Hi guys, > > This patch introduces vhost-vdpa-net device, which is inspired > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > I've tested this patch on Huawei's offload card: > ./x86_64-softmmu/qemu-system-x86_64 \ > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > For virtio hardware offloading, the most important requirement for us > is to support live migration between offloading cards from different > vendors, the combination of netdev and virtio-net seems too heavy, we > prefer a lightweight way. > > Maybe we could support both in the future ? Such as: > > * Lightweight > Net: vhost-vdpa-net > Storage: vhost-vdpa-blk > > * Heavy but more powerful > Net: netdev + virtio-net + vhost-vdpa > Storage: bdrv + virtio-blk + vhost-vdpa > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html Stefano presented a plan for vdpa-blk at KVM Forum 2021: https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-software-offload-for-virtio-blk-stefano-garzarella-red-hat It's closer to today's virtio-net + vhost-net approach than the vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as an offload feature rather than a completely separate code path that needs to be maintained and tested. That way QEMU's block layer features and live migration work with vDPA devices and re-use the virtio-blk code. The key functionality that has not been implemented yet is a "fast path" mechanism that allows the QEMU virtio-blk device's virtqueue to be offloaded to vDPA. The unified vdpa-blk architecture should deliver the same performance as the vhost-vdpa-blk device you mentioned but with more features, so I wonder what aspects of the vhost-vdpa-blk idea are important to you? QEMU already has vhost-user-blk, which takes a similar approach as the vhost-vdpa-blk device you are proposing. I'm not against the vhost-vdpa-blk approach in priciple, but would like to understand your requirements and see if there is a way to collaborate on one vdpa-blk implementation instead of dividing our efforts between two. Stefan
On Thu, Dec 09, 2021 at 09:16:58AM +0000, Stefan Hajnoczi wrote: >On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: >> From: Longpeng <longpeng2@huawei.com> >> >> Hi guys, >> >> This patch introduces vhost-vdpa-net device, which is inspired >> by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. >> >> I've tested this patch on Huawei's offload card: >> ./x86_64-softmmu/qemu-system-x86_64 \ >> -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 >> >> For virtio hardware offloading, the most important requirement for us >> is to support live migration between offloading cards from different >> vendors, the combination of netdev and virtio-net seems too heavy, we >> prefer a lightweight way. >> >> Maybe we could support both in the future ? Such as: >> >> * Lightweight >> Net: vhost-vdpa-net >> Storage: vhost-vdpa-blk >> >> * Heavy but more powerful >> Net: netdev + virtio-net + vhost-vdpa >> Storage: bdrv + virtio-blk + vhost-vdpa >> >> [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > >Stefano presented a plan for vdpa-blk at KVM Forum 2021: >https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-software-offload-for-virtio-blk-stefano-garzarella-red-hat > >It's closer to today's virtio-net + vhost-net approach than the >vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as >an offload feature rather than a completely separate code path that >needs to be maintained and tested. That way QEMU's block layer features >and live migration work with vDPA devices and re-use the virtio-blk >code. The key functionality that has not been implemented yet is a "fast >path" mechanism that allows the QEMU virtio-blk device's virtqueue to be >offloaded to vDPA. > >The unified vdpa-blk architecture should deliver the same performance >as the vhost-vdpa-blk device you mentioned but with more features, so I >wonder what aspects of the vhost-vdpa-blk idea are important to you? > >QEMU already has vhost-user-blk, which takes a similar approach as the >vhost-vdpa-blk device you are proposing. I'm not against the >vhost-vdpa-blk approach in priciple, but would like to understand your >requirements and see if there is a way to collaborate on one vdpa-blk >implementation instead of dividing our efforts between two. Waiting for the aspects that Stefan asked, I add some details about the plan for vdpa-blk. Currently I'm working on the in-kernel software device. In the next months I hope to start working on the QEMU part. Anyway that part could go in parallel with the in-kernel device, so if you are interested we can collaborate. Having only the unified vdpa-blk architecture would allow us to simplify the management layers and avoid duplicate code, but it takes more time to develop compared to vhost-vdpa-blk. So if vdpa-blk support in QEMU is urgent, I could understand the need to add vhost-vdpa-blk now. Let me know if you want more details about the unified vdpa-blk architecture. Thanks, Stefano
> -----Original Message----- > From: Michael S. Tsirkin [mailto:mst@redhat.com] > Sent: Thursday, December 9, 2021 3:05 AM > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > <longpeng2@huawei.com> > Cc: jasowang@redhat.com; parav@nvidia.com; xieyongji@bytedance.com; > stefanha@redhat.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > From: Longpeng <longpeng2@huawei.com> > > > > Hi guys, > > > > This patch introduces vhost-vdpa-net device, which is inspired > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > I've tested this patch on Huawei's offload card: > > ./x86_64-softmmu/qemu-system-x86_64 \ > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > For virtio hardware offloading, the most important requirement for us > > is to support live migration between offloading cards from different > > vendors, the combination of netdev and virtio-net seems too heavy, we > > prefer a lightweight way. > > Did not look at the patch in depth yet. > Is this already supported with this patch? Or is that just the plan? > With this patch, the data plane can work, I've done the test based on the Huawei's offloading card. But the live migration is not support yet. > > Maybe we could support both in the future ? Such as: > > > > * Lightweight > > Net: vhost-vdpa-net > > Storage: vhost-vdpa-blk > > > > * Heavy but more powerful > > Net: netdev + virtio-net + vhost-vdpa > > Storage: bdrv + virtio-blk + vhost-vdpa > > I'd like to better understand what is in and out of scope for > this device. Which features would be "more powerful" and belong > in virtio-net, and which in vhost-vdpa-net? > It's no doubt that the combination of netdev + vrtio-net + vhost-vdpa could provides lots of benefits (such as Jason listed in his comments) , it's more generic. However, vhost-vdpa-net is only aiming at the virtio hardware offloading case, besides the data plane passthrough, migrate between offloading cards from different vendors is our goal. Some features (e.g. transparent failover, migrate between different types of vhost backends) maybe won't be used in such specific case. > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com> > > --- > > hw/net/meson.build | 1 + > > hw/net/vhost-vdpa-net.c | 338 > +++++++++++++++++++++++++++++++++++++ > > hw/virtio/Kconfig | 5 + > > hw/virtio/meson.build | 1 + > > hw/virtio/vhost-vdpa-net-pci.c | 118 +++++++++++++ > > include/hw/virtio/vhost-vdpa-net.h | 31 ++++ > > include/net/vhost-vdpa.h | 2 + > > net/vhost-vdpa.c | 2 +- > > 8 files changed, 497 insertions(+), 1 deletion(-) > > create mode 100644 hw/net/vhost-vdpa-net.c > > create mode 100644 hw/virtio/vhost-vdpa-net-pci.c > > create mode 100644 include/hw/virtio/vhost-vdpa-net.h > > > > diff --git a/hw/net/meson.build b/hw/net/meson.build > > index bdf71f1..139ebc4 100644 > > --- a/hw/net/meson.build > > +++ b/hw/net/meson.build > > @@ -44,6 +44,7 @@ specific_ss.add(when: 'CONFIG_XILINX_ETHLITE', if_true: > files('xilinx_ethlite.c' > > > > softmmu_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('net_rx_pkt.c')) > > specific_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('virtio-net.c')) > > +specific_ss.add(when: 'CONFIG_VHOST_VDPA_NET', if_true: > files('vhost-vdpa-net.c')) > > > > softmmu_ss.add(when: ['CONFIG_VIRTIO_NET', 'CONFIG_VHOST_NET'], if_true: > files('vhost_net.c'), if_false: files('vhost_net-stub.c')) > > softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost_net-stub.c')) > > diff --git a/hw/net/vhost-vdpa-net.c b/hw/net/vhost-vdpa-net.c > > new file mode 100644 > > index 0000000..48b99f9 > > --- /dev/null > > +++ b/hw/net/vhost-vdpa-net.c > > @@ -0,0 +1,338 @@ > > +#include "qemu/osdep.h" > > +#include "qapi/error.h" > > +#include "qemu/error-report.h" > > +#include "qemu/cutils.h" > > +#include "hw/qdev-core.h" > > +#include "hw/qdev-properties.h" > > +#include "hw/qdev-properties-system.h" > > +#include "hw/virtio/vhost.h" > > +#include "hw/virtio/vhost-vdpa-net.h" > > +#include "hw/virtio/virtio.h" > > +#include "hw/virtio/virtio-bus.h" > > +#include "hw/virtio/virtio-access.h" > > +#include "sysemu/sysemu.h" > > +#include "sysemu/runstate.h" > > +#include "net/vhost-vdpa.h" > > + > > +static void vhost_vdpa_net_get_config(VirtIODevice *vdev, uint8_t *config) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + > > + memcpy(config, &s->netcfg, sizeof(struct virtio_net_config)); > > +} > > + > > +static void vhost_vdpa_net_set_config(VirtIODevice *vdev, const uint8_t > *config) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + struct virtio_net_config *netcfg = (struct virtio_net_config *)config; > > + int ret; > > + > > + ret = vhost_dev_set_config(&s->dev, (uint8_t *)netcfg, 0, > sizeof(*netcfg), > > + VHOST_SET_CONFIG_TYPE_MASTER); > > + if (ret) { > > + error_report("set device config space failed"); > > + return; > > + } > > +} > > + > > +static uint64_t vhost_vdpa_net_get_features(VirtIODevice *vdev, > > + uint64_t features, > > + Error **errp) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + > > + virtio_add_feature(&features, VIRTIO_NET_F_CSUM); > > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_CSUM); > > + virtio_add_feature(&features, VIRTIO_NET_F_MAC); > > + virtio_add_feature(&features, VIRTIO_NET_F_GSO); > > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_TSO4); > > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_TSO6); > > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_ECN); > > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_UFO); > > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_ANNOUNCE); > > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO4); > > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO6); > > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_ECN); > > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_UFO); > > + virtio_add_feature(&features, VIRTIO_NET_F_MRG_RXBUF); > > + virtio_add_feature(&features, VIRTIO_NET_F_STATUS); > > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_VQ); > > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_RX); > > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_VLAN); > > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_RX_EXTRA); > > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_MAC_ADDR); > > + virtio_add_feature(&features, VIRTIO_NET_F_MQ); > > + > > + return vhost_get_features(&s->dev, vdpa_feature_bits, features); > > +} > > + > > +static int vhost_vdpa_net_start(VirtIODevice *vdev, Error **errp) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); > > + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); > > + int i, ret; > > + > > + if (!k->set_guest_notifiers) { > > + error_setg(errp, "binding does not support guest notifiers"); > > + return -ENOSYS; > > + } > > + > > + ret = vhost_dev_enable_notifiers(&s->dev, vdev); > > + if (ret < 0) { > > + error_setg_errno(errp, -ret, "Error enabling host notifiers"); > > + return ret; > > + } > > + > > + ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true); > > + if (ret < 0) { > > + error_setg_errno(errp, -ret, "Error binding guest notifier"); > > + goto err_host_notifiers; > > + } > > + > > + s->dev.acked_features = vdev->guest_features; > > + > > + ret = vhost_dev_start(&s->dev, vdev); > > + if (ret < 0) { > > + error_setg_errno(errp, -ret, "Error starting vhost"); > > + goto err_guest_notifiers; > > + } > > + s->started = true; > > + > > + /* guest_notifier_mask/pending not used yet, so just unmask > > + * everything here. virtio-pci will do the right thing by > > + * enabling/disabling irqfd. > > + */ > > + for (i = 0; i < s->dev.nvqs; i++) { > > + vhost_virtqueue_mask(&s->dev, vdev, i, false); > > + } > > + > > + return ret; > > + > > +err_guest_notifiers: > > + k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false); > > +err_host_notifiers: > > + vhost_dev_disable_notifiers(&s->dev, vdev); > > + return ret; > > +} > > + > > +static void vhost_vdpa_net_handle_output(VirtIODevice *vdev, VirtQueue *vq) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + Error *local_err = NULL; > > + int i, ret; > > + > > + if (!vdev->start_on_kick) { > > + return; > > + } > > + > > + if (s->dev.started) { > > + return; > > + } > > + > > + /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start > > + * vhost here instead of waiting for .set_status(). > > + */ > > + ret = vhost_vdpa_net_start(vdev, &local_err); > > + if (ret < 0) { > > + error_reportf_err(local_err, "vhost-vdpa-net: start failed: "); > > + return; > > + } > > + > > + /* Kick right away to begin processing requests already in vring */ > > + for (i = 0; i < s->dev.nvqs; i++) { > > + VirtQueue *kick_vq = virtio_get_queue(vdev, i); > > + > > + if (!virtio_queue_get_desc_addr(vdev, i)) { > > + continue; > > + } > > + event_notifier_set(virtio_queue_get_host_notifier(kick_vq)); > > + } > > +} > > + > > +static void vhost_vdpa_net_stop(VirtIODevice *vdev) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); > > + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); > > + int ret; > > + > > + if (!s->started) { > > + return; > > + } > > + s->started = false; > > + > > + if (!k->set_guest_notifiers) { > > + return; > > + } > > + > > + vhost_dev_stop(&s->dev, vdev); > > + > > + ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false); > > + if (ret < 0) { > > + error_report("vhost guest notifier cleanup failed: %d", ret); > > + return; > > + } > > + > > + vhost_dev_disable_notifiers(&s->dev, vdev); > > +} > > + > > +static void vhost_vdpa_net_set_status(VirtIODevice *vdev, uint8_t status) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + bool should_start = virtio_device_started(vdev, status); > > + Error *local_err = NULL; > > + int ret; > > + > > + if (!vdev->vm_running) { > > + should_start = false; > > + } > > + > > + if (s->started == should_start) { > > + return; > > + } > > + > > + if (should_start) { > > + ret = vhost_vdpa_net_start(vdev, &local_err); > > + if (ret < 0) { > > + error_reportf_err(local_err, "vhost-vdpa-net: start failed: "); > > + } > > + } else { > > + vhost_vdpa_net_stop(vdev); > > + } > > +} > > + > > +static void vhost_vdpa_net_unrealize(VHostVdpaNet *s) > > +{ > > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > > + int i; > > + > > + for (i = 0; i < s->queue_pairs * 2; i++) { > > + virtio_delete_queue(s->virtqs[i]); > > + } > > + /* ctrl vq */ > > + virtio_delete_queue(s->virtqs[i]); > > + > > + g_free(s->virtqs); > > + virtio_cleanup(vdev); > > +} > > + > > +static void vhost_vdpa_net_device_realize(DeviceState *dev, Error **errp) > > +{ > > + VirtIODevice *vdev = VIRTIO_DEVICE(dev); > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + int i, ret; > > + > > + s->vdpa.device_fd = qemu_open_old(s->vdpa_dev, O_RDWR); > > + if (s->vdpa.device_fd == -1) { > > + error_setg(errp, "vhost-vdpa-net: open %s failed: %s", > > + s->vdpa_dev, strerror(errno)); > > + return; > > + } > > + > > + virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, > > + sizeof(struct virtio_net_config)); > > + > > + s->dev.nvqs = s->queue_pairs * 2 + 1; > > + s->dev.vqs = g_new0(struct vhost_virtqueue, s->dev.nvqs); > > + s->dev.vq_index = 0; > > + s->dev.vq_index_end = s->dev.nvqs; > > + s->dev.backend_features = 0; > > + s->started = false; > > + > > + s->virtqs = g_new0(VirtQueue *, s->dev.nvqs); > > + for (i = 0; i < s->dev.nvqs; i++) { > > + s->virtqs[i] = virtio_add_queue(vdev, s->queue_size, > > + vhost_vdpa_net_handle_output); > > + } > > + > > + ret = vhost_dev_init(&s->dev, &s->vdpa, VHOST_BACKEND_TYPE_VDPA, 0, > NULL); > > + if (ret < 0) { > > + error_setg(errp, "vhost-vdpa-net: vhost initialization failed: %s", > > + strerror(-ret)); > > + goto init_err; > > + } > > + > > + ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->netcfg, > > + sizeof(struct virtio_net_config), NULL); > > + if (ret < 0) { > > + error_setg(errp, "vhost-vdpa-net: get network config failed"); > > + goto config_err; > > + } > > + > > + return; > > +config_err: > > + vhost_dev_cleanup(&s->dev); > > +init_err: > > + vhost_vdpa_net_unrealize(s); > > + close(s->vdpa.device_fd); > > +} > > + > > +static void vhost_vdpa_net_device_unrealize(DeviceState *dev) > > +{ > > + VirtIODevice *vdev = VIRTIO_DEVICE(dev); > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + > > + virtio_set_status(vdev, 0); > > + vhost_dev_cleanup(&s->dev); > > + vhost_vdpa_net_unrealize(s); > > + close(s->vdpa.device_fd); > > +} > > + > > +static const VMStateDescription vmstate_vhost_vdpa_net = { > > + .name = "vhost-vdpa-net", > > + .minimum_version_id = 1, > > + .version_id = 1, > > + .fields = (VMStateField[]) { > > + VMSTATE_VIRTIO_DEVICE, > > + VMSTATE_END_OF_LIST() > > + }, > > +}; > > + > > Not a lot of state here compared to virtio net. > I didn't look at the code deeply so I don't know - > how is the rest of the state migrated? > > > > +static void vhost_vdpa_net_instance_init(Object *obj) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(obj); > > + > > + device_add_bootindex_property(obj, &s->bootindex, "bootindex", > > + "/ethernet-phy@0,0", DEVICE(obj)); > > +} > > + > > +static Property vhost_vdpa_net_properties[] = { > > + DEFINE_PROP_STRING("vdpa-dev", VHostVdpaNet, vdpa_dev), > > + DEFINE_PROP_UINT16("queue-pairs", VHostVdpaNet, queue_pairs, > > + VHOST_VDPA_NET_AUTO_QUEUE_PAIRS), > > + DEFINE_PROP_UINT32("queue-size", VHostVdpaNet, queue_size, > > + VHOST_VDPA_NET_QUEUE_DEFAULT_SIZE), > > + DEFINE_PROP_END_OF_LIST(), > > +}; > > + > > +static void vhost_vdpa_net_class_init(ObjectClass *klass, void *data) > > +{ > > + DeviceClass *dc = DEVICE_CLASS(klass); > > + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); > > + > > + device_class_set_props(dc, vhost_vdpa_net_properties); > > + dc->vmsd = &vmstate_vhost_vdpa_net; > > + set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); > > + vdc->realize = vhost_vdpa_net_device_realize; > > + vdc->unrealize = vhost_vdpa_net_device_unrealize; > > + vdc->get_config = vhost_vdpa_net_get_config; > > + vdc->set_config = vhost_vdpa_net_set_config; > > + vdc->get_features = vhost_vdpa_net_get_features; > > + vdc->set_status = vhost_vdpa_net_set_status; > > +} > > + > > +static const TypeInfo vhost_vdpa_net_info = { > > + .name = TYPE_VHOST_VDPA_NET, > > + .parent = TYPE_VIRTIO_DEVICE, > > + .instance_size = sizeof(VHostVdpaNet), > > + .instance_init = vhost_vdpa_net_instance_init, > > + .class_init = vhost_vdpa_net_class_init, > > +}; > > + > > +static void virtio_register_types(void) > > +{ > > + type_register_static(&vhost_vdpa_net_info); > > +} > > + > > +type_init(virtio_register_types) > > diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig > > index c144d42..50dba2e 100644 > > --- a/hw/virtio/Kconfig > > +++ b/hw/virtio/Kconfig > > @@ -68,3 +68,8 @@ config VHOST_USER_RNG > > bool > > default y > > depends on VIRTIO && VHOST_USER > > + > > +config VHOST_VDPA_NET > > + bool > > + default y if VIRTIO_PCI > > + depends on VIRTIO && VHOST_VDPA && LINUX > > diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build > > index 521f7d6..3089222 100644 > > --- a/hw/virtio/meson.build > > +++ b/hw/virtio/meson.build > > @@ -34,6 +34,7 @@ virtio_pci_ss = ss.source_set() > > virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: > files('vhost-vsock-pci.c')) > > virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_VSOCK', if_true: > files('vhost-user-vsock-pci.c')) > > virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: > files('vhost-user-blk-pci.c')) > > +virtio_pci_ss.add(when: 'CONFIG_VHOST_VDPA_NET', if_true: > files('vhost-vdpa-net-pci.c')) > > virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_INPUT', if_true: > files('vhost-user-input-pci.c')) > > virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_SCSI', if_true: > files('vhost-user-scsi-pci.c')) > > virtio_pci_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: > files('vhost-scsi-pci.c')) > > diff --git a/hw/virtio/vhost-vdpa-net-pci.c > b/hw/virtio/vhost-vdpa-net-pci.c > > new file mode 100644 > > index 0000000..84199a8 > > --- /dev/null > > +++ b/hw/virtio/vhost-vdpa-net-pci.c > > @@ -0,0 +1,118 @@ > > +#include "qemu/osdep.h" > > +#include "standard-headers/linux/virtio_pci.h" > > +#include "hw/virtio/virtio.h" > > +#include "hw/virtio/vhost-vdpa-net.h" > > +#include "hw/pci/pci.h" > > +#include "hw/qdev-properties.h" > > +#include "qapi/error.h" > > +#include "qemu/error-report.h" > > +#include "qemu/module.h" > > +#include "virtio-pci.h" > > +#include "qom/object.h" > > +#include "net/vhost-vdpa.h" > > + > > +typedef struct VHostVdpaNetPCI VHostVdpaNetPCI; > > + > > +#define TYPE_VHOST_VDPA_NET_PCI "vhost-vdpa-net-pci-base" > > +DECLARE_INSTANCE_CHECKER(VHostVdpaNetPCI, VHOST_VDPA_NET_PCI, > > + TYPE_VHOST_VDPA_NET_PCI) > > + > > +struct VHostVdpaNetPCI { > > + VirtIOPCIProxy parent_obj; > > + VHostVdpaNet vdev; > > +}; > > + > > +static Property vhost_vdpa_net_pci_properties[] = { > > + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, > > + DEV_NVECTORS_UNSPECIFIED), > > + DEFINE_PROP_END_OF_LIST(), > > +}; > > + > > +static int vhost_vdpa_net_get_queue_pairs(VHostVdpaNetPCI *dev, Error > **errp) > > +{ > > + int device_fd, queue_pairs; > > + int has_cvq; > > + > > + device_fd = qemu_open_old(dev->vdev.vdpa_dev, O_RDWR); > > + if (device_fd == -1) { > > + error_setg(errp, "vhost-vdpa-net: open %s failed: %s", > > + dev->vdev.vdpa_dev, strerror(errno)); > > + return -1; > > + } > > + > > + queue_pairs = vhost_vdpa_get_max_queue_pairs(device_fd, &has_cvq, > errp); > > + if (queue_pairs < 0) { > > + error_setg(errp, "vhost-vdpa-net: get queue pairs failed: %s", > > + strerror(errno)); > > + goto out; > > + } > > + > > + if (!has_cvq) { > > + error_setg(errp, "vhost-vdpa-net: not support ctrl vq"); > > + } > > + > > +out: > > + close(device_fd); > > + return queue_pairs; > > +} > > + > > +static void vhost_vdpa_net_pci_realize(VirtIOPCIProxy *vpci_dev, Error > **errp) > > +{ > > + VHostVdpaNetPCI *dev = VHOST_VDPA_NET_PCI(vpci_dev); > > + DeviceState *vdev = DEVICE(&dev->vdev); > > + > > + if (dev->vdev.queue_pairs == VHOST_VDPA_NET_AUTO_QUEUE_PAIRS) { > > + dev->vdev.queue_pairs = vhost_vdpa_net_get_queue_pairs(dev, errp); > > + if (*errp) { > > + return; > > + } > > + } > > + > > + if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) { > > + vpci_dev->nvectors = dev->vdev.queue_pairs * 2 + 1; > > + } > > + > > + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); > > +} > > + > > +static void vhost_vdpa_net_pci_class_init(ObjectClass *klass, void *data) > > +{ > > + DeviceClass *dc = DEVICE_CLASS(klass); > > + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); > > + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); > > + > > + set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); > > + device_class_set_props(dc, vhost_vdpa_net_pci_properties); > > + k->realize = vhost_vdpa_net_pci_realize; > > + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; > > + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_NET; > > + pcidev_k->revision = VIRTIO_PCI_ABI_VERSION; > > + pcidev_k->class_id = PCI_CLASS_NETWORK_ETHERNET; > > +} > > + > > +static void vhost_vdpa_net_pci_instance_init(Object *obj) > > +{ > > + VHostVdpaNetPCI *dev = VHOST_VDPA_NET_PCI(obj); > > + > > + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), > > + TYPE_VHOST_VDPA_NET); > > + object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev), > > + "bootindex"); > > +} > > + > > +static const VirtioPCIDeviceTypeInfo vhost_vdpa_net_pci_info = { > > + .base_name = TYPE_VHOST_VDPA_NET_PCI, > > + .generic_name = "vhost-vdpa-net-pci", > > + .transitional_name = "vhost-vdpa-net-pci-transitional", > > + .non_transitional_name = "vhost-vdpa-net-pci-non-transitional", > > + .instance_size = sizeof(VHostVdpaNetPCI), > > + .instance_init = vhost_vdpa_net_pci_instance_init, > > + .class_init = vhost_vdpa_net_pci_class_init, > > +}; > > + > > +static void vhost_vdpa_net_pci_register(void) > > +{ > > + virtio_pci_types_register(&vhost_vdpa_net_pci_info); > > +} > > + > > +type_init(vhost_vdpa_net_pci_register) > > diff --git a/include/hw/virtio/vhost-vdpa-net.h > b/include/hw/virtio/vhost-vdpa-net.h > > new file mode 100644 > > index 0000000..63bf3a6 > > --- /dev/null > > +++ b/include/hw/virtio/vhost-vdpa-net.h > > @@ -0,0 +1,31 @@ > > +#ifndef VHOST_VDPA_NET_H > > +#define VHOST_VDPA_NET_H > > + > > +#include "standard-headers/linux/virtio_blk.h" > > +#include "hw/block/block.h" > > +#include "chardev/char-fe.h" > > +#include "hw/virtio/vhost.h" > > +#include "hw/virtio/vhost-vdpa.h" > > +#include "hw/virtio/virtio-net.h" > > +#include "qom/object.h" > > + > > +#define TYPE_VHOST_VDPA_NET "vhost-vdpa-net" > > +OBJECT_DECLARE_SIMPLE_TYPE(VHostVdpaNet, VHOST_VDPA_NET) > > + > > +struct VHostVdpaNet { > > + VirtIODevice parent_obj; > > + int32_t bootindex; > > + struct virtio_net_config netcfg; > > + uint16_t queue_pairs; > > + uint32_t queue_size; > > + struct vhost_dev dev; > > + VirtQueue **virtqs; > > + struct vhost_vdpa vdpa; > > + char *vdpa_dev; > > + bool started; > > +}; > > + > > +#define VHOST_VDPA_NET_AUTO_QUEUE_PAIRS UINT16_MAX > > +#define VHOST_VDPA_NET_QUEUE_DEFAULT_SIZE 256 > > + > > +#endif > > diff --git a/include/net/vhost-vdpa.h b/include/net/vhost-vdpa.h > > index b81f9a6..f029972 100644 > > --- a/include/net/vhost-vdpa.h > > +++ b/include/net/vhost-vdpa.h > > @@ -18,4 +18,6 @@ struct vhost_net *vhost_vdpa_get_vhost_net(NetClientState > *nc); > > > > extern const int vdpa_feature_bits[]; > > > > +int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp); > > + > > #endif /* VHOST_VDPA_H */ > > diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c > > index 25dd6dd..8ee6ba5 100644 > > --- a/net/vhost-vdpa.c > > +++ b/net/vhost-vdpa.c > > @@ -219,7 +219,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState > *peer, > > return nc; > > } > > > > -static int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp) > > +int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp) > > { > > unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); > > g_autofree struct vhost_vdpa_config *config = NULL; > > -- > > 1.8.3.1
> -----Original Message----- > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > Sent: Thursday, December 9, 2021 5:17 PM > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > <longpeng2@huawei.com> > Cc: jasowang@redhat.com; mst@redhat.com; parav@nvidia.com; > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > From: Longpeng <longpeng2@huawei.com> > > > > Hi guys, > > > > This patch introduces vhost-vdpa-net device, which is inspired > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > I've tested this patch on Huawei's offload card: > > ./x86_64-softmmu/qemu-system-x86_64 \ > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > For virtio hardware offloading, the most important requirement for us > > is to support live migration between offloading cards from different > > vendors, the combination of netdev and virtio-net seems too heavy, we > > prefer a lightweight way. > > > > Maybe we could support both in the future ? Such as: > > > > * Lightweight > > Net: vhost-vdpa-net > > Storage: vhost-vdpa-blk > > > > * Heavy but more powerful > > Net: netdev + virtio-net + vhost-vdpa > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > Stefano presented a plan for vdpa-blk at KVM Forum 2021: > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof > tware-offload-for-virtio-blk-stefano-garzarella-red-hat > > It's closer to today's virtio-net + vhost-net approach than the > vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > an offload feature rather than a completely separate code path that > needs to be maintained and tested. That way QEMU's block layer features > and live migration work with vDPA devices and re-use the virtio-blk > code. The key functionality that has not been implemented yet is a "fast > path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > offloaded to vDPA. > > The unified vdpa-blk architecture should deliver the same performance > as the vhost-vdpa-blk device you mentioned but with more features, so I > wonder what aspects of the vhost-vdpa-blk idea are important to you? > > QEMU already has vhost-user-blk, which takes a similar approach as the > vhost-vdpa-blk device you are proposing. I'm not against the > vhost-vdpa-blk approach in priciple, but would like to understand your > requirements and see if there is a way to collaborate on one vdpa-blk > implementation instead of dividing our efforts between two. > We prefer a simple way in the virtio hardware offloading case, it could reduce our maintenance workload, we no need to maintain the virtio-net, netdev, virtio-blk, bdrv and ... any more. If we need to support other vdpa devices (such as virtio-crypto, virtio-fs) in the future, then we also need to maintain the corresponding device emulation code? For the virtio hardware offloading case, we usually use the vfio-pci framework, it saves a lot of our maintenance work in QEMU, we don't need to touch the device types. Inspired by Jason, what we really prefer is "vhost-vdpa-pci/mmio", use it to instead of the vfio-pci, it could provide the same performance as vfio-pci, but it's *possible* to support live migrate between offloading cards from different vendors. > Stefan
> -----Original Message----- > From: Stefano Garzarella [mailto:sgarzare@redhat.com] > Sent: Thursday, December 9, 2021 11:55 PM > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > <longpeng2@huawei.com> > Cc: Stefan Hajnoczi <stefanha@redhat.com>; jasowang@redhat.com; mst@redhat.com; > parav@nvidia.com; xieyongji@bytedance.com; Yechuan <yechuan@huawei.com>; > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > On Thu, Dec 09, 2021 at 09:16:58AM +0000, Stefan Hajnoczi wrote: > >On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > >> From: Longpeng <longpeng2@huawei.com> > >> > >> Hi guys, > >> > >> This patch introduces vhost-vdpa-net device, which is inspired > >> by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > >> > >> I've tested this patch on Huawei's offload card: > >> ./x86_64-softmmu/qemu-system-x86_64 \ > >> -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > >> > >> For virtio hardware offloading, the most important requirement for us > >> is to support live migration between offloading cards from different > >> vendors, the combination of netdev and virtio-net seems too heavy, we > >> prefer a lightweight way. > >> > >> Maybe we could support both in the future ? Such as: > >> > >> * Lightweight > >> Net: vhost-vdpa-net > >> Storage: vhost-vdpa-blk > >> > >> * Heavy but more powerful > >> Net: netdev + virtio-net + vhost-vdpa > >> Storage: bdrv + virtio-blk + vhost-vdpa > >> > >> [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > >Stefano presented a plan for vdpa-blk at KVM Forum 2021: > >https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-so > ftware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > >It's closer to today's virtio-net + vhost-net approach than the > >vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > >an offload feature rather than a completely separate code path that > >needs to be maintained and tested. That way QEMU's block layer features > >and live migration work with vDPA devices and re-use the virtio-blk > >code. The key functionality that has not been implemented yet is a "fast > >path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > >offloaded to vDPA. > > > >The unified vdpa-blk architecture should deliver the same performance > >as the vhost-vdpa-blk device you mentioned but with more features, so I > >wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > >QEMU already has vhost-user-blk, which takes a similar approach as the > >vhost-vdpa-blk device you are proposing. I'm not against the > >vhost-vdpa-blk approach in priciple, but would like to understand your > >requirements and see if there is a way to collaborate on one vdpa-blk > >implementation instead of dividing our efforts between two. > > Waiting for the aspects that Stefan asked, I add some details about the > plan for vdpa-blk. > > Currently I'm working on the in-kernel software device. In the next > months I hope to start working on the QEMU part. Anyway that part could > go in parallel with the in-kernel device, so if you are interested we > can collaborate. > The work on QEMU part means supporting the vdpa in BlockDriver and virtio-blk? In fact, I wanted to support the vdpa in QEMU block layer before I sent this RFC, because the net part uses netdev + virtio-net while the storage part uses vhost-vdpa-blk (from Yongji) looks like a strange combination. But I found enable vdpa in QEMU block layer would take more time and some features (e.g. snapshot, IO throttling) from the QEMU block layer are not needed in our hardware offloading case, so I turned to develop the "vhost-vdpa-net", maybe the combination of vhost-vdpa-net and vhost-vdpa-blk is congruous. > Having only the unified vdpa-blk architecture would allow us to simplify > the management layers and avoid duplicate code, but it takes more time > to develop compared to vhost-vdpa-blk. So if vdpa-blk support in QEMU is > urgent, I could understand the need to add vhost-vdpa-blk now. > I prefer a way that can support vdpa devices (not only net and storage, but also other device types) quickly in hardware offloading case, maybe it would decreases the universalism, but it could be an alternative to some users. > Let me know if you want more details about the unified vdpa-blk > architecture. > > Thanks, > Stefano
> -----Original Message----- > From: Jason Wang [mailto:jasowang@redhat.com] > Sent: Wednesday, December 8, 2021 2:27 PM > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > <longpeng2@huawei.com> > Cc: mst <mst@redhat.com>; Parav Pandit <parav@nvidia.com>; Yongji Xie > <xieyongji@bytedance.com>; Stefan Hajnoczi <stefanha@redhat.com>; Stefano > Garzarella <sgarzare@redhat.com>; Yechuan <yechuan@huawei.com>; Gonglei (Arei) > <arei.gonglei@huawei.com>; qemu-devel <qemu-devel@nongnu.org> > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > On Wed, Dec 8, 2021 at 1:20 PM Longpeng(Mike) <longpeng2@huawei.com> wrote: > > > > From: Longpeng <longpeng2@huawei.com> > > > > Hi guys, > > > > This patch introduces vhost-vdpa-net device, which is inspired > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > I've tested this patch on Huawei's offload card: > > ./x86_64-softmmu/qemu-system-x86_64 \ > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > For virtio hardware offloading, the most important requirement for us > > is to support live migration between offloading cards from different > > vendors, the combination of netdev and virtio-net seems too heavy, we > > prefer a lightweight way. > > Could you elaborate more on this? It's mainly the control path when > using with netdev, and it provides a lot of other benefits: > > - decouple the transport specific stuff out of the vhost abstraction, > mmio device is supported with 0 line of code > - migration compatibility, reuse the migration stream that is already > supported by Qemu virtio-net, this will allow migration among > different vhost backends. > - software mediation facility, not all the virtqueues are assigned to > guests directly. One example is the virtio-net cvq, qemu may want to > intercept and record the device state for migration. Reusing the > current virtio-net codes simplifies a lot of codes. > - transparent failover (in the future), the nic model can choose to > switch between vhost backends etc. > We want to use the vdpa framework instead of the vfio-pci framework in the virtio hardware offloading case, so maybe some of the benefits above are not needed in our case. But we need to migrate between different hardware, so I am not sure whether this approach would be harmful to the requirement. > > > > Maybe we could support both in the future ? > > For the net, we need to figure out the advantages of this approach > first. Note that we didn't have vhost-user-net-pci or vhost-pci in the > past. > Why didn't support vhost-user-net-pci in history ? Because its control path is much more complex than the block ? > For the block, I will leave Stefan and Stefano to comment. > > > Such as: > > > > * Lightweight > > Net: vhost-vdpa-net > > Storage: vhost-vdpa-blk > > > > * Heavy but more powerful > > Net: netdev + virtio-net + vhost-vdpa > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com> > > --- > > hw/net/meson.build | 1 + > > hw/net/vhost-vdpa-net.c | 338 > +++++++++++++++++++++++++++++++++++++ > > hw/virtio/Kconfig | 5 + > > hw/virtio/meson.build | 1 + > > hw/virtio/vhost-vdpa-net-pci.c | 118 +++++++++++++ > > I'd expect there's no device type specific code in this approach and > any kind of vDPA devices could be used with a general pci device. > > Any reason for having net specific types here? > No, just because there already has the proposal of vhost-vdpa-blk, so I developed the vhost-vdpa-net correspondingly. I pretty agree with your suggestion. If feasible, likes vfio-pci, we don't need to maintain the device type specific code in QEMU, what's more, it's possible to support the live migration of different virtio hardware. > > include/hw/virtio/vhost-vdpa-net.h | 31 ++++ > > include/net/vhost-vdpa.h | 2 + > > net/vhost-vdpa.c | 2 +- > > 8 files changed, 497 insertions(+), 1 deletion(-) > > create mode 100644 hw/net/vhost-vdpa-net.c > > create mode 100644 hw/virtio/vhost-vdpa-net-pci.c > > create mode 100644 include/hw/virtio/vhost-vdpa-net.h > > > > diff --git a/hw/net/meson.build b/hw/net/meson.build > > index bdf71f1..139ebc4 100644 > > --- a/hw/net/meson.build > > +++ b/hw/net/meson.build > > @@ -44,6 +44,7 @@ specific_ss.add(when: 'CONFIG_XILINX_ETHLITE', if_true: > files('xilinx_ethlite.c' > > > > softmmu_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('net_rx_pkt.c')) > > specific_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('virtio-net.c')) > > +specific_ss.add(when: 'CONFIG_VHOST_VDPA_NET', if_true: > files('vhost-vdpa-net.c')) > > > > softmmu_ss.add(when: ['CONFIG_VIRTIO_NET', 'CONFIG_VHOST_NET'], if_true: > files('vhost_net.c'), if_false: files('vhost_net-stub.c')) > > softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost_net-stub.c')) > > diff --git a/hw/net/vhost-vdpa-net.c b/hw/net/vhost-vdpa-net.c > > new file mode 100644 > > index 0000000..48b99f9 > > --- /dev/null > > +++ b/hw/net/vhost-vdpa-net.c > > @@ -0,0 +1,338 @@ > > +#include "qemu/osdep.h" > > +#include "qapi/error.h" > > +#include "qemu/error-report.h" > > +#include "qemu/cutils.h" > > +#include "hw/qdev-core.h" > > +#include "hw/qdev-properties.h" > > +#include "hw/qdev-properties-system.h" > > +#include "hw/virtio/vhost.h" > > +#include "hw/virtio/vhost-vdpa-net.h" > > +#include "hw/virtio/virtio.h" > > +#include "hw/virtio/virtio-bus.h" > > +#include "hw/virtio/virtio-access.h" > > +#include "sysemu/sysemu.h" > > +#include "sysemu/runstate.h" > > +#include "net/vhost-vdpa.h" > > + > > +static void vhost_vdpa_net_get_config(VirtIODevice *vdev, uint8_t *config) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + > > + memcpy(config, &s->netcfg, sizeof(struct virtio_net_config)); > > +} > > + > > +static void vhost_vdpa_net_set_config(VirtIODevice *vdev, const uint8_t > *config) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + struct virtio_net_config *netcfg = (struct virtio_net_config *)config; > > + int ret; > > + > > + ret = vhost_dev_set_config(&s->dev, (uint8_t *)netcfg, 0, > sizeof(*netcfg), > > + VHOST_SET_CONFIG_TYPE_MASTER); > > + if (ret) { > > + error_report("set device config space failed"); > > + return; > > + } > > +} > > + > > +static uint64_t vhost_vdpa_net_get_features(VirtIODevice *vdev, > > + uint64_t features, > > + Error **errp) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + > > + virtio_add_feature(&features, VIRTIO_NET_F_CSUM); > > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_CSUM); > > + virtio_add_feature(&features, VIRTIO_NET_F_MAC); > > + virtio_add_feature(&features, VIRTIO_NET_F_GSO); > > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_TSO4); > > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_TSO6); > > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_ECN); > > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_UFO); > > + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_ANNOUNCE); > > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO4); > > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO6); > > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_ECN); > > + virtio_add_feature(&features, VIRTIO_NET_F_HOST_UFO); > > + virtio_add_feature(&features, VIRTIO_NET_F_MRG_RXBUF); > > + virtio_add_feature(&features, VIRTIO_NET_F_STATUS); > > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_VQ); > > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_RX); > > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_VLAN); > > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_RX_EXTRA); > > + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_MAC_ADDR); > > + virtio_add_feature(&features, VIRTIO_NET_F_MQ); > > Any reason for those hand crafted features? > > > + > > + return vhost_get_features(&s->dev, vdpa_feature_bits, features); > > +} > > + > > +static int vhost_vdpa_net_start(VirtIODevice *vdev, Error **errp) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); > > + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); > > + int i, ret; > > + > > + if (!k->set_guest_notifiers) { > > + error_setg(errp, "binding does not support guest notifiers"); > > + return -ENOSYS; > > + } > > + > > + ret = vhost_dev_enable_notifiers(&s->dev, vdev); > > + if (ret < 0) { > > + error_setg_errno(errp, -ret, "Error enabling host notifiers"); > > + return ret; > > + } > > + > > + ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true); > > + if (ret < 0) { > > + error_setg_errno(errp, -ret, "Error binding guest notifier"); > > + goto err_host_notifiers; > > + } > > + > > + s->dev.acked_features = vdev->guest_features; > > + > > + ret = vhost_dev_start(&s->dev, vdev); > > + if (ret < 0) { > > + error_setg_errno(errp, -ret, "Error starting vhost"); > > + goto err_guest_notifiers; > > + } > > + s->started = true; > > + > > + /* guest_notifier_mask/pending not used yet, so just unmask > > + * everything here. virtio-pci will do the right thing by > > + * enabling/disabling irqfd. > > + */ > > + for (i = 0; i < s->dev.nvqs; i++) { > > + vhost_virtqueue_mask(&s->dev, vdev, i, false); > > + } > > + > > + return ret; > > + > > +err_guest_notifiers: > > + k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false); > > +err_host_notifiers: > > + vhost_dev_disable_notifiers(&s->dev, vdev); > > + return ret; > > +} > > + > > +static void vhost_vdpa_net_handle_output(VirtIODevice *vdev, VirtQueue *vq) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + Error *local_err = NULL; > > + int i, ret; > > + > > + if (!vdev->start_on_kick) { > > + return; > > + } > > + > > + if (s->dev.started) { > > + return; > > + } > > + > > + /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start > > + * vhost here instead of waiting for .set_status(). > > + */ > > + ret = vhost_vdpa_net_start(vdev, &local_err); > > + if (ret < 0) { > > + error_reportf_err(local_err, "vhost-vdpa-net: start failed: "); > > + return; > > + } > > + > > + /* Kick right away to begin processing requests already in vring */ > > + for (i = 0; i < s->dev.nvqs; i++) { > > + VirtQueue *kick_vq = virtio_get_queue(vdev, i); > > + > > + if (!virtio_queue_get_desc_addr(vdev, i)) { > > + continue; > > + } > > + event_notifier_set(virtio_queue_get_host_notifier(kick_vq)); > > + } > > +} > > + > > +static void vhost_vdpa_net_stop(VirtIODevice *vdev) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); > > + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); > > + int ret; > > + > > + if (!s->started) { > > + return; > > + } > > + s->started = false; > > + > > + if (!k->set_guest_notifiers) { > > + return; > > + } > > + > > + vhost_dev_stop(&s->dev, vdev); > > + > > + ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false); > > + if (ret < 0) { > > + error_report("vhost guest notifier cleanup failed: %d", ret); > > + return; > > + } > > + > > + vhost_dev_disable_notifiers(&s->dev, vdev); > > +} > > + > > +static void vhost_vdpa_net_set_status(VirtIODevice *vdev, uint8_t status) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + bool should_start = virtio_device_started(vdev, status); > > + Error *local_err = NULL; > > + int ret; > > + > > + if (!vdev->vm_running) { > > + should_start = false; > > + } > > + > > + if (s->started == should_start) { > > + return; > > + } > > + > > + if (should_start) { > > + ret = vhost_vdpa_net_start(vdev, &local_err); > > + if (ret < 0) { > > + error_reportf_err(local_err, "vhost-vdpa-net: start failed: "); > > + } > > + } else { > > + vhost_vdpa_net_stop(vdev); > > + } > > +} > > + > > +static void vhost_vdpa_net_unrealize(VHostVdpaNet *s) > > +{ > > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > > + int i; > > + > > + for (i = 0; i < s->queue_pairs * 2; i++) { > > + virtio_delete_queue(s->virtqs[i]); > > + } > > + /* ctrl vq */ > > + virtio_delete_queue(s->virtqs[i]); > > + > > + g_free(s->virtqs); > > + virtio_cleanup(vdev); > > +} > > + > > +static void vhost_vdpa_net_device_realize(DeviceState *dev, Error **errp) > > +{ > > + VirtIODevice *vdev = VIRTIO_DEVICE(dev); > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + int i, ret; > > + > > + s->vdpa.device_fd = qemu_open_old(s->vdpa_dev, O_RDWR); > > + if (s->vdpa.device_fd == -1) { > > + error_setg(errp, "vhost-vdpa-net: open %s failed: %s", > > + s->vdpa_dev, strerror(errno)); > > + return; > > + } > > + > > + virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, > > + sizeof(struct virtio_net_config)); > > + > > + s->dev.nvqs = s->queue_pairs * 2 + 1; > > + s->dev.vqs = g_new0(struct vhost_virtqueue, s->dev.nvqs); > > + s->dev.vq_index = 0; > > + s->dev.vq_index_end = s->dev.nvqs; > > + s->dev.backend_features = 0; > > + s->started = false; > > + > > + s->virtqs = g_new0(VirtQueue *, s->dev.nvqs); > > + for (i = 0; i < s->dev.nvqs; i++) { > > + s->virtqs[i] = virtio_add_queue(vdev, s->queue_size, > > + vhost_vdpa_net_handle_output); > > We should check whether MQ is negotiated since the index varies > depending on that. > > > + } > > + > > + ret = vhost_dev_init(&s->dev, &s->vdpa, VHOST_BACKEND_TYPE_VDPA, 0, > NULL); > > + if (ret < 0) { > > + error_setg(errp, "vhost-vdpa-net: vhost initialization failed: %s", > > + strerror(-ret)); > > + goto init_err; > > + } > > + > > + ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->netcfg, > > + sizeof(struct virtio_net_config), NULL); > > + if (ret < 0) { > > + error_setg(errp, "vhost-vdpa-net: get network config failed"); > > + goto config_err; > > + } > > + > > + return; > > +config_err: > > + vhost_dev_cleanup(&s->dev); > > +init_err: > > + vhost_vdpa_net_unrealize(s); > > + close(s->vdpa.device_fd); > > +} > > + > > +static void vhost_vdpa_net_device_unrealize(DeviceState *dev) > > +{ > > + VirtIODevice *vdev = VIRTIO_DEVICE(dev); > > + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); > > + > > + virtio_set_status(vdev, 0); > > + vhost_dev_cleanup(&s->dev); > > + vhost_vdpa_net_unrealize(s); > > + close(s->vdpa.device_fd); > > +} > > + > > +static const VMStateDescription vmstate_vhost_vdpa_net = { > > + .name = "vhost-vdpa-net", > > + .minimum_version_id = 1, > > + .version_id = 1, > > + .fields = (VMStateField[]) { > > + VMSTATE_VIRTIO_DEVICE, > > + VMSTATE_END_OF_LIST() > > + }, > > +}; > > + > > +static void vhost_vdpa_net_instance_init(Object *obj) > > +{ > > + VHostVdpaNet *s = VHOST_VDPA_NET(obj); > > + > > + device_add_bootindex_property(obj, &s->bootindex, "bootindex", > > + "/ethernet-phy@0,0", DEVICE(obj)); > > +} > > + > > +static Property vhost_vdpa_net_properties[] = { > > + DEFINE_PROP_STRING("vdpa-dev", VHostVdpaNet, vdpa_dev), > > + DEFINE_PROP_UINT16("queue-pairs", VHostVdpaNet, queue_pairs, > > + VHOST_VDPA_NET_AUTO_QUEUE_PAIRS), > > Any reason that we need the queue pairs parameter? Note that it is > expected to be provisioned by the netlink for the management device. > > Thanks
On Sat, Dec 11, 2021 at 03:00:27AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > > > -----Original Message----- > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > > Sent: Thursday, December 9, 2021 5:17 PM > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > <longpeng2@huawei.com> > > Cc: jasowang@redhat.com; mst@redhat.com; parav@nvidia.com; > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > > From: Longpeng <longpeng2@huawei.com> > > > > > > Hi guys, > > > > > > This patch introduces vhost-vdpa-net device, which is inspired > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > > > I've tested this patch on Huawei's offload card: > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > For virtio hardware offloading, the most important requirement for us > > > is to support live migration between offloading cards from different > > > vendors, the combination of netdev and virtio-net seems too heavy, we > > > prefer a lightweight way. > > > > > > Maybe we could support both in the future ? Such as: > > > > > > * Lightweight > > > Net: vhost-vdpa-net > > > Storage: vhost-vdpa-blk > > > > > > * Heavy but more powerful > > > Net: netdev + virtio-net + vhost-vdpa > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof > > tware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > It's closer to today's virtio-net + vhost-net approach than the > > vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > an offload feature rather than a completely separate code path that > > needs to be maintained and tested. That way QEMU's block layer features > > and live migration work with vDPA devices and re-use the virtio-blk > > code. The key functionality that has not been implemented yet is a "fast > > path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > offloaded to vDPA. > > > > The unified vdpa-blk architecture should deliver the same performance > > as the vhost-vdpa-blk device you mentioned but with more features, so I > > wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > QEMU already has vhost-user-blk, which takes a similar approach as the > > vhost-vdpa-blk device you are proposing. I'm not against the > > vhost-vdpa-blk approach in priciple, but would like to understand your > > requirements and see if there is a way to collaborate on one vdpa-blk > > implementation instead of dividing our efforts between two. > > > > We prefer a simple way in the virtio hardware offloading case, it could reduce > our maintenance workload, we no need to maintain the virtio-net, netdev, > virtio-blk, bdrv and ... any more. If we need to support other vdpa devices > (such as virtio-crypto, virtio-fs) in the future, then we also need to maintain > the corresponding device emulation code? > > For the virtio hardware offloading case, we usually use the vfio-pci framework, > it saves a lot of our maintenance work in QEMU, we don't need to touch the device > types. Inspired by Jason, what we really prefer is "vhost-vdpa-pci/mmio", use it to > instead of the vfio-pci, it could provide the same performance as vfio-pci, but it's > *possible* to support live migrate between offloading cards from different vendors. OK, so the features you are dropping would be migration between a vdpa, vhost and virtio backends. I think given vhost-vdpa-blk is seems fair enough... What do others think? > > Stefan
On Sun, Dec 12, 2021 at 5:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Sat, Dec 11, 2021 at 03:00:27AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > > > > > > -----Original Message----- > > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > > > Sent: Thursday, December 9, 2021 5:17 PM > > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > > <longpeng2@huawei.com> > > > Cc: jasowang@redhat.com; mst@redhat.com; parav@nvidia.com; > > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > > > From: Longpeng <longpeng2@huawei.com> > > > > > > > > Hi guys, > > > > > > > > This patch introduces vhost-vdpa-net device, which is inspired > > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > > > > > I've tested this patch on Huawei's offload card: > > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > > > For virtio hardware offloading, the most important requirement for us > > > > is to support live migration between offloading cards from different > > > > vendors, the combination of netdev and virtio-net seems too heavy, we > > > > prefer a lightweight way. > > > > > > > > Maybe we could support both in the future ? Such as: > > > > > > > > * Lightweight > > > > Net: vhost-vdpa-net > > > > Storage: vhost-vdpa-blk > > > > > > > > * Heavy but more powerful > > > > Net: netdev + virtio-net + vhost-vdpa > > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof > > > tware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > > It's closer to today's virtio-net + vhost-net approach than the > > > vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > > an offload feature rather than a completely separate code path that > > > needs to be maintained and tested. That way QEMU's block layer features > > > and live migration work with vDPA devices and re-use the virtio-blk > > > code. The key functionality that has not been implemented yet is a "fast > > > path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > > offloaded to vDPA. > > > > > > The unified vdpa-blk architecture should deliver the same performance > > > as the vhost-vdpa-blk device you mentioned but with more features, so I > > > wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > > > QEMU already has vhost-user-blk, which takes a similar approach as the > > > vhost-vdpa-blk device you are proposing. I'm not against the > > > vhost-vdpa-blk approach in priciple, but would like to understand your > > > requirements and see if there is a way to collaborate on one vdpa-blk > > > implementation instead of dividing our efforts between two. > > > > > > > We prefer a simple way in the virtio hardware offloading case, it could reduce > > our maintenance workload, we no need to maintain the virtio-net, netdev, > > virtio-blk, bdrv and ... any more. If we need to support other vdpa devices > > (such as virtio-crypto, virtio-fs) in the future, then we also need to maintain > > the corresponding device emulation code? > > > > For the virtio hardware offloading case, we usually use the vfio-pci framework, > > it saves a lot of our maintenance work in QEMU, we don't need to touch the device > > types. Inspired by Jason, what we really prefer is "vhost-vdpa-pci/mmio", use it to > > instead of the vfio-pci, it could provide the same performance as vfio-pci, but it's > > *possible* to support live migrate between offloading cards from different vendors. > > OK, so the features you are dropping would be migration between > a vdpa, vhost and virtio backends. I think given vhost-vdpa-blk is seems > fair enough... What do others think? I think it should be fine, and it would be even better to make it not specific to device type. Thanks > > > > Stefan >
On Sat, Dec 11, 2021 at 1:23 PM Longpeng (Mike, Cloud Infrastructure Service Product Dept.) <longpeng2@huawei.com> wrote: > > > > > -----Original Message----- > > From: Jason Wang [mailto:jasowang@redhat.com] > > Sent: Wednesday, December 8, 2021 2:27 PM > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > <longpeng2@huawei.com> > > Cc: mst <mst@redhat.com>; Parav Pandit <parav@nvidia.com>; Yongji Xie > > <xieyongji@bytedance.com>; Stefan Hajnoczi <stefanha@redhat.com>; Stefano > > Garzarella <sgarzare@redhat.com>; Yechuan <yechuan@huawei.com>; Gonglei (Arei) > > <arei.gonglei@huawei.com>; qemu-devel <qemu-devel@nongnu.org> > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > On Wed, Dec 8, 2021 at 1:20 PM Longpeng(Mike) <longpeng2@huawei.com> wrote: > > > > > > From: Longpeng <longpeng2@huawei.com> > > > > > > Hi guys, > > > > > > This patch introduces vhost-vdpa-net device, which is inspired > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > > > I've tested this patch on Huawei's offload card: > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > For virtio hardware offloading, the most important requirement for us > > > is to support live migration between offloading cards from different > > > vendors, the combination of netdev and virtio-net seems too heavy, we > > > prefer a lightweight way. > > > > Could you elaborate more on this? It's mainly the control path when > > using with netdev, and it provides a lot of other benefits: > > > > - decouple the transport specific stuff out of the vhost abstraction, > > mmio device is supported with 0 line of code > > - migration compatibility, reuse the migration stream that is already > > supported by Qemu virtio-net, this will allow migration among > > different vhost backends. > > - software mediation facility, not all the virtqueues are assigned to > > guests directly. One example is the virtio-net cvq, qemu may want to > > intercept and record the device state for migration. Reusing the > > current virtio-net codes simplifies a lot of codes. > > - transparent failover (in the future), the nic model can choose to > > switch between vhost backends etc. > > > > We want to use the vdpa framework instead of the vfio-pci framework in > the virtio hardware offloading case, so maybe some of the benefits above > are not needed in our case. But we need to migrate between different > hardware, so I am not sure whether this approach would be harmful to the > requirement. It should not, but it needs to build the migration facility for the net from the ground. And if we want to have a general migration solution instead of a vendor specific one, it may duplicate some logic of existing virtio-net implementation. The CVQ migration is an example, we don't provide a dedicated migration facility in the spec. So a more general way for live migration currently is using the shadow virtqueue which is what Eugenio is doing. So thanks to the design where we tried to do all the work in the vhost layer, this might not be a problem for this approach. But talking about the CVQ migration, things will be interesting. Qemu needs to decode the cvq commands in the middle thus it can record the device state. For having a general migration solution, vhost-vdpa-pci needs to do this as well. Virtio-net has the full CVQ logic so it's much easier, for vhost-vdpa-pci, it needs to duplicate them all in its own logic. > > > > > > > Maybe we could support both in the future ? > > > > For the net, we need to figure out the advantages of this approach > > first. Note that we didn't have vhost-user-net-pci or vhost-pci in the > > past. > > > > Why didn't support vhost-user-net-pci in history ? Because its control > path is much more complex than the block ? I don't know, it may be simply because no one tries to do that. > > > For the block, I will leave Stefan and Stefano to comment. > > > > > Such as: > > > > > > * Lightweight > > > Net: vhost-vdpa-net > > > Storage: vhost-vdpa-blk > > > > > > * Heavy but more powerful > > > Net: netdev + virtio-net + vhost-vdpa > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com> > > > --- > > > hw/net/meson.build | 1 + > > > hw/net/vhost-vdpa-net.c | 338 > > +++++++++++++++++++++++++++++++++++++ > > > hw/virtio/Kconfig | 5 + > > > hw/virtio/meson.build | 1 + > > > hw/virtio/vhost-vdpa-net-pci.c | 118 +++++++++++++ > > > > I'd expect there's no device type specific code in this approach and > > any kind of vDPA devices could be used with a general pci device. > > > > Any reason for having net specific types here? > > > > No, just because there already has the proposal of vhost-vdpa-blk, so I > developed the vhost-vdpa-net correspondingly. > > I pretty agree with your suggestion. If feasible, likes vfio-pci, we don't > need to maintain the device type specific code in QEMU, what's more, it's > possible to support the live migration of different virtio hardware. > See above, we probably need type specific migration code. [...] Thanks
On Mon, Dec 13, 2021 at 10:47:00AM +0800, Jason Wang wrote: >On Sun, Dec 12, 2021 at 5:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: >> >> On Sat, Dec 11, 2021 at 03:00:27AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: >> > >> > >> > > -----Original Message----- >> > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] >> > > Sent: Thursday, December 9, 2021 5:17 PM >> > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) >> > > <longpeng2@huawei.com> >> > > Cc: jasowang@redhat.com; mst@redhat.com; parav@nvidia.com; >> > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; >> > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org >> > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support >> > > >> > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: >> > > > From: Longpeng <longpeng2@huawei.com> >> > > > >> > > > Hi guys, >> > > > >> > > > This patch introduces vhost-vdpa-net device, which is inspired >> > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. >> > > > >> > > > I've tested this patch on Huawei's offload card: >> > > > ./x86_64-softmmu/qemu-system-x86_64 \ >> > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 >> > > > >> > > > For virtio hardware offloading, the most important requirement for us >> > > > is to support live migration between offloading cards from different >> > > > vendors, the combination of netdev and virtio-net seems too heavy, we >> > > > prefer a lightweight way. >> > > > >> > > > Maybe we could support both in the future ? Such as: >> > > > >> > > > * Lightweight >> > > > Net: vhost-vdpa-net >> > > > Storage: vhost-vdpa-blk >> > > > >> > > > * Heavy but more powerful >> > > > Net: netdev + virtio-net + vhost-vdpa >> > > > Storage: bdrv + virtio-blk + vhost-vdpa >> > > > >> > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html >> > > >> > > Stefano presented a plan for vdpa-blk at KVM Forum 2021: >> > > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof >> > > tware-offload-for-virtio-blk-stefano-garzarella-red-hat >> > > >> > > It's closer to today's virtio-net + vhost-net approach than the >> > > vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as >> > > an offload feature rather than a completely separate code path that >> > > needs to be maintained and tested. That way QEMU's block layer features >> > > and live migration work with vDPA devices and re-use the virtio-blk >> > > code. The key functionality that has not been implemented yet is a "fast >> > > path" mechanism that allows the QEMU virtio-blk device's virtqueue to be >> > > offloaded to vDPA. >> > > >> > > The unified vdpa-blk architecture should deliver the same performance >> > > as the vhost-vdpa-blk device you mentioned but with more features, so I >> > > wonder what aspects of the vhost-vdpa-blk idea are important to you? >> > > >> > > QEMU already has vhost-user-blk, which takes a similar approach as the >> > > vhost-vdpa-blk device you are proposing. I'm not against the >> > > vhost-vdpa-blk approach in priciple, but would like to understand your >> > > requirements and see if there is a way to collaborate on one vdpa-blk >> > > implementation instead of dividing our efforts between two. >> > > >> > >> > We prefer a simple way in the virtio hardware offloading case, it could reduce >> > our maintenance workload, we no need to maintain the virtio-net, netdev, >> > virtio-blk, bdrv and ... any more. If we need to support other vdpa devices >> > (such as virtio-crypto, virtio-fs) in the future, then we also need to maintain >> > the corresponding device emulation code? >> > >> > For the virtio hardware offloading case, we usually use the >> > vfio-pci framework, >> > it saves a lot of our maintenance work in QEMU, we don't need to touch the device >> > types. Inspired by Jason, what we really prefer is "vhost-vdpa-pci/mmio", use it to >> > instead of the vfio-pci, it could provide the same performance as vfio-pci, but it's >> > *possible* to support live migrate between offloading cards from different vendors. >> >> OK, so the features you are dropping would be migration between >> a vdpa, vhost and virtio backends. I think given vhost-vdpa-blk is seems >> fair enough... What do others think? > >I think it should be fine, and it would be even better to make it not >specific to device type. Yep, I agree with Jason. A generic vhost-vdpa device would be the best if the features are not needed. In this way we would have this generic device and then the specialized devices that will offer more features. Stefano
On Sat, Dec 11, 2021 at 04:11:04AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > >> -----Original Message----- >> From: Stefano Garzarella [mailto:sgarzare@redhat.com] >> Sent: Thursday, December 9, 2021 11:55 PM >> To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) >> <longpeng2@huawei.com> >> Cc: Stefan Hajnoczi <stefanha@redhat.com>; jasowang@redhat.com; mst@redhat.com; >> parav@nvidia.com; xieyongji@bytedance.com; Yechuan <yechuan@huawei.com>; >> Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org >> Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support >> >> On Thu, Dec 09, 2021 at 09:16:58AM +0000, Stefan Hajnoczi wrote: >> >On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: >> >> From: Longpeng <longpeng2@huawei.com> >> >> >> >> Hi guys, >> >> >> >> This patch introduces vhost-vdpa-net device, which is inspired >> >> by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. >> >> >> >> I've tested this patch on Huawei's offload card: >> >> ./x86_64-softmmu/qemu-system-x86_64 \ >> >> -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 >> >> >> >> For virtio hardware offloading, the most important requirement for us >> >> is to support live migration between offloading cards from different >> >> vendors, the combination of netdev and virtio-net seems too heavy, we >> >> prefer a lightweight way. >> >> >> >> Maybe we could support both in the future ? Such as: >> >> >> >> * Lightweight >> >> Net: vhost-vdpa-net >> >> Storage: vhost-vdpa-blk >> >> >> >> * Heavy but more powerful >> >> Net: netdev + virtio-net + vhost-vdpa >> >> Storage: bdrv + virtio-blk + vhost-vdpa >> >> >> >> [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html >> > >> >Stefano presented a plan for vdpa-blk at KVM Forum 2021: >> >https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-so >> ftware-offload-for-virtio-blk-stefano-garzarella-red-hat >> > >> >It's closer to today's virtio-net + vhost-net approach than the >> >vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as >> >an offload feature rather than a completely separate code path that >> >needs to be maintained and tested. That way QEMU's block layer features >> >and live migration work with vDPA devices and re-use the virtio-blk >> >code. The key functionality that has not been implemented yet is a "fast >> >path" mechanism that allows the QEMU virtio-blk device's virtqueue to be >> >offloaded to vDPA. >> > >> >The unified vdpa-blk architecture should deliver the same performance >> >as the vhost-vdpa-blk device you mentioned but with more features, so I >> >wonder what aspects of the vhost-vdpa-blk idea are important to you? >> > >> >QEMU already has vhost-user-blk, which takes a similar approach as the >> >vhost-vdpa-blk device you are proposing. I'm not against the >> >vhost-vdpa-blk approach in priciple, but would like to understand your >> >requirements and see if there is a way to collaborate on one vdpa-blk >> >implementation instead of dividing our efforts between two. >> >> Waiting for the aspects that Stefan asked, I add some details about the >> plan for vdpa-blk. >> >> Currently I'm working on the in-kernel software device. In the next >> months I hope to start working on the QEMU part. Anyway that part could >> go in parallel with the in-kernel device, so if you are interested we >> can collaborate. >> > >The work on QEMU part means supporting the vdpa in BlockDriver and virtio-blk? Yep. > >In fact, I wanted to support the vdpa in QEMU block layer before I sent this >RFC, because the net part uses netdev + virtio-net while the storage part uses >vhost-vdpa-blk (from Yongji) looks like a strange combination. > >But I found enable vdpa in QEMU block layer would take more time and some >features (e.g. snapshot, IO throttling) from the QEMU block layer are not needed >in our hardware offloading case, so I turned to develop the "vhost-vdpa-net", >maybe the combination of vhost-vdpa-net and vhost-vdpa-blk is congruous. Yes, I agree it takes more time, but it would be very flexible. Like Jason said, maybe for this use case it would be better to have a generic device, not type dependent, so we don't need to add vhost-vdpa-blk, vhost-vdpa-net, etc. > >> Having only the unified vdpa-blk architecture would allow us to simplify >> the management layers and avoid duplicate code, but it takes more time >> to develop compared to vhost-vdpa-blk. So if vdpa-blk support in QEMU is >> urgent, I could understand the need to add vhost-vdpa-blk now. >> > >I prefer a way that can support vdpa devices (not only net and storage, but also >other device types) quickly in hardware offloading case, maybe it would decreases >the universalism, but it could be an alternative to some users. Yep, make sense. Stefano
On Mon, Dec 13, 2021 at 10:47:00AM +0800, Jason Wang wrote: > On Sun, Dec 12, 2021 at 5:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Sat, Dec 11, 2021 at 03:00:27AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > > > > > > > > > -----Original Message----- > > > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > > > > Sent: Thursday, December 9, 2021 5:17 PM > > > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > > > <longpeng2@huawei.com> > > > > Cc: jasowang@redhat.com; mst@redhat.com; parav@nvidia.com; > > > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > > > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > > > > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > > > > From: Longpeng <longpeng2@huawei.com> > > > > > > > > > > Hi guys, > > > > > > > > > > This patch introduces vhost-vdpa-net device, which is inspired > > > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > > > > > > > I've tested this patch on Huawei's offload card: > > > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > > > > > For virtio hardware offloading, the most important requirement for us > > > > > is to support live migration between offloading cards from different > > > > > vendors, the combination of netdev and virtio-net seems too heavy, we > > > > > prefer a lightweight way. > > > > > > > > > > Maybe we could support both in the future ? Such as: > > > > > > > > > > * Lightweight > > > > > Net: vhost-vdpa-net > > > > > Storage: vhost-vdpa-blk > > > > > > > > > > * Heavy but more powerful > > > > > Net: netdev + virtio-net + vhost-vdpa > > > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > > > Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > > > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof > > > > tware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > > > > It's closer to today's virtio-net + vhost-net approach than the > > > > vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > > > an offload feature rather than a completely separate code path that > > > > needs to be maintained and tested. That way QEMU's block layer features > > > > and live migration work with vDPA devices and re-use the virtio-blk > > > > code. The key functionality that has not been implemented yet is a "fast > > > > path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > > > offloaded to vDPA. > > > > > > > > The unified vdpa-blk architecture should deliver the same performance > > > > as the vhost-vdpa-blk device you mentioned but with more features, so I > > > > wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > > > > > QEMU already has vhost-user-blk, which takes a similar approach as the > > > > vhost-vdpa-blk device you are proposing. I'm not against the > > > > vhost-vdpa-blk approach in priciple, but would like to understand your > > > > requirements and see if there is a way to collaborate on one vdpa-blk > > > > implementation instead of dividing our efforts between two. > > > > > > > > > > We prefer a simple way in the virtio hardware offloading case, it could reduce > > > our maintenance workload, we no need to maintain the virtio-net, netdev, > > > virtio-blk, bdrv and ... any more. If we need to support other vdpa devices > > > (such as virtio-crypto, virtio-fs) in the future, then we also need to maintain > > > the corresponding device emulation code? > > > > > > For the virtio hardware offloading case, we usually use the vfio-pci framework, > > > it saves a lot of our maintenance work in QEMU, we don't need to touch the device > > > types. Inspired by Jason, what we really prefer is "vhost-vdpa-pci/mmio", use it to > > > instead of the vfio-pci, it could provide the same performance as vfio-pci, but it's > > > *possible* to support live migrate between offloading cards from different vendors. > > > > OK, so the features you are dropping would be migration between > > a vdpa, vhost and virtio backends. I think given vhost-vdpa-blk is seems > > fair enough... What do others think? > > I think it should be fine, and it would be even better to make it not > specific to device type. That's an interesting idea. A generic vDPA VirtIODevice could exposed as --device vhost-vdpa-pci, [vhostfd=FD,| vhostpath=/dev/vhost-vdpa-N] (and for virtio-mmio and virtio-ccw too). I don't think this is possible yet because the vhost_vdpa ioctls are missing some introspection functionality. Here is what I found: - Device ID: ok, use VHOST_VDPA_GET_DEVICE_ID - Device feature bits: ok, use VHOST_GET_BACKEND_FEATURES - Configuration space size: missing, need ioctl for ops->get_config_size() - Max virtqueue size: ok, VHOST_VDPA_GET_VRING_NUM - Number of virtqueues: probe using VHOST_GET_VRING_BASE? I think it's worth adding the missing introspection so that VMMs like QEMU can implement a generic vDPA device. Stefan
On Sat, Dec 11, 2021 at 04:11:04AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > > > -----Original Message----- > > From: Stefano Garzarella [mailto:sgarzare@redhat.com] > > Sent: Thursday, December 9, 2021 11:55 PM > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > <longpeng2@huawei.com> > > Cc: Stefan Hajnoczi <stefanha@redhat.com>; jasowang@redhat.com; mst@redhat.com; > > parav@nvidia.com; xieyongji@bytedance.com; Yechuan <yechuan@huawei.com>; > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > On Thu, Dec 09, 2021 at 09:16:58AM +0000, Stefan Hajnoczi wrote: > > >On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > >> From: Longpeng <longpeng2@huawei.com> > > >> > > >> Hi guys, > > >> > > >> This patch introduces vhost-vdpa-net device, which is inspired > > >> by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > >> > > >> I've tested this patch on Huawei's offload card: > > >> ./x86_64-softmmu/qemu-system-x86_64 \ > > >> -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > >> > > >> For virtio hardware offloading, the most important requirement for us > > >> is to support live migration between offloading cards from different > > >> vendors, the combination of netdev and virtio-net seems too heavy, we > > >> prefer a lightweight way. > > >> > > >> Maybe we could support both in the future ? Such as: > > >> > > >> * Lightweight > > >> Net: vhost-vdpa-net > > >> Storage: vhost-vdpa-blk > > >> > > >> * Heavy but more powerful > > >> Net: netdev + virtio-net + vhost-vdpa > > >> Storage: bdrv + virtio-blk + vhost-vdpa > > >> > > >> [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > >Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > >https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-so > > ftware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > >It's closer to today's virtio-net + vhost-net approach than the > > >vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > >an offload feature rather than a completely separate code path that > > >needs to be maintained and tested. That way QEMU's block layer features > > >and live migration work with vDPA devices and re-use the virtio-blk > > >code. The key functionality that has not been implemented yet is a "fast > > >path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > >offloaded to vDPA. > > > > > >The unified vdpa-blk architecture should deliver the same performance > > >as the vhost-vdpa-blk device you mentioned but with more features, so I > > >wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > > >QEMU already has vhost-user-blk, which takes a similar approach as the > > >vhost-vdpa-blk device you are proposing. I'm not against the > > >vhost-vdpa-blk approach in priciple, but would like to understand your > > >requirements and see if there is a way to collaborate on one vdpa-blk > > >implementation instead of dividing our efforts between two. > > > > Waiting for the aspects that Stefan asked, I add some details about the > > plan for vdpa-blk. > > > > Currently I'm working on the in-kernel software device. In the next > > months I hope to start working on the QEMU part. Anyway that part could > > go in parallel with the in-kernel device, so if you are interested we > > can collaborate. > > > > The work on QEMU part means supporting the vdpa in BlockDriver and virtio-blk? > > In fact, I wanted to support the vdpa in QEMU block layer before I sent this > RFC, because the net part uses netdev + virtio-net while the storage part uses > vhost-vdpa-blk (from Yongji) looks like a strange combination. > > But I found enable vdpa in QEMU block layer would take more time and some > features (e.g. snapshot, IO throttling) from the QEMU block layer are not needed > in our hardware offloading case, so I turned to develop the "vhost-vdpa-net", > maybe the combination of vhost-vdpa-net and vhost-vdpa-blk is congruous. > > > Having only the unified vdpa-blk architecture would allow us to simplify > > the management layers and avoid duplicate code, but it takes more time > > to develop compared to vhost-vdpa-blk. So if vdpa-blk support in QEMU is > > urgent, I could understand the need to add vhost-vdpa-blk now. > > > > I prefer a way that can support vdpa devices (not only net and storage, but also > other device types) quickly in hardware offloading case, maybe it would decreases > the universalism, but it could be an alternative to some users. If QEMU already had --blockdev vpda-blk, would you use that with --device virtio-blk-pci or still want to implement a separate --device vhost-vdpa-blk-pci device? Stefan
> -----Original Message----- > From: Jason Wang [mailto:jasowang@redhat.com] > Sent: Monday, December 13, 2021 11:23 AM > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > <longpeng2@huawei.com> > Cc: mst <mst@redhat.com>; Parav Pandit <parav@nvidia.com>; Yongji Xie > <xieyongji@bytedance.com>; Stefan Hajnoczi <stefanha@redhat.com>; Stefano > Garzarella <sgarzare@redhat.com>; Yechuan <yechuan@huawei.com>; Gonglei (Arei) > <arei.gonglei@huawei.com>; qemu-devel <qemu-devel@nongnu.org> > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > On Sat, Dec 11, 2021 at 1:23 PM Longpeng (Mike, Cloud Infrastructure > Service Product Dept.) <longpeng2@huawei.com> wrote: > > > > > > > > > -----Original Message----- > > > From: Jason Wang [mailto:jasowang@redhat.com] > > > Sent: Wednesday, December 8, 2021 2:27 PM > > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > > <longpeng2@huawei.com> > > > Cc: mst <mst@redhat.com>; Parav Pandit <parav@nvidia.com>; Yongji Xie > > > <xieyongji@bytedance.com>; Stefan Hajnoczi <stefanha@redhat.com>; Stefano > > > Garzarella <sgarzare@redhat.com>; Yechuan <yechuan@huawei.com>; Gonglei > (Arei) > > > <arei.gonglei@huawei.com>; qemu-devel <qemu-devel@nongnu.org> > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > > > On Wed, Dec 8, 2021 at 1:20 PM Longpeng(Mike) <longpeng2@huawei.com> wrote: > > > > > > > > From: Longpeng <longpeng2@huawei.com> > > > > > > > > Hi guys, > > > > > > > > This patch introduces vhost-vdpa-net device, which is inspired > > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > > > > > I've tested this patch on Huawei's offload card: > > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > > > For virtio hardware offloading, the most important requirement for us > > > > is to support live migration between offloading cards from different > > > > vendors, the combination of netdev and virtio-net seems too heavy, we > > > > prefer a lightweight way. > > > > > > Could you elaborate more on this? It's mainly the control path when > > > using with netdev, and it provides a lot of other benefits: > > > > > > - decouple the transport specific stuff out of the vhost abstraction, > > > mmio device is supported with 0 line of code > > > - migration compatibility, reuse the migration stream that is already > > > supported by Qemu virtio-net, this will allow migration among > > > different vhost backends. > > > - software mediation facility, not all the virtqueues are assigned to > > > guests directly. One example is the virtio-net cvq, qemu may want to > > > intercept and record the device state for migration. Reusing the > > > current virtio-net codes simplifies a lot of codes. > > > - transparent failover (in the future), the nic model can choose to > > > switch between vhost backends etc. > > > > > > > We want to use the vdpa framework instead of the vfio-pci framework in > > the virtio hardware offloading case, so maybe some of the benefits above > > are not needed in our case. But we need to migrate between different > > hardware, so I am not sure whether this approach would be harmful to the > > requirement. > > It should not, but it needs to build the migration facility for the > net from the ground. And if we want to have a general migration > solution instead of a vendor specific one, it may duplicate some logic > of existing virtio-net implementation. The CVQ migration is an > example, we don't provide a dedicated migration facility in the spec. > So a more general way for live migration currently is using the shadow > virtqueue which is what Eugenio is doing. So thanks to the design > where we tried to do all the work in the vhost layer, this might not > be a problem for this approach. But talking about the CVQ migration, > things will be interesting. Qemu needs to decode the cvq commands in > the middle thus it can record the device state. For having a general > migration solution, vhost-vdpa-pci needs to do this as well. > Virtio-net has the full CVQ logic so it's much easier, for > vhost-vdpa-pci, it needs to duplicate them all in its own logic. > OK, thanks for your patient explanation. We will follow up the progress of live migration. > > > > > > > > > > Maybe we could support both in the future ? > > > > > > For the net, we need to figure out the advantages of this approach > > > first. Note that we didn't have vhost-user-net-pci or vhost-pci in the > > > past. > > > > > > > Why didn't support vhost-user-net-pci in history ? Because its control > > path is much more complex than the block ? > > I don't know, it may be simply because no one tries to do that. > > > > > > For the block, I will leave Stefan and Stefano to comment. > > > > > > > Such as: > > > > > > > > * Lightweight > > > > Net: vhost-vdpa-net > > > > Storage: vhost-vdpa-blk > > > > > > > > * Heavy but more powerful > > > > Net: netdev + virtio-net + vhost-vdpa > > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > > > Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com> > > > > --- > > > > hw/net/meson.build | 1 + > > > > hw/net/vhost-vdpa-net.c | 338 > > > +++++++++++++++++++++++++++++++++++++ > > > > hw/virtio/Kconfig | 5 + > > > > hw/virtio/meson.build | 1 + > > > > hw/virtio/vhost-vdpa-net-pci.c | 118 +++++++++++++ > > > > > > I'd expect there's no device type specific code in this approach and > > > any kind of vDPA devices could be used with a general pci device. > > > > > > Any reason for having net specific types here? > > > > > > > No, just because there already has the proposal of vhost-vdpa-blk, so I > > developed the vhost-vdpa-net correspondingly. > > > > I pretty agree with your suggestion. If feasible, likes vfio-pci, we don't > > need to maintain the device type specific code in QEMU, what's more, it's > > possible to support the live migration of different virtio hardware. > > > > See above, we probably need type specific migration code. > > [...] > > Thanks
> -----Original Message----- > From: Qemu-devel [mailto:qemu-devel-bounces+longpeng2=huawei.com@nongnu.org] > On Behalf Of Stefan Hajnoczi > Sent: Monday, December 13, 2021 11:16 PM > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > <longpeng2@huawei.com> > Cc: mst@redhat.com; jasowang@redhat.com; qemu-devel@nongnu.org; Yechuan > <yechuan@huawei.com>; xieyongji@bytedance.com; Gonglei (Arei) > <arei.gonglei@huawei.com>; parav@nvidia.com; Stefano Garzarella > <sgarzare@redhat.com> > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > On Sat, Dec 11, 2021 at 04:11:04AM +0000, Longpeng (Mike, Cloud Infrastructure > Service Product Dept.) wrote: > > > > > > > -----Original Message----- > > > From: Stefano Garzarella [mailto:sgarzare@redhat.com] > > > Sent: Thursday, December 9, 2021 11:55 PM > > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > > <longpeng2@huawei.com> > > > Cc: Stefan Hajnoczi <stefanha@redhat.com>; jasowang@redhat.com; > mst@redhat.com; > > > parav@nvidia.com; xieyongji@bytedance.com; Yechuan <yechuan@huawei.com>; > > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > > > On Thu, Dec 09, 2021 at 09:16:58AM +0000, Stefan Hajnoczi wrote: > > > >On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > > >> From: Longpeng <longpeng2@huawei.com> > > > >> > > > >> Hi guys, > > > >> > > > >> This patch introduces vhost-vdpa-net device, which is inspired > > > >> by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > >> > > > >> I've tested this patch on Huawei's offload card: > > > >> ./x86_64-softmmu/qemu-system-x86_64 \ > > > >> -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > >> > > > >> For virtio hardware offloading, the most important requirement for us > > > >> is to support live migration between offloading cards from different > > > >> vendors, the combination of netdev and virtio-net seems too heavy, we > > > >> prefer a lightweight way. > > > >> > > > >> Maybe we could support both in the future ? Such as: > > > >> > > > >> * Lightweight > > > >> Net: vhost-vdpa-net > > > >> Storage: vhost-vdpa-blk > > > >> > > > >> * Heavy but more powerful > > > >> Net: netdev + virtio-net + vhost-vdpa > > > >> Storage: bdrv + virtio-blk + vhost-vdpa > > > >> > > > >> [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > > >Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > > >https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-an > d-so > > > ftware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > > > >It's closer to today's virtio-net + vhost-net approach than the > > > >vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > > >an offload feature rather than a completely separate code path that > > > >needs to be maintained and tested. That way QEMU's block layer features > > > >and live migration work with vDPA devices and re-use the virtio-blk > > > >code. The key functionality that has not been implemented yet is a "fast > > > >path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > > >offloaded to vDPA. > > > > > > > >The unified vdpa-blk architecture should deliver the same performance > > > >as the vhost-vdpa-blk device you mentioned but with more features, so I > > > >wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > > > > >QEMU already has vhost-user-blk, which takes a similar approach as the > > > >vhost-vdpa-blk device you are proposing. I'm not against the > > > >vhost-vdpa-blk approach in priciple, but would like to understand your > > > >requirements and see if there is a way to collaborate on one vdpa-blk > > > >implementation instead of dividing our efforts between two. > > > > > > Waiting for the aspects that Stefan asked, I add some details about the > > > plan for vdpa-blk. > > > > > > Currently I'm working on the in-kernel software device. In the next > > > months I hope to start working on the QEMU part. Anyway that part could > > > go in parallel with the in-kernel device, so if you are interested we > > > can collaborate. > > > > > > > The work on QEMU part means supporting the vdpa in BlockDriver and virtio-blk? > > > > In fact, I wanted to support the vdpa in QEMU block layer before I sent this > > RFC, because the net part uses netdev + virtio-net while the storage part uses > > vhost-vdpa-blk (from Yongji) looks like a strange combination. > > > > But I found enable vdpa in QEMU block layer would take more time and some > > features (e.g. snapshot, IO throttling) from the QEMU block layer are not needed > > in our hardware offloading case, so I turned to develop the "vhost-vdpa-net", > > maybe the combination of vhost-vdpa-net and vhost-vdpa-blk is congruous. > > > > > Having only the unified vdpa-blk architecture would allow us to simplify > > > the management layers and avoid duplicate code, but it takes more time > > > to develop compared to vhost-vdpa-blk. So if vdpa-blk support in QEMU is > > > urgent, I could understand the need to add vhost-vdpa-blk now. > > > > > > > I prefer a way that can support vdpa devices (not only net and storage, but > also > > other device types) quickly in hardware offloading case, maybe it would > decreases > > the universalism, but it could be an alternative to some users. > > If QEMU already had --blockdev vpda-blk, would you use that with > --device virtio-blk-pci or still want to implement a separate --device > vhost-vdpa-blk-pci device? > vhost-vdpa-blk/net seems no need now, but a generic vdpa device may be still needed. We are still in the research stage, so I cannot decide to use vdpa-blk or the generic device for the storage devices now. If we need to migrate the legacy non-offloading instances to the offloading instances, then we have no choice but to use vdpa-blk. However, migrating from non-offloading to offloading is a complex project, not only the virtualization layer needs to support but also other layers, so it's hard to say whether this is possible in practical reality. So maybe a good choice for us is : Net: -netdev type=vhost-vdpa Storage: -blockdev vpda-blk Others (e.g. fs, crypto): generic vdpa device > Stefan
On Mon, Dec 13, 2021 at 11:14 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > On Mon, Dec 13, 2021 at 10:47:00AM +0800, Jason Wang wrote: > > On Sun, Dec 12, 2021 at 5:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > On Sat, Dec 11, 2021 at 03:00:27AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > > > > > > > > > > > > -----Original Message----- > > > > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > > > > > Sent: Thursday, December 9, 2021 5:17 PM > > > > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > > > > <longpeng2@huawei.com> > > > > > Cc: jasowang@redhat.com; mst@redhat.com; parav@nvidia.com; > > > > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > > > > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > > > > > > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > > > > > From: Longpeng <longpeng2@huawei.com> > > > > > > > > > > > > Hi guys, > > > > > > > > > > > > This patch introduces vhost-vdpa-net device, which is inspired > > > > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > > > > > > > > > I've tested this patch on Huawei's offload card: > > > > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > > > > > > > For virtio hardware offloading, the most important requirement for us > > > > > > is to support live migration between offloading cards from different > > > > > > vendors, the combination of netdev and virtio-net seems too heavy, we > > > > > > prefer a lightweight way. > > > > > > > > > > > > Maybe we could support both in the future ? Such as: > > > > > > > > > > > > * Lightweight > > > > > > Net: vhost-vdpa-net > > > > > > Storage: vhost-vdpa-blk > > > > > > > > > > > > * Heavy but more powerful > > > > > > Net: netdev + virtio-net + vhost-vdpa > > > > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > > > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > > > > > Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > > > > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof > > > > > tware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > > > > > > It's closer to today's virtio-net + vhost-net approach than the > > > > > vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > > > > an offload feature rather than a completely separate code path that > > > > > needs to be maintained and tested. That way QEMU's block layer features > > > > > and live migration work with vDPA devices and re-use the virtio-blk > > > > > code. The key functionality that has not been implemented yet is a "fast > > > > > path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > > > > offloaded to vDPA. > > > > > > > > > > The unified vdpa-blk architecture should deliver the same performance > > > > > as the vhost-vdpa-blk device you mentioned but with more features, so I > > > > > wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > > > > > > > QEMU already has vhost-user-blk, which takes a similar approach as the > > > > > vhost-vdpa-blk device you are proposing. I'm not against the > > > > > vhost-vdpa-blk approach in priciple, but would like to understand your > > > > > requirements and see if there is a way to collaborate on one vdpa-blk > > > > > implementation instead of dividing our efforts between two. > > > > > > > > > > > > > We prefer a simple way in the virtio hardware offloading case, it could reduce > > > > our maintenance workload, we no need to maintain the virtio-net, netdev, > > > > virtio-blk, bdrv and ... any more. If we need to support other vdpa devices > > > > (such as virtio-crypto, virtio-fs) in the future, then we also need to maintain > > > > the corresponding device emulation code? > > > > > > > > For the virtio hardware offloading case, we usually use the vfio-pci framework, > > > > it saves a lot of our maintenance work in QEMU, we don't need to touch the device > > > > types. Inspired by Jason, what we really prefer is "vhost-vdpa-pci/mmio", use it to > > > > instead of the vfio-pci, it could provide the same performance as vfio-pci, but it's > > > > *possible* to support live migrate between offloading cards from different vendors. > > > > > > OK, so the features you are dropping would be migration between > > > a vdpa, vhost and virtio backends. I think given vhost-vdpa-blk is seems > > > fair enough... What do others think? > > > > I think it should be fine, and it would be even better to make it not > > specific to device type. > > That's an interesting idea. A generic vDPA VirtIODevice could exposed as > > --device vhost-vdpa-pci, > [vhostfd=FD,| > vhostpath=/dev/vhost-vdpa-N] > > (and for virtio-mmio and virtio-ccw too). > > I don't think this is possible yet because the vhost_vdpa ioctls are > missing some introspection functionality. Here is what I found: > - Device ID: ok, use VHOST_VDPA_GET_DEVICE_ID > - Device feature bits: ok, use VHOST_GET_BACKEND_FEATURES > - Configuration space size: missing, need ioctl for ops->get_config_size() Any specific reason that we need this considering we've already had VHOST_VDPA_GET_CONFIG and we do the size validation there? > - Max virtqueue size: ok, VHOST_VDPA_GET_VRING_NUM > - Number of virtqueues: probe using VHOST_GET_VRING_BASE? I'm not sure whether or not we need this and it seems not necessary since it can be deduced from the config space and features. Thanks > > I think it's worth adding the missing introspection so that VMMs like > QEMU can implement a generic vDPA device. > > Stefan
On Tue, Dec 14, 2021 at 01:44:46AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > > > -----Original Message----- > > From: Qemu-devel [mailto:qemu-devel-bounces+longpeng2=huawei.com@nongnu.org] > > On Behalf Of Stefan Hajnoczi > > Sent: Monday, December 13, 2021 11:16 PM > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > <longpeng2@huawei.com> > > Cc: mst@redhat.com; jasowang@redhat.com; qemu-devel@nongnu.org; Yechuan > > <yechuan@huawei.com>; xieyongji@bytedance.com; Gonglei (Arei) > > <arei.gonglei@huawei.com>; parav@nvidia.com; Stefano Garzarella > > <sgarzare@redhat.com> > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > On Sat, Dec 11, 2021 at 04:11:04AM +0000, Longpeng (Mike, Cloud Infrastructure > > Service Product Dept.) wrote: > > > > > > > > > > -----Original Message----- > > > > From: Stefano Garzarella [mailto:sgarzare@redhat.com] > > > > Sent: Thursday, December 9, 2021 11:55 PM > > > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > > > <longpeng2@huawei.com> > > > > Cc: Stefan Hajnoczi <stefanha@redhat.com>; jasowang@redhat.com; > > mst@redhat.com; > > > > parav@nvidia.com; xieyongji@bytedance.com; Yechuan <yechuan@huawei.com>; > > > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > > > > > On Thu, Dec 09, 2021 at 09:16:58AM +0000, Stefan Hajnoczi wrote: > > > > >On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > > > >> From: Longpeng <longpeng2@huawei.com> > > > > >> > > > > >> Hi guys, > > > > >> > > > > >> This patch introduces vhost-vdpa-net device, which is inspired > > > > >> by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > >> > > > > >> I've tested this patch on Huawei's offload card: > > > > >> ./x86_64-softmmu/qemu-system-x86_64 \ > > > > >> -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > >> > > > > >> For virtio hardware offloading, the most important requirement for us > > > > >> is to support live migration between offloading cards from different > > > > >> vendors, the combination of netdev and virtio-net seems too heavy, we > > > > >> prefer a lightweight way. > > > > >> > > > > >> Maybe we could support both in the future ? Such as: > > > > >> > > > > >> * Lightweight > > > > >> Net: vhost-vdpa-net > > > > >> Storage: vhost-vdpa-blk > > > > >> > > > > >> * Heavy but more powerful > > > > >> Net: netdev + virtio-net + vhost-vdpa > > > > >> Storage: bdrv + virtio-blk + vhost-vdpa > > > > >> > > > > >> [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > > > > >Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > > > >https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-an > > d-so > > > > ftware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > > > > > >It's closer to today's virtio-net + vhost-net approach than the > > > > >vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > > > >an offload feature rather than a completely separate code path that > > > > >needs to be maintained and tested. That way QEMU's block layer features > > > > >and live migration work with vDPA devices and re-use the virtio-blk > > > > >code. The key functionality that has not been implemented yet is a "fast > > > > >path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > > > >offloaded to vDPA. > > > > > > > > > >The unified vdpa-blk architecture should deliver the same performance > > > > >as the vhost-vdpa-blk device you mentioned but with more features, so I > > > > >wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > > > > > > >QEMU already has vhost-user-blk, which takes a similar approach as the > > > > >vhost-vdpa-blk device you are proposing. I'm not against the > > > > >vhost-vdpa-blk approach in priciple, but would like to understand your > > > > >requirements and see if there is a way to collaborate on one vdpa-blk > > > > >implementation instead of dividing our efforts between two. > > > > > > > > Waiting for the aspects that Stefan asked, I add some details about the > > > > plan for vdpa-blk. > > > > > > > > Currently I'm working on the in-kernel software device. In the next > > > > months I hope to start working on the QEMU part. Anyway that part could > > > > go in parallel with the in-kernel device, so if you are interested we > > > > can collaborate. > > > > > > > > > > The work on QEMU part means supporting the vdpa in BlockDriver and virtio-blk? > > > > > > In fact, I wanted to support the vdpa in QEMU block layer before I sent this > > > RFC, because the net part uses netdev + virtio-net while the storage part uses > > > vhost-vdpa-blk (from Yongji) looks like a strange combination. > > > > > > But I found enable vdpa in QEMU block layer would take more time and some > > > features (e.g. snapshot, IO throttling) from the QEMU block layer are not needed > > > in our hardware offloading case, so I turned to develop the "vhost-vdpa-net", > > > maybe the combination of vhost-vdpa-net and vhost-vdpa-blk is congruous. > > > > > > > Having only the unified vdpa-blk architecture would allow us to simplify > > > > the management layers and avoid duplicate code, but it takes more time > > > > to develop compared to vhost-vdpa-blk. So if vdpa-blk support in QEMU is > > > > urgent, I could understand the need to add vhost-vdpa-blk now. > > > > > > > > > > I prefer a way that can support vdpa devices (not only net and storage, but > > also > > > other device types) quickly in hardware offloading case, maybe it would > > decreases > > > the universalism, but it could be an alternative to some users. > > > > If QEMU already had --blockdev vpda-blk, would you use that with > > --device virtio-blk-pci or still want to implement a separate --device > > vhost-vdpa-blk-pci device? > > > > vhost-vdpa-blk/net seems no need now, but a generic vdpa device may be still > needed. > > We are still in the research stage, so I cannot decide to use vdpa-blk or the > generic device for the storage devices now. > > If we need to migrate the legacy non-offloading instances to the offloading > instances, then we have no choice but to use vdpa-blk. However, migrating from > non-offloading to offloading is a complex project, not only the virtualization > layer needs to support but also other layers, so it's hard to say whether this > is possible in practical reality. > > So maybe a good choice for us is : > Net: -netdev type=vhost-vdpa > Storage: -blockdev vpda-blk > Others (e.g. fs, crypto): generic vdpa device I see, thanks! Stefan
On Tue, Dec 14, 2021 at 10:22:53AM +0800, Jason Wang wrote: > On Mon, Dec 13, 2021 at 11:14 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > On Mon, Dec 13, 2021 at 10:47:00AM +0800, Jason Wang wrote: > > > On Sun, Dec 12, 2021 at 5:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > On Sat, Dec 11, 2021 at 03:00:27AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > > > > > > > > > > > > > > > -----Original Message----- > > > > > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > > > > > > Sent: Thursday, December 9, 2021 5:17 PM > > > > > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > > > > > <longpeng2@huawei.com> > > > > > > Cc: jasowang@redhat.com; mst@redhat.com; parav@nvidia.com; > > > > > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > > > > > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > > > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > > > > > > > > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > > > > > > From: Longpeng <longpeng2@huawei.com> > > > > > > > > > > > > > > Hi guys, > > > > > > > > > > > > > > This patch introduces vhost-vdpa-net device, which is inspired > > > > > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > > > > > > > > > > > I've tested this patch on Huawei's offload card: > > > > > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > > > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > > > > > > > > > For virtio hardware offloading, the most important requirement for us > > > > > > > is to support live migration between offloading cards from different > > > > > > > vendors, the combination of netdev and virtio-net seems too heavy, we > > > > > > > prefer a lightweight way. > > > > > > > > > > > > > > Maybe we could support both in the future ? Such as: > > > > > > > > > > > > > > * Lightweight > > > > > > > Net: vhost-vdpa-net > > > > > > > Storage: vhost-vdpa-blk > > > > > > > > > > > > > > * Heavy but more powerful > > > > > > > Net: netdev + virtio-net + vhost-vdpa > > > > > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > > > > > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > > > > > > > Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > > > > > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof > > > > > > tware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > > > > > > > > It's closer to today's virtio-net + vhost-net approach than the > > > > > > vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > > > > > an offload feature rather than a completely separate code path that > > > > > > needs to be maintained and tested. That way QEMU's block layer features > > > > > > and live migration work with vDPA devices and re-use the virtio-blk > > > > > > code. The key functionality that has not been implemented yet is a "fast > > > > > > path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > > > > > offloaded to vDPA. > > > > > > > > > > > > The unified vdpa-blk architecture should deliver the same performance > > > > > > as the vhost-vdpa-blk device you mentioned but with more features, so I > > > > > > wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > > > > > > > > > QEMU already has vhost-user-blk, which takes a similar approach as the > > > > > > vhost-vdpa-blk device you are proposing. I'm not against the > > > > > > vhost-vdpa-blk approach in priciple, but would like to understand your > > > > > > requirements and see if there is a way to collaborate on one vdpa-blk > > > > > > implementation instead of dividing our efforts between two. > > > > > > > > > > > > > > > > We prefer a simple way in the virtio hardware offloading case, it could reduce > > > > > our maintenance workload, we no need to maintain the virtio-net, netdev, > > > > > virtio-blk, bdrv and ... any more. If we need to support other vdpa devices > > > > > (such as virtio-crypto, virtio-fs) in the future, then we also need to maintain > > > > > the corresponding device emulation code? > > > > > > > > > > For the virtio hardware offloading case, we usually use the vfio-pci framework, > > > > > it saves a lot of our maintenance work in QEMU, we don't need to touch the device > > > > > types. Inspired by Jason, what we really prefer is "vhost-vdpa-pci/mmio", use it to > > > > > instead of the vfio-pci, it could provide the same performance as vfio-pci, but it's > > > > > *possible* to support live migrate between offloading cards from different vendors. > > > > > > > > OK, so the features you are dropping would be migration between > > > > a vdpa, vhost and virtio backends. I think given vhost-vdpa-blk is seems > > > > fair enough... What do others think? > > > > > > I think it should be fine, and it would be even better to make it not > > > specific to device type. > > > > That's an interesting idea. A generic vDPA VirtIODevice could exposed as > > > > --device vhost-vdpa-pci, > > [vhostfd=FD,| > > vhostpath=/dev/vhost-vdpa-N] > > > > (and for virtio-mmio and virtio-ccw too). > > > > I don't think this is possible yet because the vhost_vdpa ioctls are > > missing some introspection functionality. Here is what I found: > > - Device ID: ok, use VHOST_VDPA_GET_DEVICE_ID > > - Device feature bits: ok, use VHOST_GET_BACKEND_FEATURES > > - Configuration space size: missing, need ioctl for ops->get_config_size() > > Any specific reason that we need this considering we've already had > VHOST_VDPA_GET_CONFIG and we do the size validation there? QEMU's virtio_init() takes a size_t config_size argument. We need to determine the size of the vhost_vdpa's configuration space in order to create the VirtIODevice in QEMU. Do you mean probing by checking for the VHOST_VDPA_GET_CONFIG -E2BIG return value? It's hacky but I guess it's possible to do a binary search that calls VHOST_VDPA_GET_CONFIG each iteration and reduces the size if -E2BIG is returned or increases the size otherwise. Or do you mean re-writing QEMU's hw/virtio/virtio.c to allow the VirtIODevice to override the size and we pass accesses through to vhost_vdpa. That way it might be possible to avoid fetching the configuration space size at startup, but I'm not sure this will work because QEMU might depend on knowing the exact size (e.g. live migration). > > - Max virtqueue size: ok, VHOST_VDPA_GET_VRING_NUM > > - Number of virtqueues: probe using VHOST_GET_VRING_BASE? > > I'm not sure whether or not we need this and it seems not necessary > since it can be deduced from the config space and features. It can only be deduced in a device-specific way (net, blk, etc). I can't think of a way to detect the number of virtqueues for an arbitrary VIRTIO device from the features bits and configuration space contents. Stefan
On Tue, Dec 14, 2021 at 9:11 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > On Tue, Dec 14, 2021 at 10:22:53AM +0800, Jason Wang wrote: > > On Mon, Dec 13, 2021 at 11:14 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > On Mon, Dec 13, 2021 at 10:47:00AM +0800, Jason Wang wrote: > > > > On Sun, Dec 12, 2021 at 5:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > On Sat, Dec 11, 2021 at 03:00:27AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > > > > > > > > > > > > > > > > > > -----Original Message----- > > > > > > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > > > > > > > Sent: Thursday, December 9, 2021 5:17 PM > > > > > > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > > > > > > <longpeng2@huawei.com> > > > > > > > Cc: jasowang@redhat.com; mst@redhat.com; parav@nvidia.com; > > > > > > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > > > > > > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > > > > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > > > > > > > > > > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > > > > > > > From: Longpeng <longpeng2@huawei.com> > > > > > > > > > > > > > > > > Hi guys, > > > > > > > > > > > > > > > > This patch introduces vhost-vdpa-net device, which is inspired > > > > > > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > > > > > > > > > > > > > I've tested this patch on Huawei's offload card: > > > > > > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > > > > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > > > > > > > > > > > For virtio hardware offloading, the most important requirement for us > > > > > > > > is to support live migration between offloading cards from different > > > > > > > > vendors, the combination of netdev and virtio-net seems too heavy, we > > > > > > > > prefer a lightweight way. > > > > > > > > > > > > > > > > Maybe we could support both in the future ? Such as: > > > > > > > > > > > > > > > > * Lightweight > > > > > > > > Net: vhost-vdpa-net > > > > > > > > Storage: vhost-vdpa-blk > > > > > > > > > > > > > > > > * Heavy but more powerful > > > > > > > > Net: netdev + virtio-net + vhost-vdpa > > > > > > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > > > > > > > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > > > > > > > > > Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > > > > > > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof > > > > > > > tware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > > > > > > > > > > It's closer to today's virtio-net + vhost-net approach than the > > > > > > > vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > > > > > > an offload feature rather than a completely separate code path that > > > > > > > needs to be maintained and tested. That way QEMU's block layer features > > > > > > > and live migration work with vDPA devices and re-use the virtio-blk > > > > > > > code. The key functionality that has not been implemented yet is a "fast > > > > > > > path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > > > > > > offloaded to vDPA. > > > > > > > > > > > > > > The unified vdpa-blk architecture should deliver the same performance > > > > > > > as the vhost-vdpa-blk device you mentioned but with more features, so I > > > > > > > wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > > > > > > > > > > > QEMU already has vhost-user-blk, which takes a similar approach as the > > > > > > > vhost-vdpa-blk device you are proposing. I'm not against the > > > > > > > vhost-vdpa-blk approach in priciple, but would like to understand your > > > > > > > requirements and see if there is a way to collaborate on one vdpa-blk > > > > > > > implementation instead of dividing our efforts between two. > > > > > > > > > > > > > > > > > > > We prefer a simple way in the virtio hardware offloading case, it could reduce > > > > > > our maintenance workload, we no need to maintain the virtio-net, netdev, > > > > > > virtio-blk, bdrv and ... any more. If we need to support other vdpa devices > > > > > > (such as virtio-crypto, virtio-fs) in the future, then we also need to maintain > > > > > > the corresponding device emulation code? > > > > > > > > > > > > For the virtio hardware offloading case, we usually use the vfio-pci framework, > > > > > > it saves a lot of our maintenance work in QEMU, we don't need to touch the device > > > > > > types. Inspired by Jason, what we really prefer is "vhost-vdpa-pci/mmio", use it to > > > > > > instead of the vfio-pci, it could provide the same performance as vfio-pci, but it's > > > > > > *possible* to support live migrate between offloading cards from different vendors. > > > > > > > > > > OK, so the features you are dropping would be migration between > > > > > a vdpa, vhost and virtio backends. I think given vhost-vdpa-blk is seems > > > > > fair enough... What do others think? > > > > > > > > I think it should be fine, and it would be even better to make it not > > > > specific to device type. > > > > > > That's an interesting idea. A generic vDPA VirtIODevice could exposed as > > > > > > --device vhost-vdpa-pci, > > > [vhostfd=FD,| > > > vhostpath=/dev/vhost-vdpa-N] > > > > > > (and for virtio-mmio and virtio-ccw too). > > > > > > I don't think this is possible yet because the vhost_vdpa ioctls are > > > missing some introspection functionality. Here is what I found: > > > - Device ID: ok, use VHOST_VDPA_GET_DEVICE_ID > > > - Device feature bits: ok, use VHOST_GET_BACKEND_FEATURES > > > - Configuration space size: missing, need ioctl for ops->get_config_size() > > > > Any specific reason that we need this considering we've already had > > VHOST_VDPA_GET_CONFIG and we do the size validation there? > > QEMU's virtio_init() takes a size_t config_size argument. We need to > determine the size of the vhost_vdpa's configuration space in order to > create the VirtIODevice in QEMU. > > Do you mean probing by checking for the VHOST_VDPA_GET_CONFIG -E2BIG > return value? It's hacky but I guess it's possible to do a binary search > that calls VHOST_VDPA_GET_CONFIG each iteration and reduces the size if > -E2BIG is returned or increases the size otherwise. > > Or do you mean re-writing QEMU's hw/virtio/virtio.c to allow the > VirtIODevice to override the size and we pass accesses through to > vhost_vdpa. That way it might be possible to avoid fetching the > configuration space size at startup, but I'm not sure this will work > because QEMU might depend on knowing the exact size (e.g. live > migration). Good point, so looking at virtio-blk it has: virtio_blk_set_config_size(s, s->host_features); virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, s->config_size); I think here virtio-blk/net should check the vhost-vdpa features here and fail if they are not the same? This looks better than overriding the config_size with what vhost-vdpa provides since it can override the features that the cli tries to enable. > > > > - Max virtqueue size: ok, VHOST_VDPA_GET_VRING_NUM > > > - Number of virtqueues: probe using VHOST_GET_VRING_BASE? > > > > I'm not sure whether or not we need this and it seems not necessary > > since it can be deduced from the config space and features. > > It can only be deduced in a device-specific way (net, blk, etc). I can't > think of a way to detect the number of virtqueues for an arbitrary > VIRTIO device from the features bits and configuration space contents. Yes, I'm not against this idea but it looks to me it works even without this. Modern PCI has num_queues but we don't have things like this in MMIO and legacy PCI. Thanks > > Stefan
On Wed, Dec 15, 2021 at 11:18:05AM +0800, Jason Wang wrote: > On Tue, Dec 14, 2021 at 9:11 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > On Tue, Dec 14, 2021 at 10:22:53AM +0800, Jason Wang wrote: > > > On Mon, Dec 13, 2021 at 11:14 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > On Mon, Dec 13, 2021 at 10:47:00AM +0800, Jason Wang wrote: > > > > > On Sun, Dec 12, 2021 at 5:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > On Sat, Dec 11, 2021 at 03:00:27AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > > > > > > > > > > > > > > > > > > > > > -----Original Message----- > > > > > > > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > > > > > > > > Sent: Thursday, December 9, 2021 5:17 PM > > > > > > > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > > > > > > > <longpeng2@huawei.com> > > > > > > > > Cc: jasowang@redhat.com; mst@redhat.com; parav@nvidia.com; > > > > > > > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > > > > > > > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > > > > > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > > > > > > > > > > > > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > > > > > > > > From: Longpeng <longpeng2@huawei.com> > > > > > > > > > > > > > > > > > > Hi guys, > > > > > > > > > > > > > > > > > > This patch introduces vhost-vdpa-net device, which is inspired > > > > > > > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > > > > > > > > > > > > > > > I've tested this patch on Huawei's offload card: > > > > > > > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > > > > > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > > > > > > > > > > > > > For virtio hardware offloading, the most important requirement for us > > > > > > > > > is to support live migration between offloading cards from different > > > > > > > > > vendors, the combination of netdev and virtio-net seems too heavy, we > > > > > > > > > prefer a lightweight way. > > > > > > > > > > > > > > > > > > Maybe we could support both in the future ? Such as: > > > > > > > > > > > > > > > > > > * Lightweight > > > > > > > > > Net: vhost-vdpa-net > > > > > > > > > Storage: vhost-vdpa-blk > > > > > > > > > > > > > > > > > > * Heavy but more powerful > > > > > > > > > Net: netdev + virtio-net + vhost-vdpa > > > > > > > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > > > > > > > > > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > > > > > > > > > > > Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > > > > > > > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof > > > > > > > > tware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > > > > > > > > > > > > It's closer to today's virtio-net + vhost-net approach than the > > > > > > > > vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > > > > > > > an offload feature rather than a completely separate code path that > > > > > > > > needs to be maintained and tested. That way QEMU's block layer features > > > > > > > > and live migration work with vDPA devices and re-use the virtio-blk > > > > > > > > code. The key functionality that has not been implemented yet is a "fast > > > > > > > > path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > > > > > > > offloaded to vDPA. > > > > > > > > > > > > > > > > The unified vdpa-blk architecture should deliver the same performance > > > > > > > > as the vhost-vdpa-blk device you mentioned but with more features, so I > > > > > > > > wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > > > > > > > > > > > > > QEMU already has vhost-user-blk, which takes a similar approach as the > > > > > > > > vhost-vdpa-blk device you are proposing. I'm not against the > > > > > > > > vhost-vdpa-blk approach in priciple, but would like to understand your > > > > > > > > requirements and see if there is a way to collaborate on one vdpa-blk > > > > > > > > implementation instead of dividing our efforts between two. > > > > > > > > > > > > > > > > > > > > > > We prefer a simple way in the virtio hardware offloading case, it could reduce > > > > > > > our maintenance workload, we no need to maintain the virtio-net, netdev, > > > > > > > virtio-blk, bdrv and ... any more. If we need to support other vdpa devices > > > > > > > (such as virtio-crypto, virtio-fs) in the future, then we also need to maintain > > > > > > > the corresponding device emulation code? > > > > > > > > > > > > > > For the virtio hardware offloading case, we usually use the vfio-pci framework, > > > > > > > it saves a lot of our maintenance work in QEMU, we don't need to touch the device > > > > > > > types. Inspired by Jason, what we really prefer is "vhost-vdpa-pci/mmio", use it to > > > > > > > instead of the vfio-pci, it could provide the same performance as vfio-pci, but it's > > > > > > > *possible* to support live migrate between offloading cards from different vendors. > > > > > > > > > > > > OK, so the features you are dropping would be migration between > > > > > > a vdpa, vhost and virtio backends. I think given vhost-vdpa-blk is seems > > > > > > fair enough... What do others think? > > > > > > > > > > I think it should be fine, and it would be even better to make it not > > > > > specific to device type. > > > > > > > > That's an interesting idea. A generic vDPA VirtIODevice could exposed as > > > > > > > > --device vhost-vdpa-pci, > > > > [vhostfd=FD,| > > > > vhostpath=/dev/vhost-vdpa-N] > > > > > > > > (and for virtio-mmio and virtio-ccw too). > > > > > > > > I don't think this is possible yet because the vhost_vdpa ioctls are > > > > missing some introspection functionality. Here is what I found: > > > > - Device ID: ok, use VHOST_VDPA_GET_DEVICE_ID > > > > - Device feature bits: ok, use VHOST_GET_BACKEND_FEATURES > > > > - Configuration space size: missing, need ioctl for ops->get_config_size() > > > > > > Any specific reason that we need this considering we've already had > > > VHOST_VDPA_GET_CONFIG and we do the size validation there? > > > > QEMU's virtio_init() takes a size_t config_size argument. We need to > > determine the size of the vhost_vdpa's configuration space in order to > > create the VirtIODevice in QEMU. > > > > Do you mean probing by checking for the VHOST_VDPA_GET_CONFIG -E2BIG > > return value? It's hacky but I guess it's possible to do a binary search > > that calls VHOST_VDPA_GET_CONFIG each iteration and reduces the size if > > -E2BIG is returned or increases the size otherwise. > > > > Or do you mean re-writing QEMU's hw/virtio/virtio.c to allow the > > VirtIODevice to override the size and we pass accesses through to > > vhost_vdpa. That way it might be possible to avoid fetching the > > configuration space size at startup, but I'm not sure this will work > > because QEMU might depend on knowing the exact size (e.g. live > > migration). > > Good point, so looking at virtio-blk it has: > > virtio_blk_set_config_size(s, s->host_features); > virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, s->config_size); > > I think here virtio-blk/net should check the vhost-vdpa features here > and fail if they are not the same? The vhost feature bit code in QEMU is complicated and I can't respond without investing too much time studying it :). > This looks better than overriding the config_size with what vhost-vdpa > provides since it can override the features that the cli tries to > enable. I'm thinking about the generic --device vhost-vdpa idea. QEMU should not require knowledge of the device feature bits in that case, so it cannot calculate the configuration space size. > > > > > > - Max virtqueue size: ok, VHOST_VDPA_GET_VRING_NUM > > > > - Number of virtqueues: probe using VHOST_GET_VRING_BASE? > > > > > > I'm not sure whether or not we need this and it seems not necessary > > > since it can be deduced from the config space and features. > > > > It can only be deduced in a device-specific way (net, blk, etc). I can't > > think of a way to detect the number of virtqueues for an arbitrary > > VIRTIO device from the features bits and configuration space contents. > > Yes, I'm not against this idea but it looks to me it works even without this. > > Modern PCI has num_queues but we don't have things like this in MMIO > and legacy PCI. Even if the VIRTIO hardware interface doesn't expose this information to the guest, QEMU's VirtIODevice API needs it. Device emulation code must call virtio_add_queue() to expose virtqueues to the guest. I suppose --device vhost-vdpa could probe the number of virtqueues using VHOST_GET_VRING_BASE and then call virtio_add_queue(), but it's a little hacky and involves unnecessary ioctl calls. Instead I would add ioctls to fetch the configuration space size and number of virtqueues from the vhost_vdpa device. With these two ioctls added --device vhost-vdpa could create a VirtIODevice that works with any QEMU VIRTIO transport (pci, mmio, ccw). It would work with any vDPA device without device-specific knowledge. Live migration might be possible with some additional work. Stefan
On Wed, Dec 15, 2021 at 6:07 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > On Wed, Dec 15, 2021 at 11:18:05AM +0800, Jason Wang wrote: > > On Tue, Dec 14, 2021 at 9:11 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > On Tue, Dec 14, 2021 at 10:22:53AM +0800, Jason Wang wrote: > > > > On Mon, Dec 13, 2021 at 11:14 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > > > On Mon, Dec 13, 2021 at 10:47:00AM +0800, Jason Wang wrote: > > > > > > On Sun, Dec 12, 2021 at 5:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > On Sat, Dec 11, 2021 at 03:00:27AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > > > > > > > > > > > > > > > > > > > > > > > > -----Original Message----- > > > > > > > > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > > > > > > > > > Sent: Thursday, December 9, 2021 5:17 PM > > > > > > > > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > > > > > > > > <longpeng2@huawei.com> > > > > > > > > > Cc: jasowang@redhat.com; mst@redhat.com; parav@nvidia.com; > > > > > > > > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > > > > > > > > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > > > > > > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > > > > > > > > > > > > > > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > > > > > > > > > From: Longpeng <longpeng2@huawei.com> > > > > > > > > > > > > > > > > > > > > Hi guys, > > > > > > > > > > > > > > > > > > > > This patch introduces vhost-vdpa-net device, which is inspired > > > > > > > > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > > > > > > > > > > > > > > > > > I've tested this patch on Huawei's offload card: > > > > > > > > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > > > > > > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > > > > > > > > > > > > > > > For virtio hardware offloading, the most important requirement for us > > > > > > > > > > is to support live migration between offloading cards from different > > > > > > > > > > vendors, the combination of netdev and virtio-net seems too heavy, we > > > > > > > > > > prefer a lightweight way. > > > > > > > > > > > > > > > > > > > > Maybe we could support both in the future ? Such as: > > > > > > > > > > > > > > > > > > > > * Lightweight > > > > > > > > > > Net: vhost-vdpa-net > > > > > > > > > > Storage: vhost-vdpa-blk > > > > > > > > > > > > > > > > > > > > * Heavy but more powerful > > > > > > > > > > Net: netdev + virtio-net + vhost-vdpa > > > > > > > > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > > > > > > > > > > > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > > > > > > > > > > > > > Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > > > > > > > > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof > > > > > > > > > tware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > > > > > > > > > > > > > > It's closer to today's virtio-net + vhost-net approach than the > > > > > > > > > vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > > > > > > > > an offload feature rather than a completely separate code path that > > > > > > > > > needs to be maintained and tested. That way QEMU's block layer features > > > > > > > > > and live migration work with vDPA devices and re-use the virtio-blk > > > > > > > > > code. The key functionality that has not been implemented yet is a "fast > > > > > > > > > path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > > > > > > > > offloaded to vDPA. > > > > > > > > > > > > > > > > > > The unified vdpa-blk architecture should deliver the same performance > > > > > > > > > as the vhost-vdpa-blk device you mentioned but with more features, so I > > > > > > > > > wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > > > > > > > > > > > > > > > QEMU already has vhost-user-blk, which takes a similar approach as the > > > > > > > > > vhost-vdpa-blk device you are proposing. I'm not against the > > > > > > > > > vhost-vdpa-blk approach in priciple, but would like to understand your > > > > > > > > > requirements and see if there is a way to collaborate on one vdpa-blk > > > > > > > > > implementation instead of dividing our efforts between two. > > > > > > > > > > > > > > > > > > > > > > > > > We prefer a simple way in the virtio hardware offloading case, it could reduce > > > > > > > > our maintenance workload, we no need to maintain the virtio-net, netdev, > > > > > > > > virtio-blk, bdrv and ... any more. If we need to support other vdpa devices > > > > > > > > (such as virtio-crypto, virtio-fs) in the future, then we also need to maintain > > > > > > > > the corresponding device emulation code? > > > > > > > > > > > > > > > > For the virtio hardware offloading case, we usually use the vfio-pci framework, > > > > > > > > it saves a lot of our maintenance work in QEMU, we don't need to touch the device > > > > > > > > types. Inspired by Jason, what we really prefer is "vhost-vdpa-pci/mmio", use it to > > > > > > > > instead of the vfio-pci, it could provide the same performance as vfio-pci, but it's > > > > > > > > *possible* to support live migrate between offloading cards from different vendors. > > > > > > > > > > > > > > OK, so the features you are dropping would be migration between > > > > > > > a vdpa, vhost and virtio backends. I think given vhost-vdpa-blk is seems > > > > > > > fair enough... What do others think? > > > > > > > > > > > > I think it should be fine, and it would be even better to make it not > > > > > > specific to device type. > > > > > > > > > > That's an interesting idea. A generic vDPA VirtIODevice could exposed as > > > > > > > > > > --device vhost-vdpa-pci, > > > > > [vhostfd=FD,| > > > > > vhostpath=/dev/vhost-vdpa-N] > > > > > > > > > > (and for virtio-mmio and virtio-ccw too). > > > > > > > > > > I don't think this is possible yet because the vhost_vdpa ioctls are > > > > > missing some introspection functionality. Here is what I found: > > > > > - Device ID: ok, use VHOST_VDPA_GET_DEVICE_ID > > > > > - Device feature bits: ok, use VHOST_GET_BACKEND_FEATURES > > > > > - Configuration space size: missing, need ioctl for ops->get_config_size() > > > > > > > > Any specific reason that we need this considering we've already had > > > > VHOST_VDPA_GET_CONFIG and we do the size validation there? > > > > > > QEMU's virtio_init() takes a size_t config_size argument. We need to > > > determine the size of the vhost_vdpa's configuration space in order to > > > create the VirtIODevice in QEMU. > > > > > > Do you mean probing by checking for the VHOST_VDPA_GET_CONFIG -E2BIG > > > return value? It's hacky but I guess it's possible to do a binary search > > > that calls VHOST_VDPA_GET_CONFIG each iteration and reduces the size if > > > -E2BIG is returned or increases the size otherwise. > > > > > > Or do you mean re-writing QEMU's hw/virtio/virtio.c to allow the > > > VirtIODevice to override the size and we pass accesses through to > > > vhost_vdpa. That way it might be possible to avoid fetching the > > > configuration space size at startup, but I'm not sure this will work > > > because QEMU might depend on knowing the exact size (e.g. live > > > migration). > > > > Good point, so looking at virtio-blk it has: > > > > virtio_blk_set_config_size(s, s->host_features); > > virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, s->config_size); > > > > I think here virtio-blk/net should check the vhost-vdpa features here > > and fail if they are not the same? > > The vhost feature bit code in QEMU is complicated and I can't respond > without investing too much time studying it :). > > > This looks better than overriding the config_size with what vhost-vdpa > > provides since it can override the features that the cli tries to > > enable. > > I'm thinking about the generic --device vhost-vdpa idea. QEMU should not > require knowledge of the device feature bits in that case, so it cannot > calculate the configuration space size. In this case, it looks to me the config size could be deduced from VHOST_VDPA_GET_FEATURES? > > > > > > > > > - Max virtqueue size: ok, VHOST_VDPA_GET_VRING_NUM > > > > > - Number of virtqueues: probe using VHOST_GET_VRING_BASE? > > > > > > > > I'm not sure whether or not we need this and it seems not necessary > > > > since it can be deduced from the config space and features. > > > > > > It can only be deduced in a device-specific way (net, blk, etc). I can't > > > think of a way to detect the number of virtqueues for an arbitrary > > > VIRTIO device from the features bits and configuration space contents. > > > > Yes, I'm not against this idea but it looks to me it works even without this. > > > > Modern PCI has num_queues but we don't have things like this in MMIO > > and legacy PCI. > > Even if the VIRTIO hardware interface doesn't expose this information to > the guest, QEMU's VirtIODevice API needs it. Device emulation code must > call virtio_add_queue() to expose virtqueues to the guest. We don't need this for current multiqueue virtio-net with vhost-vdpa since the queue num were deduced from the VHOST_VDPA_GET_CONFIG during the initialization of vhost-vdpa backend. If we are talking about generic vhost-vdpa-pci, we don't need virtio_add_queue() in this case. Thanks > > I suppose --device vhost-vdpa could probe the number of virtqueues using > VHOST_GET_VRING_BASE and then call virtio_add_queue(), but it's a little > hacky and involves unnecessary ioctl calls. > > Instead I would add ioctls to fetch the configuration space size and > number of virtqueues from the vhost_vdpa device. > > With these two ioctls added --device vhost-vdpa could create a > VirtIODevice that works with any QEMU VIRTIO transport (pci, mmio, ccw). > It would work with any vDPA device without device-specific knowledge. > Live migration might be possible with some additional work. > > Stefan
On Thu, Dec 16, 2021 at 11:01:40AM +0800, Jason Wang wrote: > On Wed, Dec 15, 2021 at 6:07 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > On Wed, Dec 15, 2021 at 11:18:05AM +0800, Jason Wang wrote: > > > On Tue, Dec 14, 2021 at 9:11 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > On Tue, Dec 14, 2021 at 10:22:53AM +0800, Jason Wang wrote: > > > > > On Mon, Dec 13, 2021 at 11:14 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > > > > > On Mon, Dec 13, 2021 at 10:47:00AM +0800, Jason Wang wrote: > > > > > > > On Sun, Dec 12, 2021 at 5:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > On Sat, Dec 11, 2021 at 03:00:27AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > -----Original Message----- > > > > > > > > > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > > > > > > > > > > Sent: Thursday, December 9, 2021 5:17 PM > > > > > > > > > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > > > > > > > > > <longpeng2@huawei.com> > > > > > > > > > > Cc: jasowang@redhat.com; mst@redhat.com; parav@nvidia.com; > > > > > > > > > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > > > > > > > > > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > > > > > > > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > > > > > > > > > > > > > > > > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > > > > > > > > > > From: Longpeng <longpeng2@huawei.com> > > > > > > > > > > > > > > > > > > > > > > Hi guys, > > > > > > > > > > > > > > > > > > > > > > This patch introduces vhost-vdpa-net device, which is inspired > > > > > > > > > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > > > > > > > > > > > > > > > > > > > I've tested this patch on Huawei's offload card: > > > > > > > > > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > > > > > > > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > > > > > > > > > > > > > > > > > For virtio hardware offloading, the most important requirement for us > > > > > > > > > > > is to support live migration between offloading cards from different > > > > > > > > > > > vendors, the combination of netdev and virtio-net seems too heavy, we > > > > > > > > > > > prefer a lightweight way. > > > > > > > > > > > > > > > > > > > > > > Maybe we could support both in the future ? Such as: > > > > > > > > > > > > > > > > > > > > > > * Lightweight > > > > > > > > > > > Net: vhost-vdpa-net > > > > > > > > > > > Storage: vhost-vdpa-blk > > > > > > > > > > > > > > > > > > > > > > * Heavy but more powerful > > > > > > > > > > > Net: netdev + virtio-net + vhost-vdpa > > > > > > > > > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > > > > > > > > > > > > > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > > > > > > > > > > > > > > > Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > > > > > > > > > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof > > > > > > > > > > tware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > > > > > > > > > > > > > > > > It's closer to today's virtio-net + vhost-net approach than the > > > > > > > > > > vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > > > > > > > > > an offload feature rather than a completely separate code path that > > > > > > > > > > needs to be maintained and tested. That way QEMU's block layer features > > > > > > > > > > and live migration work with vDPA devices and re-use the virtio-blk > > > > > > > > > > code. The key functionality that has not been implemented yet is a "fast > > > > > > > > > > path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > > > > > > > > > offloaded to vDPA. > > > > > > > > > > > > > > > > > > > > The unified vdpa-blk architecture should deliver the same performance > > > > > > > > > > as the vhost-vdpa-blk device you mentioned but with more features, so I > > > > > > > > > > wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > > > > > > > > > > > > > > > > > QEMU already has vhost-user-blk, which takes a similar approach as the > > > > > > > > > > vhost-vdpa-blk device you are proposing. I'm not against the > > > > > > > > > > vhost-vdpa-blk approach in priciple, but would like to understand your > > > > > > > > > > requirements and see if there is a way to collaborate on one vdpa-blk > > > > > > > > > > implementation instead of dividing our efforts between two. > > > > > > > > > > > > > > > > > > > > > > > > > > > > We prefer a simple way in the virtio hardware offloading case, it could reduce > > > > > > > > > our maintenance workload, we no need to maintain the virtio-net, netdev, > > > > > > > > > virtio-blk, bdrv and ... any more. If we need to support other vdpa devices > > > > > > > > > (such as virtio-crypto, virtio-fs) in the future, then we also need to maintain > > > > > > > > > the corresponding device emulation code? > > > > > > > > > > > > > > > > > > For the virtio hardware offloading case, we usually use the vfio-pci framework, > > > > > > > > > it saves a lot of our maintenance work in QEMU, we don't need to touch the device > > > > > > > > > types. Inspired by Jason, what we really prefer is "vhost-vdpa-pci/mmio", use it to > > > > > > > > > instead of the vfio-pci, it could provide the same performance as vfio-pci, but it's > > > > > > > > > *possible* to support live migrate between offloading cards from different vendors. > > > > > > > > > > > > > > > > OK, so the features you are dropping would be migration between > > > > > > > > a vdpa, vhost and virtio backends. I think given vhost-vdpa-blk is seems > > > > > > > > fair enough... What do others think? > > > > > > > > > > > > > > I think it should be fine, and it would be even better to make it not > > > > > > > specific to device type. > > > > > > > > > > > > That's an interesting idea. A generic vDPA VirtIODevice could exposed as > > > > > > > > > > > > --device vhost-vdpa-pci, > > > > > > [vhostfd=FD,| > > > > > > vhostpath=/dev/vhost-vdpa-N] > > > > > > > > > > > > (and for virtio-mmio and virtio-ccw too). > > > > > > > > > > > > I don't think this is possible yet because the vhost_vdpa ioctls are > > > > > > missing some introspection functionality. Here is what I found: > > > > > > - Device ID: ok, use VHOST_VDPA_GET_DEVICE_ID > > > > > > - Device feature bits: ok, use VHOST_GET_BACKEND_FEATURES > > > > > > - Configuration space size: missing, need ioctl for ops->get_config_size() > > > > > > > > > > Any specific reason that we need this considering we've already had > > > > > VHOST_VDPA_GET_CONFIG and we do the size validation there? > > > > > > > > QEMU's virtio_init() takes a size_t config_size argument. We need to > > > > determine the size of the vhost_vdpa's configuration space in order to > > > > create the VirtIODevice in QEMU. > > > > > > > > Do you mean probing by checking for the VHOST_VDPA_GET_CONFIG -E2BIG > > > > return value? It's hacky but I guess it's possible to do a binary search > > > > that calls VHOST_VDPA_GET_CONFIG each iteration and reduces the size if > > > > -E2BIG is returned or increases the size otherwise. > > > > > > > > Or do you mean re-writing QEMU's hw/virtio/virtio.c to allow the > > > > VirtIODevice to override the size and we pass accesses through to > > > > vhost_vdpa. That way it might be possible to avoid fetching the > > > > configuration space size at startup, but I'm not sure this will work > > > > because QEMU might depend on knowing the exact size (e.g. live > > > > migration). > > > > > > Good point, so looking at virtio-blk it has: > > > > > > virtio_blk_set_config_size(s, s->host_features); > > > virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, s->config_size); > > > > > > I think here virtio-blk/net should check the vhost-vdpa features here > > > and fail if they are not the same? > > > > The vhost feature bit code in QEMU is complicated and I can't respond > > without investing too much time studying it :). > > > > > This looks better than overriding the config_size with what vhost-vdpa > > > provides since it can override the features that the cli tries to > > > enable. > > > > I'm thinking about the generic --device vhost-vdpa idea. QEMU should not > > require knowledge of the device feature bits in that case, so it cannot > > calculate the configuration space size. > > In this case, it looks to me the config size could be deduced from > VHOST_VDPA_GET_FEATURES? I think we're talking about different things, see below... > > > > > > > > > > > > - Max virtqueue size: ok, VHOST_VDPA_GET_VRING_NUM > > > > > > - Number of virtqueues: probe using VHOST_GET_VRING_BASE? > > > > > > > > > > I'm not sure whether or not we need this and it seems not necessary > > > > > since it can be deduced from the config space and features. > > > > > > > > It can only be deduced in a device-specific way (net, blk, etc). I can't > > > > think of a way to detect the number of virtqueues for an arbitrary > > > > VIRTIO device from the features bits and configuration space contents. > > > > > > Yes, I'm not against this idea but it looks to me it works even without this. > > > > > > Modern PCI has num_queues but we don't have things like this in MMIO > > > and legacy PCI. > > > > Even if the VIRTIO hardware interface doesn't expose this information to > > the guest, QEMU's VirtIODevice API needs it. Device emulation code must > > call virtio_add_queue() to expose virtqueues to the guest. > > We don't need this for current multiqueue virtio-net with vhost-vdpa > since the queue num were deduced from the VHOST_VDPA_GET_CONFIG during > the initialization of vhost-vdpa backend. > > If we are talking about generic vhost-vdpa-pci, we don't need > virtio_add_queue() in this case. When I say --device vhost-vdpa I mean a VirtIODevice in QEMU that takes any /dev/vhost-vdpa-N and exposes the device to the guest (over virtio-pci, virtio-mmio, or virtio-ccw). It's generic because it has no knowledge of specific device types. This means new device types can be added without modifying QEMU. I think the model you are describing is not generic because it relies on knowledge of specific device types (net, blk, scsi, etc) so it can interpret feature bits and configuration space fields. When you originally said "it would be even better to make it not specific to device type" I thought you meant a generic --device vhost-vdpa and that's what I've been describing, but in your recent replies I guess you have a different model in mind. Are there reasons why the generic model won't work? Stefan
On Thu, Dec 16, 2021 at 5:10 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > On Thu, Dec 16, 2021 at 11:01:40AM +0800, Jason Wang wrote: > > On Wed, Dec 15, 2021 at 6:07 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > On Wed, Dec 15, 2021 at 11:18:05AM +0800, Jason Wang wrote: > > > > On Tue, Dec 14, 2021 at 9:11 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > > > On Tue, Dec 14, 2021 at 10:22:53AM +0800, Jason Wang wrote: > > > > > > On Mon, Dec 13, 2021 at 11:14 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > > > > > > > On Mon, Dec 13, 2021 at 10:47:00AM +0800, Jason Wang wrote: > > > > > > > > On Sun, Dec 12, 2021 at 5:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > > > On Sat, Dec 11, 2021 at 03:00:27AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > -----Original Message----- > > > > > > > > > > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > > > > > > > > > > > Sent: Thursday, December 9, 2021 5:17 PM > > > > > > > > > > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > > > > > > > > > > <longpeng2@huawei.com> > > > > > > > > > > > Cc: jasowang@redhat.com; mst@redhat.com; parav@nvidia.com; > > > > > > > > > > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > > > > > > > > > > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > > > > > > > > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > > > > > > > > > > > > > > > > > > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > > > > > > > > > > > From: Longpeng <longpeng2@huawei.com> > > > > > > > > > > > > > > > > > > > > > > > > Hi guys, > > > > > > > > > > > > > > > > > > > > > > > > This patch introduces vhost-vdpa-net device, which is inspired > > > > > > > > > > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > > > > > > > > > > > > > > > > > > > > > I've tested this patch on Huawei's offload card: > > > > > > > > > > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > > > > > > > > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > > > > > > > > > > > > > > > > > > > For virtio hardware offloading, the most important requirement for us > > > > > > > > > > > > is to support live migration between offloading cards from different > > > > > > > > > > > > vendors, the combination of netdev and virtio-net seems too heavy, we > > > > > > > > > > > > prefer a lightweight way. > > > > > > > > > > > > > > > > > > > > > > > > Maybe we could support both in the future ? Such as: > > > > > > > > > > > > > > > > > > > > > > > > * Lightweight > > > > > > > > > > > > Net: vhost-vdpa-net > > > > > > > > > > > > Storage: vhost-vdpa-blk > > > > > > > > > > > > > > > > > > > > > > > > * Heavy but more powerful > > > > > > > > > > > > Net: netdev + virtio-net + vhost-vdpa > > > > > > > > > > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > > > > > > > > > > > > > > > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > > > > > > > > > > > > > > > > > Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > > > > > > > > > > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof > > > > > > > > > > > tware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > > > > > > > > > > > > > > > > > > It's closer to today's virtio-net + vhost-net approach than the > > > > > > > > > > > vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > > > > > > > > > > an offload feature rather than a completely separate code path that > > > > > > > > > > > needs to be maintained and tested. That way QEMU's block layer features > > > > > > > > > > > and live migration work with vDPA devices and re-use the virtio-blk > > > > > > > > > > > code. The key functionality that has not been implemented yet is a "fast > > > > > > > > > > > path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > > > > > > > > > > offloaded to vDPA. > > > > > > > > > > > > > > > > > > > > > > The unified vdpa-blk architecture should deliver the same performance > > > > > > > > > > > as the vhost-vdpa-blk device you mentioned but with more features, so I > > > > > > > > > > > wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > > > > > > > > > > > > > > > > > > > QEMU already has vhost-user-blk, which takes a similar approach as the > > > > > > > > > > > vhost-vdpa-blk device you are proposing. I'm not against the > > > > > > > > > > > vhost-vdpa-blk approach in priciple, but would like to understand your > > > > > > > > > > > requirements and see if there is a way to collaborate on one vdpa-blk > > > > > > > > > > > implementation instead of dividing our efforts between two. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > We prefer a simple way in the virtio hardware offloading case, it could reduce > > > > > > > > > > our maintenance workload, we no need to maintain the virtio-net, netdev, > > > > > > > > > > virtio-blk, bdrv and ... any more. If we need to support other vdpa devices > > > > > > > > > > (such as virtio-crypto, virtio-fs) in the future, then we also need to maintain > > > > > > > > > > the corresponding device emulation code? > > > > > > > > > > > > > > > > > > > > For the virtio hardware offloading case, we usually use the vfio-pci framework, > > > > > > > > > > it saves a lot of our maintenance work in QEMU, we don't need to touch the device > > > > > > > > > > types. Inspired by Jason, what we really prefer is "vhost-vdpa-pci/mmio", use it to > > > > > > > > > > instead of the vfio-pci, it could provide the same performance as vfio-pci, but it's > > > > > > > > > > *possible* to support live migrate between offloading cards from different vendors. > > > > > > > > > > > > > > > > > > OK, so the features you are dropping would be migration between > > > > > > > > > a vdpa, vhost and virtio backends. I think given vhost-vdpa-blk is seems > > > > > > > > > fair enough... What do others think? > > > > > > > > > > > > > > > > I think it should be fine, and it would be even better to make it not > > > > > > > > specific to device type. > > > > > > > > > > > > > > That's an interesting idea. A generic vDPA VirtIODevice could exposed as > > > > > > > > > > > > > > --device vhost-vdpa-pci, > > > > > > > [vhostfd=FD,| > > > > > > > vhostpath=/dev/vhost-vdpa-N] > > > > > > > > > > > > > > (and for virtio-mmio and virtio-ccw too). > > > > > > > > > > > > > > I don't think this is possible yet because the vhost_vdpa ioctls are > > > > > > > missing some introspection functionality. Here is what I found: > > > > > > > - Device ID: ok, use VHOST_VDPA_GET_DEVICE_ID > > > > > > > - Device feature bits: ok, use VHOST_GET_BACKEND_FEATURES > > > > > > > - Configuration space size: missing, need ioctl for ops->get_config_size() > > > > > > > > > > > > Any specific reason that we need this considering we've already had > > > > > > VHOST_VDPA_GET_CONFIG and we do the size validation there? > > > > > > > > > > QEMU's virtio_init() takes a size_t config_size argument. We need to > > > > > determine the size of the vhost_vdpa's configuration space in order to > > > > > create the VirtIODevice in QEMU. > > > > > > > > > > Do you mean probing by checking for the VHOST_VDPA_GET_CONFIG -E2BIG > > > > > return value? It's hacky but I guess it's possible to do a binary search > > > > > that calls VHOST_VDPA_GET_CONFIG each iteration and reduces the size if > > > > > -E2BIG is returned or increases the size otherwise. > > > > > > > > > > Or do you mean re-writing QEMU's hw/virtio/virtio.c to allow the > > > > > VirtIODevice to override the size and we pass accesses through to > > > > > vhost_vdpa. That way it might be possible to avoid fetching the > > > > > configuration space size at startup, but I'm not sure this will work > > > > > because QEMU might depend on knowing the exact size (e.g. live > > > > > migration). > > > > > > > > Good point, so looking at virtio-blk it has: > > > > > > > > virtio_blk_set_config_size(s, s->host_features); > > > > virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, s->config_size); > > > > > > > > I think here virtio-blk/net should check the vhost-vdpa features here > > > > and fail if they are not the same? > > > > > > The vhost feature bit code in QEMU is complicated and I can't respond > > > without investing too much time studying it :). > > > > > > > This looks better than overriding the config_size with what vhost-vdpa > > > > provides since it can override the features that the cli tries to > > > > enable. > > > > > > I'm thinking about the generic --device vhost-vdpa idea. QEMU should not > > > require knowledge of the device feature bits in that case, so it cannot > > > calculate the configuration space size. > > > > In this case, it looks to me the config size could be deduced from > > VHOST_VDPA_GET_FEATURES? > > I think we're talking about different things, see below... > > > > > > > > > > > > > > > > - Max virtqueue size: ok, VHOST_VDPA_GET_VRING_NUM > > > > > > > - Number of virtqueues: probe using VHOST_GET_VRING_BASE? > > > > > > > > > > > > I'm not sure whether or not we need this and it seems not necessary > > > > > > since it can be deduced from the config space and features. > > > > > > > > > > It can only be deduced in a device-specific way (net, blk, etc). I can't > > > > > think of a way to detect the number of virtqueues for an arbitrary > > > > > VIRTIO device from the features bits and configuration space contents. > > > > > > > > Yes, I'm not against this idea but it looks to me it works even without this. > > > > > > > > Modern PCI has num_queues but we don't have things like this in MMIO > > > > and legacy PCI. > > > > > > Even if the VIRTIO hardware interface doesn't expose this information to > > > the guest, QEMU's VirtIODevice API needs it. Device emulation code must > > > call virtio_add_queue() to expose virtqueues to the guest. > > > > We don't need this for current multiqueue virtio-net with vhost-vdpa > > since the queue num were deduced from the VHOST_VDPA_GET_CONFIG during > > the initialization of vhost-vdpa backend. > > > > If we are talking about generic vhost-vdpa-pci, we don't need > > virtio_add_queue() in this case. > > When I say --device vhost-vdpa I mean a VirtIODevice in QEMU that takes > any /dev/vhost-vdpa-N and exposes the device to the guest (over > virtio-pci, virtio-mmio, or virtio-ccw). It's generic because it has no > knowledge of specific device types. This means new device types can be > added without modifying QEMU. > > I think the model you are describing is not generic because it relies on > knowledge of specific device types (net, blk, scsi, etc) so it can > interpret feature bits and configuration space fields. Yes, but what I meant is that in this case qemu can simply relay the set/get config to vhost-vdpa. And the guest driver can enumerate the number of queues correctly depending on his own knowledge. > > When you originally said "it would be even better to make it not > specific to device type" I thought you meant a generic --device > vhost-vdpa and that's what I've been describing, but in your recent > replies I guess you have a different model in mind. > > Are there reasons why the generic model won't work? I think not. One thing comes to my mind is that since we provide num_queues via modern virtio-pci, this is probably another call for having the API you described. For the general vhost-vdpa backend, the only thing that may block us is the migration. If we want to make vhost-vdpa type independent, we need first investigate the independent migration facility in virtio spec which is still suspicious. Thanks > > Stefan
On Fri, Dec 17, 2021 at 12:26:53PM +0800, Jason Wang wrote: Dave: You created the VIRTIO vmstate infrastructure in QEMU. Please see the bottom of this email about moving to a standard VIRTIO device save/load format defined by the VIRTIO spec in the future. > On Thu, Dec 16, 2021 at 5:10 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > On Thu, Dec 16, 2021 at 11:01:40AM +0800, Jason Wang wrote: > > > On Wed, Dec 15, 2021 at 6:07 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > On Wed, Dec 15, 2021 at 11:18:05AM +0800, Jason Wang wrote: > > > > > On Tue, Dec 14, 2021 at 9:11 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > > > > > On Tue, Dec 14, 2021 at 10:22:53AM +0800, Jason Wang wrote: > > > > > > > On Mon, Dec 13, 2021 at 11:14 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > > > > > > > > > On Mon, Dec 13, 2021 at 10:47:00AM +0800, Jason Wang wrote: > > > > > > > > > On Sun, Dec 12, 2021 at 5:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > On Sat, Dec 11, 2021 at 03:00:27AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > -----Original Message----- > > > > > > > > > > > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > > > > > > > > > > > > Sent: Thursday, December 9, 2021 5:17 PM > > > > > > > > > > > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > > > > > > > > > > > <longpeng2@huawei.com> > > > > > > > > > > > > Cc: jasowang@redhat.com; mst@redhat.com; parav@nvidia.com; > > > > > > > > > > > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > > > > > > > > > > > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > > > > > > > > > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > > > > > > > > > > > > > > > > > > > > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > > > > > > > > > > > > From: Longpeng <longpeng2@huawei.com> > > > > > > > > > > > > > > > > > > > > > > > > > > Hi guys, > > > > > > > > > > > > > > > > > > > > > > > > > > This patch introduces vhost-vdpa-net device, which is inspired > > > > > > > > > > > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > > > > > > > > > > > > > > > > > > > > > > > I've tested this patch on Huawei's offload card: > > > > > > > > > > > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > > > > > > > > > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > > > > > > > > > > > > > > > > > > > > > For virtio hardware offloading, the most important requirement for us > > > > > > > > > > > > > is to support live migration between offloading cards from different > > > > > > > > > > > > > vendors, the combination of netdev and virtio-net seems too heavy, we > > > > > > > > > > > > > prefer a lightweight way. > > > > > > > > > > > > > > > > > > > > > > > > > > Maybe we could support both in the future ? Such as: > > > > > > > > > > > > > > > > > > > > > > > > > > * Lightweight > > > > > > > > > > > > > Net: vhost-vdpa-net > > > > > > > > > > > > > Storage: vhost-vdpa-blk > > > > > > > > > > > > > > > > > > > > > > > > > > * Heavy but more powerful > > > > > > > > > > > > > Net: netdev + virtio-net + vhost-vdpa > > > > > > > > > > > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > > > > > > > > > > > > > > > > > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > > > > > > > > > > > > > > > > > > > Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > > > > > > > > > > > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof > > > > > > > > > > > > tware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > > > > > > > > > > > > > > > > > > > > It's closer to today's virtio-net + vhost-net approach than the > > > > > > > > > > > > vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > > > > > > > > > > > an offload feature rather than a completely separate code path that > > > > > > > > > > > > needs to be maintained and tested. That way QEMU's block layer features > > > > > > > > > > > > and live migration work with vDPA devices and re-use the virtio-blk > > > > > > > > > > > > code. The key functionality that has not been implemented yet is a "fast > > > > > > > > > > > > path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > > > > > > > > > > > offloaded to vDPA. > > > > > > > > > > > > > > > > > > > > > > > > The unified vdpa-blk architecture should deliver the same performance > > > > > > > > > > > > as the vhost-vdpa-blk device you mentioned but with more features, so I > > > > > > > > > > > > wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > > > > > > > > > > > > > > > > > > > > > QEMU already has vhost-user-blk, which takes a similar approach as the > > > > > > > > > > > > vhost-vdpa-blk device you are proposing. I'm not against the > > > > > > > > > > > > vhost-vdpa-blk approach in priciple, but would like to understand your > > > > > > > > > > > > requirements and see if there is a way to collaborate on one vdpa-blk > > > > > > > > > > > > implementation instead of dividing our efforts between two. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > We prefer a simple way in the virtio hardware offloading case, it could reduce > > > > > > > > > > > our maintenance workload, we no need to maintain the virtio-net, netdev, > > > > > > > > > > > virtio-blk, bdrv and ... any more. If we need to support other vdpa devices > > > > > > > > > > > (such as virtio-crypto, virtio-fs) in the future, then we also need to maintain > > > > > > > > > > > the corresponding device emulation code? > > > > > > > > > > > > > > > > > > > > > > For the virtio hardware offloading case, we usually use the vfio-pci framework, > > > > > > > > > > > it saves a lot of our maintenance work in QEMU, we don't need to touch the device > > > > > > > > > > > types. Inspired by Jason, what we really prefer is "vhost-vdpa-pci/mmio", use it to > > > > > > > > > > > instead of the vfio-pci, it could provide the same performance as vfio-pci, but it's > > > > > > > > > > > *possible* to support live migrate between offloading cards from different vendors. > > > > > > > > > > > > > > > > > > > > OK, so the features you are dropping would be migration between > > > > > > > > > > a vdpa, vhost and virtio backends. I think given vhost-vdpa-blk is seems > > > > > > > > > > fair enough... What do others think? > > > > > > > > > > > > > > > > > > I think it should be fine, and it would be even better to make it not > > > > > > > > > specific to device type. > > > > > > > > > > > > > > > > That's an interesting idea. A generic vDPA VirtIODevice could exposed as > > > > > > > > > > > > > > > > --device vhost-vdpa-pci, > > > > > > > > [vhostfd=FD,| > > > > > > > > vhostpath=/dev/vhost-vdpa-N] > > > > > > > > > > > > > > > > (and for virtio-mmio and virtio-ccw too). > > > > > > > > > > > > > > > > I don't think this is possible yet because the vhost_vdpa ioctls are > > > > > > > > missing some introspection functionality. Here is what I found: > > > > > > > > - Device ID: ok, use VHOST_VDPA_GET_DEVICE_ID > > > > > > > > - Device feature bits: ok, use VHOST_GET_BACKEND_FEATURES > > > > > > > > - Configuration space size: missing, need ioctl for ops->get_config_size() > > > > > > > > > > > > > > Any specific reason that we need this considering we've already had > > > > > > > VHOST_VDPA_GET_CONFIG and we do the size validation there? > > > > > > > > > > > > QEMU's virtio_init() takes a size_t config_size argument. We need to > > > > > > determine the size of the vhost_vdpa's configuration space in order to > > > > > > create the VirtIODevice in QEMU. > > > > > > > > > > > > Do you mean probing by checking for the VHOST_VDPA_GET_CONFIG -E2BIG > > > > > > return value? It's hacky but I guess it's possible to do a binary search > > > > > > that calls VHOST_VDPA_GET_CONFIG each iteration and reduces the size if > > > > > > -E2BIG is returned or increases the size otherwise. > > > > > > > > > > > > Or do you mean re-writing QEMU's hw/virtio/virtio.c to allow the > > > > > > VirtIODevice to override the size and we pass accesses through to > > > > > > vhost_vdpa. That way it might be possible to avoid fetching the > > > > > > configuration space size at startup, but I'm not sure this will work > > > > > > because QEMU might depend on knowing the exact size (e.g. live > > > > > > migration). > > > > > > > > > > Good point, so looking at virtio-blk it has: > > > > > > > > > > virtio_blk_set_config_size(s, s->host_features); > > > > > virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, s->config_size); > > > > > > > > > > I think here virtio-blk/net should check the vhost-vdpa features here > > > > > and fail if they are not the same? > > > > > > > > The vhost feature bit code in QEMU is complicated and I can't respond > > > > without investing too much time studying it :). > > > > > > > > > This looks better than overriding the config_size with what vhost-vdpa > > > > > provides since it can override the features that the cli tries to > > > > > enable. > > > > > > > > I'm thinking about the generic --device vhost-vdpa idea. QEMU should not > > > > require knowledge of the device feature bits in that case, so it cannot > > > > calculate the configuration space size. > > > > > > In this case, it looks to me the config size could be deduced from > > > VHOST_VDPA_GET_FEATURES? > > > > I think we're talking about different things, see below... > > > > > > > > > > > > > > > > > > > > - Max virtqueue size: ok, VHOST_VDPA_GET_VRING_NUM > > > > > > > > - Number of virtqueues: probe using VHOST_GET_VRING_BASE? > > > > > > > > > > > > > > I'm not sure whether or not we need this and it seems not necessary > > > > > > > since it can be deduced from the config space and features. > > > > > > > > > > > > It can only be deduced in a device-specific way (net, blk, etc). I can't > > > > > > think of a way to detect the number of virtqueues for an arbitrary > > > > > > VIRTIO device from the features bits and configuration space contents. > > > > > > > > > > Yes, I'm not against this idea but it looks to me it works even without this. > > > > > > > > > > Modern PCI has num_queues but we don't have things like this in MMIO > > > > > and legacy PCI. > > > > > > > > Even if the VIRTIO hardware interface doesn't expose this information to > > > > the guest, QEMU's VirtIODevice API needs it. Device emulation code must > > > > call virtio_add_queue() to expose virtqueues to the guest. > > > > > > We don't need this for current multiqueue virtio-net with vhost-vdpa > > > since the queue num were deduced from the VHOST_VDPA_GET_CONFIG during > > > the initialization of vhost-vdpa backend. > > > > > > If we are talking about generic vhost-vdpa-pci, we don't need > > > virtio_add_queue() in this case. > > > > When I say --device vhost-vdpa I mean a VirtIODevice in QEMU that takes > > any /dev/vhost-vdpa-N and exposes the device to the guest (over > > virtio-pci, virtio-mmio, or virtio-ccw). It's generic because it has no > > knowledge of specific device types. This means new device types can be > > added without modifying QEMU. > > > > I think the model you are describing is not generic because it relies on > > knowledge of specific device types (net, blk, scsi, etc) so it can > > interpret feature bits and configuration space fields. > > Yes, but what I meant is that in this case qemu can simply relay the > set/get config to vhost-vdpa. And the guest driver can enumerate the > number of queues correctly depending on his own knowledge. That requires changes to how virtqueues are managed by hw/virtio/virtio.c because today the code assumes QEMU knows the number of virtqueues. virtio_add_queue() must be called by device emulation before the guest driver can configure a virtqueue. > > > > When you originally said "it would be even better to make it not > > specific to device type" I thought you meant a generic --device > > vhost-vdpa and that's what I've been describing, but in your recent > > replies I guess you have a different model in mind. > > > > Are there reasons why the generic model won't work? > > I think not. > > One thing comes to my mind is that since we provide num_queues via > modern virtio-pci, this is probably another call for having the API > you described. > > For the general vhost-vdpa backend, the only thing that may block us > is the migration. If we want to make vhost-vdpa type independent, we > need first investigate the independent migration facility in virtio > spec which is still suspicious. Yes, definitely. Another challenge with migration is that the generic vhost-vdpa vmstate probably won't be compatible with QEMU's virtio-net/blk/scsi/etc vmstates. It would be nice if it was possible to migrate between QEMU and vDPA device models since they both implement the same device types. Maybe the solution is for QEMU's virtio device models to switch to the new VIRTIO save/load data format once that has been defined in the spec. Then the QEMU VirtIODevice vmstate would be: 1. QEMU-specific VirtIODevice state (virtqueue state, etc) 2. VIRTIO standard device save/load data (virtio-net mac table, etc) It's still not clear to me how much of the VIRTIO device save/load data is implementation-specific. I think the next step forward is to review the QEMU vmstates for virtio-net, virtio-gpu, etc to figure out whether we can really standardize the save/load data. Stefan
On Fri, Dec 17, 2021 at 4:35 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > On Fri, Dec 17, 2021 at 12:26:53PM +0800, Jason Wang wrote: > > Dave: You created the VIRTIO vmstate infrastructure in QEMU. Please see > the bottom of this email about moving to a standard VIRTIO device > save/load format defined by the VIRTIO spec in the future. > > > On Thu, Dec 16, 2021 at 5:10 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > On Thu, Dec 16, 2021 at 11:01:40AM +0800, Jason Wang wrote: > > > > On Wed, Dec 15, 2021 at 6:07 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > > > On Wed, Dec 15, 2021 at 11:18:05AM +0800, Jason Wang wrote: > > > > > > On Tue, Dec 14, 2021 at 9:11 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > > > > > > > On Tue, Dec 14, 2021 at 10:22:53AM +0800, Jason Wang wrote: > > > > > > > > On Mon, Dec 13, 2021 at 11:14 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > > > > > > > > > > > On Mon, Dec 13, 2021 at 10:47:00AM +0800, Jason Wang wrote: > > > > > > > > > > On Sun, Dec 12, 2021 at 5:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > > > On Sat, Dec 11, 2021 at 03:00:27AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > -----Original Message----- > > > > > > > > > > > > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > > > > > > > > > > > > > Sent: Thursday, December 9, 2021 5:17 PM > > > > > > > > > > > > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > > > > > > > > > > > > <longpeng2@huawei.com> > > > > > > > > > > > > > Cc: jasowang@redhat.com; mst@redhat.com; parav@nvidia.com; > > > > > > > > > > > > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > > > > > > > > > > > > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > > > > > > > > > > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > > > > > > > > > > > > > > > > > > > > > > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > > > > > > > > > > > > > From: Longpeng <longpeng2@huawei.com> > > > > > > > > > > > > > > > > > > > > > > > > > > > > Hi guys, > > > > > > > > > > > > > > > > > > > > > > > > > > > > This patch introduces vhost-vdpa-net device, which is inspired > > > > > > > > > > > > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > > > > > > > > > > > > > > > > > > > > > > > > > I've tested this patch on Huawei's offload card: > > > > > > > > > > > > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > > > > > > > > > > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > > > > > > > > > > > > > > > > > > > > > > > For virtio hardware offloading, the most important requirement for us > > > > > > > > > > > > > > is to support live migration between offloading cards from different > > > > > > > > > > > > > > vendors, the combination of netdev and virtio-net seems too heavy, we > > > > > > > > > > > > > > prefer a lightweight way. > > > > > > > > > > > > > > > > > > > > > > > > > > > > Maybe we could support both in the future ? Such as: > > > > > > > > > > > > > > > > > > > > > > > > > > > > * Lightweight > > > > > > > > > > > > > > Net: vhost-vdpa-net > > > > > > > > > > > > > > Storage: vhost-vdpa-blk > > > > > > > > > > > > > > > > > > > > > > > > > > > > * Heavy but more powerful > > > > > > > > > > > > > > Net: netdev + virtio-net + vhost-vdpa > > > > > > > > > > > > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > > > > > > > > > > > > > > > > > > > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > > > > > > > > > > > > > > > > > > > > > Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > > > > > > > > > > > > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof > > > > > > > > > > > > > tware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > > > > > > > > > > > > > > > > > > > > > > It's closer to today's virtio-net + vhost-net approach than the > > > > > > > > > > > > > vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > > > > > > > > > > > > an offload feature rather than a completely separate code path that > > > > > > > > > > > > > needs to be maintained and tested. That way QEMU's block layer features > > > > > > > > > > > > > and live migration work with vDPA devices and re-use the virtio-blk > > > > > > > > > > > > > code. The key functionality that has not been implemented yet is a "fast > > > > > > > > > > > > > path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > > > > > > > > > > > > offloaded to vDPA. > > > > > > > > > > > > > > > > > > > > > > > > > > The unified vdpa-blk architecture should deliver the same performance > > > > > > > > > > > > > as the vhost-vdpa-blk device you mentioned but with more features, so I > > > > > > > > > > > > > wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > > > > > > > > > > > > > > > > > > > > > > > QEMU already has vhost-user-blk, which takes a similar approach as the > > > > > > > > > > > > > vhost-vdpa-blk device you are proposing. I'm not against the > > > > > > > > > > > > > vhost-vdpa-blk approach in priciple, but would like to understand your > > > > > > > > > > > > > requirements and see if there is a way to collaborate on one vdpa-blk > > > > > > > > > > > > > implementation instead of dividing our efforts between two. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > We prefer a simple way in the virtio hardware offloading case, it could reduce > > > > > > > > > > > > our maintenance workload, we no need to maintain the virtio-net, netdev, > > > > > > > > > > > > virtio-blk, bdrv and ... any more. If we need to support other vdpa devices > > > > > > > > > > > > (such as virtio-crypto, virtio-fs) in the future, then we also need to maintain > > > > > > > > > > > > the corresponding device emulation code? > > > > > > > > > > > > > > > > > > > > > > > > For the virtio hardware offloading case, we usually use the vfio-pci framework, > > > > > > > > > > > > it saves a lot of our maintenance work in QEMU, we don't need to touch the device > > > > > > > > > > > > types. Inspired by Jason, what we really prefer is "vhost-vdpa-pci/mmio", use it to > > > > > > > > > > > > instead of the vfio-pci, it could provide the same performance as vfio-pci, but it's > > > > > > > > > > > > *possible* to support live migrate between offloading cards from different vendors. > > > > > > > > > > > > > > > > > > > > > > OK, so the features you are dropping would be migration between > > > > > > > > > > > a vdpa, vhost and virtio backends. I think given vhost-vdpa-blk is seems > > > > > > > > > > > fair enough... What do others think? > > > > > > > > > > > > > > > > > > > > I think it should be fine, and it would be even better to make it not > > > > > > > > > > specific to device type. > > > > > > > > > > > > > > > > > > That's an interesting idea. A generic vDPA VirtIODevice could exposed as > > > > > > > > > > > > > > > > > > --device vhost-vdpa-pci, > > > > > > > > > [vhostfd=FD,| > > > > > > > > > vhostpath=/dev/vhost-vdpa-N] > > > > > > > > > > > > > > > > > > (and for virtio-mmio and virtio-ccw too). > > > > > > > > > > > > > > > > > > I don't think this is possible yet because the vhost_vdpa ioctls are > > > > > > > > > missing some introspection functionality. Here is what I found: > > > > > > > > > - Device ID: ok, use VHOST_VDPA_GET_DEVICE_ID > > > > > > > > > - Device feature bits: ok, use VHOST_GET_BACKEND_FEATURES > > > > > > > > > - Configuration space size: missing, need ioctl for ops->get_config_size() > > > > > > > > > > > > > > > > Any specific reason that we need this considering we've already had > > > > > > > > VHOST_VDPA_GET_CONFIG and we do the size validation there? > > > > > > > > > > > > > > QEMU's virtio_init() takes a size_t config_size argument. We need to > > > > > > > determine the size of the vhost_vdpa's configuration space in order to > > > > > > > create the VirtIODevice in QEMU. > > > > > > > > > > > > > > Do you mean probing by checking for the VHOST_VDPA_GET_CONFIG -E2BIG > > > > > > > return value? It's hacky but I guess it's possible to do a binary search > > > > > > > that calls VHOST_VDPA_GET_CONFIG each iteration and reduces the size if > > > > > > > -E2BIG is returned or increases the size otherwise. > > > > > > > > > > > > > > Or do you mean re-writing QEMU's hw/virtio/virtio.c to allow the > > > > > > > VirtIODevice to override the size and we pass accesses through to > > > > > > > vhost_vdpa. That way it might be possible to avoid fetching the > > > > > > > configuration space size at startup, but I'm not sure this will work > > > > > > > because QEMU might depend on knowing the exact size (e.g. live > > > > > > > migration). > > > > > > > > > > > > Good point, so looking at virtio-blk it has: > > > > > > > > > > > > virtio_blk_set_config_size(s, s->host_features); > > > > > > virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, s->config_size); > > > > > > > > > > > > I think here virtio-blk/net should check the vhost-vdpa features here > > > > > > and fail if they are not the same? > > > > > > > > > > The vhost feature bit code in QEMU is complicated and I can't respond > > > > > without investing too much time studying it :). > > > > > > > > > > > This looks better than overriding the config_size with what vhost-vdpa > > > > > > provides since it can override the features that the cli tries to > > > > > > enable. > > > > > > > > > > I'm thinking about the generic --device vhost-vdpa idea. QEMU should not > > > > > require knowledge of the device feature bits in that case, so it cannot > > > > > calculate the configuration space size. > > > > > > > > In this case, it looks to me the config size could be deduced from > > > > VHOST_VDPA_GET_FEATURES? > > > > > > I think we're talking about different things, see below... > > > > > > > > > > > > > > > > > > > > > > > > - Max virtqueue size: ok, VHOST_VDPA_GET_VRING_NUM > > > > > > > > > - Number of virtqueues: probe using VHOST_GET_VRING_BASE? > > > > > > > > > > > > > > > > I'm not sure whether or not we need this and it seems not necessary > > > > > > > > since it can be deduced from the config space and features. > > > > > > > > > > > > > > It can only be deduced in a device-specific way (net, blk, etc). I can't > > > > > > > think of a way to detect the number of virtqueues for an arbitrary > > > > > > > VIRTIO device from the features bits and configuration space contents. > > > > > > > > > > > > Yes, I'm not against this idea but it looks to me it works even without this. > > > > > > > > > > > > Modern PCI has num_queues but we don't have things like this in MMIO > > > > > > and legacy PCI. > > > > > > > > > > Even if the VIRTIO hardware interface doesn't expose this information to > > > > > the guest, QEMU's VirtIODevice API needs it. Device emulation code must > > > > > call virtio_add_queue() to expose virtqueues to the guest. > > > > > > > > We don't need this for current multiqueue virtio-net with vhost-vdpa > > > > since the queue num were deduced from the VHOST_VDPA_GET_CONFIG during > > > > the initialization of vhost-vdpa backend. > > > > > > > > If we are talking about generic vhost-vdpa-pci, we don't need > > > > virtio_add_queue() in this case. > > > > > > When I say --device vhost-vdpa I mean a VirtIODevice in QEMU that takes > > > any /dev/vhost-vdpa-N and exposes the device to the guest (over > > > virtio-pci, virtio-mmio, or virtio-ccw). It's generic because it has no > > > knowledge of specific device types. This means new device types can be > > > added without modifying QEMU. > > > > > > I think the model you are describing is not generic because it relies on > > > knowledge of specific device types (net, blk, scsi, etc) so it can > > > interpret feature bits and configuration space fields. > > > > Yes, but what I meant is that in this case qemu can simply relay the > > set/get config to vhost-vdpa. And the guest driver can enumerate the > > number of queues correctly depending on his own knowledge. > > That requires changes to how virtqueues are managed by > hw/virtio/virtio.c because today the code assumes QEMU knows the number > of virtqueues. virtio_add_queue() must be called by device emulation > before the guest driver can configure a virtqueue. Right. > > > > > > > When you originally said "it would be even better to make it not > > > specific to device type" I thought you meant a generic --device > > > vhost-vdpa and that's what I've been describing, but in your recent > > > replies I guess you have a different model in mind. > > > > > > Are there reasons why the generic model won't work? > > > > I think not. > > > > One thing comes to my mind is that since we provide num_queues via > > modern virtio-pci, this is probably another call for having the API > > you described. > > > > For the general vhost-vdpa backend, the only thing that may block us > > is the migration. If we want to make vhost-vdpa type independent, we > > need first investigate the independent migration facility in virtio > > spec which is still suspicious. > > Yes, definitely. > > Another challenge with migration is that the generic vhost-vdpa vmstate > probably won't be compatible with QEMU's virtio-net/blk/scsi/etc > vmstates. It would be nice if it was possible to migrate between QEMU > and vDPA device models since they both implement the same device types. > > Maybe the solution is for QEMU's virtio device models to switch to the > new VIRTIO save/load data format once that has been defined in the spec. > Then the QEMU VirtIODevice vmstate would be: > 1. QEMU-specific VirtIODevice state (virtqueue state, etc) > 2. VIRTIO standard device save/load data (virtio-net mac table, etc) Right. The question is that do we expect the exact byte stream format defined in the spec? It looks to me it's sufficient to define each state that is required for the live migration and leave the byte stream format to be implementation specific. If we manage to do this, there's still a chance that we can live migration between those two. > > It's still not clear to me how much of the VIRTIO device save/load data > is implementation-specific. I think the next step forward is to review > the QEMU vmstates for virtio-net, virtio-gpu, etc to figure out whether > we can really standardize the save/load data. Yes, and it should not be hard to have a general load and save based on key/value pairs which could be defined in the spec. Ideally, it should be more than enough to enumerate the keys based on the negotiated features. (But as discussed, virtio-fs and other stateful devices seem more complicated and a lot of spec work seems like a requirement before support this). Thanks > > Stefan
On Mon, Dec 20, 2021 at 10:48:09AM +0800, Jason Wang wrote: > On Fri, Dec 17, 2021 at 4:35 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > On Fri, Dec 17, 2021 at 12:26:53PM +0800, Jason Wang wrote: > > > > Dave: You created the VIRTIO vmstate infrastructure in QEMU. Please see > > the bottom of this email about moving to a standard VIRTIO device > > save/load format defined by the VIRTIO spec in the future. > > > > > On Thu, Dec 16, 2021 at 5:10 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > On Thu, Dec 16, 2021 at 11:01:40AM +0800, Jason Wang wrote: > > > > > On Wed, Dec 15, 2021 at 6:07 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > > > > > On Wed, Dec 15, 2021 at 11:18:05AM +0800, Jason Wang wrote: > > > > > > > On Tue, Dec 14, 2021 at 9:11 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > > > > > > > > > On Tue, Dec 14, 2021 at 10:22:53AM +0800, Jason Wang wrote: > > > > > > > > > On Mon, Dec 13, 2021 at 11:14 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > On Mon, Dec 13, 2021 at 10:47:00AM +0800, Jason Wang wrote: > > > > > > > > > > > On Sun, Dec 12, 2021 at 5:30 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > > > > > On Sat, Dec 11, 2021 at 03:00:27AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > -----Original Message----- > > > > > > > > > > > > > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > > > > > > > > > > > > > > Sent: Thursday, December 9, 2021 5:17 PM > > > > > > > > > > > > > > To: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) > > > > > > > > > > > > > > <longpeng2@huawei.com> > > > > > > > > > > > > > > Cc: jasowang@redhat.com; mst@redhat.com; parav@nvidia.com; > > > > > > > > > > > > > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > > > > > > > > > > > > > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org > > > > > > > > > > > > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > > > > > > > > > > > > > > > > > > > > > > > > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote: > > > > > > > > > > > > > > > From: Longpeng <longpeng2@huawei.com> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Hi guys, > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > This patch introduces vhost-vdpa-net device, which is inspired > > > > > > > > > > > > > > > by vhost-user-blk and the proposal of vhost-vdpa-blk device [1]. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > I've tested this patch on Huawei's offload card: > > > > > > > > > > > > > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > > > > > > > > > > > > > -device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > For virtio hardware offloading, the most important requirement for us > > > > > > > > > > > > > > > is to support live migration between offloading cards from different > > > > > > > > > > > > > > > vendors, the combination of netdev and virtio-net seems too heavy, we > > > > > > > > > > > > > > > prefer a lightweight way. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Maybe we could support both in the future ? Such as: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > * Lightweight > > > > > > > > > > > > > > > Net: vhost-vdpa-net > > > > > > > > > > > > > > > Storage: vhost-vdpa-blk > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > * Heavy but more powerful > > > > > > > > > > > > > > > Net: netdev + virtio-net + vhost-vdpa > > > > > > > > > > > > > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > [1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > > > > > > > > > > > > > > > > > > > > > > > Stefano presented a plan for vdpa-blk at KVM Forum 2021: > > > > > > > > > > > > > > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof > > > > > > > > > > > > > > tware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > > > > > > > > > > > > > > > > > > > > > > > > It's closer to today's virtio-net + vhost-net approach than the > > > > > > > > > > > > > > vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as > > > > > > > > > > > > > > an offload feature rather than a completely separate code path that > > > > > > > > > > > > > > needs to be maintained and tested. That way QEMU's block layer features > > > > > > > > > > > > > > and live migration work with vDPA devices and re-use the virtio-blk > > > > > > > > > > > > > > code. The key functionality that has not been implemented yet is a "fast > > > > > > > > > > > > > > path" mechanism that allows the QEMU virtio-blk device's virtqueue to be > > > > > > > > > > > > > > offloaded to vDPA. > > > > > > > > > > > > > > > > > > > > > > > > > > > > The unified vdpa-blk architecture should deliver the same performance > > > > > > > > > > > > > > as the vhost-vdpa-blk device you mentioned but with more features, so I > > > > > > > > > > > > > > wonder what aspects of the vhost-vdpa-blk idea are important to you? > > > > > > > > > > > > > > > > > > > > > > > > > > > > QEMU already has vhost-user-blk, which takes a similar approach as the > > > > > > > > > > > > > > vhost-vdpa-blk device you are proposing. I'm not against the > > > > > > > > > > > > > > vhost-vdpa-blk approach in priciple, but would like to understand your > > > > > > > > > > > > > > requirements and see if there is a way to collaborate on one vdpa-blk > > > > > > > > > > > > > > implementation instead of dividing our efforts between two. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > We prefer a simple way in the virtio hardware offloading case, it could reduce > > > > > > > > > > > > > our maintenance workload, we no need to maintain the virtio-net, netdev, > > > > > > > > > > > > > virtio-blk, bdrv and ... any more. If we need to support other vdpa devices > > > > > > > > > > > > > (such as virtio-crypto, virtio-fs) in the future, then we also need to maintain > > > > > > > > > > > > > the corresponding device emulation code? > > > > > > > > > > > > > > > > > > > > > > > > > > For the virtio hardware offloading case, we usually use the vfio-pci framework, > > > > > > > > > > > > > it saves a lot of our maintenance work in QEMU, we don't need to touch the device > > > > > > > > > > > > > types. Inspired by Jason, what we really prefer is "vhost-vdpa-pci/mmio", use it to > > > > > > > > > > > > > instead of the vfio-pci, it could provide the same performance as vfio-pci, but it's > > > > > > > > > > > > > *possible* to support live migrate between offloading cards from different vendors. > > > > > > > > > > > > > > > > > > > > > > > > OK, so the features you are dropping would be migration between > > > > > > > > > > > > a vdpa, vhost and virtio backends. I think given vhost-vdpa-blk is seems > > > > > > > > > > > > fair enough... What do others think? > > > > > > > > > > > > > > > > > > > > > > I think it should be fine, and it would be even better to make it not > > > > > > > > > > > specific to device type. > > > > > > > > > > > > > > > > > > > > That's an interesting idea. A generic vDPA VirtIODevice could exposed as > > > > > > > > > > > > > > > > > > > > --device vhost-vdpa-pci, > > > > > > > > > > [vhostfd=FD,| > > > > > > > > > > vhostpath=/dev/vhost-vdpa-N] > > > > > > > > > > > > > > > > > > > > (and for virtio-mmio and virtio-ccw too). > > > > > > > > > > > > > > > > > > > > I don't think this is possible yet because the vhost_vdpa ioctls are > > > > > > > > > > missing some introspection functionality. Here is what I found: > > > > > > > > > > - Device ID: ok, use VHOST_VDPA_GET_DEVICE_ID > > > > > > > > > > - Device feature bits: ok, use VHOST_GET_BACKEND_FEATURES > > > > > > > > > > - Configuration space size: missing, need ioctl for ops->get_config_size() > > > > > > > > > > > > > > > > > > Any specific reason that we need this considering we've already had > > > > > > > > > VHOST_VDPA_GET_CONFIG and we do the size validation there? > > > > > > > > > > > > > > > > QEMU's virtio_init() takes a size_t config_size argument. We need to > > > > > > > > determine the size of the vhost_vdpa's configuration space in order to > > > > > > > > create the VirtIODevice in QEMU. > > > > > > > > > > > > > > > > Do you mean probing by checking for the VHOST_VDPA_GET_CONFIG -E2BIG > > > > > > > > return value? It's hacky but I guess it's possible to do a binary search > > > > > > > > that calls VHOST_VDPA_GET_CONFIG each iteration and reduces the size if > > > > > > > > -E2BIG is returned or increases the size otherwise. > > > > > > > > > > > > > > > > Or do you mean re-writing QEMU's hw/virtio/virtio.c to allow the > > > > > > > > VirtIODevice to override the size and we pass accesses through to > > > > > > > > vhost_vdpa. That way it might be possible to avoid fetching the > > > > > > > > configuration space size at startup, but I'm not sure this will work > > > > > > > > because QEMU might depend on knowing the exact size (e.g. live > > > > > > > > migration). > > > > > > > > > > > > > > Good point, so looking at virtio-blk it has: > > > > > > > > > > > > > > virtio_blk_set_config_size(s, s->host_features); > > > > > > > virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, s->config_size); > > > > > > > > > > > > > > I think here virtio-blk/net should check the vhost-vdpa features here > > > > > > > and fail if they are not the same? > > > > > > > > > > > > The vhost feature bit code in QEMU is complicated and I can't respond > > > > > > without investing too much time studying it :). > > > > > > > > > > > > > This looks better than overriding the config_size with what vhost-vdpa > > > > > > > provides since it can override the features that the cli tries to > > > > > > > enable. > > > > > > > > > > > > I'm thinking about the generic --device vhost-vdpa idea. QEMU should not > > > > > > require knowledge of the device feature bits in that case, so it cannot > > > > > > calculate the configuration space size. > > > > > > > > > > In this case, it looks to me the config size could be deduced from > > > > > VHOST_VDPA_GET_FEATURES? > > > > > > > > I think we're talking about different things, see below... > > > > > > > > > > > > > > > > > > > > > > > > > > > > - Max virtqueue size: ok, VHOST_VDPA_GET_VRING_NUM > > > > > > > > > > - Number of virtqueues: probe using VHOST_GET_VRING_BASE? > > > > > > > > > > > > > > > > > > I'm not sure whether or not we need this and it seems not necessary > > > > > > > > > since it can be deduced from the config space and features. > > > > > > > > > > > > > > > > It can only be deduced in a device-specific way (net, blk, etc). I can't > > > > > > > > think of a way to detect the number of virtqueues for an arbitrary > > > > > > > > VIRTIO device from the features bits and configuration space contents. > > > > > > > > > > > > > > Yes, I'm not against this idea but it looks to me it works even without this. > > > > > > > > > > > > > > Modern PCI has num_queues but we don't have things like this in MMIO > > > > > > > and legacy PCI. > > > > > > > > > > > > Even if the VIRTIO hardware interface doesn't expose this information to > > > > > > the guest, QEMU's VirtIODevice API needs it. Device emulation code must > > > > > > call virtio_add_queue() to expose virtqueues to the guest. > > > > > > > > > > We don't need this for current multiqueue virtio-net with vhost-vdpa > > > > > since the queue num were deduced from the VHOST_VDPA_GET_CONFIG during > > > > > the initialization of vhost-vdpa backend. > > > > > > > > > > If we are talking about generic vhost-vdpa-pci, we don't need > > > > > virtio_add_queue() in this case. > > > > > > > > When I say --device vhost-vdpa I mean a VirtIODevice in QEMU that takes > > > > any /dev/vhost-vdpa-N and exposes the device to the guest (over > > > > virtio-pci, virtio-mmio, or virtio-ccw). It's generic because it has no > > > > knowledge of specific device types. This means new device types can be > > > > added without modifying QEMU. > > > > > > > > I think the model you are describing is not generic because it relies on > > > > knowledge of specific device types (net, blk, scsi, etc) so it can > > > > interpret feature bits and configuration space fields. > > > > > > Yes, but what I meant is that in this case qemu can simply relay the > > > set/get config to vhost-vdpa. And the guest driver can enumerate the > > > number of queues correctly depending on his own knowledge. > > > > That requires changes to how virtqueues are managed by > > hw/virtio/virtio.c because today the code assumes QEMU knows the number > > of virtqueues. virtio_add_queue() must be called by device emulation > > before the guest driver can configure a virtqueue. > > Right. > > > > > > > > > > > When you originally said "it would be even better to make it not > > > > specific to device type" I thought you meant a generic --device > > > > vhost-vdpa and that's what I've been describing, but in your recent > > > > replies I guess you have a different model in mind. > > > > > > > > Are there reasons why the generic model won't work? > > > > > > I think not. > > > > > > One thing comes to my mind is that since we provide num_queues via > > > modern virtio-pci, this is probably another call for having the API > > > you described. > > > > > > For the general vhost-vdpa backend, the only thing that may block us > > > is the migration. If we want to make vhost-vdpa type independent, we > > > need first investigate the independent migration facility in virtio > > > spec which is still suspicious. > > > > Yes, definitely. > > > > Another challenge with migration is that the generic vhost-vdpa vmstate > > probably won't be compatible with QEMU's virtio-net/blk/scsi/etc > > vmstates. It would be nice if it was possible to migrate between QEMU > > and vDPA device models since they both implement the same device types. > > > > Maybe the solution is for QEMU's virtio device models to switch to the > > new VIRTIO save/load data format once that has been defined in the spec. > > Then the QEMU VirtIODevice vmstate would be: > > 1. QEMU-specific VirtIODevice state (virtqueue state, etc) > > 2. VIRTIO standard device save/load data (virtio-net mac table, etc) > > Right. The question is that do we expect the exact byte stream format > defined in the spec? It looks to me it's sufficient to define each > state that is required for the live migration and leave the byte > stream format to be implementation specific. If we manage to do this, > there's still a chance that we can live migration between those two. Yes. I think the pros/cons of translation are better compatibility but more complex code. Not sure if maintaining a QEMU-specific save/load format in addition to the standard VIRTIO format is desirable in the long term. > > > > It's still not clear to me how much of the VIRTIO device save/load data > > is implementation-specific. I think the next step forward is to review > > the QEMU vmstates for virtio-net, virtio-gpu, etc to figure out whether > > we can really standardize the save/load data. > > Yes, and it should not be hard to have a general load and save based > on key/value pairs which could be defined in the spec. Ideally, it > should be more than enough to enumerate the keys based on the > negotiated features. (But as discussed, virtio-fs and other stateful > devices seem more complicated and a lot of spec work seems like a > requirement before support this). Great, I'm glad we had a chance to discuss this. It has helped me understand the direction things are heading in. Migration isn't a dependency for what Longpeng is doing in this patch series. The generic --device vdpa-vhost can already be implemented today without live migration support. Adding the vhost_vdpa ioctls we discussed would be nice although it seems possible (but hacky) for QEMU to probe using existing ioctls too. Longpeng: Do you want to generalize this patch into a --device vdpa-host that supports all device types? Stefan
> -----Original Message----- > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > Sent: Monday, December 20, 2021 4:11 PM > To: Jason Wang <jasowang@redhat.com> > Cc: Michael S. Tsirkin <mst@redhat.com>; Longpeng (Mike, Cloud Infrastructure > Service Product Dept.) <longpeng2@huawei.com>; parav@nvidia.com; > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org; Dr. David > Alan Gilbert <dgilbert@redhat.com> > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > On Mon, Dec 20, 2021 at 10:48:09AM +0800, Jason Wang wrote: > > On Fri, Dec 17, 2021 at 4:35 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > On Fri, Dec 17, 2021 at 12:26:53PM +0800, Jason Wang wrote: > > > > > > Dave: You created the VIRTIO vmstate infrastructure in QEMU. Please see > > > the bottom of this email about moving to a standard VIRTIO device > > > save/load format defined by the VIRTIO spec in the future. > > > > > > > On Thu, Dec 16, 2021 at 5:10 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > > > > > On Thu, Dec 16, 2021 at 11:01:40AM +0800, Jason Wang wrote: > > > > > > On Wed, Dec 15, 2021 at 6:07 PM Stefan Hajnoczi <stefanha@redhat.com> > wrote: > > > > > > > > > > > > > > On Wed, Dec 15, 2021 at 11:18:05AM +0800, Jason Wang wrote: > > > > > > > > On Tue, Dec 14, 2021 at 9:11 PM Stefan Hajnoczi <stefanha@redhat.com> > wrote: > > > > > > > > > > > > > > > > > > On Tue, Dec 14, 2021 at 10:22:53AM +0800, Jason Wang wrote: > > > > > > > > > > On Mon, Dec 13, 2021 at 11:14 PM Stefan Hajnoczi > <stefanha@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > > > On Mon, Dec 13, 2021 at 10:47:00AM +0800, Jason Wang wrote: > > > > > > > > > > > > On Sun, Dec 12, 2021 at 5:30 PM Michael S. Tsirkin <mst@redhat.com> > wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > On Sat, Dec 11, 2021 at 03:00:27AM +0000, Longpeng (Mike, > Cloud Infrastructure Service Product Dept.) wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > -----Original Message----- > > > > > > > > > > > > > > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > > > > > > > > > > > > > > > Sent: Thursday, December 9, 2021 5:17 PM > > > > > > > > > > > > > > > To: Longpeng (Mike, Cloud Infrastructure Service > Product Dept.) > > > > > > > > > > > > > > > <longpeng2@huawei.com> > > > > > > > > > > > > > > > Cc: jasowang@redhat.com; mst@redhat.com; > parav@nvidia.com; > > > > > > > > > > > > > > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan > <yechuan@huawei.com>; > > > > > > > > > > > > > > > Gonglei (Arei) <arei.gonglei@huawei.com>; > qemu-devel@nongnu.org > > > > > > > > > > > > > > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net > host device support > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) > wrote: > > > > > > > > > > > > > > > > From: Longpeng <longpeng2@huawei.com> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Hi guys, > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > This patch introduces vhost-vdpa-net device, which > is inspired > > > > > > > > > > > > > > > > by vhost-user-blk and the proposal of vhost-vdpa-blk > device [1]. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > I've tested this patch on Huawei's offload card: > > > > > > > > > > > > > > > > ./x86_64-softmmu/qemu-system-x86_64 \ > > > > > > > > > > > > > > > > -device > vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0 > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > For virtio hardware offloading, the most important > requirement for us > > > > > > > > > > > > > > > > is to support live migration between offloading > cards from different > > > > > > > > > > > > > > > > vendors, the combination of netdev and virtio-net > seems too heavy, we > > > > > > > > > > > > > > > > prefer a lightweight way. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Maybe we could support both in the future ? Such > as: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > * Lightweight > > > > > > > > > > > > > > > > Net: vhost-vdpa-net > > > > > > > > > > > > > > > > Storage: vhost-vdpa-blk > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > * Heavy but more powerful > > > > > > > > > > > > > > > > Net: netdev + virtio-net + vhost-vdpa > > > > > > > > > > > > > > > > Storage: bdrv + virtio-blk + vhost-vdpa > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > [1] > https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Stefano presented a plan for vdpa-blk at KVM Forum > 2021: > > > > > > > > > > > > > > > > https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-sof > > > > > > > > > > > > > > > tware-offload-for-virtio-blk-stefano-garzarella-red-hat > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > It's closer to today's virtio-net + vhost-net approach > than the > > > > > > > > > > > > > > > vhost-vdpa-blk device you have mentioned. The idea > is to treat vDPA as > > > > > > > > > > > > > > > an offload feature rather than a completely separate > code path that > > > > > > > > > > > > > > > needs to be maintained and tested. That way QEMU's > block layer features > > > > > > > > > > > > > > > and live migration work with vDPA devices and re-use > the virtio-blk > > > > > > > > > > > > > > > code. The key functionality that has not been implemented > yet is a "fast > > > > > > > > > > > > > > > path" mechanism that allows the QEMU virtio-blk device's > virtqueue to be > > > > > > > > > > > > > > > offloaded to vDPA. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > The unified vdpa-blk architecture should deliver > the same performance > > > > > > > > > > > > > > > as the vhost-vdpa-blk device you mentioned but with > more features, so I > > > > > > > > > > > > > > > wonder what aspects of the vhost-vdpa-blk idea are > important to you? > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > QEMU already has vhost-user-blk, which takes a similar > approach as the > > > > > > > > > > > > > > > vhost-vdpa-blk device you are proposing. I'm not > against the > > > > > > > > > > > > > > > vhost-vdpa-blk approach in priciple, but would like > to understand your > > > > > > > > > > > > > > > requirements and see if there is a way to collaborate > on one vdpa-blk > > > > > > > > > > > > > > > implementation instead of dividing our efforts between > two. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > We prefer a simple way in the virtio hardware offloading > case, it could reduce > > > > > > > > > > > > > > our maintenance workload, we no need to maintain the > virtio-net, netdev, > > > > > > > > > > > > > > virtio-blk, bdrv and ... any more. If we need to support > other vdpa devices > > > > > > > > > > > > > > (such as virtio-crypto, virtio-fs) in the future, then > we also need to maintain > > > > > > > > > > > > > > the corresponding device emulation code? > > > > > > > > > > > > > > > > > > > > > > > > > > > > For the virtio hardware offloading case, we usually > use the vfio-pci framework, > > > > > > > > > > > > > > it saves a lot of our maintenance work in QEMU, we > don't need to touch the device > > > > > > > > > > > > > > types. Inspired by Jason, what we really prefer is > "vhost-vdpa-pci/mmio", use it to > > > > > > > > > > > > > > instead of the vfio-pci, it could provide the same > performance as vfio-pci, but it's > > > > > > > > > > > > > > *possible* to support live migrate between offloading > cards from different vendors. > > > > > > > > > > > > > > > > > > > > > > > > > > OK, so the features you are dropping would be migration > between > > > > > > > > > > > > > a vdpa, vhost and virtio backends. I think given vhost-vdpa-blk > is seems > > > > > > > > > > > > > fair enough... What do others think? > > > > > > > > > > > > > > > > > > > > > > > > I think it should be fine, and it would be even better > to make it not > > > > > > > > > > > > specific to device type. > > > > > > > > > > > > > > > > > > > > > > That's an interesting idea. A generic vDPA VirtIODevice could > exposed as > > > > > > > > > > > > > > > > > > > > > > --device vhost-vdpa-pci, > > > > > > > > > > > [vhostfd=FD,| > > > > > > > > > > > vhostpath=/dev/vhost-vdpa-N] > > > > > > > > > > > > > > > > > > > > > > (and for virtio-mmio and virtio-ccw too). > > > > > > > > > > > > > > > > > > > > > > I don't think this is possible yet because the vhost_vdpa > ioctls are > > > > > > > > > > > missing some introspection functionality. Here is what I > found: > > > > > > > > > > > - Device ID: ok, use VHOST_VDPA_GET_DEVICE_ID > > > > > > > > > > > - Device feature bits: ok, use VHOST_GET_BACKEND_FEATURES > > > > > > > > > > > - Configuration space size: missing, need ioctl for > ops->get_config_size() > > > > > > > > > > > > > > > > > > > > Any specific reason that we need this considering we've already > had > > > > > > > > > > VHOST_VDPA_GET_CONFIG and we do the size validation there? > > > > > > > > > > > > > > > > > > QEMU's virtio_init() takes a size_t config_size argument. We > need to > > > > > > > > > determine the size of the vhost_vdpa's configuration space in > order to > > > > > > > > > create the VirtIODevice in QEMU. > > > > > > > > > > > > > > > > > > Do you mean probing by checking for the VHOST_VDPA_GET_CONFIG > -E2BIG > > > > > > > > > return value? It's hacky but I guess it's possible to do a binary > search > > > > > > > > > that calls VHOST_VDPA_GET_CONFIG each iteration and reduces the > size if > > > > > > > > > -E2BIG is returned or increases the size otherwise. > > > > > > > > > > > > > > > > > > Or do you mean re-writing QEMU's hw/virtio/virtio.c to allow > the > > > > > > > > > VirtIODevice to override the size and we pass accesses through > to > > > > > > > > > vhost_vdpa. That way it might be possible to avoid fetching the > > > > > > > > > configuration space size at startup, but I'm not sure this will > work > > > > > > > > > because QEMU might depend on knowing the exact size (e.g. live > > > > > > > > > migration). > > > > > > > > > > > > > > > > Good point, so looking at virtio-blk it has: > > > > > > > > > > > > > > > > virtio_blk_set_config_size(s, s->host_features); > > > > > > > > virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, > s->config_size); > > > > > > > > > > > > > > > > I think here virtio-blk/net should check the vhost-vdpa features > here > > > > > > > > and fail if they are not the same? > > > > > > > > > > > > > > The vhost feature bit code in QEMU is complicated and I can't respond > > > > > > > without investing too much time studying it :). > > > > > > > > > > > > > > > This looks better than overriding the config_size with what vhost-vdpa > > > > > > > > provides since it can override the features that the cli tries > to > > > > > > > > enable. > > > > > > > > > > > > > > I'm thinking about the generic --device vhost-vdpa idea. QEMU should > not > > > > > > > require knowledge of the device feature bits in that case, so it > cannot > > > > > > > calculate the configuration space size. > > > > > > > > > > > > In this case, it looks to me the config size could be deduced from > > > > > > VHOST_VDPA_GET_FEATURES? > > > > > > > > > > I think we're talking about different things, see below... > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > - Max virtqueue size: ok, VHOST_VDPA_GET_VRING_NUM > > > > > > > > > > > - Number of virtqueues: probe using VHOST_GET_VRING_BASE? > > > > > > > > > > > > > > > > > > > > I'm not sure whether or not we need this and it seems not necessary > > > > > > > > > > since it can be deduced from the config space and features. > > > > > > > > > > > > > > > > > > It can only be deduced in a device-specific way (net, blk, etc). > I can't > > > > > > > > > think of a way to detect the number of virtqueues for an arbitrary > > > > > > > > > VIRTIO device from the features bits and configuration space > contents. > > > > > > > > > > > > > > > > Yes, I'm not against this idea but it looks to me it works even > without this. > > > > > > > > > > > > > > > > Modern PCI has num_queues but we don't have things like this in > MMIO > > > > > > > > and legacy PCI. > > > > > > > > > > > > > > Even if the VIRTIO hardware interface doesn't expose this information > to > > > > > > > the guest, QEMU's VirtIODevice API needs it. Device emulation code > must > > > > > > > call virtio_add_queue() to expose virtqueues to the guest. > > > > > > > > > > > > We don't need this for current multiqueue virtio-net with vhost-vdpa > > > > > > since the queue num were deduced from the VHOST_VDPA_GET_CONFIG during > > > > > > the initialization of vhost-vdpa backend. > > > > > > > > > > > > If we are talking about generic vhost-vdpa-pci, we don't need > > > > > > virtio_add_queue() in this case. > > > > > > > > > > When I say --device vhost-vdpa I mean a VirtIODevice in QEMU that takes > > > > > any /dev/vhost-vdpa-N and exposes the device to the guest (over > > > > > virtio-pci, virtio-mmio, or virtio-ccw). It's generic because it has > no > > > > > knowledge of specific device types. This means new device types can be > > > > > added without modifying QEMU. > > > > > > > > > > I think the model you are describing is not generic because it relies > on > > > > > knowledge of specific device types (net, blk, scsi, etc) so it can > > > > > interpret feature bits and configuration space fields. > > > > > > > > Yes, but what I meant is that in this case qemu can simply relay the > > > > set/get config to vhost-vdpa. And the guest driver can enumerate the > > > > number of queues correctly depending on his own knowledge. > > > > > > That requires changes to how virtqueues are managed by > > > hw/virtio/virtio.c because today the code assumes QEMU knows the number > > > of virtqueues. virtio_add_queue() must be called by device emulation > > > before the guest driver can configure a virtqueue. > > > > Right. > > > > > > > > > > > > > > > When you originally said "it would be even better to make it not > > > > > specific to device type" I thought you meant a generic --device > > > > > vhost-vdpa and that's what I've been describing, but in your recent > > > > > replies I guess you have a different model in mind. > > > > > > > > > > Are there reasons why the generic model won't work? > > > > > > > > I think not. > > > > > > > > One thing comes to my mind is that since we provide num_queues via > > > > modern virtio-pci, this is probably another call for having the API > > > > you described. > > > > > > > > For the general vhost-vdpa backend, the only thing that may block us > > > > is the migration. If we want to make vhost-vdpa type independent, we > > > > need first investigate the independent migration facility in virtio > > > > spec which is still suspicious. > > > > > > Yes, definitely. > > > > > > Another challenge with migration is that the generic vhost-vdpa vmstate > > > probably won't be compatible with QEMU's virtio-net/blk/scsi/etc > > > vmstates. It would be nice if it was possible to migrate between QEMU > > > and vDPA device models since they both implement the same device types. > > > > > > Maybe the solution is for QEMU's virtio device models to switch to the > > > new VIRTIO save/load data format once that has been defined in the spec. > > > Then the QEMU VirtIODevice vmstate would be: > > > 1. QEMU-specific VirtIODevice state (virtqueue state, etc) > > > 2. VIRTIO standard device save/load data (virtio-net mac table, etc) > > > > Right. The question is that do we expect the exact byte stream format > > defined in the spec? It looks to me it's sufficient to define each > > state that is required for the live migration and leave the byte > > stream format to be implementation specific. If we manage to do this, > > there's still a chance that we can live migration between those two. > > Yes. I think the pros/cons of translation are better compatibility but > more complex code. Not sure if maintaining a QEMU-specific save/load > format in addition to the standard VIRTIO format is desirable in the > long term. > > > > > > > It's still not clear to me how much of the VIRTIO device save/load data > > > is implementation-specific. I think the next step forward is to review > > > the QEMU vmstates for virtio-net, virtio-gpu, etc to figure out whether > > > we can really standardize the save/load data. > > > > Yes, and it should not be hard to have a general load and save based > > on key/value pairs which could be defined in the spec. Ideally, it > > should be more than enough to enumerate the keys based on the > > negotiated features. (But as discussed, virtio-fs and other stateful > > devices seem more complicated and a lot of spec work seems like a > > requirement before support this). > > Great, I'm glad we had a chance to discuss this. It has helped me > understand the direction things are heading in. > > Migration isn't a dependency for what Longpeng is doing in this patch > series. The generic --device vdpa-vhost can already be implemented today > without live migration support. Adding the vhost_vdpa ioctls we > discussed would be nice although it seems possible (but hacky) for QEMU > to probe using existing ioctls too. > > Longpeng: Do you want to generalize this patch into a --device vdpa-host > that supports all device types? > Yes, I already told Jason that I'll start to develop the device next week :) > Stefan
On Mon, Dec 20, 2021 at 09:17:40AM +0000, Longpeng (Mike, Cloud Infrastructure Service Product Dept.) wrote: > > -----Original Message----- > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com] > > Sent: Monday, December 20, 2021 4:11 PM > > To: Jason Wang <jasowang@redhat.com> > > Cc: Michael S. Tsirkin <mst@redhat.com>; Longpeng (Mike, Cloud Infrastructure > > Service Product Dept.) <longpeng2@huawei.com>; parav@nvidia.com; > > xieyongji@bytedance.com; sgarzare@redhat.com; Yechuan <yechuan@huawei.com>; > > Gonglei (Arei) <arei.gonglei@huawei.com>; qemu-devel@nongnu.org; Dr. David > > Alan Gilbert <dgilbert@redhat.com> > > Subject: Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support > > > > On Mon, Dec 20, 2021 at 10:48:09AM +0800, Jason Wang wrote: > > > On Fri, Dec 17, 2021 at 4:35 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > Longpeng: Do you want to generalize this patch into a --device vdpa-host > > that supports all device types? > > > > Yes, I already told Jason that I'll start to develop the device next week :) That's great! Stefan
diff --git a/hw/net/meson.build b/hw/net/meson.build index bdf71f1..139ebc4 100644 --- a/hw/net/meson.build +++ b/hw/net/meson.build @@ -44,6 +44,7 @@ specific_ss.add(when: 'CONFIG_XILINX_ETHLITE', if_true: files('xilinx_ethlite.c' softmmu_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('net_rx_pkt.c')) specific_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('virtio-net.c')) +specific_ss.add(when: 'CONFIG_VHOST_VDPA_NET', if_true: files('vhost-vdpa-net.c')) softmmu_ss.add(when: ['CONFIG_VIRTIO_NET', 'CONFIG_VHOST_NET'], if_true: files('vhost_net.c'), if_false: files('vhost_net-stub.c')) softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost_net-stub.c')) diff --git a/hw/net/vhost-vdpa-net.c b/hw/net/vhost-vdpa-net.c new file mode 100644 index 0000000..48b99f9 --- /dev/null +++ b/hw/net/vhost-vdpa-net.c @@ -0,0 +1,338 @@ +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/cutils.h" +#include "hw/qdev-core.h" +#include "hw/qdev-properties.h" +#include "hw/qdev-properties-system.h" +#include "hw/virtio/vhost.h" +#include "hw/virtio/vhost-vdpa-net.h" +#include "hw/virtio/virtio.h" +#include "hw/virtio/virtio-bus.h" +#include "hw/virtio/virtio-access.h" +#include "sysemu/sysemu.h" +#include "sysemu/runstate.h" +#include "net/vhost-vdpa.h" + +static void vhost_vdpa_net_get_config(VirtIODevice *vdev, uint8_t *config) +{ + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); + + memcpy(config, &s->netcfg, sizeof(struct virtio_net_config)); +} + +static void vhost_vdpa_net_set_config(VirtIODevice *vdev, const uint8_t *config) +{ + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); + struct virtio_net_config *netcfg = (struct virtio_net_config *)config; + int ret; + + ret = vhost_dev_set_config(&s->dev, (uint8_t *)netcfg, 0, sizeof(*netcfg), + VHOST_SET_CONFIG_TYPE_MASTER); + if (ret) { + error_report("set device config space failed"); + return; + } +} + +static uint64_t vhost_vdpa_net_get_features(VirtIODevice *vdev, + uint64_t features, + Error **errp) +{ + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); + + virtio_add_feature(&features, VIRTIO_NET_F_CSUM); + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_CSUM); + virtio_add_feature(&features, VIRTIO_NET_F_MAC); + virtio_add_feature(&features, VIRTIO_NET_F_GSO); + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_TSO4); + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_TSO6); + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_ECN); + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_UFO); + virtio_add_feature(&features, VIRTIO_NET_F_GUEST_ANNOUNCE); + virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO4); + virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO6); + virtio_add_feature(&features, VIRTIO_NET_F_HOST_ECN); + virtio_add_feature(&features, VIRTIO_NET_F_HOST_UFO); + virtio_add_feature(&features, VIRTIO_NET_F_MRG_RXBUF); + virtio_add_feature(&features, VIRTIO_NET_F_STATUS); + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_VQ); + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_RX); + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_VLAN); + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_RX_EXTRA); + virtio_add_feature(&features, VIRTIO_NET_F_CTRL_MAC_ADDR); + virtio_add_feature(&features, VIRTIO_NET_F_MQ); + + return vhost_get_features(&s->dev, vdpa_feature_bits, features); +} + +static int vhost_vdpa_net_start(VirtIODevice *vdev, Error **errp) +{ + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + int i, ret; + + if (!k->set_guest_notifiers) { + error_setg(errp, "binding does not support guest notifiers"); + return -ENOSYS; + } + + ret = vhost_dev_enable_notifiers(&s->dev, vdev); + if (ret < 0) { + error_setg_errno(errp, -ret, "Error enabling host notifiers"); + return ret; + } + + ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true); + if (ret < 0) { + error_setg_errno(errp, -ret, "Error binding guest notifier"); + goto err_host_notifiers; + } + + s->dev.acked_features = vdev->guest_features; + + ret = vhost_dev_start(&s->dev, vdev); + if (ret < 0) { + error_setg_errno(errp, -ret, "Error starting vhost"); + goto err_guest_notifiers; + } + s->started = true; + + /* guest_notifier_mask/pending not used yet, so just unmask + * everything here. virtio-pci will do the right thing by + * enabling/disabling irqfd. + */ + for (i = 0; i < s->dev.nvqs; i++) { + vhost_virtqueue_mask(&s->dev, vdev, i, false); + } + + return ret; + +err_guest_notifiers: + k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false); +err_host_notifiers: + vhost_dev_disable_notifiers(&s->dev, vdev); + return ret; +} + +static void vhost_vdpa_net_handle_output(VirtIODevice *vdev, VirtQueue *vq) +{ + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); + Error *local_err = NULL; + int i, ret; + + if (!vdev->start_on_kick) { + return; + } + + if (s->dev.started) { + return; + } + + /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start + * vhost here instead of waiting for .set_status(). + */ + ret = vhost_vdpa_net_start(vdev, &local_err); + if (ret < 0) { + error_reportf_err(local_err, "vhost-vdpa-net: start failed: "); + return; + } + + /* Kick right away to begin processing requests already in vring */ + for (i = 0; i < s->dev.nvqs; i++) { + VirtQueue *kick_vq = virtio_get_queue(vdev, i); + + if (!virtio_queue_get_desc_addr(vdev, i)) { + continue; + } + event_notifier_set(virtio_queue_get_host_notifier(kick_vq)); + } +} + +static void vhost_vdpa_net_stop(VirtIODevice *vdev) +{ + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + int ret; + + if (!s->started) { + return; + } + s->started = false; + + if (!k->set_guest_notifiers) { + return; + } + + vhost_dev_stop(&s->dev, vdev); + + ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false); + if (ret < 0) { + error_report("vhost guest notifier cleanup failed: %d", ret); + return; + } + + vhost_dev_disable_notifiers(&s->dev, vdev); +} + +static void vhost_vdpa_net_set_status(VirtIODevice *vdev, uint8_t status) +{ + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); + bool should_start = virtio_device_started(vdev, status); + Error *local_err = NULL; + int ret; + + if (!vdev->vm_running) { + should_start = false; + } + + if (s->started == should_start) { + return; + } + + if (should_start) { + ret = vhost_vdpa_net_start(vdev, &local_err); + if (ret < 0) { + error_reportf_err(local_err, "vhost-vdpa-net: start failed: "); + } + } else { + vhost_vdpa_net_stop(vdev); + } +} + +static void vhost_vdpa_net_unrealize(VHostVdpaNet *s) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(s); + int i; + + for (i = 0; i < s->queue_pairs * 2; i++) { + virtio_delete_queue(s->virtqs[i]); + } + /* ctrl vq */ + virtio_delete_queue(s->virtqs[i]); + + g_free(s->virtqs); + virtio_cleanup(vdev); +} + +static void vhost_vdpa_net_device_realize(DeviceState *dev, Error **errp) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); + int i, ret; + + s->vdpa.device_fd = qemu_open_old(s->vdpa_dev, O_RDWR); + if (s->vdpa.device_fd == -1) { + error_setg(errp, "vhost-vdpa-net: open %s failed: %s", + s->vdpa_dev, strerror(errno)); + return; + } + + virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, + sizeof(struct virtio_net_config)); + + s->dev.nvqs = s->queue_pairs * 2 + 1; + s->dev.vqs = g_new0(struct vhost_virtqueue, s->dev.nvqs); + s->dev.vq_index = 0; + s->dev.vq_index_end = s->dev.nvqs; + s->dev.backend_features = 0; + s->started = false; + + s->virtqs = g_new0(VirtQueue *, s->dev.nvqs); + for (i = 0; i < s->dev.nvqs; i++) { + s->virtqs[i] = virtio_add_queue(vdev, s->queue_size, + vhost_vdpa_net_handle_output); + } + + ret = vhost_dev_init(&s->dev, &s->vdpa, VHOST_BACKEND_TYPE_VDPA, 0, NULL); + if (ret < 0) { + error_setg(errp, "vhost-vdpa-net: vhost initialization failed: %s", + strerror(-ret)); + goto init_err; + } + + ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->netcfg, + sizeof(struct virtio_net_config), NULL); + if (ret < 0) { + error_setg(errp, "vhost-vdpa-net: get network config failed"); + goto config_err; + } + + return; +config_err: + vhost_dev_cleanup(&s->dev); +init_err: + vhost_vdpa_net_unrealize(s); + close(s->vdpa.device_fd); +} + +static void vhost_vdpa_net_device_unrealize(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostVdpaNet *s = VHOST_VDPA_NET(vdev); + + virtio_set_status(vdev, 0); + vhost_dev_cleanup(&s->dev); + vhost_vdpa_net_unrealize(s); + close(s->vdpa.device_fd); +} + +static const VMStateDescription vmstate_vhost_vdpa_net = { + .name = "vhost-vdpa-net", + .minimum_version_id = 1, + .version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_VIRTIO_DEVICE, + VMSTATE_END_OF_LIST() + }, +}; + +static void vhost_vdpa_net_instance_init(Object *obj) +{ + VHostVdpaNet *s = VHOST_VDPA_NET(obj); + + device_add_bootindex_property(obj, &s->bootindex, "bootindex", + "/ethernet-phy@0,0", DEVICE(obj)); +} + +static Property vhost_vdpa_net_properties[] = { + DEFINE_PROP_STRING("vdpa-dev", VHostVdpaNet, vdpa_dev), + DEFINE_PROP_UINT16("queue-pairs", VHostVdpaNet, queue_pairs, + VHOST_VDPA_NET_AUTO_QUEUE_PAIRS), + DEFINE_PROP_UINT32("queue-size", VHostVdpaNet, queue_size, + VHOST_VDPA_NET_QUEUE_DEFAULT_SIZE), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vhost_vdpa_net_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + device_class_set_props(dc, vhost_vdpa_net_properties); + dc->vmsd = &vmstate_vhost_vdpa_net; + set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); + vdc->realize = vhost_vdpa_net_device_realize; + vdc->unrealize = vhost_vdpa_net_device_unrealize; + vdc->get_config = vhost_vdpa_net_get_config; + vdc->set_config = vhost_vdpa_net_set_config; + vdc->get_features = vhost_vdpa_net_get_features; + vdc->set_status = vhost_vdpa_net_set_status; +} + +static const TypeInfo vhost_vdpa_net_info = { + .name = TYPE_VHOST_VDPA_NET, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VHostVdpaNet), + .instance_init = vhost_vdpa_net_instance_init, + .class_init = vhost_vdpa_net_class_init, +}; + +static void virtio_register_types(void) +{ + type_register_static(&vhost_vdpa_net_info); +} + +type_init(virtio_register_types) diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig index c144d42..50dba2e 100644 --- a/hw/virtio/Kconfig +++ b/hw/virtio/Kconfig @@ -68,3 +68,8 @@ config VHOST_USER_RNG bool default y depends on VIRTIO && VHOST_USER + +config VHOST_VDPA_NET + bool + default y if VIRTIO_PCI + depends on VIRTIO && VHOST_VDPA && LINUX diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build index 521f7d6..3089222 100644 --- a/hw/virtio/meson.build +++ b/hw/virtio/meson.build @@ -34,6 +34,7 @@ virtio_pci_ss = ss.source_set() virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: files('vhost-vsock-pci.c')) virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_VSOCK', if_true: files('vhost-user-vsock-pci.c')) virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: files('vhost-user-blk-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VHOST_VDPA_NET', if_true: files('vhost-vdpa-net-pci.c')) virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_INPUT', if_true: files('vhost-user-input-pci.c')) virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_SCSI', if_true: files('vhost-user-scsi-pci.c')) virtio_pci_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: files('vhost-scsi-pci.c')) diff --git a/hw/virtio/vhost-vdpa-net-pci.c b/hw/virtio/vhost-vdpa-net-pci.c new file mode 100644 index 0000000..84199a8 --- /dev/null +++ b/hw/virtio/vhost-vdpa-net-pci.c @@ -0,0 +1,118 @@ +#include "qemu/osdep.h" +#include "standard-headers/linux/virtio_pci.h" +#include "hw/virtio/virtio.h" +#include "hw/virtio/vhost-vdpa-net.h" +#include "hw/pci/pci.h" +#include "hw/qdev-properties.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include "virtio-pci.h" +#include "qom/object.h" +#include "net/vhost-vdpa.h" + +typedef struct VHostVdpaNetPCI VHostVdpaNetPCI; + +#define TYPE_VHOST_VDPA_NET_PCI "vhost-vdpa-net-pci-base" +DECLARE_INSTANCE_CHECKER(VHostVdpaNetPCI, VHOST_VDPA_NET_PCI, + TYPE_VHOST_VDPA_NET_PCI) + +struct VHostVdpaNetPCI { + VirtIOPCIProxy parent_obj; + VHostVdpaNet vdev; +}; + +static Property vhost_vdpa_net_pci_properties[] = { + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, + DEV_NVECTORS_UNSPECIFIED), + DEFINE_PROP_END_OF_LIST(), +}; + +static int vhost_vdpa_net_get_queue_pairs(VHostVdpaNetPCI *dev, Error **errp) +{ + int device_fd, queue_pairs; + int has_cvq; + + device_fd = qemu_open_old(dev->vdev.vdpa_dev, O_RDWR); + if (device_fd == -1) { + error_setg(errp, "vhost-vdpa-net: open %s failed: %s", + dev->vdev.vdpa_dev, strerror(errno)); + return -1; + } + + queue_pairs = vhost_vdpa_get_max_queue_pairs(device_fd, &has_cvq, errp); + if (queue_pairs < 0) { + error_setg(errp, "vhost-vdpa-net: get queue pairs failed: %s", + strerror(errno)); + goto out; + } + + if (!has_cvq) { + error_setg(errp, "vhost-vdpa-net: not support ctrl vq"); + } + +out: + close(device_fd); + return queue_pairs; +} + +static void vhost_vdpa_net_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VHostVdpaNetPCI *dev = VHOST_VDPA_NET_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + + if (dev->vdev.queue_pairs == VHOST_VDPA_NET_AUTO_QUEUE_PAIRS) { + dev->vdev.queue_pairs = vhost_vdpa_net_get_queue_pairs(dev, errp); + if (*errp) { + return; + } + } + + if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) { + vpci_dev->nvectors = dev->vdev.queue_pairs * 2 + 1; + } + + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void vhost_vdpa_net_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + + set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); + device_class_set_props(dc, vhost_vdpa_net_pci_properties); + k->realize = vhost_vdpa_net_pci_realize; + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_NET; + pcidev_k->revision = VIRTIO_PCI_ABI_VERSION; + pcidev_k->class_id = PCI_CLASS_NETWORK_ETHERNET; +} + +static void vhost_vdpa_net_pci_instance_init(Object *obj) +{ + VHostVdpaNetPCI *dev = VHOST_VDPA_NET_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VHOST_VDPA_NET); + object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev), + "bootindex"); +} + +static const VirtioPCIDeviceTypeInfo vhost_vdpa_net_pci_info = { + .base_name = TYPE_VHOST_VDPA_NET_PCI, + .generic_name = "vhost-vdpa-net-pci", + .transitional_name = "vhost-vdpa-net-pci-transitional", + .non_transitional_name = "vhost-vdpa-net-pci-non-transitional", + .instance_size = sizeof(VHostVdpaNetPCI), + .instance_init = vhost_vdpa_net_pci_instance_init, + .class_init = vhost_vdpa_net_pci_class_init, +}; + +static void vhost_vdpa_net_pci_register(void) +{ + virtio_pci_types_register(&vhost_vdpa_net_pci_info); +} + +type_init(vhost_vdpa_net_pci_register) diff --git a/include/hw/virtio/vhost-vdpa-net.h b/include/hw/virtio/vhost-vdpa-net.h new file mode 100644 index 0000000..63bf3a6 --- /dev/null +++ b/include/hw/virtio/vhost-vdpa-net.h @@ -0,0 +1,31 @@ +#ifndef VHOST_VDPA_NET_H +#define VHOST_VDPA_NET_H + +#include "standard-headers/linux/virtio_blk.h" +#include "hw/block/block.h" +#include "chardev/char-fe.h" +#include "hw/virtio/vhost.h" +#include "hw/virtio/vhost-vdpa.h" +#include "hw/virtio/virtio-net.h" +#include "qom/object.h" + +#define TYPE_VHOST_VDPA_NET "vhost-vdpa-net" +OBJECT_DECLARE_SIMPLE_TYPE(VHostVdpaNet, VHOST_VDPA_NET) + +struct VHostVdpaNet { + VirtIODevice parent_obj; + int32_t bootindex; + struct virtio_net_config netcfg; + uint16_t queue_pairs; + uint32_t queue_size; + struct vhost_dev dev; + VirtQueue **virtqs; + struct vhost_vdpa vdpa; + char *vdpa_dev; + bool started; +}; + +#define VHOST_VDPA_NET_AUTO_QUEUE_PAIRS UINT16_MAX +#define VHOST_VDPA_NET_QUEUE_DEFAULT_SIZE 256 + +#endif diff --git a/include/net/vhost-vdpa.h b/include/net/vhost-vdpa.h index b81f9a6..f029972 100644 --- a/include/net/vhost-vdpa.h +++ b/include/net/vhost-vdpa.h @@ -18,4 +18,6 @@ struct vhost_net *vhost_vdpa_get_vhost_net(NetClientState *nc); extern const int vdpa_feature_bits[]; +int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp); + #endif /* VHOST_VDPA_H */ diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 25dd6dd..8ee6ba5 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -219,7 +219,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer, return nc; } -static int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp) +int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp) { unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); g_autofree struct vhost_vdpa_config *config = NULL;