From patchwork Mon Nov 2 22:24:33 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Michael S. Tsirkin" X-Patchwork-Id: 57157 Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id nA2MRD51019413 for ; Mon, 2 Nov 2009 22:27:13 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756960AbZKBW1G (ORCPT ); Mon, 2 Nov 2009 17:27:06 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1756899AbZKBW1G (ORCPT ); Mon, 2 Nov 2009 17:27:06 -0500 Received: from mx1.redhat.com ([209.132.183.28]:15794 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756886AbZKBW1E (ORCPT ); Mon, 2 Nov 2009 17:27:04 -0500 Received: from int-mx01.intmail.prod.int.phx2.redhat.com (int-mx01.intmail.prod.int.phx2.redhat.com [10.5.11.11]) by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id nA2MR26V032392; Mon, 2 Nov 2009 17:27:02 -0500 Received: from redhat.com (vpn-6-135.tlv.redhat.com [10.35.6.135]) by int-mx01.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with SMTP id nA2MQuvF001544; Mon, 2 Nov 2009 17:26:57 -0500 Date: Tue, 3 Nov 2009 00:24:33 +0200 From: "Michael S. Tsirkin" To: avi@redhat.com, kvm@vger.kernel.org, virtualization@lists.linux-foundation.org Cc: gregory.haskins@gmail.com Subject: [PATCHv4 6/6] qemu-kvm: vhost-net implementation Message-ID: <20091102222433.GG15153@redhat.com> References: MIME-Version: 1.0 Content-Disposition: inline In-Reply-To: User-Agent: Mutt/1.5.19 (2009-01-05) X-Scanned-By: MIMEDefang 2.67 on 10.5.11.11 Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org diff --git a/Makefile.target b/Makefile.target index acee285..0d8e688 100644 --- a/Makefile.target +++ b/Makefile.target @@ -160,7 +160,8 @@ obj-y = vl.o monitor.o pci.o isa_mmio.o machine.o \ gdbstub.o gdbstub-xml.o # virtio has to be here due to weird dependency between PCI and virtio-net. # need to fix this properly -obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-console.o virtio-pci.o +obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-console.o virtio-pci.o \ + vhost_net.o obj-$(CONFIG_KVM) += kvm.o kvm-all.o # MSI-X depends on kvm for interrupt injection, # so moved it from Makefile.hw to Makefile.target for now diff --git a/hw/vhost_net.c b/hw/vhost_net.c new file mode 100644 index 0000000..bc179ab --- /dev/null +++ b/hw/vhost_net.c @@ -0,0 +1,251 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "net.h" +#include "qemu-kvm.h" + +#include "vhost_net.h" + +static int vhost_virtqueue_init(struct vhost_dev *dev, + struct VirtIODevice *vdev, + struct vhost_virtqueue *vq, + struct VirtQueue *q, + unsigned idx) +{ + target_phys_addr_t s, l; + int r; + struct vhost_vring_addr addr = { + .index = idx, + }; + struct vhost_vring_file file = { + .index = idx, + }; + struct vhost_vring_state size = { + .index = idx, + }; + + size.num = q->vring.num; + r = ioctl(dev->control, VHOST_SET_VRING_NUM, &size); + if (r) + return -errno; + + file.fd = vq->kick = eventfd(0, 0); + r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file); + if (r) + return -errno; + file.fd = vq->call = eventfd(0, 0); + r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file); + if (r) + return -errno; + + s = l = sizeof(struct vring_desc) * q->vring.num; + vq->desc = cpu_physical_memory_map(q->vring.desc, &l, 0); + if (!vq->desc || l != s) + return -ENOMEM; + addr.user_addr = (u_int64_t)(unsigned long)vq->desc; + r = ioctl(dev->control, VHOST_SET_VRING_DESC, &addr); + if (r < 0) + return -errno; + s = l = offsetof(struct vring_avail, ring) + + sizeof(u_int64_t) * q->vring.num; + vq->avail = cpu_physical_memory_map(q->vring.avail, &l, 0); + if (!vq->avail || l != s) + return -ENOMEM; + addr.user_addr = (u_int64_t)(unsigned long)vq->avail; + r = ioctl(dev->control, VHOST_SET_VRING_AVAIL, &addr); + if (r < 0) + return -errno; + s = l = offsetof(struct vring_used, ring) + + sizeof(struct vring_used_elem) * q->vring.num; + vq->used = cpu_physical_memory_map(q->vring.used, &l, 1); + if (!vq->used || l != s) + return -ENOMEM; + addr.user_addr = (u_int64_t)(unsigned long)vq->used; + r = ioctl(dev->control, VHOST_SET_VRING_USED, &addr); + if (r < 0) + return -errno; + + r = vdev->binding->irqfd(vdev->binding_opaque, q->vector, vq->call); + if (r < 0) + return -errno; + + r = vdev->binding->queuefd(vdev->binding_opaque, idx, vq->kick); + if (r < 0) + return -errno; + + return 0; +} + +static int vhost_dev_init(struct vhost_dev *hdev) +{ + uint64_t features; + int r; + hdev->control = open("/dev/vhost-net", O_RDWR); + if (hdev->control < 0) + return -errno; + r = ioctl(hdev->control, VHOST_SET_OWNER, NULL); + if (r < 0) + return -errno; + + r = ioctl(hdev->control, VHOST_GET_FEATURES, &features); + if (r < 0) + return -errno; + hdev->features = features; + return 0; +} + +static void vhost_dev_cleanup(struct vhost_dev *hdev) +{ + close(hdev->control); +} + +static int vhost_dev_start(struct vhost_dev *hdev, + VirtIODevice *vdev) +{ + int i, r, n = 0; + struct vhost_memory *mem; + + r = ioctl(hdev->control, VHOST_ACK_FEATURES, &hdev->acked_features); + if (r < 0) + return -errno; + + for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) { + if (!slots[i].len || (slots[i].flags & KVM_MEM_LOG_DIRTY_PAGES)) { + continue; + } + ++n; + } + + mem = qemu_mallocz(offsetof(struct vhost_memory, regions) + + n * sizeof(struct vhost_memory_region)); + if (!mem) + return -ENOMEM; + mem->nregions = n; + n = 0; + for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) { + if (!slots[i].len || (slots[i].flags & KVM_MEM_LOG_DIRTY_PAGES)) { + continue; + } + mem->regions[n].guest_phys_addr = slots[i].phys_addr; + mem->regions[n].memory_size = slots[i].len; + mem->regions[n].userspace_addr = slots[i].userspace_addr; + ++n; + } + + r = ioctl(hdev->control, VHOST_SET_MEM_TABLE, mem); + if (r < 0) + return -errno; + + for (i = 0; i < hdev->nvqs; ++i) { + r = vhost_virtqueue_init(hdev, + vdev, + hdev->vqs + i, + vdev->vq + i, + i); + if (r < 0) + return r; + } + + return 0; +} + +unsigned vhost_net_get_features(struct vhost_net *net) +{ + unsigned features = 0; + if (net->dev.features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) + features |= VIRTIO_F_NOTIFY_ON_EMPTY; + if (net->dev.features & (1 << VIRTIO_RING_F_INDIRECT_DESC)) + features |= VIRTIO_RING_F_INDIRECT_DESC; + return features; +} + +void vhost_net_ack_features(struct vhost_net *net, unsigned features) +{ + net->dev.acked_features = net->dev.backend_features; + if (features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) + net->dev.acked_features |= VIRTIO_F_NOTIFY_ON_EMPTY; + if (features & (1 << VIRTIO_RING_F_INDIRECT_DESC)) + net->dev.acked_features |= VIRTIO_RING_F_INDIRECT_DESC; +} + +static int vhost_net_get_fd(VLANClientState *backend, + unsigned long long *backend_features) +{ + int r; + r = raw_get_fd(backend); + if (r >= 0) { + *backend_features = (1 << VHOST_NET_F_VIRTIO_NET_HDR); + return r; + } + r = tap_get_fd(backend); + if (r >= 0) { + *backend_features = 0; + return r; + } + fprintf(stderr, "vhost requires raw socket or tap backend\n"); + return -EBADFD; +} + +int vhost_net_init(struct vhost_net *net, VLANClientState *backend) +{ + int r; + + if (!backend) { + fprintf(stderr, "vhost requires backend to be setup\n"); + return -EINVAL; + } + r = vhost_net_get_fd(backend, &net->dev.backend_features); + if (r < 0) + return r; + net->backend = r; + + r = vhost_dev_init(&net->dev); + if (r < 0) + return r; + if (~net->dev.features & net->dev.backend_features) { + fprintf(stderr, "vhost lacks feature mask %llu for backend\n", + ~net->dev.features & net->dev.backend_features); + vhost_dev_cleanup(&net->dev); + return -EINVAL; + } + + /* Set sane init value. Override when guest acks. */ + vhost_net_ack_features(net, 0); + return 0; +} + +int vhost_net_start(struct vhost_net *net, + VirtIODevice *dev) +{ + struct vhost_vring_file file = { }; + int r; + + net->dev.nvqs = 2; + net->dev.vqs = net->vqs; + r = vhost_dev_start(&net->dev, dev); + if (r < 0) + return r; + + /* Stop polling backend from qemu. */ + qemu_set_fd_handler(net->backend, NULL, NULL, NULL); + file.fd = net->backend; + for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { + r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); + if (r < 0) { + /* TODO: cleanup on error. */ + return -errno; + } + } + return 0; +} diff --git a/hw/vhost_net.h b/hw/vhost_net.h new file mode 100644 index 0000000..65720e1 --- /dev/null +++ b/hw/vhost_net.h @@ -0,0 +1,38 @@ +#ifndef VHOST_NET_H +#define VHOST_NET_H + +#include "hw/virtio.h" + +struct vhost_virtqueue { + int kick; + int call; + void *desc; + void *avail; + void *used; +}; + +struct vhost_dev { + int control; + struct vhost_virtqueue *vqs; + int nvqs; + unsigned long long features; + unsigned long long acked_features; + unsigned long long backend_features; +}; + +struct vhost_net { + struct vhost_dev dev; + struct vhost_virtqueue vqs[2]; + int backend; +}; + +int vhost_net_init(struct vhost_net *net, + VLANClientState *backend); + +int vhost_net_start(struct vhost_net *net, + VirtIODevice *dev); + +unsigned vhost_net_get_features(struct vhost_net *net); +void vhost_net_ack_features(struct vhost_net *net, unsigned features); + +#endif diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 2e51a6a..3b0b947 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -19,6 +19,8 @@ #include "qemu-kvm.h" #endif +#include "vhost_net.h" + #define TAP_VNET_HDR #define VIRTIO_NET_VM_VERSION 10 @@ -56,6 +58,8 @@ typedef struct VirtIONet uint8_t *macs; } mac_table; uint32_t *vlans; + int vhost_device; + struct vhost_net vhost; } VirtIONet; /* TODO @@ -127,16 +131,10 @@ static void virtio_net_reset(VirtIODevice *vdev) static uint32_t virtio_net_get_features(VirtIODevice *vdev) { - uint32_t features = (1 << VIRTIO_NET_F_MAC) | - (1 << VIRTIO_NET_F_MRG_RXBUF) | - (1 << VIRTIO_NET_F_STATUS) | - (1 << VIRTIO_NET_F_CTRL_VQ) | - (1 << VIRTIO_NET_F_CTRL_RX) | - (1 << VIRTIO_NET_F_CTRL_VLAN) | - (1 << VIRTIO_NET_F_CTRL_RX_EXTRA); + uint32_t features = 0; + VirtIONet *n = to_virtio_net(vdev); #ifdef TAP_VNET_HDR - VirtIONet *n = to_virtio_net(vdev); VLANClientState *host = n->vc->vlan->first_client; if (tap_has_vnet_hdr(host)) { @@ -149,12 +147,23 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev) features |= (1 << VIRTIO_NET_F_HOST_TSO4); features |= (1 << VIRTIO_NET_F_HOST_TSO6); features |= (1 << VIRTIO_NET_F_HOST_ECN); - features |= (1 << VIRTIO_NET_F_MRG_RXBUF); /* Kernel can't actually handle UFO in software currently. */ } #endif - return features | virtio_common_features(); + if (n->vhost_device) + features |= (1 << VIRTIO_NET_F_MAC) | vhost_net_get_features(&n->vhost); + else + features |= virtio_common_features() | + (1 << VIRTIO_NET_F_MAC) | + (1 << VIRTIO_NET_F_MRG_RXBUF) | + (1 << VIRTIO_NET_F_STATUS) | + (1 << VIRTIO_NET_F_CTRL_VQ) | + (1 << VIRTIO_NET_F_CTRL_RX) | + (1 << VIRTIO_NET_F_CTRL_VLAN) | + (1 << VIRTIO_NET_F_CTRL_RX_EXTRA); + + return features; } static uint32_t virtio_net_bad_features(VirtIODevice *vdev) @@ -175,11 +184,15 @@ static uint32_t virtio_net_bad_features(VirtIODevice *vdev) static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features) { VirtIONet *n = to_virtio_net(vdev); + /* vhost net supports no features */ #ifdef TAP_VNET_HDR VLANClientState *host = n->vc->vlan->first_client; #endif n->mergeable_rx_bufs = !!(features & (1 << VIRTIO_NET_F_MRG_RXBUF)); + if (n->vhost_device) { + vhost_net_ack_features(&n->vhost, features); + } #ifdef TAP_VNET_HDR if (!tap_has_vnet_hdr(host) || !host->set_offload) @@ -351,6 +364,9 @@ static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq) static int do_virtio_net_can_receive(VirtIONet *n, int bufsize) { + if (n->vhost_device) + return 0; + if (!virtio_queue_ready(n->rx_vq) || !(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) return 0; @@ -411,6 +427,7 @@ static int iov_fill(struct iovec *iov, int iovcnt, const void *buf, int count) while (offset < count && i < iovcnt) { int len = MIN(iov[i].iov_len, count - offset); memcpy(iov[i].iov_base, buf + offset, len); + offset += len; i++; } @@ -611,6 +628,8 @@ static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) #else int has_vnet_hdr = 0; #endif + if (n->vhost_device) + return; if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) return; @@ -810,6 +829,8 @@ static void virtio_net_cleanup(VLANClientState *vc) { VirtIONet *n = vc->opaque; + /* TODO: vhost device cleanup */ + qemu_purge_queued_packets(vc); unregister_savevm("virtio-net", n); @@ -823,6 +844,21 @@ static void virtio_net_cleanup(VLANClientState *vc) virtio_cleanup(&n->vdev); } +static void virtio_net_driver_ok(VirtIODevice *vdev) +{ + VirtIONet *n = to_virtio_net(vdev); + int r; + + if (!n->vhost_device) + return; + + r = vhost_net_start(&n->vhost, vdev); + if (r) { + fprintf(stderr, "\nvhost_net_init returned %d\n", r); + exit(-r); + } +} + VirtIODevice *virtio_net_init(DeviceState *dev) { VirtIONet *n; @@ -831,6 +867,15 @@ VirtIODevice *virtio_net_init(DeviceState *dev) n = (VirtIONet *)virtio_common_init("virtio-net", VIRTIO_ID_NET, sizeof(struct virtio_net_config), sizeof(VirtIONet)); + n->vhost_device = dev->nd->vhost_device; + if (n->vhost_device) { + int r = vhost_net_init(&n->vhost, dev->nd->vlan->first_client); + if (r) { + fprintf(stderr, "Unable to initialize vhost device: %d\n", r); + virtio_cleanup(&n->vdev); + return NULL; + } + } n->vdev.get_config = virtio_net_get_config; n->vdev.set_config = virtio_net_set_config; @@ -838,6 +883,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev) n->vdev.set_features = virtio_net_set_features; n->vdev.bad_features = virtio_net_bad_features; n->vdev.reset = virtio_net_reset; + n->vdev.driver_ok = virtio_net_driver_ok; n->rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx); n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx); n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl); @@ -864,7 +910,6 @@ VirtIODevice *virtio_net_init(DeviceState *dev) n->vdev.nvectors = 3; else n->vdev.nvectors = dev->nd->nvectors; - register_savevm("virtio-net", virtio_net_id++, VIRTIO_NET_VM_VERSION, virtio_net_save, virtio_net_load, n); diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c index 0716f6f..b7f073b 100644 --- a/hw/virtio-pci.c +++ b/hw/virtio-pci.c @@ -15,11 +15,13 @@ #include +#include #include "virtio.h" #include "pci.h" #include "sysemu.h" #include "msix.h" #include "net.h" +#include "qemu-kvm.h" /* from Linux's linux/virtio_pci.h */ @@ -199,6 +201,8 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) vdev->status = val & 0xFF; if (vdev->status == 0) virtio_pci_reset(&proxy->pci_dev.qdev); + if ((val & VIRTIO_CONFIG_S_DRIVER_OK) && vdev->driver_ok) + vdev->driver_ok(vdev); break; case VIRTIO_MSI_CONFIG_VECTOR: msix_vector_unuse(&proxy->pci_dev, vdev->config_vector); @@ -373,12 +377,48 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address, msix_write_config(pci_dev, address, val, len); } +static int virtio_pci_irqfd(void * opaque, uint16_t vector, int fd) +{ + VirtIOPCIProxy *proxy = opaque; + struct kvm_irqfd call = { }; + int r; + + if (vector >= proxy->pci_dev.msix_entries_nr) + return -EINVAL; + if (!proxy->pci_dev.msix_entry_used[vector]) + return -ENOENT; + call.fd = fd; + call.gsi = proxy->pci_dev.msix_irq_entries[vector].gsi; + r = kvm_vm_ioctl(kvm_state, KVM_IRQFD, &call); + if (r < 0) + return r; + return 0; +} + +static int virtio_pci_queuefd(void * opaque, int n, int fd) +{ + VirtIOPCIProxy *proxy = opaque; + struct kvm_ioeventfd kick = { + .datamatch = n, + .addr = proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY, + .len = 2, + .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO, + .fd = fd, + }; + int r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); + if (r < 0) + return r; + return 0; +} + static const VirtIOBindings virtio_pci_bindings = { .notify = virtio_pci_notify, .save_config = virtio_pci_save_config, .load_config = virtio_pci_load_config, .save_queue = virtio_pci_save_queue, .load_queue = virtio_pci_load_queue, + .irqfd = virtio_pci_irqfd, + .queuefd = virtio_pci_queuefd, }; static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev, diff --git a/hw/virtio.c b/hw/virtio.c index 337ff27..cc5c205 100644 --- a/hw/virtio.c +++ b/hw/virtio.c @@ -54,24 +54,6 @@ typedef struct VRingUsed VRingUsedElem ring[0]; } VRingUsed; -typedef struct VRing -{ - unsigned int num; - target_phys_addr_t desc; - target_phys_addr_t avail; - target_phys_addr_t used; -} VRing; - -struct VirtQueue -{ - VRing vring; - target_phys_addr_t pa; - uint16_t last_avail_idx; - int inuse; - uint16_t vector; - void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq); -}; - #define VIRTIO_PCI_QUEUE_MAX 16 /* virt queue functions */ @@ -401,7 +383,6 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem) sg->iov_base = cpu_physical_memory_map(vring_desc_addr(desc_pa, i), &len, is_write); - if (sg->iov_base == NULL || len != sg->iov_len) { fprintf(stderr, "virtio: trying to map MMIO memory\n"); exit(1); diff --git a/hw/virtio.h b/hw/virtio.h index 799e608..12792da 100644 --- a/hw/virtio.h +++ b/hw/virtio.h @@ -54,15 +54,34 @@ struct VirtQueue; +typedef struct VRing +{ + unsigned int num; + target_phys_addr_t desc; + target_phys_addr_t avail; + target_phys_addr_t used; +} VRing; + +typedef struct VirtQueue VirtQueue; +struct VirtIODevice; +typedef struct VirtIODevice VirtIODevice; + +struct VirtQueue +{ + VRing vring; + target_phys_addr_t pa; + uint16_t last_avail_idx; + int inuse; + uint16_t vector; + void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq); +}; + static inline target_phys_addr_t vring_align(target_phys_addr_t addr, unsigned long align) { return (addr + align - 1) & ~(align - 1); } -typedef struct VirtQueue VirtQueue; -typedef struct VirtIODevice VirtIODevice; - #define VIRTQUEUE_MAX_SIZE 1024 typedef struct VirtQueueElement @@ -81,6 +100,8 @@ typedef struct { void (*save_queue)(void * opaque, int n, QEMUFile *f); int (*load_config)(void * opaque, QEMUFile *f); int (*load_queue)(void * opaque, int n, QEMUFile *f); + int (*irqfd)(void * opaque, uint16_t vector, int fd); + int (*queuefd)(void * opaque, int n, int fd); } VirtIOBindings; #define VIRTIO_PCI_QUEUE_MAX 16 @@ -104,6 +125,7 @@ struct VirtIODevice void (*get_config)(VirtIODevice *vdev, uint8_t *config); void (*set_config)(VirtIODevice *vdev, const uint8_t *config); void (*reset)(VirtIODevice *vdev); + void (*driver_ok)(VirtIODevice *vdev); VirtQueue *vq; const VirtIOBindings *binding; void *binding_opaque; diff --git a/kvm/include/linux/vhost.h b/kvm/include/linux/vhost.h new file mode 100644 index 0000000..aa4ff24 --- /dev/null +++ b/kvm/include/linux/vhost.h @@ -0,0 +1,126 @@ +#ifndef _LINUX_VHOST_H +#define _LINUX_VHOST_H +/* Userspace interface for in-kernel virtio accelerators. */ + +/* vhost is used to reduce the number of system calls involved in virtio. + * + * Existing virtio net code is used in the guest without modification. + * + * This header includes interface used by userspace hypervisor for + * device configuration. + */ + +#include + +#include +#include +#include + +struct vhost_vring_state { + unsigned int index; + unsigned int num; +}; + +struct vhost_vring_file { + unsigned int index; + int fd; /* Pass -1 to unbind from file. */ + +}; + +struct vhost_vring_addr { + unsigned int index; + unsigned int padding; + __u64 user_addr; +}; + +struct vhost_memory_region { + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; + __u64 flags_padding; /* No flags are currently specified. */ +}; + +/* All region addresses and sizes must be 4K aligned. */ +#define VHOST_PAGE_SIZE 0x1000 + +struct vhost_memory { + __u32 nregions; + __u32 padding; + struct vhost_memory_region regions[0]; +}; + +/* ioctls */ + +#define VHOST_VIRTIO 0xAF + +/* Features bitmask for forward compatibility. Transport bits are used for + * vhost specific features. */ +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_ACK_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) + +/* Set current process as the (exclusive) owner of this file descriptor. This + * must be called before any other vhost command. Further calls to + * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */ +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +/* Give up ownership, and reset the device to default values. + * Allows subsequent call to VHOST_OWNER_SET to succeed. */ +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) + +/* Set up/modify memory layout */ +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) + +/* Write logging setup. */ +/* Memory writes can optionally be logged by setting bit at an offset + * (calculated from the physical address) from specified log base. + * The bit is set using an atomic 32 bit operation. */ +/* Set base address for logging. */ +#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) +/* Specify an eventfd file descriptor to signal on log write. */ +#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) + +/* Ring setup. These parameters can not be modified while ring is running + * (bound to a device). */ +/* Set number of descriptors in ring */ +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +/* Start of array of descriptors (virtually contiguous) */ +#define VHOST_SET_VRING_DESC _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +/* Used structure address. Must be 32 bit aligned */ +#define VHOST_SET_VRING_USED _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_addr) +/* Available structure address. Must be 16 bit aligned */ +#define VHOST_SET_VRING_AVAIL _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_addr) +/* Base value where queue looks for available descriptors */ +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) +/* Get accessor: reads index, writes value in num */ +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x14, struct vhost_vring_state) + +/* Logging support. Can be modified while ring is running. */ +/* Log writes to used structure, at offset calculated from specified address. + * Address must be 32 bit aligned. Pass 0x1 to disable logging. */ +#define VHOST_SET_VRING_LOG _IOW(VHOST_VIRTIO, 0x18, struct vhost_vring_addr) +#define VHOST_VRING_LOG_DISABLE (0x1) + +/* The following ioctls use eventfd file descriptors to signal and poll + * for events. */ + +/* Set eventfd to poll for added buffers */ +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +/* Set eventfd to signal when buffers have beed used */ +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +/* Set eventfd to signal an error */ +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) + +/* VHOST_NET specific defines */ + +/* Attach virtio net ring to a raw socket, or tap device. + * The socket must be already bound to an ethernet device, this device will be + * used for transmit. Pass fd -1 to unbind from the socket and the transmit + * device. This can be used to stop the ring (e.g. for migration). */ +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) + +/* Feature bits */ +/* Log all write descriptors. Can be changed while device is active. */ +#define VHOST_F_LOG_ALL 26 +/* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */ +#define VHOST_NET_F_VIRTIO_NET_HDR 27 + +#endif diff --git a/net.c b/net.c index 9168460..5d98e90 100644 --- a/net.c +++ b/net.c @@ -2767,6 +2767,9 @@ static int net_init_nic(QemuOpts *opts, Monitor *mon) if (qemu_opt_get(opts, "addr")) { nd->devaddr = qemu_strdup(qemu_opt_get(opts, "addr")); } + if (qemu_opt_get(opts, "vhost")) { + nd->vhost_device = qemu_opt_get_bool(opts, "vhost", 0); + } nd->macaddr[0] = 0x52; nd->macaddr[1] = 0x54; @@ -3182,6 +3185,10 @@ static struct { .name = "vectors", .type = QEMU_OPT_NUMBER, .help = "number of MSI-x vectors, 0 to disable MSI-X", + }, { + .name = "vhost", + .type = QEMU_OPT_BOOL, + .help = "enable vhost backend", }, { /* end of list */ } }, diff --git a/net.h b/net.h index 932b50d..adcd5c6 100644 --- a/net.h +++ b/net.h @@ -115,6 +115,7 @@ struct NICInfo { int used; int bootable; int nvectors; + int vhost_device; }; extern int nb_nics; diff --git a/qemu-kvm.c b/qemu-kvm.c index 62ca050..a547975 100644 --- a/qemu-kvm.c +++ b/qemu-kvm.c @@ -150,14 +150,6 @@ static inline void clear_gsi(kvm_context_t kvm, unsigned int gsi) DPRINTF("Invalid GSI %d\n"); } -struct slot_info { - unsigned long phys_addr; - unsigned long len; - unsigned long userspace_addr; - unsigned flags; - int logging_count; -}; - struct slot_info slots[KVM_MAX_NUM_MEM_REGIONS]; static void init_slots(void) diff --git a/qemu-kvm.h b/qemu-kvm.h index d6748c7..2ab6c33 100644 --- a/qemu-kvm.h +++ b/qemu-kvm.h @@ -1240,6 +1240,15 @@ int kvm_ioctl(KVMState *s, int type, ...); int kvm_vm_ioctl(KVMState *s, int type, ...); int kvm_check_extension(KVMState *s, unsigned int ext); +struct slot_info { + unsigned long phys_addr; + unsigned long len; + unsigned long userspace_addr; + unsigned flags; + int logging_count; +}; + +extern struct slot_info slots[KVM_MAX_NUM_MEM_REGIONS]; #endif #endif