From patchwork Wed Apr 7 00:36:28 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Badari Pulavarty X-Patchwork-Id: 90903 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.3) with ESMTP id o370adC7026976 for ; Wed, 7 Apr 2010 00:36:39 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1757679Ab0DGAgg (ORCPT ); Tue, 6 Apr 2010 20:36:36 -0400 Received: from e34.co.us.ibm.com ([32.97.110.152]:39816 "EHLO e34.co.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753228Ab0DGAgg (ORCPT ); Tue, 6 Apr 2010 20:36:36 -0400 Received: from d03relay05.boulder.ibm.com (d03relay05.boulder.ibm.com [9.17.195.107]) by e34.co.us.ibm.com (8.14.3/8.13.1) with ESMTP id o370TbUs002828 for ; Tue, 6 Apr 2010 18:29:37 -0600 Received: from d03av04.boulder.ibm.com (d03av04.boulder.ibm.com [9.17.195.170]) by d03relay05.boulder.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id o370aVBx135428 for ; Tue, 6 Apr 2010 18:36:32 -0600 Received: from d03av04.boulder.ibm.com (loopback [127.0.0.1]) by d03av04.boulder.ibm.com (8.14.3/8.13.1/NCO v10.0 AVout) with ESMTP id o370aUlj003044 for ; Tue, 6 Apr 2010 18:36:31 -0600 Received: from [9.47.17.100] (badari-desktop.beaverton.ibm.com [9.47.17.100]) by d03av04.boulder.ibm.com (8.14.3/8.13.1/NCO v10.0 AVin) with ESMTP id o370aTPM003002; Tue, 6 Apr 2010 18:36:30 -0600 Subject: [RFC] vhost-blk implementation (v2) From: Badari Pulavarty To: kvm@vger.kernel.org, virtualization@lists.linux-foundation.org, qemu-devel@nongnu.org In-Reply-To: <20100405195912.GA17589@infradead.org> References: <1269306023.7931.72.camel@badari-desktop> <4BA891E2.9040500@redhat.com> <20100324200502.GB22272@infradead.org> <4BAB7AA8.8030509@shiftmail.org> <20100405195912.GA17589@infradead.org> Date: Tue, 06 Apr 2010 17:36:28 -0700 Message-Id: <1270600588.28348.36.camel@badari-desktop> Mime-Version: 1.0 X-Mailer: Evolution 2.22.3.1 Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Wed, 07 Apr 2010 00:36:39 +0000 (UTC) Index: net-next/drivers/vhost/blk.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ net-next/drivers/vhost/blk.c 2010-04-06 16:38:03.563847905 -0400 @@ -0,0 +1,445 @@ + /* + * virtio-block server in host kernel. + * Inspired by vhost-net and shamlessly ripped code from it :) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vhost.h" + +#define VHOST_BLK_VQ_MAX 1 +#define SECTOR_SHIFT 9 + +struct vhost_blk { + struct vhost_dev dev; + struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX]; + struct vhost_poll poll[VHOST_BLK_VQ_MAX]; +}; + +struct vhost_blk_io { + struct list_head list; + struct work_struct work; + struct vhost_blk *blk; + struct file *file; + int head; + uint32_t type; + uint32_t nvecs; + uint64_t sector; + uint64_t len; + struct iovec iov[0]; +}; + +static struct workqueue_struct *vblk_workqueue; +static LIST_HEAD(write_queue); +static LIST_HEAD(read_queue); + +static void handle_io_work(struct work_struct *work) +{ + struct vhost_blk_io *vbio, *entry; + struct vhost_virtqueue *vq; + struct vhost_blk *blk; + struct list_head single, *head, *node, *tmp; + + int i, need_free, ret = 0; + loff_t pos; + uint8_t status = 0; + + vbio = container_of(work, struct vhost_blk_io, work); + blk = vbio->blk; + vq = &blk->dev.vqs[0]; + pos = vbio->sector << 8; + + use_mm(blk->dev.mm); + if (vbio->type & VIRTIO_BLK_T_FLUSH) { + ret = vfs_fsync(vbio->file, vbio->file->f_path.dentry, 1); + } else if (vbio->type & VIRTIO_BLK_T_OUT) { + ret = vfs_writev(vbio->file, vbio->iov, vbio->nvecs, &pos); + } else { + ret = vfs_readv(vbio->file, vbio->iov, vbio->nvecs, &pos); + } + status = (ret < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; + if (vbio->head != -1) { + INIT_LIST_HEAD(&single); + list_add(&vbio->list, &single); + head = &single; + need_free = 0; + } else { + head = &vbio->list; + need_free = 1; + } + list_for_each_entry(entry, head, list) { + copy_to_user(entry->iov[entry->nvecs].iov_base, &status, sizeof status); + } + mutex_lock(&vq->mutex); + list_for_each_safe(node, tmp, head) { + entry = list_entry(node, struct vhost_blk_io, list); + vhost_add_used_and_signal(&blk->dev, vq, entry->head, ret); + list_del(node); + kfree(entry); + } + mutex_unlock(&vq->mutex); + unuse_mm(blk->dev.mm); + if (need_free) + kfree(vbio); +} + +static struct vhost_blk_io *allocate_vbio(int nvecs) +{ + struct vhost_blk_io *vbio; + int size = sizeof(struct vhost_blk_io) + nvecs * sizeof(struct iovec); + vbio = kmalloc(size, GFP_KERNEL); + if (vbio) { + INIT_WORK(&vbio->work, handle_io_work); + INIT_LIST_HEAD(&vbio->list); + } + return vbio; +} + +static void merge_and_handoff_work(struct list_head *queue) +{ + struct vhost_blk_io *vbio, *entry; + int nvecs = 0; + int entries = 0; + + list_for_each_entry(entry, queue, list) { + nvecs += entry->nvecs; + entries++; + } + + if (entries == 1) { + vbio = list_first_entry(queue, struct vhost_blk_io, list); + list_del(&vbio->list); + queue_work(vblk_workqueue, &vbio->work); + return; + } + + vbio = allocate_vbio(nvecs); + if (!vbio) { + /* Unable to allocate memory - submit IOs individually */ + list_for_each_entry(vbio, queue, list) { + queue_work(vblk_workqueue, &vbio->work); + } + INIT_LIST_HEAD(queue); + return; + } + + entry = list_first_entry(queue, struct vhost_blk_io, list); + vbio->nvecs = nvecs; + vbio->blk = entry->blk; + vbio->file = entry->file; + vbio->type = entry->type; + vbio->sector = entry->sector; + vbio->head = -1; + vbio->len = 0; + nvecs = 0; + + list_for_each_entry(entry, queue, list) { + memcpy(&vbio->iov[nvecs], entry->iov, entry->nvecs * sizeof(struct iovec)); + nvecs += entry->nvecs; + vbio->len += entry->len; + } + list_replace_init(queue, &vbio->list); + queue_work(vblk_workqueue, &vbio->work); +} + +static void start_io(struct list_head *queue) +{ + struct list_head start; + struct vhost_blk_io *vbio = NULL, *entry; + + if (list_empty(queue)) + return; + + list_for_each_entry(entry, queue, list) { + if (!vbio) { + vbio = entry; + continue; + } + if (vbio->sector + (vbio->len >> SECTOR_SHIFT) == entry->sector) { + vbio = entry; + } else { + INIT_LIST_HEAD(&start); + list_cut_position(&start, queue, &vbio->list); + merge_and_handoff_work(&start); + vbio = entry; + } + } + if (!list_empty(queue)) + merge_and_handoff_work(queue); +} + +static uint64_t calculate_len(struct iovec *iov, int nvecs) +{ + uint64_t len = 0; + int i; + + for (i=0; isector > vbio->sector) + break; + } + list_add_tail(&vbio->list, &entry->list); +} + +static int handoff_io(struct vhost_blk *blk, int head, + uint32_t type, uint64_t sector, + struct iovec *iov, int nvecs) +{ + struct vhost_virtqueue *vq = &blk->dev.vqs[0]; + struct vhost_blk_io *vbio; + + vbio = allocate_vbio(nvecs+1); + if (!vbio) { + return -ENOMEM; + } + vbio->blk = blk; + vbio->head = head; + vbio->file = vq->private_data; + vbio->type = type; + vbio->sector = sector; + vbio->nvecs = nvecs; + vbio->len = calculate_len(iov, nvecs); + memcpy(vbio->iov, iov, (nvecs + 1) * sizeof(struct iovec)); + + if (vbio->type & VIRTIO_BLK_T_FLUSH) { +#if 0 + /* Sync called - do I need to submit IOs in the queue ? */ + start_io(&read_queue); + start_io(&write_queue); +#endif + queue_work(vblk_workqueue, &vbio->work); + } else if (vbio->type & VIRTIO_BLK_T_OUT) { + insert_to_queue(vbio, &write_queue); + } else { + insert_to_queue(vbio, &read_queue); + } + return 0; +} + + +static void handle_blk(struct vhost_blk *blk) +{ + struct vhost_virtqueue *vq = &blk->dev.vqs[0]; + unsigned head, out, in; + struct virtio_blk_outhdr hdr; + int nvecs; + + use_mm(blk->dev.mm); + mutex_lock(&vq->mutex); + + vhost_disable_notify(vq); + + for (;;) { + head = vhost_get_vq_desc(&blk->dev, vq, vq->iov, + ARRAY_SIZE(vq->iov), + &out, &in, NULL, NULL); + if (head == vq->num) { + if (unlikely(vhost_enable_notify(vq))) { + vhost_disable_notify(vq); + continue; + } + start_io(&read_queue); + start_io(&write_queue); + break; + } + + BUG_ON(vq->iov[0].iov_len != 16); + + if (copy_from_user(&hdr, vq->iov[0].iov_base, sizeof hdr)) { + vhost_discard_vq_desc(vq); + continue; + } + + nvecs = out - 1; + if (hdr.type == VIRTIO_BLK_T_IN) + nvecs = in - 1; + + BUG_ON(vq->iov[nvecs+1].iov_len != 1); + if (handoff_io(blk, head, hdr.type, hdr.sector, &vq->iov[1], nvecs) < 0) { + vhost_discard_vq_desc(vq); + continue; + } + } + mutex_unlock(&vq->mutex); + unuse_mm(blk->dev.mm); +} + +static void vhost_blk_flush(struct vhost_blk *n) +{ + vhost_poll_flush(n->poll); + vhost_poll_flush(&n->dev.vqs[0].poll); +} + +static void handle_blk_kick(struct work_struct *work) +{ + struct vhost_virtqueue *vq; + struct vhost_blk *blk; + vq = container_of(work, struct vhost_virtqueue, poll.work); + blk = container_of(vq->dev, struct vhost_blk, dev); + handle_blk(blk); +} + +static void handle_rq_blk(struct work_struct *work) +{ + struct vhost_blk *blk; + blk = container_of(work, struct vhost_blk, poll[0].work); + handle_blk(blk); +} + +static int vhost_blk_open(struct inode *inode, struct file *f) +{ + struct vhost_blk *n = kmalloc(sizeof *n, GFP_KERNEL); + int r; + if (!n) + return -ENOMEM; + n->vqs[0].handle_kick = handle_blk_kick; + r = vhost_dev_init(&n->dev, n->vqs, VHOST_BLK_VQ_MAX); + if (r < 0) { + kfree(n); + return r; + } + + vhost_poll_init(n->poll, handle_rq_blk, POLLOUT|POLLIN); + f->private_data = n; + return 0; +} + +static int vhost_blk_release(struct inode *inode, struct file *f) +{ + struct vhost_blk *n = f->private_data; + + fput(n->vqs->private_data); + kfree(n); + return 0; +} + +static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int fd) +{ + struct file *file; + struct vhost_virtqueue *vq; + + if (index >= VHOST_BLK_VQ_MAX) + return -ENOBUFS; + + file = fget(fd); + if (!file) + return -EBADF; + + vq = n->vqs + index; + mutex_lock(&vq->mutex); + rcu_assign_pointer(vq->private_data, file); + mutex_unlock(&vq->mutex); + return 0; +} + + +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + struct vhost_blk *n = f->private_data; + void __user *argp = (void __user *)arg; + struct vhost_vring_file backend; + int r; + + switch (ioctl) { + case VHOST_NET_SET_BACKEND: + r = copy_from_user(&backend, argp, sizeof backend); + if (r < 0) + return r; + return vhost_blk_set_backend(n, backend.index, backend.fd); + default: + mutex_lock(&n->dev.mutex); + r = vhost_dev_ioctl(&n->dev, ioctl, arg); + vhost_blk_flush(n); + mutex_unlock(&n->dev.mutex); + return r; + } +} + +const static struct file_operations vhost_blk_fops = { + .owner = THIS_MODULE, + .release = vhost_blk_release, + .open = vhost_blk_open, + .unlocked_ioctl = vhost_blk_ioctl, +}; + +static struct miscdevice vhost_blk_misc = { + 234, + "vhost-blk", + &vhost_blk_fops, +}; + +static int vhost_blk_init(void) +{ + int r = vhost_init(); + if (r) + goto err_init; + + vblk_workqueue = create_workqueue("vblk"); + if (!vblk_workqueue) { + r = -ENOMEM; + goto err_vblk; + } + + r = misc_register(&vhost_blk_misc); + if (r) + goto err_reg; + return 0; +err_reg: + destroy_workqueue(vblk_workqueue); +err_vblk: + vhost_cleanup(); +err_init: + return r; + +} +module_init(vhost_blk_init); + +static void vhost_blk_exit(void) +{ + misc_deregister(&vhost_blk_misc); + destroy_workqueue(vblk_workqueue); + vhost_cleanup(); +} +module_exit(vhost_blk_exit); + +MODULE_VERSION("0.0.2"); +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Host kernel accelerator for virtio blk"); -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html