@@ -231,18 +231,48 @@ void vhost_poll_stop(struct vhost_poll *poll)
}
EXPORT_SYMBOL_GPL(vhost_poll_stop);
-void vhost_work_dev_flush(struct vhost_dev *dev)
+static void vhost_work_queue_on(struct vhost_dev *dev, struct vhost_work *work,
+ int worker_id)
+{
+ if (!dev->workers)
+ return;
+
+ if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
+ /* We can only add the work to the list after we're
+ * sure it was not in the list.
+ * test_and_set_bit() implies a memory barrier.
+ */
+ llist_add(&work->node, &dev->workers[worker_id]->work_list);
+ wake_up_process(dev->workers[worker_id]->task);
+ }
+}
+
+void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
+{
+ vhost_work_queue_on(dev, work, 0);
+}
+EXPORT_SYMBOL_GPL(vhost_work_queue);
+
+static void vhost_work_flush_on(struct vhost_dev *dev, int worker_id)
{
struct vhost_flush_struct flush;
- if (dev->worker) {
+ if (dev->workers) {
init_completion(&flush.wait_event);
vhost_work_init(&flush.work, vhost_flush_work);
- vhost_work_queue(dev, &flush.work);
+ vhost_work_queue_on(dev, &flush.work, worker_id);
wait_for_completion(&flush.wait_event);
}
}
+
+void vhost_work_dev_flush(struct vhost_dev *dev)
+{
+ int i;
+
+ for (i = 0; i < dev->num_workers; i++)
+ vhost_work_flush_on(dev, i);
+}
EXPORT_SYMBOL_GPL(vhost_work_dev_flush);
/* Flush any work that has been scheduled. When calling this, don't hold any
@@ -253,26 +283,20 @@ void vhost_poll_flush(struct vhost_poll *poll)
}
EXPORT_SYMBOL_GPL(vhost_poll_flush);
-void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
+/* A lockless hint for busy polling code to exit the loop */
+bool vhost_has_work(struct vhost_dev *dev)
{
- if (!dev->worker)
- return;
+ int i;
- if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
- /* We can only add the work to the list after we're
- * sure it was not in the list.
- * test_and_set_bit() implies a memory barrier.
- */
- llist_add(&work->node, &dev->work_list);
- wake_up_process(dev->worker);
+ if (!dev->workers)
+ return false;
+
+ for (i = 0; i < dev->num_workers; i++) {
+ if (!llist_empty(&dev->workers[i]->work_list))
+ return true;
}
-}
-EXPORT_SYMBOL_GPL(vhost_work_queue);
-/* A lockless hint for busy polling code to exit the loop */
-bool vhost_has_work(struct vhost_dev *dev)
-{
- return !llist_empty(&dev->work_list);
+ return false;
}
EXPORT_SYMBOL_GPL(vhost_has_work);
@@ -338,7 +362,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
static int vhost_worker(void *data)
{
- struct vhost_dev *dev = data;
+ struct vhost_worker *worker = data;
+ struct vhost_dev *dev = worker->dev;
struct vhost_work *work, *work_next;
struct llist_node *node;
@@ -352,8 +377,7 @@ static int vhost_worker(void *data)
__set_current_state(TASK_RUNNING);
break;
}
-
- node = llist_del_all(&dev->work_list);
+ node = llist_del_all(&worker->work_list);
if (!node)
schedule();
@@ -506,13 +530,13 @@ int vhost_dev_init(struct vhost_dev *dev,
dev->umem = NULL;
dev->iotlb = NULL;
dev->mm = NULL;
- dev->worker = NULL;
+ dev->workers = NULL;
+ dev->num_workers = 0;
dev->iov_limit = iov_limit;
dev->weight = weight;
- dev->byte_weight = byte_weight;
dev->use_worker = use_worker;
+ dev->byte_weight = byte_weight;
dev->msg_handler = msg_handler;
- init_llist_head(&dev->work_list);
init_waitqueue_head(&dev->wait);
INIT_LIST_HEAD(&dev->read_list);
INIT_LIST_HEAD(&dev->pending_list);
@@ -558,16 +582,28 @@ static void vhost_attach_cgroups_work(struct vhost_work *work)
s->ret = cgroup_attach_task_all(s->owner, current);
}
-static int vhost_attach_cgroups(struct vhost_dev *dev)
+static int vhost_attach_cgroups_on(struct vhost_dev *dev, int worker_id)
{
struct vhost_attach_cgroups_struct attach;
attach.owner = current;
vhost_work_init(&attach.work, vhost_attach_cgroups_work);
- vhost_work_queue(dev, &attach.work);
- vhost_work_dev_flush(dev);
+ vhost_work_queue_on(dev, &attach.work, worker_id);
+ vhost_work_flush_on(dev, worker_id);
return attach.ret;
}
+static int vhost_attach_cgroups(struct vhost_dev *dev, int first_worker)
+{
+ int i, ret = 0;
+
+ for (i = first_worker; i < dev->num_workers; i++) {
+ ret = vhost_attach_cgroups_on(dev, i);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
/* Caller should have device mutex */
bool vhost_dev_has_owner(struct vhost_dev *dev)
@@ -606,10 +642,117 @@ static void vhost_detach_mm(struct vhost_dev *dev)
dev->mm = NULL;
}
+static void vhost_workers_free(struct vhost_dev *dev)
+{
+ struct vhost_worker *worker;
+ int i;
+
+ if (!dev->workers)
+ return;
+
+ for (i = 0; i < dev->num_workers; i++) {
+ worker = dev->workers[i];
+
+ WARN_ON(!llist_empty(&worker->work_list));
+ kthread_stop(worker->task);
+ kfree(worker);
+ }
+
+ kfree(dev->workers);
+ dev->workers = NULL;
+ dev->num_workers = 0;
+}
+
+static int vhost_worker_create(struct vhost_dev *dev, int worker_id)
+{
+ struct vhost_worker *worker;
+ struct task_struct *task;
+ int ret;
+
+ worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+ if (!worker)
+ return -ENOMEM;
+
+ init_llist_head(&worker->work_list);
+ worker->dev = dev;
+
+ task = kthread_create(vhost_worker, worker, "vhost-%d", current->pid);
+ if (IS_ERR(task)) {
+ ret = PTR_ERR(task);
+ goto free_worker;
+ }
+
+ dev->workers[worker_id] = worker;
+ worker->task = task;
+ wake_up_process(task); /* avoid contributing to loadavg */
+ return 0;
+
+free_worker:
+ kfree(worker);
+ return ret;
+}
+
+/**
+ * vhost_workers_create - create vhost workers and attach to cgroup
+ * @dev: vhost device
+ * @new_num_workers: the total number of workers we want after this returns
+ *
+ * Caller must have the device mutex and have stopped operations that
+ * can access the workers array.
+ */
+int vhost_workers_create(struct vhost_dev *dev, int new_num_workers)
+{
+ struct vhost_worker **new_workers;
+ struct mm_struct *mm;
+ bool owner_match = true;
+ int i, err, start;
+
+ if (new_num_workers == dev->num_workers)
+ return 0;
+
+ if (new_num_workers < dev->num_workers)
+ return -EINVAL;
+
+ if (vhost_dev_has_owner(dev)) {
+ mm = get_task_mm(current);
+ if (mm != dev->mm)
+ owner_match = false;
+ mmput(mm);
+ if (!owner_match)
+ return -EBUSY;
+ }
+
+ new_workers = krealloc(dev->workers, new_num_workers * sizeof(*new_workers),
+ GFP_KERNEL);
+ if (!new_workers) {
+ err = -ENOMEM;
+ goto free_workers;
+ }
+ dev->workers = new_workers;
+
+ start = dev->num_workers;
+ for (i = start; i < new_num_workers; i++) {
+ err = vhost_worker_create(dev, i);
+ if (err)
+ goto free_workers;
+ dev->num_workers++;
+ }
+
+ err = vhost_attach_cgroups(dev, start);
+ if (err)
+ goto free_workers;
+
+ return 0;
+
+free_workers:
+ vhost_workers_free(dev);
+ return err;
+}
+EXPORT_SYMBOL_GPL(vhost_workers_create);
+
/* Caller should have device mutex */
long vhost_dev_set_owner(struct vhost_dev *dev)
{
- struct task_struct *worker;
int err;
/* Is there an owner already? */
@@ -622,27 +765,16 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
dev->kcov_handle = kcov_common_handle();
if (dev->use_worker) {
- worker = kthread_create(vhost_worker, dev,
- "vhost-%d", current->pid);
- if (IS_ERR(worker)) {
- err = PTR_ERR(worker);
- goto err_worker;
- }
-
- dev->worker = worker;
- wake_up_process(worker); /* avoid contributing to loadavg */
-
- err = vhost_attach_cgroups(dev);
+ /*
+ * All drivers that set use_worker=true, use at least one
+ * worker. Drivers like vhost-scsi may override this later.
+ */
+ err = vhost_workers_create(dev, 1);
if (err)
- goto err_cgroup;
+ goto err_worker;
}
return 0;
-err_cgroup:
- if (dev->worker) {
- kthread_stop(dev->worker);
- dev->worker = NULL;
- }
err_worker:
vhost_detach_mm(dev);
dev->kcov_handle = 0;
@@ -735,12 +867,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
dev->iotlb = NULL;
vhost_clear_msg(dev);
wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
- WARN_ON(!llist_empty(&dev->work_list));
- if (dev->worker) {
- kthread_stop(dev->worker);
- dev->worker = NULL;
- dev->kcov_handle = 0;
- }
+ vhost_workers_free(dev);
+ dev->kcov_handle = 0;
vhost_detach_mm(dev);
}
EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
@@ -25,6 +25,12 @@ struct vhost_work {
unsigned long flags;
};
+struct vhost_worker {
+ struct task_struct *task;
+ struct llist_head work_list;
+ struct vhost_dev *dev;
+};
+
/* Poll a file (eventfd or socket) */
/* Note: there's nothing vhost specific about this structure. */
struct vhost_poll {
@@ -39,6 +45,7 @@ struct vhost_poll {
void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
bool vhost_has_work(struct vhost_dev *dev);
+int vhost_workers_create(struct vhost_dev *dev, int new_num_workers);
void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
__poll_t mask, struct vhost_dev *dev);
@@ -152,8 +159,8 @@ struct vhost_dev {
int nvqs;
int max_nvqs;
struct eventfd_ctx *log_ctx;
- struct llist_head work_list;
- struct task_struct *worker;
+ struct vhost_worker **workers;
+ int num_workers;
struct vhost_iotlb *umem;
struct vhost_iotlb *iotlb;
spinlock_t iotlb_lock;
@@ -175,6 +182,7 @@ int vhost_dev_init(struct vhost_dev *dev, struct vhost_virtqueue **vqs,
int byte_weight, bool use_worker,
int (*msg_handler)(struct vhost_dev *dev,
struct vhost_iotlb_msg *msg));
+int vhost_vq_set_worker(struct vhost_virtqueue *vq, int worker_id);
long vhost_dev_set_owner(struct vhost_dev *dev);
bool vhost_dev_has_owner(struct vhost_dev *dev);
long vhost_dev_check_owner(struct vhost_dev *);
This is a prep patch to support multiple vhost worker threads per vhost dev. This patch converts the code that had assumed a single worker thread by: 1. Moving worker related fields to a new struct vhost_worker. 2. Converting vhost.c code to use the new struct and assume we will have an array of workers. 3. It also exports a helper function that will be used in the last patch when vhost-scsi is converted to use this new functionality. Why do we need multiple worker threads? For vhost-scsi, we do the initial submission and completion from the vhost worker thread and after adding 2 vqs this single thread becomes a bottleneck. With the null_blk driver we max out at 360K IOPs when doing a random workload like: fio --direct=1 --rw=randrw --bs=4k --ioengine=libaio \ --iodepth=VQ_QUEUE_DEPTH --numjobs=NUM_VQS --filename /dev/sdXYZ where NUM_VQS gets up to 8 (number of cores per numa node on my system) and VQ_QUEUE_DEPTH can be anywhere from 32 to 128. With the patches in this set, we are able to get IOPs from a single LUN up to 640K. And, With some other changes I am working on to the LIO locking and binding worker threads to specific CPUs we can get this up to 880K Signed-off-by: Mike Christie <michael.christie@oracle.com> --- drivers/vhost/vhost.c | 232 +++++++++++++++++++++++++++++++++++++++----------- drivers/vhost/vhost.h | 12 ++- 2 files changed, 190 insertions(+), 54 deletions(-)