Message ID | 1466440308-4961-2-git-send-email-kwankhede@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 06/21/2016 12:31 AM, Kirti Wankhede wrote: > + > +static int mdev_add_attribute_group(struct device *dev, > + const struct attribute_group **groups) > +{ > + return sysfs_create_groups(&dev->kobj, groups); > +} > + > +static void mdev_remove_attribute_group(struct device *dev, > + const struct attribute_group **groups) > +{ > + sysfs_remove_groups(&dev->kobj, groups); > +} > + > +static struct mdev_device *find_mdev_device(struct parent_device *parent, > + uuid_le uuid, int instance) > +{ > + struct mdev_device *mdev = NULL, *p; > + > + list_for_each_entry(p, &parent->mdev_list, next) { > + if ((uuid_le_cmp(p->uuid, uuid) == 0) && > + (p->instance == instance)) { > + mdev = p; > + break; > + } > + } > + return mdev; > +} Hi Kirti, Neo, Thanks for the new version! We just found that find_mdev_device() is necessary for pdev driver to locate the mdev by VM identity and mdev instance. e.g. caller of your vfio-iommu exported API, vfio_pin_pages(), must have something to identify which address space it wants, that's a subfield of mdev. Do you mind to have it exported? Or is there any other way for this? -- Thanks, Jike > + > +/* Should be called holding parent_devices.list_lock */ > +static struct parent_device *find_parent_device(struct device *dev) > +{ > + struct parent_device *parent = NULL, *p; > + > + WARN_ON(!mutex_is_locked(&parent_devices.list_lock)); > + list_for_each_entry(p, &parent_devices.dev_list, next) { > + if (p->dev == dev) { > + parent = p; > + break; > + } > + } > + return parent; > +} > + > +static void mdev_release_parent(struct kref *kref) > +{ > + struct parent_device *parent = container_of(kref, struct parent_device, > + ref); > + kfree(parent); > +} > + > +static > +inline struct parent_device *mdev_get_parent(struct parent_device *parent) > +{ > + if (parent) > + kref_get(&parent->ref); > + > + return parent; > +} > + > +static inline void mdev_put_parent(struct parent_device *parent) > +{ > + if (parent) > + kref_put(&parent->ref, mdev_release_parent); > +} > + > +static struct parent_device *mdev_get_parent_by_dev(struct device *dev) > +{ > + struct parent_device *parent = NULL, *p; > + > + mutex_lock(&parent_devices.list_lock); > + list_for_each_entry(p, &parent_devices.dev_list, next) { > + if (p->dev == dev) { > + parent = mdev_get_parent(p); > + break; > + } > + } > + mutex_unlock(&parent_devices.list_lock); > + return parent; > +} > + > +static int mdev_device_create_ops(struct mdev_device *mdev, char *mdev_params) > +{ > + struct parent_device *parent = mdev->parent; > + int ret; > + > + mutex_lock(&parent->ops_lock); > + if (parent->ops->create) { > + ret = parent->ops->create(mdev->dev.parent, mdev->uuid, > + mdev->instance, mdev_params); > + if (ret) > + goto create_ops_err; > + } > + > + ret = mdev_add_attribute_group(&mdev->dev, > + parent->ops->mdev_attr_groups); > +create_ops_err: > + mutex_unlock(&parent->ops_lock); > + return ret; > +} > + > +static int mdev_device_destroy_ops(struct mdev_device *mdev, bool force) > +{ > + struct parent_device *parent = mdev->parent; > + int ret = 0; > + > + /* > + * If vendor driver doesn't return success that means vendor > + * driver doesn't support hot-unplug > + */ > + mutex_lock(&parent->ops_lock); > + if (parent->ops->destroy) { > + ret = parent->ops->destroy(parent->dev, mdev->uuid, > + mdev->instance); > + if (ret && !force) { > + ret = -EBUSY; > + goto destroy_ops_err; > + } > + } > + mdev_remove_attribute_group(&mdev->dev, > + parent->ops->mdev_attr_groups); > +destroy_ops_err: > + mutex_unlock(&parent->ops_lock); > + > + return ret; > +} > + > +static void mdev_release_device(struct kref *kref) > +{ > + struct mdev_device *mdev = container_of(kref, struct mdev_device, ref); > + struct parent_device *parent = mdev->parent; > + > + device_unregister(&mdev->dev); > + wake_up(&parent->release_done); > + mdev_put_parent(parent); > +} > + > +struct mdev_device *mdev_get_device(struct mdev_device *mdev) > +{ > + if (mdev) > + kref_get(&mdev->ref); > + > + return mdev; > +} > +EXPORT_SYMBOL(mdev_get_device); > + > +void mdev_put_device(struct mdev_device *mdev) > +{ > + if (mdev) > + kref_put(&mdev->ref, mdev_release_device); > +} > +EXPORT_SYMBOL(mdev_put_device); > + > +/* > + * Find first mediated device from given uuid and increment refcount of > + * mediated device. Caller should call mdev_put_device() when the use of > + * mdev_device is done. > + */ > +static struct mdev_device *mdev_get_first_device_by_uuid(uuid_le uuid) > +{ > + struct mdev_device *mdev = NULL, *p; > + struct parent_device *parent; > + > + mutex_lock(&parent_devices.list_lock); > + list_for_each_entry(parent, &parent_devices.dev_list, next) { > + mutex_lock(&parent->mdev_list_lock); > + list_for_each_entry(p, &parent->mdev_list, next) { > + if (uuid_le_cmp(p->uuid, uuid) == 0) { > + mdev = mdev_get_device(p); > + break; > + } > + } > + mutex_unlock(&parent->mdev_list_lock); > + > + if (mdev) > + break; > + } > + mutex_unlock(&parent_devices.list_lock); > + return mdev; > +} > + > +/* > + * Find mediated device from given iommu_group and increment refcount of > + * mediated device. Caller should call mdev_put_device() when the use of > + * mdev_device is done. > + */ > +struct mdev_device *mdev_get_device_by_group(struct iommu_group *group) > +{ > + struct mdev_device *mdev = NULL, *p; > + struct parent_device *parent; > + > + mutex_lock(&parent_devices.list_lock); > + list_for_each_entry(parent, &parent_devices.dev_list, next) { > + mutex_lock(&parent->mdev_list_lock); > + list_for_each_entry(p, &parent->mdev_list, next) { > + if (!p->group) > + continue; > + > + if (iommu_group_id(p->group) == iommu_group_id(group)) { > + mdev = mdev_get_device(p); > + break; > + } > + } > + mutex_unlock(&parent->mdev_list_lock); > + > + if (mdev) > + break; > + } > + mutex_unlock(&parent_devices.list_lock); > + return mdev; > +} > +EXPORT_SYMBOL(mdev_get_device_by_group); > + > +/* > + * mdev_register_device : Register a device > + * @dev: device structure representing parent device. > + * @ops: Parent device operation structure to be registered. > + * > + * Add device to list of registered parent devices. > + * Returns a negative value on error, otherwise 0. > + */ > +int mdev_register_device(struct device *dev, const struct parent_ops *ops) > +{ > + int ret = 0; > + struct parent_device *parent; > + > + if (!dev || !ops) > + return -EINVAL; > + > + mutex_lock(&parent_devices.list_lock); > + > + /* Check for duplicate */ > + parent = find_parent_device(dev); > + if (parent) { > + ret = -EEXIST; > + goto add_dev_err; > + } > + > + parent = kzalloc(sizeof(*parent), GFP_KERNEL); > + if (!parent) { > + ret = -ENOMEM; > + goto add_dev_err; > + } > + > + kref_init(&parent->ref); > + list_add(&parent->next, &parent_devices.dev_list); > + mutex_unlock(&parent_devices.list_lock); > + > + parent->dev = dev; > + parent->ops = ops; > + mutex_init(&parent->ops_lock); > + mutex_init(&parent->mdev_list_lock); > + INIT_LIST_HEAD(&parent->mdev_list); > + init_waitqueue_head(&parent->release_done); > + > + ret = mdev_create_sysfs_files(dev); > + if (ret) > + goto add_sysfs_error; > + > + ret = mdev_add_attribute_group(dev, ops->dev_attr_groups); > + if (ret) > + goto add_group_error; > + > + dev_info(dev, "MDEV: Registered\n"); > + return 0; > + > +add_group_error: > + mdev_remove_sysfs_files(dev); > +add_sysfs_error: > + mutex_lock(&parent_devices.list_lock); > + list_del(&parent->next); > + mutex_unlock(&parent_devices.list_lock); > + mdev_put_parent(parent); > + return ret; > + > +add_dev_err: > + mutex_unlock(&parent_devices.list_lock); > + return ret; > +} > +EXPORT_SYMBOL(mdev_register_device); > + > +/* > + * mdev_unregister_device : Unregister a parent device > + * @dev: device structure representing parent device. > + * > + * Remove device from list of registered parent devices. Give a chance to free > + * existing mediated devices for given device. > + */ > + > +void mdev_unregister_device(struct device *dev) > +{ > + struct parent_device *parent; > + struct mdev_device *mdev, *n; > + int ret; > + > + mutex_lock(&parent_devices.list_lock); > + parent = find_parent_device(dev); > + > + if (!parent) { > + mutex_unlock(&parent_devices.list_lock); > + return; > + } > + dev_info(dev, "MDEV: Unregistering\n"); > + > + /* > + * Remove parent from the list and remove create and destroy sysfs > + * files so that no new mediated device could be created for this parent > + */ > + list_del(&parent->next); > + mdev_remove_sysfs_files(dev); > + mutex_unlock(&parent_devices.list_lock); > + > + mutex_lock(&parent->ops_lock); > + mdev_remove_attribute_group(dev, > + parent->ops->dev_attr_groups); > + mutex_unlock(&parent->ops_lock); > + > + mutex_lock(&parent->mdev_list_lock); > + list_for_each_entry_safe(mdev, n, &parent->mdev_list, next) { > + mdev_device_destroy_ops(mdev, true); > + list_del(&mdev->next); > + mdev_put_device(mdev); > + } > + mutex_unlock(&parent->mdev_list_lock); > + > + do { > + ret = wait_event_interruptible_timeout(parent->release_done, > + list_empty(&parent->mdev_list), HZ * 10); > + if (ret == -ERESTARTSYS) { > + dev_warn(dev, "Mediated devices are in use, task" > + " \"%s\" (%d) " > + "blocked until all are released", > + current->comm, task_pid_nr(current)); > + } > + } while (ret <= 0); > + > + mdev_put_parent(parent); > +} > +EXPORT_SYMBOL(mdev_unregister_device); > + > +/* > + * Functions required for mdev-sysfs > + */ > +static void mdev_device_release(struct device *dev) > +{ > + struct mdev_device *mdev = to_mdev_device(dev); > + > + dev_dbg(&mdev->dev, "MDEV: destroying\n"); > + kfree(mdev); > +} > + > +int mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance, > + char *mdev_params) > +{ > + int ret; > + struct mdev_device *mdev; > + struct parent_device *parent; > + > + parent = mdev_get_parent_by_dev(dev); > + if (!parent) > + return -EINVAL; > + > + /* Check for duplicate */ > + mdev = find_mdev_device(parent, uuid, instance); > + if (mdev) { > + ret = -EEXIST; > + goto create_err; > + } > + > + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); > + if (!mdev) { > + ret = -ENOMEM; > + goto create_err; > + } > + > + memcpy(&mdev->uuid, &uuid, sizeof(uuid_le)); > + mdev->instance = instance; > + mdev->parent = parent; > + mutex_init(&mdev->ops_lock); > + kref_init(&mdev->ref); > + > + mdev->dev.parent = dev; > + mdev->dev.bus = &mdev_bus_type; > + mdev->dev.release = mdev_device_release; > + dev_set_name(&mdev->dev, "%pUb-%d", uuid.b, instance); > + > + ret = device_register(&mdev->dev); > + if (ret) { > + put_device(&mdev->dev); > + goto create_err; > + } > + > + ret = mdev_device_create_ops(mdev, mdev_params); > + if (ret) > + goto create_failed; > + > + mutex_lock(&parent->mdev_list_lock); > + list_add(&mdev->next, &parent->mdev_list); > + mutex_unlock(&parent->mdev_list_lock); > + > + dev_dbg(&mdev->dev, "MDEV: created\n"); > + > + return ret; > + > +create_failed: > + device_unregister(&mdev->dev); > + > +create_err: > + mdev_put_parent(parent); > + return ret; > +} > + > +int mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t instance) > +{ > + struct mdev_device *mdev; > + struct parent_device *parent; > + int ret; > + > + parent = mdev_get_parent_by_dev(dev); > + if (!parent) { > + ret = -EINVAL; > + goto destroy_err; > + } > + > + mdev = find_mdev_device(parent, uuid, instance); > + if (!mdev) { > + ret = -EINVAL; > + goto destroy_err; > + } > + > + ret = mdev_device_destroy_ops(mdev, false); > + if (ret) > + goto destroy_err; > + > + mdev_put_parent(parent); > + > + mutex_lock(&parent->mdev_list_lock); > + list_del(&mdev->next); > + mutex_unlock(&parent->mdev_list_lock); > + > + mdev_put_device(mdev); > + return ret; > + > +destroy_err: > + mdev_put_parent(parent); > + return ret; > +} > + > +void mdev_device_supported_config(struct device *dev, char *str) > +{ > + struct parent_device *parent; > + > + parent = mdev_get_parent_by_dev(dev); > + > + if (parent) { > + mutex_lock(&parent->ops_lock); > + if (parent->ops->supported_config) > + parent->ops->supported_config(parent->dev, str); > + mutex_unlock(&parent->ops_lock); > + mdev_put_parent(parent); > + } > +} > + > +int mdev_device_start(uuid_le uuid) > +{ > + int ret = 0; > + struct mdev_device *mdev; > + struct parent_device *parent; > + > + mdev = mdev_get_first_device_by_uuid(uuid); > + if (!mdev) > + return -EINVAL; > + > + parent = mdev->parent; > + > + mutex_lock(&parent->ops_lock); > + if (parent->ops->start) > + ret = parent->ops->start(mdev->uuid); > + mutex_unlock(&parent->ops_lock); > + > + if (ret) > + pr_err("mdev_start failed %d\n", ret); > + else > + kobject_uevent(&mdev->dev.kobj, KOBJ_ONLINE); > + > + mdev_put_device(mdev); > + > + return ret; > +} > + > +int mdev_device_shutdown(uuid_le uuid) > +{ > + int ret = 0; > + struct mdev_device *mdev; > + struct parent_device *parent; > + > + mdev = mdev_get_first_device_by_uuid(uuid); > + if (!mdev) > + return -EINVAL; > + > + parent = mdev->parent; > + > + mutex_lock(&parent->ops_lock); > + if (parent->ops->shutdown) > + ret = parent->ops->shutdown(mdev->uuid); > + mutex_unlock(&parent->ops_lock); > + > + if (ret) > + pr_err("mdev_shutdown failed %d\n", ret); > + else > + kobject_uevent(&mdev->dev.kobj, KOBJ_OFFLINE); > + > + mdev_put_device(mdev); > + return ret; > +} > + > +static struct class mdev_class = { > + .name = MDEV_CLASS_NAME, > + .owner = THIS_MODULE, > + .class_attrs = mdev_class_attrs, > +}; > + > +static int __init mdev_init(void) > +{ > + int ret; > + > + mutex_init(&parent_devices.list_lock); > + INIT_LIST_HEAD(&parent_devices.dev_list); > + > + ret = class_register(&mdev_class); > + if (ret) { > + pr_err("Failed to register mdev class\n"); > + return ret; > + } > + > + ret = mdev_bus_register(); > + if (ret) { > + pr_err("Failed to register mdev bus\n"); > + class_unregister(&mdev_class); > + return ret; > + } > + > + return ret; > +} > + > +static void __exit mdev_exit(void) > +{ > + mdev_bus_unregister(); > + class_unregister(&mdev_class); > +} > + > +module_init(mdev_init) > +module_exit(mdev_exit) > + > +MODULE_VERSION(DRIVER_VERSION); > +MODULE_LICENSE("GPL"); > +MODULE_AUTHOR(DRIVER_AUTHOR); > +MODULE_DESCRIPTION(DRIVER_DESC); > diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c > new file mode 100644 > index 000000000000..f1aed541111d > --- /dev/null > +++ b/drivers/vfio/mdev/mdev_driver.c > @@ -0,0 +1,138 @@ > +/* > + * MDEV driver > + * > + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. > + * Author: Neo Jia <cjia@nvidia.com> > + * Kirti Wankhede <kwankhede@nvidia.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/device.h> > +#include <linux/iommu.h> > +#include <linux/mdev.h> > + > +#include "mdev_private.h" > + > +static int mdev_attach_iommu(struct mdev_device *mdev) > +{ > + int ret; > + struct iommu_group *group; > + > + group = iommu_group_alloc(); > + if (IS_ERR(group)) { > + dev_err(&mdev->dev, "MDEV: failed to allocate group!\n"); > + return PTR_ERR(group); > + } > + > + ret = iommu_group_add_device(group, &mdev->dev); > + if (ret) { > + dev_err(&mdev->dev, "MDEV: failed to add dev to group!\n"); > + goto attach_fail; > + } > + > + mdev->group = group; > + > + dev_info(&mdev->dev, "MDEV: group_id = %d\n", > + iommu_group_id(group)); > +attach_fail: > + iommu_group_put(group); > + return ret; > +} > + > +static void mdev_detach_iommu(struct mdev_device *mdev) > +{ > + iommu_group_remove_device(&mdev->dev); > + dev_info(&mdev->dev, "MDEV: detaching iommu\n"); > +} > + > +static int mdev_probe(struct device *dev) > +{ > + struct mdev_driver *drv = to_mdev_driver(dev->driver); > + struct mdev_device *mdev = to_mdev_device(dev); > + int ret; > + > + ret = mdev_attach_iommu(mdev); > + if (ret) { > + dev_err(dev, "Failed to attach IOMMU\n"); > + return ret; > + } > + > + if (drv && drv->probe) > + ret = drv->probe(dev); > + > + return ret; > +} > + > +static int mdev_remove(struct device *dev) > +{ > + struct mdev_driver *drv = to_mdev_driver(dev->driver); > + struct mdev_device *mdev = to_mdev_device(dev); > + > + if (drv && drv->remove) > + drv->remove(dev); > + > + mdev_detach_iommu(mdev); > + > + return 0; > +} > + > +static int mdev_match(struct device *dev, struct device_driver *drv) > +{ > + struct mdev_driver *mdrv = to_mdev_driver(drv); > + > + if (mdrv && mdrv->match) > + return mdrv->match(dev); > + > + return 0; > +} > + > +struct bus_type mdev_bus_type = { > + .name = "mdev", > + .match = mdev_match, > + .probe = mdev_probe, > + .remove = mdev_remove, > +}; > +EXPORT_SYMBOL_GPL(mdev_bus_type); > + > +/* > + * mdev_register_driver - register a new MDEV driver > + * @drv: the driver to register > + * @owner: module owner of driver to be registered > + * > + * Returns a negative value on error, otherwise 0. > + */ > +int mdev_register_driver(struct mdev_driver *drv, struct module *owner) > +{ > + /* initialize common driver fields */ > + drv->driver.name = drv->name; > + drv->driver.bus = &mdev_bus_type; > + drv->driver.owner = owner; > + > + /* register with core */ > + return driver_register(&drv->driver); > +} > +EXPORT_SYMBOL(mdev_register_driver); > + > +/* > + * mdev_unregister_driver - unregister MDEV driver > + * @drv: the driver to unregister > + * > + */ > +void mdev_unregister_driver(struct mdev_driver *drv) > +{ > + driver_unregister(&drv->driver); > +} > +EXPORT_SYMBOL(mdev_unregister_driver); > + > +int mdev_bus_register(void) > +{ > + return bus_register(&mdev_bus_type); > +} > + > +void mdev_bus_unregister(void) > +{ > + bus_unregister(&mdev_bus_type); > +} > diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h > new file mode 100644 > index 000000000000..991d7f796169 > --- /dev/null > +++ b/drivers/vfio/mdev/mdev_private.h > @@ -0,0 +1,33 @@ > +/* > + * Mediated device interal definitions > + * > + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. > + * Author: Neo Jia <cjia@nvidia.com> > + * Kirti Wankhede <kwankhede@nvidia.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#ifndef MDEV_PRIVATE_H > +#define MDEV_PRIVATE_H > + > +int mdev_bus_register(void); > +void mdev_bus_unregister(void); > + > +/* Function prototypes for mdev_sysfs */ > + > +extern struct class_attribute mdev_class_attrs[]; > + > +int mdev_create_sysfs_files(struct device *dev); > +void mdev_remove_sysfs_files(struct device *dev); > + > +int mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance, > + char *mdev_params); > +int mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t instance); > +void mdev_device_supported_config(struct device *dev, char *str); > +int mdev_device_start(uuid_le uuid); > +int mdev_device_shutdown(uuid_le uuid); > + > +#endif /* MDEV_PRIVATE_H */ > diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c > new file mode 100644 > index 000000000000..48b66e40009e > --- /dev/null > +++ b/drivers/vfio/mdev/mdev_sysfs.c > @@ -0,0 +1,300 @@ > +/* > + * File attributes for Mediated devices > + * > + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. > + * Author: Neo Jia <cjia@nvidia.com> > + * Kirti Wankhede <kwankhede@nvidia.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/sysfs.h> > +#include <linux/ctype.h> > +#include <linux/device.h> > +#include <linux/slab.h> > +#include <linux/uuid.h> > +#include <linux/mdev.h> > + > +#include "mdev_private.h" > + > +/* Prototypes */ > +static ssize_t mdev_supported_types_show(struct device *dev, > + struct device_attribute *attr, > + char *buf); > +static DEVICE_ATTR_RO(mdev_supported_types); > + > +static ssize_t mdev_create_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t count); > +static DEVICE_ATTR_WO(mdev_create); > + > +static ssize_t mdev_destroy_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t count); > +static DEVICE_ATTR_WO(mdev_destroy); > + > +/* Static functions */ > + > +#define UUID_CHAR_LENGTH 36 > +#define UUID_BYTE_LENGTH 16 > + > +#define SUPPORTED_TYPE_BUFFER_LENGTH 1024 > + > +static inline bool is_uuid_sep(char sep) > +{ > + if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0') > + return true; > + return false; > +} > + > +static int uuid_parse(const char *str, uuid_le *uuid) > +{ > + int i; > + > + if (strlen(str) < UUID_CHAR_LENGTH) > + return -EINVAL; > + > + for (i = 0; i < UUID_BYTE_LENGTH; i++) { > + if (!isxdigit(str[0]) || !isxdigit(str[1])) { > + pr_err("%s err", __func__); > + return -EINVAL; > + } > + > + uuid->b[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]); > + str += 2; > + if (is_uuid_sep(*str)) > + str++; > + } > + > + return 0; > +} > + > +/* mdev sysfs Functions */ > +static ssize_t mdev_supported_types_show(struct device *dev, > + struct device_attribute *attr, > + char *buf) > +{ > + char *str, *ptr; > + ssize_t n; > + > + str = kzalloc(sizeof(*str) * SUPPORTED_TYPE_BUFFER_LENGTH, GFP_KERNEL); > + if (!str) > + return -ENOMEM; > + > + ptr = str; > + mdev_device_supported_config(dev, str); > + > + n = sprintf(buf, "%s\n", str); > + kfree(ptr); > + > + return n; > +} > + > +static ssize_t mdev_create_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t count) > +{ > + char *str, *pstr; > + char *uuid_str, *instance_str, *mdev_params = NULL, *params = NULL; > + uuid_le uuid; > + uint32_t instance; > + int ret; > + > + pstr = str = kstrndup(buf, count, GFP_KERNEL); > + > + if (!str) > + return -ENOMEM; > + > + uuid_str = strsep(&str, ":"); > + if (!uuid_str) { > + pr_err("mdev_create: Empty UUID string %s\n", buf); > + ret = -EINVAL; > + goto create_error; > + } > + > + if (!str) { > + pr_err("mdev_create: mdev instance not present %s\n", buf); > + ret = -EINVAL; > + goto create_error; > + } > + > + instance_str = strsep(&str, ":"); > + if (!instance_str) { > + pr_err("mdev_create: Empty instance string %s\n", buf); > + ret = -EINVAL; > + goto create_error; > + } > + > + ret = kstrtouint(instance_str, 0, &instance); > + if (ret) { > + pr_err("mdev_create: mdev instance parsing error %s\n", buf); > + goto create_error; > + } > + > + if (str) > + params = mdev_params = kstrdup(str, GFP_KERNEL); > + > + ret = uuid_parse(uuid_str, &uuid); > + if (ret) { > + pr_err("mdev_create: UUID parse error %s\n", buf); > + goto create_error; > + } > + > + ret = mdev_device_create(dev, uuid, instance, mdev_params); > + if (ret) > + pr_err("mdev_create: Failed to create mdev device\n"); > + else > + ret = count; > + > +create_error: > + kfree(params); > + kfree(pstr); > + return ret; > +} > + > +static ssize_t mdev_destroy_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t count) > +{ > + char *uuid_str, *str, *pstr; > + uuid_le uuid; > + unsigned int instance; > + int ret; > + > + str = pstr = kstrndup(buf, count, GFP_KERNEL); > + > + if (!str) > + return -ENOMEM; > + > + uuid_str = strsep(&str, ":"); > + if (!uuid_str) { > + pr_err("mdev_destroy: Empty UUID string %s\n", buf); > + ret = -EINVAL; > + goto destroy_error; > + } > + > + if (str == NULL) { > + pr_err("mdev_destroy: instance not specified %s\n", buf); > + ret = -EINVAL; > + goto destroy_error; > + } > + > + ret = kstrtouint(str, 0, &instance); > + if (ret) { > + pr_err("mdev_destroy: instance parsing error %s\n", buf); > + goto destroy_error; > + } > + > + ret = uuid_parse(uuid_str, &uuid); > + if (ret) { > + pr_err("mdev_destroy: UUID parse error %s\n", buf); > + goto destroy_error; > + } > + > + ret = mdev_device_destroy(dev, uuid, instance); > + if (ret == 0) > + ret = count; > + > +destroy_error: > + kfree(pstr); > + return ret; > +} > + > +ssize_t mdev_start_store(struct class *class, struct class_attribute *attr, > + const char *buf, size_t count) > +{ > + char *uuid_str, *ptr; > + uuid_le uuid; > + int ret; > + > + ptr = uuid_str = kstrndup(buf, count, GFP_KERNEL); > + > + if (!uuid_str) > + return -ENOMEM; > + > + ret = uuid_parse(uuid_str, &uuid); > + if (ret) { > + pr_err("mdev_start: UUID parse error %s\n", buf); > + goto start_error; > + } > + > + ret = mdev_device_start(uuid); > + if (ret == 0) > + ret = count; > + > +start_error: > + kfree(ptr); > + return ret; > +} > + > +ssize_t mdev_shutdown_store(struct class *class, struct class_attribute *attr, > + const char *buf, size_t count) > +{ > + char *uuid_str, *ptr; > + uuid_le uuid; > + int ret; > + > + ptr = uuid_str = kstrndup(buf, count, GFP_KERNEL); > + > + if (!uuid_str) > + return -ENOMEM; > + > + ret = uuid_parse(uuid_str, &uuid); > + if (ret) { > + pr_err("mdev_shutdown: UUID parse error %s\n", buf); > + goto shutdown_error; > + } > + > + ret = mdev_device_shutdown(uuid); > + if (ret == 0) > + ret = count; > + > +shutdown_error: > + kfree(ptr); > + return ret; > + > +} > + > +struct class_attribute mdev_class_attrs[] = { > + __ATTR_WO(mdev_start), > + __ATTR_WO(mdev_shutdown), > + __ATTR_NULL > +}; > + > +int mdev_create_sysfs_files(struct device *dev) > +{ > + int ret; > + > + ret = sysfs_create_file(&dev->kobj, > + &dev_attr_mdev_supported_types.attr); > + if (ret) { > + pr_err("Failed to create mdev_supported_types sysfs entry\n"); > + return ret; > + } > + > + ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_create.attr); > + if (ret) { > + pr_err("Failed to create mdev_create sysfs entry\n"); > + goto create_sysfs_failed; > + } > + > + ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_destroy.attr); > + if (ret) { > + pr_err("Failed to create mdev_destroy sysfs entry\n"); > + sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr); > + } else > + return ret; > + > +create_sysfs_failed: > + sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr); > + return ret; > +} > + > +void mdev_remove_sysfs_files(struct device *dev) > +{ > + sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr); > + sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr); > + sysfs_remove_file(&dev->kobj, &dev_attr_mdev_destroy.attr); > +} > diff --git a/include/linux/mdev.h b/include/linux/mdev.h > new file mode 100644 > index 000000000000..31b6f8572cfa > --- /dev/null > +++ b/include/linux/mdev.h > @@ -0,0 +1,232 @@ > +/* > + * Mediated device definition > + * > + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. > + * Author: Neo Jia <cjia@nvidia.com> > + * Kirti Wankhede <kwankhede@nvidia.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#ifndef MDEV_H > +#define MDEV_H > + > +/* Common Data structures */ > + > +struct pci_region_info { > + uint64_t start; > + uint64_t size; > + uint32_t flags; /* VFIO region info flags */ > +}; > + > +enum mdev_emul_space { > + EMUL_CONFIG_SPACE, /* PCI configuration space */ > + EMUL_IO, /* I/O register space */ > + EMUL_MMIO /* Memory-mapped I/O space */ > +}; > + > +struct parent_device; > + > +/* > + * Mediated device > + */ > + > +struct mdev_device { > + struct device dev; > + struct parent_device *parent; > + struct iommu_group *group; > + void *iommu_data; > + uuid_le uuid; > + uint32_t instance; > + > + /* internal only */ > + struct kref ref; > + struct mutex ops_lock; > + struct list_head next; > +}; > + > + > +/** > + * struct parent_ops - Structure to be registered for each parent device to > + * register the device to mdev module. > + * > + * @owner: The module owner. > + * @dev_attr_groups: Default attributes of the parent device. > + * @mdev_attr_groups: Default attributes of the mediated device. > + * @supported_config: Called to get information about supported types. > + * @dev : device structure of parent device. > + * @config: should return string listing supported config > + * Returns integer: success (0) or error (< 0) > + * @create: Called to allocate basic resources in parent device's > + * driver for a particular mediated device > + * @dev: parent device structure on which mediated device > + * should be created > + * @uuid: VM's uuid for which VM it is intended to > + * @instance: mediated instance in that VM > + * @mdev_params: extra parameters required by parent > + * device's driver. > + * Returns integer: success (0) or error (< 0) > + * @destroy: Called to free resources in parent device's driver for a > + * a mediated device instance of that VM. > + * @dev: parent device structure to which this mediated > + * device points to. > + * @uuid: VM's uuid for which the mediated device belongs > + * @instance: mdev instance in that VM > + * Returns integer: success (0) or error (< 0) > + * If VM is running and destroy() is called that means the > + * mdev is being hotunpluged. Return error if VM is running > + * and driver doesn't support mediated device hotplug. > + * @start: Called to initiate mediated device initialization > + * process in parent device's driver when VM boots before > + * VMM starts > + * @uuid: VM's UUID which is booting. > + * Returns integer: success (0) or error (< 0) > + * @shutdown: Called to teardown mediated device related resources for > + * the VM > + * @uuid: VM's UUID which is shutting down . > + * Returns integer: success (0) or error (< 0) > + * @read: Read emulation callback > + * @mdev: mediated device structure > + * @buf: read buffer > + * @count: number of bytes to read > + * @address_space: specifies for which address space the > + * request is intended for - pci_config_space, IO register > + * space or MMIO space. > + * @addr: address. > + * Retuns number on bytes read on success or error. > + * @write: Write emulation callback > + * @mdev: mediated device structure > + * @buf: write buffer > + * @count: number of bytes to be written > + * @address_space: specifies for which address space the > + * request is intended for - pci_config_space, IO register > + * space or MMIO space. > + * @addr: address. > + * Retuns number on bytes written on success or error. > + * @set_irqs: Called to send about interrupts configuration > + * information that VMM sets. > + * @mdev: mediated device structure > + * @flags, index, start, count and *data : same as that of > + * struct vfio_irq_set of VFIO_DEVICE_SET_IRQS API. > + * @get_region_info: Called to get VFIO region size and flags of mediated > + * device. > + * @mdev: mediated device structure > + * @region_index: VFIO region index > + * @region_info: output, returns size and flags of > + * requested region. > + * Returns integer: success (0) or error (< 0) > + * @validate_map_request: Validate remap pfn request > + * @mdev: mediated device structure > + * @virtaddr: target user address to start at > + * @pfn: parent address of kernel memory, vendor driver > + * can change if required. > + * @size: size of map area, vendor driver can change the > + * size of map area if desired. > + * @prot: page protection flags for this mapping, vendor > + * driver can change, if required. > + * Returns integer: success (0) or error (< 0) > + * > + * Parent device that support mediated device should be registered with mdev > + * module with parent_ops structure. > + */ > + > +struct parent_ops { > + struct module *owner; > + const struct attribute_group **dev_attr_groups; > + const struct attribute_group **mdev_attr_groups; > + > + int (*supported_config)(struct device *dev, char *config); > + int (*create)(struct device *dev, uuid_le uuid, > + uint32_t instance, char *mdev_params); > + int (*destroy)(struct device *dev, uuid_le uuid, > + uint32_t instance); > + int (*start)(uuid_le uuid); > + int (*shutdown)(uuid_le uuid); > + ssize_t (*read)(struct mdev_device *vdev, char *buf, size_t count, > + enum mdev_emul_space address_space, loff_t pos); > + ssize_t (*write)(struct mdev_device *vdev, char *buf, size_t count, > + enum mdev_emul_space address_space, loff_t pos); > + int (*set_irqs)(struct mdev_device *vdev, uint32_t flags, > + unsigned int index, unsigned int start, > + unsigned int count, void *data); > + int (*get_region_info)(struct mdev_device *vdev, int region_index, > + struct pci_region_info *region_info); > + int (*validate_map_request)(struct mdev_device *vdev, > + unsigned long virtaddr, > + unsigned long *pfn, unsigned long *size, > + pgprot_t *prot); > +}; > + > +/* > + * Parent Device > + */ > +struct parent_device { > + struct device *dev; > + const struct parent_ops *ops; > + > + /* internal */ > + struct kref ref; > + struct mutex ops_lock; > + struct list_head next; > + struct list_head mdev_list; > + struct mutex mdev_list_lock; > + wait_queue_head_t release_done; > +}; > + > +/** > + * struct mdev_driver - Mediated device driver > + * @name: driver name > + * @probe: called when new device created > + * @remove: called when device removed > + * @match: called when new device or driver is added for this bus. Return 1 if > + * given device can be handled by given driver and zero otherwise. > + * @driver: device driver structure > + * > + **/ > +struct mdev_driver { > + const char *name; > + int (*probe)(struct device *dev); > + void (*remove)(struct device *dev); > + int (*match)(struct device *dev); > + struct device_driver driver; > +}; > + > +static inline struct mdev_driver *to_mdev_driver(struct device_driver *drv) > +{ > + return drv ? container_of(drv, struct mdev_driver, driver) : NULL; > +} > + > +static inline struct mdev_device *to_mdev_device(struct device *dev) > +{ > + return dev ? container_of(dev, struct mdev_device, dev) : NULL; > +} > + > +static inline void *mdev_get_drvdata(struct mdev_device *mdev) > +{ > + return dev_get_drvdata(&mdev->dev); > +} > + > +static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data) > +{ > + dev_set_drvdata(&mdev->dev, data); > +} > + > +extern struct bus_type mdev_bus_type; > + > +#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type) > + > +extern int mdev_register_device(struct device *dev, > + const struct parent_ops *ops); > +extern void mdev_unregister_device(struct device *dev); > + > +extern int mdev_register_driver(struct mdev_driver *drv, struct module *owner); > +extern void mdev_unregister_driver(struct mdev_driver *drv); > + > +extern struct mdev_device *mdev_get_device(struct mdev_device *mdev); > +extern void mdev_put_device(struct mdev_device *mdev); > + > +extern struct mdev_device *mdev_get_device_by_group(struct iommu_group *group); > + > +#endif /* MDEV_H */ > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 20 Jun 2016 22:01:46 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote: > Design for Mediated Device Driver: > Main purpose of this driver is to provide a common interface for mediated > device management that can be used by differnt drivers of different > devices. > > This module provides a generic interface to create the device, add it to > mediated bus, add device to IOMMU group and then add it to vfio group. > > Below is the high Level block diagram, with Nvidia, Intel and IBM devices > as example, since these are the devices which are going to actively use > this module as of now. > > +---------------+ > | | > | +-----------+ | mdev_register_driver() +--------------+ > | | | +<------------------------+ __init() | > | | | | | | > | | mdev | +------------------------>+ |<-> VFIO user > | | bus | | probe()/remove() | vfio_mpci.ko | APIs > | | driver | | | | > | | | | +--------------+ > | | | | mdev_register_driver() +--------------+ > | | | +<------------------------+ __init() | > | | | | | | > | | | +------------------------>+ |<-> VFIO user > | +-----------+ | probe()/remove() | vfio_mccw.ko | APIs > | | | | > | MDEV CORE | +--------------+ > | MODULE | > | mdev.ko | > | +-----------+ | mdev_register_device() +--------------+ > | | | +<------------------------+ | > | | | | | nvidia.ko |<-> physical > | | | +------------------------>+ | device > | | | | callback +--------------+ > | | Physical | | > | | device | | mdev_register_device() +--------------+ > | | interface | |<------------------------+ | > | | | | | i915.ko |<-> physical > | | | +------------------------>+ | device > | | | | callback +--------------+ > | | | | > | | | | mdev_register_device() +--------------+ > | | | +<------------------------+ | > | | | | | ccw_device.ko|<-> physical > | | | +------------------------>+ | device > | | | | callback +--------------+ > | +-----------+ | > +---------------+ > > Core driver provides two types of registration interfaces: > 1. Registration interface for mediated bus driver: > > /** > * struct mdev_driver - Mediated device's driver > * @name: driver name > * @probe: called when new device created > * @remove:called when device removed > * @match: called when new device or driver is added for this bus. > Return 1 if given device can be handled by given driver and > zero otherwise. > * @driver:device driver structure > * > **/ > struct mdev_driver { > const char *name; > int (*probe) (struct device *dev); > void (*remove) (struct device *dev); > int (*match)(struct device *dev); > struct device_driver driver; > }; > > int mdev_register_driver(struct mdev_driver *drv, struct module *owner); > void mdev_unregister_driver(struct mdev_driver *drv); > > Mediated device's driver for mdev should use this interface to register > with Core driver. With this, mediated devices driver for such devices is > responsible to add mediated device to VFIO group. > > 2. Physical device driver interface > This interface provides vendor driver the set APIs to manage physical > device related work in their own driver. APIs are : > - supported_config: provide supported configuration list by the vendor > driver > - create: to allocate basic resources in vendor driver for a mediated > device. > - destroy: to free resources in vendor driver when mediated device is > destroyed. > - start: to initiate mediated device initialization process from vendor > driver when VM boots and before QEMU starts. > - shutdown: to teardown mediated device resources during VM teardown. > - read : read emulation callback. > - write: write emulation callback. > - set_irqs: send interrupt configuration information that QEMU sets. > - get_region_info: to provide region size and its flags for the mediated > device. > - validate_map_request: to validate remap pfn request. > > This registration interface should be used by vendor drivers to register > each physical device to mdev core driver. > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com> > Signed-off-by: Neo Jia <cjia@nvidia.com> > Change-Id: I73a5084574270b14541c529461ea2f03c292d510 > --- > drivers/vfio/Kconfig | 1 + > drivers/vfio/Makefile | 1 + > drivers/vfio/mdev/Kconfig | 11 + > drivers/vfio/mdev/Makefile | 5 + > drivers/vfio/mdev/mdev_core.c | 595 +++++++++++++++++++++++++++++++++++++++ > drivers/vfio/mdev/mdev_driver.c | 138 +++++++++ > drivers/vfio/mdev/mdev_private.h | 33 +++ > drivers/vfio/mdev/mdev_sysfs.c | 300 ++++++++++++++++++++ > include/linux/mdev.h | 232 +++++++++++++++ > 9 files changed, 1316 insertions(+) > create mode 100644 drivers/vfio/mdev/Kconfig > create mode 100644 drivers/vfio/mdev/Makefile > create mode 100644 drivers/vfio/mdev/mdev_core.c > create mode 100644 drivers/vfio/mdev/mdev_driver.c > create mode 100644 drivers/vfio/mdev/mdev_private.h > create mode 100644 drivers/vfio/mdev/mdev_sysfs.c > create mode 100644 include/linux/mdev.h > > diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig > index da6e2ce77495..23eced02aaf6 100644 > --- a/drivers/vfio/Kconfig > +++ b/drivers/vfio/Kconfig > @@ -48,4 +48,5 @@ menuconfig VFIO_NOIOMMU > > source "drivers/vfio/pci/Kconfig" > source "drivers/vfio/platform/Kconfig" > +source "drivers/vfio/mdev/Kconfig" > source "virt/lib/Kconfig" > diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile > index 7b8a31f63fea..7c70753e54ab 100644 > --- a/drivers/vfio/Makefile > +++ b/drivers/vfio/Makefile > @@ -7,3 +7,4 @@ obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o > obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o > obj-$(CONFIG_VFIO_PCI) += pci/ > obj-$(CONFIG_VFIO_PLATFORM) += platform/ > +obj-$(CONFIG_MDEV) += mdev/ > diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig > new file mode 100644 > index 000000000000..951e2bb06a3f > --- /dev/null > +++ b/drivers/vfio/mdev/Kconfig > @@ -0,0 +1,11 @@ > + > +config MDEV > + tristate "Mediated device driver framework" > + depends on VFIO > + default n > + help > + MDEV provides a framework to virtualize device without SR-IOV cap > + See Documentation/mdev.txt for more details. Documentation pointer still doesn't exist. Perhaps this file would be a more appropriate place than the commit log for some of the information above. Every time I review this I'm struggling to figure out why this isn't VFIO_MDEV since it's really tied to vfio and difficult to evaluate it as some sort of standalone mediated device interface. I don't know the answer, but it always strikes me as a discontinuity. > + > + If you don't know what do here, say N. > + > diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile > new file mode 100644 > index 000000000000..2c6d11f7bc24 > --- /dev/null > +++ b/drivers/vfio/mdev/Makefile > @@ -0,0 +1,5 @@ > + > +mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o > + > +obj-$(CONFIG_MDEV) += mdev.o > + > diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c > new file mode 100644 > index 000000000000..3c45ed2ae1e9 > --- /dev/null > +++ b/drivers/vfio/mdev/mdev_core.c > @@ -0,0 +1,595 @@ > +/* > + * Mediated device Core Driver > + * > + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. > + * Author: Neo Jia <cjia@nvidia.com> > + * Kirti Wankhede <kwankhede@nvidia.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/init.h> > +#include <linux/module.h> > +#include <linux/device.h> > +#include <linux/kernel.h> > +#include <linux/fs.h> > +#include <linux/slab.h> > +#include <linux/cdev.h> > +#include <linux/sched.h> > +#include <linux/uuid.h> > +#include <linux/vfio.h> > +#include <linux/iommu.h> > +#include <linux/sysfs.h> > +#include <linux/mdev.h> > + > +#include "mdev_private.h" > + > +#define DRIVER_VERSION "0.1" > +#define DRIVER_AUTHOR "NVIDIA Corporation" > +#define DRIVER_DESC "Mediated device Core Driver" > + > +#define MDEV_CLASS_NAME "mdev" > + > +static struct devices_list { > + struct list_head dev_list; > + struct mutex list_lock; > +} parent_devices; > + I imagine this is following the example of struct vfio in vfio.c but for this usage the following seems much easier: static LIST_HEAD(parent_list) static DEFINE_MUTEX(parent_list_lock); Then you can also remove the initialization from mdev_init(). > +static int mdev_add_attribute_group(struct device *dev, > + const struct attribute_group **groups) > +{ > + return sysfs_create_groups(&dev->kobj, groups); > +} > + > +static void mdev_remove_attribute_group(struct device *dev, > + const struct attribute_group **groups) > +{ > + sysfs_remove_groups(&dev->kobj, groups); > +} > + > +static struct mdev_device *find_mdev_device(struct parent_device *parent, > + uuid_le uuid, int instance) > +{ > + struct mdev_device *mdev = NULL, *p; > + > + list_for_each_entry(p, &parent->mdev_list, next) { > + if ((uuid_le_cmp(p->uuid, uuid) == 0) && > + (p->instance == instance)) { > + mdev = p; Locking here is still broken, the callers are create and destroy, which can still race each other and themselves. > + break; > + } > + } > + return mdev; > +} > + > +/* Should be called holding parent_devices.list_lock */ > +static struct parent_device *find_parent_device(struct device *dev) > +{ > + struct parent_device *parent = NULL, *p; > + > + WARN_ON(!mutex_is_locked(&parent_devices.list_lock)); > + list_for_each_entry(p, &parent_devices.dev_list, next) { > + if (p->dev == dev) { > + parent = p; > + break; > + } > + } > + return parent; > +} > + > +static void mdev_release_parent(struct kref *kref) > +{ > + struct parent_device *parent = container_of(kref, struct parent_device, > + ref); > + kfree(parent); > +} > + > +static > +inline struct parent_device *mdev_get_parent(struct parent_device *parent) > +{ > + if (parent) > + kref_get(&parent->ref); > + > + return parent; > +} > + > +static inline void mdev_put_parent(struct parent_device *parent) > +{ > + if (parent) > + kref_put(&parent->ref, mdev_release_parent); > +} > + > +static struct parent_device *mdev_get_parent_by_dev(struct device *dev) > +{ > + struct parent_device *parent = NULL, *p; > + > + mutex_lock(&parent_devices.list_lock); > + list_for_each_entry(p, &parent_devices.dev_list, next) { > + if (p->dev == dev) { > + parent = mdev_get_parent(p); > + break; > + } > + } > + mutex_unlock(&parent_devices.list_lock); > + return parent; > +} > + > +static int mdev_device_create_ops(struct mdev_device *mdev, char *mdev_params) > +{ > + struct parent_device *parent = mdev->parent; > + int ret; > + > + mutex_lock(&parent->ops_lock); > + if (parent->ops->create) { How would a parent_device without ops->create or ops->destroy useful? Perhaps mdev_register_driver() should enforce required ops. mdev.h should at least document which ops are optional if they really are optional. > + ret = parent->ops->create(mdev->dev.parent, mdev->uuid, > + mdev->instance, mdev_params); > + if (ret) > + goto create_ops_err; > + } > + > + ret = mdev_add_attribute_group(&mdev->dev, > + parent->ops->mdev_attr_groups); An error here seems to put us in a bad place, the device is created but the attributes are broken, is it the caller's responsibility to destroy? Seems like we need a cleanup if this fails. > +create_ops_err: > + mutex_unlock(&parent->ops_lock); It seems like ops_lock isn't used so much as a lock as a serialization mechanism. Why? Where is this serialization per parent device documented? > + return ret; > +} > + > +static int mdev_device_destroy_ops(struct mdev_device *mdev, bool force) > +{ > + struct parent_device *parent = mdev->parent; > + int ret = 0; > + > + /* > + * If vendor driver doesn't return success that means vendor > + * driver doesn't support hot-unplug > + */ > + mutex_lock(&parent->ops_lock); > + if (parent->ops->destroy) { > + ret = parent->ops->destroy(parent->dev, mdev->uuid, > + mdev->instance); > + if (ret && !force) { It seems this is not so much a 'force' but an ignore errors, we never actually force the mdev driver to destroy the device... which makes me wonder if there are leaks there. > + ret = -EBUSY; > + goto destroy_ops_err; > + } > + } > + mdev_remove_attribute_group(&mdev->dev, > + parent->ops->mdev_attr_groups); > +destroy_ops_err: > + mutex_unlock(&parent->ops_lock); > + > + return ret; > +} > + > +static void mdev_release_device(struct kref *kref) > +{ > + struct mdev_device *mdev = container_of(kref, struct mdev_device, ref); > + struct parent_device *parent = mdev->parent; > + > + device_unregister(&mdev->dev); > + wake_up(&parent->release_done); > + mdev_put_parent(parent); > +} > + > +struct mdev_device *mdev_get_device(struct mdev_device *mdev) > +{ > + if (mdev) > + kref_get(&mdev->ref); > + > + return mdev; > +} > +EXPORT_SYMBOL(mdev_get_device); > + > +void mdev_put_device(struct mdev_device *mdev) > +{ > + if (mdev) > + kref_put(&mdev->ref, mdev_release_device); > +} > +EXPORT_SYMBOL(mdev_put_device); > + > +/* > + * Find first mediated device from given uuid and increment refcount of > + * mediated device. Caller should call mdev_put_device() when the use of > + * mdev_device is done. > + */ > +static struct mdev_device *mdev_get_first_device_by_uuid(uuid_le uuid) > +{ > + struct mdev_device *mdev = NULL, *p; > + struct parent_device *parent; > + > + mutex_lock(&parent_devices.list_lock); > + list_for_each_entry(parent, &parent_devices.dev_list, next) { > + mutex_lock(&parent->mdev_list_lock); This lock ordering is something we'll need to keep in mind. > + list_for_each_entry(p, &parent->mdev_list, next) { > + if (uuid_le_cmp(p->uuid, uuid) == 0) { > + mdev = mdev_get_device(p); > + break; > + } > + } > + mutex_unlock(&parent->mdev_list_lock); > + > + if (mdev) > + break; > + } > + mutex_unlock(&parent_devices.list_lock); > + return mdev; > +} > + > +/* > + * Find mediated device from given iommu_group and increment refcount of > + * mediated device. Caller should call mdev_put_device() when the use of > + * mdev_device is done. > + */ > +struct mdev_device *mdev_get_device_by_group(struct iommu_group *group) > +{ > + struct mdev_device *mdev = NULL, *p; > + struct parent_device *parent; > + > + mutex_lock(&parent_devices.list_lock); > + list_for_each_entry(parent, &parent_devices.dev_list, next) { > + mutex_lock(&parent->mdev_list_lock); > + list_for_each_entry(p, &parent->mdev_list, next) { > + if (!p->group) > + continue; > + > + if (iommu_group_id(p->group) == iommu_group_id(group)) { > + mdev = mdev_get_device(p); > + break; > + } > + } > + mutex_unlock(&parent->mdev_list_lock); > + > + if (mdev) > + break; > + } > + mutex_unlock(&parent_devices.list_lock); > + return mdev; > +} > +EXPORT_SYMBOL(mdev_get_device_by_group); > + > +/* > + * mdev_register_device : Register a device > + * @dev: device structure representing parent device. > + * @ops: Parent device operation structure to be registered. > + * > + * Add device to list of registered parent devices. > + * Returns a negative value on error, otherwise 0. > + */ > +int mdev_register_device(struct device *dev, const struct parent_ops *ops) > +{ > + int ret = 0; > + struct parent_device *parent; > + > + if (!dev || !ops) > + return -EINVAL; > + > + mutex_lock(&parent_devices.list_lock); > + > + /* Check for duplicate */ > + parent = find_parent_device(dev); > + if (parent) { > + ret = -EEXIST; > + goto add_dev_err; > + } > + > + parent = kzalloc(sizeof(*parent), GFP_KERNEL); > + if (!parent) { > + ret = -ENOMEM; > + goto add_dev_err; > + } > + > + kref_init(&parent->ref); > + list_add(&parent->next, &parent_devices.dev_list); > + mutex_unlock(&parent_devices.list_lock); find_parent_device() matches based on parent->dev, but we're dropping the list lock before we setup parent->dev. There are other ways to shorten the time this lock is held, but releasing it with an incomplete entry in the list is not the way I would choose. > + > + parent->dev = dev; > + parent->ops = ops; > + mutex_init(&parent->ops_lock); > + mutex_init(&parent->mdev_list_lock); > + INIT_LIST_HEAD(&parent->mdev_list); > + init_waitqueue_head(&parent->release_done); > + > + ret = mdev_create_sysfs_files(dev); > + if (ret) > + goto add_sysfs_error; > + > + ret = mdev_add_attribute_group(dev, ops->dev_attr_groups); > + if (ret) > + goto add_group_error; > + > + dev_info(dev, "MDEV: Registered\n"); > + return 0; > + > +add_group_error: > + mdev_remove_sysfs_files(dev); > +add_sysfs_error: > + mutex_lock(&parent_devices.list_lock); > + list_del(&parent->next); > + mutex_unlock(&parent_devices.list_lock); > + mdev_put_parent(parent); > + return ret; > + > +add_dev_err: > + mutex_unlock(&parent_devices.list_lock); > + return ret; > +} > +EXPORT_SYMBOL(mdev_register_device); > + > +/* > + * mdev_unregister_device : Unregister a parent device > + * @dev: device structure representing parent device. > + * > + * Remove device from list of registered parent devices. Give a chance to free > + * existing mediated devices for given device. > + */ > + > +void mdev_unregister_device(struct device *dev) > +{ > + struct parent_device *parent; > + struct mdev_device *mdev, *n; > + int ret; > + > + mutex_lock(&parent_devices.list_lock); > + parent = find_parent_device(dev); > + > + if (!parent) { > + mutex_unlock(&parent_devices.list_lock); > + return; > + } > + dev_info(dev, "MDEV: Unregistering\n"); > + > + /* > + * Remove parent from the list and remove create and destroy sysfs > + * files so that no new mediated device could be created for this parent > + */ > + list_del(&parent->next); > + mdev_remove_sysfs_files(dev); > + mutex_unlock(&parent_devices.list_lock); > + > + mutex_lock(&parent->ops_lock); > + mdev_remove_attribute_group(dev, > + parent->ops->dev_attr_groups); > + mutex_unlock(&parent->ops_lock); > + > + mutex_lock(&parent->mdev_list_lock); > + list_for_each_entry_safe(mdev, n, &parent->mdev_list, next) { > + mdev_device_destroy_ops(mdev, true); > + list_del(&mdev->next); > + mdev_put_device(mdev); > + } > + mutex_unlock(&parent->mdev_list_lock); > + > + do { > + ret = wait_event_interruptible_timeout(parent->release_done, > + list_empty(&parent->mdev_list), HZ * 10); But we do a list_del for each mdev in mdev_list above, how could the list not be empty here? I think you're trying to wait for all the mdev devices to be released, but I don't think this does that. Isn't the list empty regardless? > + if (ret == -ERESTARTSYS) { > + dev_warn(dev, "Mediated devices are in use, task" > + " \"%s\" (%d) " > + "blocked until all are released", > + current->comm, task_pid_nr(current)); > + } > + } while (ret <= 0); > + > + mdev_put_parent(parent); > +} > +EXPORT_SYMBOL(mdev_unregister_device); > + > +/* > + * Functions required for mdev-sysfs > + */ > +static void mdev_device_release(struct device *dev) > +{ > + struct mdev_device *mdev = to_mdev_device(dev); > + > + dev_dbg(&mdev->dev, "MDEV: destroying\n"); > + kfree(mdev); > +} > + > +int mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance, > + char *mdev_params) > +{ > + int ret; > + struct mdev_device *mdev; > + struct parent_device *parent; > + > + parent = mdev_get_parent_by_dev(dev); > + if (!parent) > + return -EINVAL; > + > + /* Check for duplicate */ > + mdev = find_mdev_device(parent, uuid, instance); But this doesn't actually prevent duplicates because we we're not holding any lock the guarantee that another racing process doesn't create the same {uuid,instance} between where we check and the below list_add. > + if (mdev) { > + ret = -EEXIST; > + goto create_err; > + } > + > + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); > + if (!mdev) { > + ret = -ENOMEM; > + goto create_err; > + } > + > + memcpy(&mdev->uuid, &uuid, sizeof(uuid_le)); > + mdev->instance = instance; > + mdev->parent = parent; > + mutex_init(&mdev->ops_lock); > + kref_init(&mdev->ref); > + > + mdev->dev.parent = dev; > + mdev->dev.bus = &mdev_bus_type; > + mdev->dev.release = mdev_device_release; > + dev_set_name(&mdev->dev, "%pUb-%d", uuid.b, instance); > + > + ret = device_register(&mdev->dev); > + if (ret) { > + put_device(&mdev->dev); > + goto create_err; > + } > + > + ret = mdev_device_create_ops(mdev, mdev_params); > + if (ret) > + goto create_failed; > + > + mutex_lock(&parent->mdev_list_lock); > + list_add(&mdev->next, &parent->mdev_list); > + mutex_unlock(&parent->mdev_list_lock); > + > + dev_dbg(&mdev->dev, "MDEV: created\n"); > + > + return ret; > + > +create_failed: > + device_unregister(&mdev->dev); > + > +create_err: > + mdev_put_parent(parent); > + return ret; > +} > + > +int mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t instance) > +{ > + struct mdev_device *mdev; > + struct parent_device *parent; > + int ret; > + > + parent = mdev_get_parent_by_dev(dev); > + if (!parent) { > + ret = -EINVAL; > + goto destroy_err; > + } > + > + mdev = find_mdev_device(parent, uuid, instance); > + if (!mdev) { > + ret = -EINVAL; > + goto destroy_err; > + } Likewise, without locking multiple callers can get here with the same mdev. > + > + ret = mdev_device_destroy_ops(mdev, false); > + if (ret) > + goto destroy_err; > + > + mdev_put_parent(parent); > + > + mutex_lock(&parent->mdev_list_lock); > + list_del(&mdev->next); > + mutex_unlock(&parent->mdev_list_lock); > + > + mdev_put_device(mdev); > + return ret; > + > +destroy_err: > + mdev_put_parent(parent); > + return ret; > +} > + > +void mdev_device_supported_config(struct device *dev, char *str) > +{ > + struct parent_device *parent; > + > + parent = mdev_get_parent_by_dev(dev); > + > + if (parent) { > + mutex_lock(&parent->ops_lock); > + if (parent->ops->supported_config) > + parent->ops->supported_config(parent->dev, str); > + mutex_unlock(&parent->ops_lock); > + mdev_put_parent(parent); > + } > +} > + > +int mdev_device_start(uuid_le uuid) > +{ > + int ret = 0; > + struct mdev_device *mdev; > + struct parent_device *parent; > + > + mdev = mdev_get_first_device_by_uuid(uuid); > + if (!mdev) > + return -EINVAL; > + > + parent = mdev->parent; > + > + mutex_lock(&parent->ops_lock); > + if (parent->ops->start) > + ret = parent->ops->start(mdev->uuid); > + mutex_unlock(&parent->ops_lock); > + > + if (ret) > + pr_err("mdev_start failed %d\n", ret); > + else > + kobject_uevent(&mdev->dev.kobj, KOBJ_ONLINE); > + > + mdev_put_device(mdev); > + > + return ret; > +} > + > +int mdev_device_shutdown(uuid_le uuid) > +{ > + int ret = 0; > + struct mdev_device *mdev; > + struct parent_device *parent; > + > + mdev = mdev_get_first_device_by_uuid(uuid); > + if (!mdev) > + return -EINVAL; > + > + parent = mdev->parent; > + > + mutex_lock(&parent->ops_lock); > + if (parent->ops->shutdown) > + ret = parent->ops->shutdown(mdev->uuid); > + mutex_unlock(&parent->ops_lock); > + > + if (ret) > + pr_err("mdev_shutdown failed %d\n", ret); > + else > + kobject_uevent(&mdev->dev.kobj, KOBJ_OFFLINE); > + > + mdev_put_device(mdev); > + return ret; > +} > + > +static struct class mdev_class = { > + .name = MDEV_CLASS_NAME, > + .owner = THIS_MODULE, > + .class_attrs = mdev_class_attrs, > +}; > + > +static int __init mdev_init(void) > +{ > + int ret; > + > + mutex_init(&parent_devices.list_lock); > + INIT_LIST_HEAD(&parent_devices.dev_list); > + > + ret = class_register(&mdev_class); > + if (ret) { > + pr_err("Failed to register mdev class\n"); > + return ret; > + } > + > + ret = mdev_bus_register(); > + if (ret) { > + pr_err("Failed to register mdev bus\n"); > + class_unregister(&mdev_class); > + return ret; > + } > + > + return ret; > +} > + > +static void __exit mdev_exit(void) > +{ > + mdev_bus_unregister(); > + class_unregister(&mdev_class); > +} > + > +module_init(mdev_init) > +module_exit(mdev_exit) > + > +MODULE_VERSION(DRIVER_VERSION); > +MODULE_LICENSE("GPL"); > +MODULE_AUTHOR(DRIVER_AUTHOR); > +MODULE_DESCRIPTION(DRIVER_DESC); > diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c > new file mode 100644 > index 000000000000..f1aed541111d > --- /dev/null > +++ b/drivers/vfio/mdev/mdev_driver.c > @@ -0,0 +1,138 @@ > +/* > + * MDEV driver > + * > + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. > + * Author: Neo Jia <cjia@nvidia.com> > + * Kirti Wankhede <kwankhede@nvidia.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/device.h> > +#include <linux/iommu.h> > +#include <linux/mdev.h> > + > +#include "mdev_private.h" > + > +static int mdev_attach_iommu(struct mdev_device *mdev) > +{ > + int ret; > + struct iommu_group *group; > + > + group = iommu_group_alloc(); > + if (IS_ERR(group)) { > + dev_err(&mdev->dev, "MDEV: failed to allocate group!\n"); > + return PTR_ERR(group); > + } > + > + ret = iommu_group_add_device(group, &mdev->dev); > + if (ret) { > + dev_err(&mdev->dev, "MDEV: failed to add dev to group!\n"); > + goto attach_fail; > + } > + > + mdev->group = group; > + > + dev_info(&mdev->dev, "MDEV: group_id = %d\n", > + iommu_group_id(group)); > +attach_fail: > + iommu_group_put(group); > + return ret; > +} > + > +static void mdev_detach_iommu(struct mdev_device *mdev) > +{ > + iommu_group_remove_device(&mdev->dev); mdev->group = NULL; seems prudent > + dev_info(&mdev->dev, "MDEV: detaching iommu\n"); > +} > + > +static int mdev_probe(struct device *dev) > +{ > + struct mdev_driver *drv = to_mdev_driver(dev->driver); > + struct mdev_device *mdev = to_mdev_device(dev); > + int ret; > + > + ret = mdev_attach_iommu(mdev); > + if (ret) { > + dev_err(dev, "Failed to attach IOMMU\n"); > + return ret; > + } > + > + if (drv && drv->probe) > + ret = drv->probe(dev); > + > + return ret; > +} > + > +static int mdev_remove(struct device *dev) > +{ > + struct mdev_driver *drv = to_mdev_driver(dev->driver); > + struct mdev_device *mdev = to_mdev_device(dev); > + > + if (drv && drv->remove) > + drv->remove(dev); > + > + mdev_detach_iommu(mdev); > + > + return 0; > +} > + > +static int mdev_match(struct device *dev, struct device_driver *drv) > +{ > + struct mdev_driver *mdrv = to_mdev_driver(drv); nit, drv above, mdrv here > + > + if (mdrv && mdrv->match) > + return mdrv->match(dev); > + > + return 0; > +} > + > +struct bus_type mdev_bus_type = { > + .name = "mdev", > + .match = mdev_match, > + .probe = mdev_probe, > + .remove = mdev_remove, > +}; > +EXPORT_SYMBOL_GPL(mdev_bus_type); > + > +/* > + * mdev_register_driver - register a new MDEV driver > + * @drv: the driver to register > + * @owner: module owner of driver to be registered > + * > + * Returns a negative value on error, otherwise 0. > + */ > +int mdev_register_driver(struct mdev_driver *drv, struct module *owner) > +{ > + /* initialize common driver fields */ > + drv->driver.name = drv->name; > + drv->driver.bus = &mdev_bus_type; > + drv->driver.owner = owner; > + > + /* register with core */ > + return driver_register(&drv->driver); > +} > +EXPORT_SYMBOL(mdev_register_driver); > + > +/* > + * mdev_unregister_driver - unregister MDEV driver > + * @drv: the driver to unregister > + * > + */ > +void mdev_unregister_driver(struct mdev_driver *drv) > +{ > + driver_unregister(&drv->driver); > +} > +EXPORT_SYMBOL(mdev_unregister_driver); > + > +int mdev_bus_register(void) > +{ > + return bus_register(&mdev_bus_type); > +} > + > +void mdev_bus_unregister(void) > +{ > + bus_unregister(&mdev_bus_type); > +} > diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h > new file mode 100644 > index 000000000000..991d7f796169 > --- /dev/null > +++ b/drivers/vfio/mdev/mdev_private.h > @@ -0,0 +1,33 @@ > +/* > + * Mediated device interal definitions > + * > + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. > + * Author: Neo Jia <cjia@nvidia.com> > + * Kirti Wankhede <kwankhede@nvidia.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#ifndef MDEV_PRIVATE_H > +#define MDEV_PRIVATE_H > + > +int mdev_bus_register(void); > +void mdev_bus_unregister(void); > + > +/* Function prototypes for mdev_sysfs */ > + > +extern struct class_attribute mdev_class_attrs[]; > + > +int mdev_create_sysfs_files(struct device *dev); > +void mdev_remove_sysfs_files(struct device *dev); > + > +int mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance, > + char *mdev_params); > +int mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t instance); > +void mdev_device_supported_config(struct device *dev, char *str); > +int mdev_device_start(uuid_le uuid); > +int mdev_device_shutdown(uuid_le uuid); nit, stop is start as startup is to shutdown. IOW, should this be mdev_device_stop()? > + > +#endif /* MDEV_PRIVATE_H */ > diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c > new file mode 100644 > index 000000000000..48b66e40009e > --- /dev/null > +++ b/drivers/vfio/mdev/mdev_sysfs.c > @@ -0,0 +1,300 @@ > +/* > + * File attributes for Mediated devices > + * > + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. > + * Author: Neo Jia <cjia@nvidia.com> > + * Kirti Wankhede <kwankhede@nvidia.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/sysfs.h> > +#include <linux/ctype.h> > +#include <linux/device.h> > +#include <linux/slab.h> > +#include <linux/uuid.h> > +#include <linux/mdev.h> > + > +#include "mdev_private.h" > + > +/* Prototypes */ > +static ssize_t mdev_supported_types_show(struct device *dev, > + struct device_attribute *attr, > + char *buf); > +static DEVICE_ATTR_RO(mdev_supported_types); > + > +static ssize_t mdev_create_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t count); > +static DEVICE_ATTR_WO(mdev_create); > + > +static ssize_t mdev_destroy_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t count); > +static DEVICE_ATTR_WO(mdev_destroy); > + > +/* Static functions */ > + > +#define UUID_CHAR_LENGTH 36 > +#define UUID_BYTE_LENGTH 16 > + > +#define SUPPORTED_TYPE_BUFFER_LENGTH 1024 > + > +static inline bool is_uuid_sep(char sep) > +{ > + if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0') > + return true; > + return false; > +} > + > +static int uuid_parse(const char *str, uuid_le *uuid) > +{ > + int i; > + > + if (strlen(str) < UUID_CHAR_LENGTH) > + return -EINVAL; > + > + for (i = 0; i < UUID_BYTE_LENGTH; i++) { > + if (!isxdigit(str[0]) || !isxdigit(str[1])) { > + pr_err("%s err", __func__); > + return -EINVAL; > + } > + > + uuid->b[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]); > + str += 2; > + if (is_uuid_sep(*str)) > + str++; > + } > + > + return 0; > +} > + > +/* mdev sysfs Functions */ > +static ssize_t mdev_supported_types_show(struct device *dev, > + struct device_attribute *attr, > + char *buf) > +{ > + char *str, *ptr; > + ssize_t n; > + > + str = kzalloc(sizeof(*str) * SUPPORTED_TYPE_BUFFER_LENGTH, GFP_KERNEL); > + if (!str) > + return -ENOMEM; > + > + ptr = str; > + mdev_device_supported_config(dev, str); > + > + n = sprintf(buf, "%s\n", str); > + kfree(ptr); > + > + return n; > +} > + > +static ssize_t mdev_create_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t count) > +{ > + char *str, *pstr; > + char *uuid_str, *instance_str, *mdev_params = NULL, *params = NULL; > + uuid_le uuid; > + uint32_t instance; > + int ret; > + > + pstr = str = kstrndup(buf, count, GFP_KERNEL); > + > + if (!str) > + return -ENOMEM; > + > + uuid_str = strsep(&str, ":"); > + if (!uuid_str) { > + pr_err("mdev_create: Empty UUID string %s\n", buf); > + ret = -EINVAL; > + goto create_error; > + } > + > + if (!str) { > + pr_err("mdev_create: mdev instance not present %s\n", buf); > + ret = -EINVAL; > + goto create_error; > + } > + > + instance_str = strsep(&str, ":"); > + if (!instance_str) { > + pr_err("mdev_create: Empty instance string %s\n", buf); > + ret = -EINVAL; > + goto create_error; > + } > + > + ret = kstrtouint(instance_str, 0, &instance); > + if (ret) { > + pr_err("mdev_create: mdev instance parsing error %s\n", buf); > + goto create_error; > + } > + > + if (str) > + params = mdev_params = kstrdup(str, GFP_KERNEL); > + > + ret = uuid_parse(uuid_str, &uuid); > + if (ret) { > + pr_err("mdev_create: UUID parse error %s\n", buf); > + goto create_error; > + } > + > + ret = mdev_device_create(dev, uuid, instance, mdev_params); > + if (ret) > + pr_err("mdev_create: Failed to create mdev device\n"); > + else > + ret = count; > + > +create_error: > + kfree(params); > + kfree(pstr); > + return ret; > +} > + > +static ssize_t mdev_destroy_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t count) > +{ > + char *uuid_str, *str, *pstr; > + uuid_le uuid; > + unsigned int instance; > + int ret; > + > + str = pstr = kstrndup(buf, count, GFP_KERNEL); > + > + if (!str) > + return -ENOMEM; > + > + uuid_str = strsep(&str, ":"); > + if (!uuid_str) { > + pr_err("mdev_destroy: Empty UUID string %s\n", buf); > + ret = -EINVAL; > + goto destroy_error; > + } > + > + if (str == NULL) { > + pr_err("mdev_destroy: instance not specified %s\n", buf); > + ret = -EINVAL; > + goto destroy_error; > + } > + > + ret = kstrtouint(str, 0, &instance); > + if (ret) { > + pr_err("mdev_destroy: instance parsing error %s\n", buf); > + goto destroy_error; > + } > + > + ret = uuid_parse(uuid_str, &uuid); > + if (ret) { > + pr_err("mdev_destroy: UUID parse error %s\n", buf); > + goto destroy_error; > + } > + > + ret = mdev_device_destroy(dev, uuid, instance); > + if (ret == 0) > + ret = count; > + > +destroy_error: > + kfree(pstr); > + return ret; > +} > + > +ssize_t mdev_start_store(struct class *class, struct class_attribute *attr, > + const char *buf, size_t count) > +{ > + char *uuid_str, *ptr; > + uuid_le uuid; > + int ret; > + > + ptr = uuid_str = kstrndup(buf, count, GFP_KERNEL); > + > + if (!uuid_str) > + return -ENOMEM; > + > + ret = uuid_parse(uuid_str, &uuid); > + if (ret) { > + pr_err("mdev_start: UUID parse error %s\n", buf); > + goto start_error; > + } > + > + ret = mdev_device_start(uuid); > + if (ret == 0) > + ret = count; > + > +start_error: > + kfree(ptr); > + return ret; > +} > + > +ssize_t mdev_shutdown_store(struct class *class, struct class_attribute *attr, > + const char *buf, size_t count) > +{ > + char *uuid_str, *ptr; > + uuid_le uuid; > + int ret; > + > + ptr = uuid_str = kstrndup(buf, count, GFP_KERNEL); > + > + if (!uuid_str) > + return -ENOMEM; > + > + ret = uuid_parse(uuid_str, &uuid); > + if (ret) { > + pr_err("mdev_shutdown: UUID parse error %s\n", buf); > + goto shutdown_error; > + } > + > + ret = mdev_device_shutdown(uuid); > + if (ret == 0) > + ret = count; > + > +shutdown_error: > + kfree(ptr); > + return ret; > + > +} > + > +struct class_attribute mdev_class_attrs[] = { > + __ATTR_WO(mdev_start), > + __ATTR_WO(mdev_shutdown), > + __ATTR_NULL > +}; > + > +int mdev_create_sysfs_files(struct device *dev) > +{ > + int ret; > + > + ret = sysfs_create_file(&dev->kobj, > + &dev_attr_mdev_supported_types.attr); > + if (ret) { > + pr_err("Failed to create mdev_supported_types sysfs entry\n"); > + return ret; > + } > + > + ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_create.attr); > + if (ret) { > + pr_err("Failed to create mdev_create sysfs entry\n"); > + goto create_sysfs_failed; > + } > + > + ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_destroy.attr); > + if (ret) { > + pr_err("Failed to create mdev_destroy sysfs entry\n"); > + sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr); > + } else > + return ret; > + > +create_sysfs_failed: > + sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr); > + return ret; > +} > + > +void mdev_remove_sysfs_files(struct device *dev) > +{ > + sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr); > + sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr); > + sysfs_remove_file(&dev->kobj, &dev_attr_mdev_destroy.attr); > +} > diff --git a/include/linux/mdev.h b/include/linux/mdev.h > new file mode 100644 > index 000000000000..31b6f8572cfa > --- /dev/null > +++ b/include/linux/mdev.h > @@ -0,0 +1,232 @@ > +/* > + * Mediated device definition > + * > + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. > + * Author: Neo Jia <cjia@nvidia.com> > + * Kirti Wankhede <kwankhede@nvidia.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#ifndef MDEV_H > +#define MDEV_H > + > +/* Common Data structures */ > + > +struct pci_region_info { > + uint64_t start; > + uint64_t size; > + uint32_t flags; /* VFIO region info flags */ > +}; > + > +enum mdev_emul_space { > + EMUL_CONFIG_SPACE, /* PCI configuration space */ > + EMUL_IO, /* I/O register space */ > + EMUL_MMIO /* Memory-mapped I/O space */ > +}; I'm still confused why this is needed, perhaps a description here would be useful so I can stop asking. Clearly config space is PCI only, so it's strange to have it in the common code. Everyone not on x86 will say I/O space is also strange. I can't keep it in my head why the read/write offsets aren't sufficient for the driver to figure out what type it is. > + > +struct parent_device; > + > +/* > + * Mediated device > + */ > + > +struct mdev_device { > + struct device dev; > + struct parent_device *parent; > + struct iommu_group *group; > + void *iommu_data; > + uuid_le uuid; > + uint32_t instance; > + > + /* internal only */ > + struct kref ref; > + struct mutex ops_lock; > + struct list_head next; > +}; > + > + > +/** > + * struct parent_ops - Structure to be registered for each parent device to > + * register the device to mdev module. > + * > + * @owner: The module owner. > + * @dev_attr_groups: Default attributes of the parent device. > + * @mdev_attr_groups: Default attributes of the mediated device. > + * @supported_config: Called to get information about supported types. > + * @dev : device structure of parent device. > + * @config: should return string listing supported config > + * Returns integer: success (0) or error (< 0) > + * @create: Called to allocate basic resources in parent device's > + * driver for a particular mediated device > + * @dev: parent device structure on which mediated device > + * should be created > + * @uuid: VM's uuid for which VM it is intended to > + * @instance: mediated instance in that VM > + * @mdev_params: extra parameters required by parent > + * device's driver. > + * Returns integer: success (0) or error (< 0) > + * @destroy: Called to free resources in parent device's driver for a > + * a mediated device instance of that VM. > + * @dev: parent device structure to which this mediated > + * device points to. > + * @uuid: VM's uuid for which the mediated device belongs > + * @instance: mdev instance in that VM > + * Returns integer: success (0) or error (< 0) > + * If VM is running and destroy() is called that means the > + * mdev is being hotunpluged. Return error if VM is running > + * and driver doesn't support mediated device hotplug. > + * @start: Called to initiate mediated device initialization > + * process in parent device's driver when VM boots before > + * VMM starts > + * @uuid: VM's UUID which is booting. > + * Returns integer: success (0) or error (< 0) > + * @shutdown: Called to teardown mediated device related resources for > + * the VM > + * @uuid: VM's UUID which is shutting down . > + * Returns integer: success (0) or error (< 0) > + * @read: Read emulation callback > + * @mdev: mediated device structure > + * @buf: read buffer > + * @count: number of bytes to read > + * @address_space: specifies for which address space the > + * request is intended for - pci_config_space, IO register > + * space or MMIO space. > + * @addr: address. > + * Retuns number on bytes read on success or error. > + * @write: Write emulation callback > + * @mdev: mediated device structure > + * @buf: write buffer > + * @count: number of bytes to be written > + * @address_space: specifies for which address space the > + * request is intended for - pci_config_space, IO register > + * space or MMIO space. > + * @addr: address. > + * Retuns number on bytes written on success or error. > + * @set_irqs: Called to send about interrupts configuration > + * information that VMM sets. > + * @mdev: mediated device structure > + * @flags, index, start, count and *data : same as that of > + * struct vfio_irq_set of VFIO_DEVICE_SET_IRQS API. > + * @get_region_info: Called to get VFIO region size and flags of mediated > + * device. > + * @mdev: mediated device structure > + * @region_index: VFIO region index > + * @region_info: output, returns size and flags of > + * requested region. > + * Returns integer: success (0) or error (< 0) > + * @validate_map_request: Validate remap pfn request > + * @mdev: mediated device structure > + * @virtaddr: target user address to start at > + * @pfn: parent address of kernel memory, vendor driver > + * can change if required. > + * @size: size of map area, vendor driver can change the > + * size of map area if desired. > + * @prot: page protection flags for this mapping, vendor > + * driver can change, if required. > + * Returns integer: success (0) or error (< 0) > + * > + * Parent device that support mediated device should be registered with mdev > + * module with parent_ops structure. > + */ > + > +struct parent_ops { > + struct module *owner; > + const struct attribute_group **dev_attr_groups; > + const struct attribute_group **mdev_attr_groups; > + > + int (*supported_config)(struct device *dev, char *config); > + int (*create)(struct device *dev, uuid_le uuid, > + uint32_t instance, char *mdev_params); > + int (*destroy)(struct device *dev, uuid_le uuid, > + uint32_t instance); > + int (*start)(uuid_le uuid); > + int (*shutdown)(uuid_le uuid); > + ssize_t (*read)(struct mdev_device *vdev, char *buf, size_t count, > + enum mdev_emul_space address_space, loff_t pos); > + ssize_t (*write)(struct mdev_device *vdev, char *buf, size_t count, > + enum mdev_emul_space address_space, loff_t pos); > + int (*set_irqs)(struct mdev_device *vdev, uint32_t flags, > + unsigned int index, unsigned int start, > + unsigned int count, void *data); > + int (*get_region_info)(struct mdev_device *vdev, int region_index, > + struct pci_region_info *region_info); This can't be //pci_//region_info. How do you intend to support things like sparse mmap capabilities in the user REGION_INFO ioctl when such things are not part of the mediated device API? Seems like the driver should just return a buffer. > + int (*validate_map_request)(struct mdev_device *vdev, > + unsigned long virtaddr, > + unsigned long *pfn, unsigned long *size, > + pgprot_t *prot); > +}; > + > +/* > + * Parent Device > + */ > +struct parent_device { > + struct device *dev; > + const struct parent_ops *ops; > + > + /* internal */ > + struct kref ref; > + struct mutex ops_lock; > + struct list_head next; > + struct list_head mdev_list; > + struct mutex mdev_list_lock; > + wait_queue_head_t release_done; > +}; > + > +/** > + * struct mdev_driver - Mediated device driver > + * @name: driver name > + * @probe: called when new device created > + * @remove: called when device removed > + * @match: called when new device or driver is added for this bus. Return 1 if > + * given device can be handled by given driver and zero otherwise. > + * @driver: device driver structure > + * > + **/ > +struct mdev_driver { > + const char *name; > + int (*probe)(struct device *dev); > + void (*remove)(struct device *dev); > + int (*match)(struct device *dev); > + struct device_driver driver; > +}; > + > +static inline struct mdev_driver *to_mdev_driver(struct device_driver *drv) > +{ > + return drv ? container_of(drv, struct mdev_driver, driver) : NULL; > +} > + > +static inline struct mdev_device *to_mdev_device(struct device *dev) > +{ > + return dev ? container_of(dev, struct mdev_device, dev) : NULL; > +} > + > +static inline void *mdev_get_drvdata(struct mdev_device *mdev) > +{ > + return dev_get_drvdata(&mdev->dev); > +} > + > +static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data) > +{ > + dev_set_drvdata(&mdev->dev, data); > +} > + > +extern struct bus_type mdev_bus_type; > + > +#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type) > + > +extern int mdev_register_device(struct device *dev, > + const struct parent_ops *ops); > +extern void mdev_unregister_device(struct device *dev); > + > +extern int mdev_register_driver(struct mdev_driver *drv, struct module *owner); > +extern void mdev_unregister_driver(struct mdev_driver *drv); > + > +extern struct mdev_device *mdev_get_device(struct mdev_device *mdev); > +extern void mdev_put_device(struct mdev_device *mdev); > + > +extern struct mdev_device *mdev_get_device_by_group(struct iommu_group *group); > + > +#endif /* MDEV_H */ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Alex, Thanks for taking closer look. I'll incorporate all the nits you suggested. On 6/22/2016 3:00 AM, Alex Williamson wrote: > On Mon, 20 Jun 2016 22:01:46 +0530 > Kirti Wankhede <kwankhede@nvidia.com> wrote: > ... >> + >> +config MDEV >> + tristate "Mediated device driver framework" >> + depends on VFIO >> + default n >> + help >> + MDEV provides a framework to virtualize device without SR-IOV cap >> + See Documentation/mdev.txt for more details. > > Documentation pointer still doesn't exist. Perhaps this file would be > a more appropriate place than the commit log for some of the > information above. > Sure, I'll add these details to documentation. > Every time I review this I'm struggling to figure out why this isn't > VFIO_MDEV since it's really tied to vfio and difficult to evaluate it > as some sort of standalone mediated device interface. I don't know > the answer, but it always strikes me as a discontinuity. > Ok. I'll change to VFIO_MDEV >> + >> +static struct mdev_device *find_mdev_device(struct parent_device *parent, >> + uuid_le uuid, int instance) >> +{ >> + struct mdev_device *mdev = NULL, *p; >> + >> + list_for_each_entry(p, &parent->mdev_list, next) { >> + if ((uuid_le_cmp(p->uuid, uuid) == 0) && >> + (p->instance == instance)) { >> + mdev = p; > > Locking here is still broken, the callers are create and destroy, which > can still race each other and themselves. > Fixed it. >> + >> +static int mdev_device_create_ops(struct mdev_device *mdev, char *mdev_params) >> +{ >> + struct parent_device *parent = mdev->parent; >> + int ret; >> + >> + mutex_lock(&parent->ops_lock); >> + if (parent->ops->create) { > > How would a parent_device without ops->create or ops->destroy useful? > Perhaps mdev_register_driver() should enforce required ops. mdev.h > should at least document which ops are optional if they really are > optional. Makes sense, adding check in mdev_register_driver() to mandate create and destroy in ops. I'll also update the comments in mdev.h for mandatory and optional ops. > >> + ret = parent->ops->create(mdev->dev.parent, mdev->uuid, >> + mdev->instance, mdev_params); >> + if (ret) >> + goto create_ops_err; >> + } >> + >> + ret = mdev_add_attribute_group(&mdev->dev, >> + parent->ops->mdev_attr_groups); > > An error here seems to put us in a bad place, the device is created but > the attributes are broken, is it the caller's responsibility to > destroy? Seems like we need a cleanup if this fails. > Right, adding cleanup here. >> +create_ops_err: >> + mutex_unlock(&parent->ops_lock); > > It seems like ops_lock isn't used so much as a lock as a serialization > mechanism. Why? Where is this serialization per parent device > documented? > parent->ops_lock is to serialize parent device callbacks to vendor driver, i.e supported_config(), create() and destroy(). mdev->ops_lock is to serialize mediated device related callbacks to vendor driver, i.e. start(), stop(), read(), write(), set_irqs(), get_region_info(), validate_map_request(). Its not documented, I'll add comments to mdev.h about these locks. >> + return ret; >> +} >> + >> +static int mdev_device_destroy_ops(struct mdev_device *mdev, bool force) >> +{ >> + struct parent_device *parent = mdev->parent; >> + int ret = 0; >> + >> + /* >> + * If vendor driver doesn't return success that means vendor >> + * driver doesn't support hot-unplug >> + */ >> + mutex_lock(&parent->ops_lock); >> + if (parent->ops->destroy) { >> + ret = parent->ops->destroy(parent->dev, mdev->uuid, >> + mdev->instance); >> + if (ret && !force) { > > It seems this is not so much a 'force' but an ignore errors, we never > actually force the mdev driver to destroy the device... which makes me > wonder if there are leaks there. > Consider a case where VM is running or in teardown path and parent device in unbound from vendor driver, then vendor driver would call mdev_unregister_device() from its remove() call. Even if parent->ops->destroy() returns error that could also mean that hot-unplug is not supported but we have to destroy mdev device. remove() call doesn't honor error returned. In that case its a force removal. >> + >> +/* >> + * mdev_unregister_device : Unregister a parent device >> + * @dev: device structure representing parent device. >> + * >> + * Remove device from list of registered parent devices. Give a chance to free >> + * existing mediated devices for given device. >> + */ >> + >> +void mdev_unregister_device(struct device *dev) >> +{ >> + struct parent_device *parent; >> + struct mdev_device *mdev, *n; >> + int ret; >> + >> + mutex_lock(&parent_devices.list_lock); >> + parent = find_parent_device(dev); >> + >> + if (!parent) { >> + mutex_unlock(&parent_devices.list_lock); >> + return; >> + } >> + dev_info(dev, "MDEV: Unregistering\n"); >> + >> + /* >> + * Remove parent from the list and remove create and destroy sysfs >> + * files so that no new mediated device could be created for this parent >> + */ >> + list_del(&parent->next); >> + mdev_remove_sysfs_files(dev); >> + mutex_unlock(&parent_devices.list_lock); >> + >> + mutex_lock(&parent->ops_lock); >> + mdev_remove_attribute_group(dev, >> + parent->ops->dev_attr_groups); >> + mutex_unlock(&parent->ops_lock); >> + >> + mutex_lock(&parent->mdev_list_lock); >> + list_for_each_entry_safe(mdev, n, &parent->mdev_list, next) { >> + mdev_device_destroy_ops(mdev, true); >> + list_del(&mdev->next); >> + mdev_put_device(mdev); >> + } >> + mutex_unlock(&parent->mdev_list_lock); >> + >> + do { >> + ret = wait_event_interruptible_timeout(parent->release_done, >> + list_empty(&parent->mdev_list), HZ * 10); > > But we do a list_del for each mdev in mdev_list above, how could the > list not be empty here? I think you're trying to wait for all the mdev > devices to be released, but I don't think this does that. Isn't the > list empty regardless? > Right, I do want to wait for all the mdev devices to be released. Moving list_del(&mdev->next) from the above for loop to mdev_release_device() so that mdev will be removed from list on last mdev_put_device(). >> + if (ret == -ERESTARTSYS) { >> + dev_warn(dev, "Mediated devices are in use, task" >> + " \"%s\" (%d) " >> + "blocked until all are released", >> + current->comm, task_pid_nr(current)); >> + } >> + } while (ret <= 0); >> + >> + mdev_put_parent(parent); >> +} >> +EXPORT_SYMBOL(mdev_unregister_device); >> + >> + >> +int mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance, >> + char *mdev_params) >> +{ >> + int ret; >> + struct mdev_device *mdev; >> + struct parent_device *parent; >> + >> + parent = mdev_get_parent_by_dev(dev); >> + if (!parent) >> + return -EINVAL; >> + >> + /* Check for duplicate */ >> + mdev = find_mdev_device(parent, uuid, instance); > > But this doesn't actually prevent duplicates because we we're not > holding any lock the guarantee that another racing process doesn't > create the same {uuid,instance} between where we check and the below > list_add. > Oops I missed this race condition. Moving mutex_lock(&parent->mdev_list_lock); before find_mdev_device() in mdev_device_create() and mdev_device_destroy(). >> + >> +int mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance, >> + char *mdev_params); >> +int mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t instance); >> +void mdev_device_supported_config(struct device *dev, char *str); >> +int mdev_device_start(uuid_le uuid); >> +int mdev_device_shutdown(uuid_le uuid); > > nit, stop is start as startup is to shutdown. IOW, should this be > mdev_device_stop()? > Ok. Renaming mdev_device_shutdown() to mdev_device_stop(). >> + >> +struct pci_region_info { >> + uint64_t start; >> + uint64_t size; >> + uint32_t flags; /* VFIO region info flags */ >> +}; >> + >> +enum mdev_emul_space { >> + EMUL_CONFIG_SPACE, /* PCI configuration space */ >> + EMUL_IO, /* I/O register space */ >> + EMUL_MMIO /* Memory-mapped I/O space */ >> +}; > > > I'm still confused why this is needed, perhaps a description here would > be useful so I can stop asking. Clearly config space is PCI only, so > it's strange to have it in the common code. Everyone not on x86 will > say I/O space is also strange. I can't keep it in my head why the > read/write offsets aren't sufficient for the driver to figure out what > type it is. > > Now that VFIO_PCI_OFFSET_* macros are moved to vfio.h which vendor driver can also use, above enum could be removed from read/write. But again these macros are useful when parent device is PCI device. How would non-pci parent device differentiate IO ports and MMIO? >> + int (*get_region_info)(struct mdev_device *vdev, int region_index, >> + struct pci_region_info *region_info); > > This can't be //pci_//region_info. How do you intend to support things > like sparse mmap capabilities in the user REGION_INFO ioctl when such > things are not part of the mediated device API? Seems like the driver > should just return a buffer. > If not pci_region_info, can use vfio_region_info here, even to fetch sparce mmap capabilities from vendor driver? Thanks, Kirti. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 24 Jun 2016 23:24:58 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote: > Alex, > > Thanks for taking closer look. I'll incorporate all the nits you suggested. > > On 6/22/2016 3:00 AM, Alex Williamson wrote: > > On Mon, 20 Jun 2016 22:01:46 +0530 > > Kirti Wankhede <kwankhede@nvidia.com> wrote: > > > ... > >> + > >> +config MDEV > >> + tristate "Mediated device driver framework" > >> + depends on VFIO > >> + default n > >> + help > >> + MDEV provides a framework to virtualize device without SR-IOV cap > >> + See Documentation/mdev.txt for more details. > > > > Documentation pointer still doesn't exist. Perhaps this file would be > > a more appropriate place than the commit log for some of the > > information above. > > > > Sure, I'll add these details to documentation. > > > Every time I review this I'm struggling to figure out why this isn't > > VFIO_MDEV since it's really tied to vfio and difficult to evaluate it > > as some sort of standalone mediated device interface. I don't know > > the answer, but it always strikes me as a discontinuity. > > > > Ok. I'll change to VFIO_MDEV > > >> + > >> +static struct mdev_device *find_mdev_device(struct parent_device *parent, > >> + uuid_le uuid, int instance) > >> +{ > >> + struct mdev_device *mdev = NULL, *p; > >> + > >> + list_for_each_entry(p, &parent->mdev_list, next) { > >> + if ((uuid_le_cmp(p->uuid, uuid) == 0) && > >> + (p->instance == instance)) { > >> + mdev = p; > > > > Locking here is still broken, the callers are create and destroy, which > > can still race each other and themselves. > > > > Fixed it. > > >> + > >> +static int mdev_device_create_ops(struct mdev_device *mdev, char *mdev_params) > >> +{ > >> + struct parent_device *parent = mdev->parent; > >> + int ret; > >> + > >> + mutex_lock(&parent->ops_lock); > >> + if (parent->ops->create) { > > > > How would a parent_device without ops->create or ops->destroy useful? > > Perhaps mdev_register_driver() should enforce required ops. mdev.h > > should at least document which ops are optional if they really are > > optional. > > Makes sense, adding check in mdev_register_driver() to mandate create > and destroy in ops. I'll also update the comments in mdev.h for > mandatory and optional ops. > > > > >> + ret = parent->ops->create(mdev->dev.parent, mdev->uuid, > >> + mdev->instance, mdev_params); > >> + if (ret) > >> + goto create_ops_err; > >> + } > >> + > >> + ret = mdev_add_attribute_group(&mdev->dev, > >> + parent->ops->mdev_attr_groups); > > > > An error here seems to put us in a bad place, the device is created but > > the attributes are broken, is it the caller's responsibility to > > destroy? Seems like we need a cleanup if this fails. > > > > Right, adding cleanup here. > > >> +create_ops_err: > >> + mutex_unlock(&parent->ops_lock); > > > > It seems like ops_lock isn't used so much as a lock as a serialization > > mechanism. Why? Where is this serialization per parent device > > documented? > > > > parent->ops_lock is to serialize parent device callbacks to vendor > driver, i.e supported_config(), create() and destroy(). > mdev->ops_lock is to serialize mediated device related callbacks to > vendor driver, i.e. start(), stop(), read(), write(), set_irqs(), > get_region_info(), validate_map_request(). > Its not documented, I'll add comments to mdev.h about these locks. Should it be the mediated driver core's responsibility to do this? If a given mediated driver wants to serialize on their own, they can do that, but I don't see why we would impose that on every mediated driver. > >> + return ret; > >> +} > >> + > >> +static int mdev_device_destroy_ops(struct mdev_device *mdev, bool force) > >> +{ > >> + struct parent_device *parent = mdev->parent; > >> + int ret = 0; > >> + > >> + /* > >> + * If vendor driver doesn't return success that means vendor > >> + * driver doesn't support hot-unplug > >> + */ > >> + mutex_lock(&parent->ops_lock); > >> + if (parent->ops->destroy) { > >> + ret = parent->ops->destroy(parent->dev, mdev->uuid, > >> + mdev->instance); > >> + if (ret && !force) { > > > > It seems this is not so much a 'force' but an ignore errors, we never > > actually force the mdev driver to destroy the device... which makes me > > wonder if there are leaks there. > > > > Consider a case where VM is running or in teardown path and parent > device in unbound from vendor driver, then vendor driver would call > mdev_unregister_device() from its remove() call. Even if > parent->ops->destroy() returns error that could also mean that > hot-unplug is not supported but we have to destroy mdev device. remove() > call doesn't honor error returned. In that case its a force removal. > > >> + > >> +/* > >> + * mdev_unregister_device : Unregister a parent device > >> + * @dev: device structure representing parent device. > >> + * > >> + * Remove device from list of registered parent devices. Give a chance to free > >> + * existing mediated devices for given device. > >> + */ > >> + > >> +void mdev_unregister_device(struct device *dev) > >> +{ > >> + struct parent_device *parent; > >> + struct mdev_device *mdev, *n; > >> + int ret; > >> + > >> + mutex_lock(&parent_devices.list_lock); > >> + parent = find_parent_device(dev); > >> + > >> + if (!parent) { > >> + mutex_unlock(&parent_devices.list_lock); > >> + return; > >> + } > >> + dev_info(dev, "MDEV: Unregistering\n"); > >> + > >> + /* > >> + * Remove parent from the list and remove create and destroy sysfs > >> + * files so that no new mediated device could be created for this parent > >> + */ > >> + list_del(&parent->next); > >> + mdev_remove_sysfs_files(dev); > >> + mutex_unlock(&parent_devices.list_lock); > >> + > >> + mutex_lock(&parent->ops_lock); > >> + mdev_remove_attribute_group(dev, > >> + parent->ops->dev_attr_groups); > >> + mutex_unlock(&parent->ops_lock); > >> + > >> + mutex_lock(&parent->mdev_list_lock); > >> + list_for_each_entry_safe(mdev, n, &parent->mdev_list, next) { > >> + mdev_device_destroy_ops(mdev, true); > >> + list_del(&mdev->next); > >> + mdev_put_device(mdev); > >> + } > >> + mutex_unlock(&parent->mdev_list_lock); > >> + > >> + do { > >> + ret = wait_event_interruptible_timeout(parent->release_done, > >> + list_empty(&parent->mdev_list), HZ * 10); > > > > But we do a list_del for each mdev in mdev_list above, how could the > > list not be empty here? I think you're trying to wait for all the mdev > > devices to be released, but I don't think this does that. Isn't the > > list empty regardless? > > > > Right, I do want to wait for all the mdev devices to be released. Moving > list_del(&mdev->next) from the above for loop to mdev_release_device() > so that mdev will be removed from list on last mdev_put_device(). > > > >> + if (ret == -ERESTARTSYS) { > >> + dev_warn(dev, "Mediated devices are in use, task" > >> + " \"%s\" (%d) " > >> + "blocked until all are released", > >> + current->comm, task_pid_nr(current)); > >> + } > >> + } while (ret <= 0); > >> + > >> + mdev_put_parent(parent); > >> +} > >> +EXPORT_SYMBOL(mdev_unregister_device); > >> + > >> + > > > >> +int mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance, > >> + char *mdev_params) > >> +{ > >> + int ret; > >> + struct mdev_device *mdev; > >> + struct parent_device *parent; > >> + > >> + parent = mdev_get_parent_by_dev(dev); > >> + if (!parent) > >> + return -EINVAL; > >> + > >> + /* Check for duplicate */ > >> + mdev = find_mdev_device(parent, uuid, instance); > > > > But this doesn't actually prevent duplicates because we we're not > > holding any lock the guarantee that another racing process doesn't > > create the same {uuid,instance} between where we check and the below > > list_add. > > > > Oops I missed this race condition. Moving > mutex_lock(&parent->mdev_list_lock); > before find_mdev_device() in mdev_device_create() and > mdev_device_destroy(). > > > >> + > >> +int mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance, > >> + char *mdev_params); > >> +int mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t instance); > >> +void mdev_device_supported_config(struct device *dev, char *str); > >> +int mdev_device_start(uuid_le uuid); > >> +int mdev_device_shutdown(uuid_le uuid); > > > > nit, stop is start as startup is to shutdown. IOW, should this be > > mdev_device_stop()? > > > > Ok. Renaming mdev_device_shutdown() to mdev_device_stop(). > > > >> + > >> +struct pci_region_info { > >> + uint64_t start; > >> + uint64_t size; > >> + uint32_t flags; /* VFIO region info flags */ > >> +}; > >> + > >> +enum mdev_emul_space { > >> + EMUL_CONFIG_SPACE, /* PCI configuration space */ > >> + EMUL_IO, /* I/O register space */ > >> + EMUL_MMIO /* Memory-mapped I/O space */ > >> +}; > > > > > > I'm still confused why this is needed, perhaps a description here would > > be useful so I can stop asking. Clearly config space is PCI only, so > > it's strange to have it in the common code. Everyone not on x86 will > > say I/O space is also strange. I can't keep it in my head why the > > read/write offsets aren't sufficient for the driver to figure out what > > type it is. > > > > > > Now that VFIO_PCI_OFFSET_* macros are moved to vfio.h which vendor > driver can also use, above enum could be removed from read/write. But > again these macros are useful when parent device is PCI device. How > would non-pci parent device differentiate IO ports and MMIO? Moving VFIO_PCI_OFFSET_* to vfio.h already worries me, the vfio api does not impose fixed offsets, it's simply an implementation detail of vfio-pci. We should be free to change that whenever we want and not break userspace. By moving it to vfio.h and potentially having external mediated drivers depend on those offset macros, they now become part of the kABI. So more and more, I'd prefer that reads/writes/mmaps get passed directly to the mediated driver, let them define which offset is which, the core is just a passthrough. For non-PCI devices, like platform devices, the indexes are implementation specific, the user really needs to know how to work with the specific device and how it defines device mmio to region indexes. > >> + int (*get_region_info)(struct mdev_device *vdev, int region_index, > >> + struct pci_region_info *region_info); > > > > This can't be //pci_//region_info. How do you intend to support things > > like sparse mmap capabilities in the user REGION_INFO ioctl when such > > things are not part of the mediated device API? Seems like the driver > > should just return a buffer. > > > > If not pci_region_info, can use vfio_region_info here, even to fetch > sparce mmap capabilities from vendor driver? Sure, you can use vfio_region_info, then it's just a pointer to a buffer allocated by the callee and the mediated core is just a passthrough, which is probably how it should be. Thanks, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/21/2016 12:31 AM, Kirti Wankhede wrote: > Design for Mediated Device Driver: > Main purpose of this driver is to provide a common interface for mediated > device management that can be used by differnt drivers of different > devices. > > This module provides a generic interface to create the device, add it to > mediated bus, add device to IOMMU group and then add it to vfio group. > > Below is the high Level block diagram, with Nvidia, Intel and IBM devices > as example, since these are the devices which are going to actively use > this module as of now. > > +---------------+ > | | > | +-----------+ | mdev_register_driver() +--------------+ > | | | +<------------------------+ __init() | > | | | | | | > | | mdev | +------------------------>+ |<-> VFIO user > | | bus | | probe()/remove() | vfio_mpci.ko | APIs > | | driver | | | | > | | | | +--------------+ > | | | | mdev_register_driver() +--------------+ > | | | +<------------------------+ __init() | > | | | | | | > | | | +------------------------>+ |<-> VFIO user > | +-----------+ | probe()/remove() | vfio_mccw.ko | APIs > | | | | > | MDEV CORE | +--------------+ > | MODULE | > | mdev.ko | > | +-----------+ | mdev_register_device() +--------------+ > | | | +<------------------------+ | > | | | | | nvidia.ko |<-> physical > | | | +------------------------>+ | device > | | | | callback +--------------+ > | | Physical | | > | | device | | mdev_register_device() +--------------+ > | | interface | |<------------------------+ | > | | | | | i915.ko |<-> physical > | | | +------------------------>+ | device > | | | | callback +--------------+ > | | | | > | | | | mdev_register_device() +--------------+ > | | | +<------------------------+ | > | | | | | ccw_device.ko|<-> physical > | | | +------------------------>+ | device > | | | | callback +--------------+ > | +-----------+ | > +---------------+ > > Core driver provides two types of registration interfaces: > 1. Registration interface for mediated bus driver: > > /** > * struct mdev_driver - Mediated device's driver > * @name: driver name > * @probe: called when new device created > * @remove:called when device removed > * @match: called when new device or driver is added for this bus. > Return 1 if given device can be handled by given driver and > zero otherwise. > * @driver:device driver structure > * > **/ > struct mdev_driver { > const char *name; > int (*probe) (struct device *dev); > void (*remove) (struct device *dev); > int (*match)(struct device *dev); > struct device_driver driver; > }; > > int mdev_register_driver(struct mdev_driver *drv, struct module *owner); > void mdev_unregister_driver(struct mdev_driver *drv); > > Mediated device's driver for mdev should use this interface to register > with Core driver. With this, mediated devices driver for such devices is > responsible to add mediated device to VFIO group. > > 2. Physical device driver interface > This interface provides vendor driver the set APIs to manage physical > device related work in their own driver. APIs are : > - supported_config: provide supported configuration list by the vendor > driver > - create: to allocate basic resources in vendor driver for a mediated > device. > - destroy: to free resources in vendor driver when mediated device is > destroyed. > - start: to initiate mediated device initialization process from vendor > driver when VM boots and before QEMU starts. > - shutdown: to teardown mediated device resources during VM teardown. > - read : read emulation callback. > - write: write emulation callback. > - set_irqs: send interrupt configuration information that QEMU sets. > - get_region_info: to provide region size and its flags for the mediated > device. > - validate_map_request: to validate remap pfn request. > > This registration interface should be used by vendor drivers to register > each physical device to mdev core driver. > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com> > Signed-off-by: Neo Jia <cjia@nvidia.com> > Change-Id: I73a5084574270b14541c529461ea2f03c292d510 > --- > drivers/vfio/Kconfig | 1 + > drivers/vfio/Makefile | 1 + > drivers/vfio/mdev/Kconfig | 11 + > drivers/vfio/mdev/Makefile | 5 + > drivers/vfio/mdev/mdev_core.c | 595 +++++++++++++++++++++++++++++++++++++++ > drivers/vfio/mdev/mdev_driver.c | 138 +++++++++ > drivers/vfio/mdev/mdev_private.h | 33 +++ > drivers/vfio/mdev/mdev_sysfs.c | 300 ++++++++++++++++++++ > include/linux/mdev.h | 232 +++++++++++++++ > 9 files changed, 1316 insertions(+) > create mode 100644 drivers/vfio/mdev/Kconfig > create mode 100644 drivers/vfio/mdev/Makefile > create mode 100644 drivers/vfio/mdev/mdev_core.c > create mode 100644 drivers/vfio/mdev/mdev_driver.c > create mode 100644 drivers/vfio/mdev/mdev_private.h > create mode 100644 drivers/vfio/mdev/mdev_sysfs.c > create mode 100644 include/linux/mdev.h > > diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig > index da6e2ce77495..23eced02aaf6 100644 > --- a/drivers/vfio/Kconfig > +++ b/drivers/vfio/Kconfig > @@ -48,4 +48,5 @@ menuconfig VFIO_NOIOMMU > > source "drivers/vfio/pci/Kconfig" > source "drivers/vfio/platform/Kconfig" > +source "drivers/vfio/mdev/Kconfig" > source "virt/lib/Kconfig" > diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile > index 7b8a31f63fea..7c70753e54ab 100644 > --- a/drivers/vfio/Makefile > +++ b/drivers/vfio/Makefile > @@ -7,3 +7,4 @@ obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o > obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o > obj-$(CONFIG_VFIO_PCI) += pci/ > obj-$(CONFIG_VFIO_PLATFORM) += platform/ > +obj-$(CONFIG_MDEV) += mdev/ > diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig > new file mode 100644 > index 000000000000..951e2bb06a3f > --- /dev/null > +++ b/drivers/vfio/mdev/Kconfig > @@ -0,0 +1,11 @@ > + > +config MDEV > + tristate "Mediated device driver framework" > + depends on VFIO > + default n > + help > + MDEV provides a framework to virtualize device without SR-IOV cap > + See Documentation/mdev.txt for more details. > + > + If you don't know what do here, say N. > + > diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile > new file mode 100644 > index 000000000000..2c6d11f7bc24 > --- /dev/null > +++ b/drivers/vfio/mdev/Makefile > @@ -0,0 +1,5 @@ > + > +mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o > + > +obj-$(CONFIG_MDEV) += mdev.o > + > diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c > new file mode 100644 > index 000000000000..3c45ed2ae1e9 > --- /dev/null > +++ b/drivers/vfio/mdev/mdev_core.c > @@ -0,0 +1,595 @@ > +/* > + * Mediated device Core Driver > + * > + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. > + * Author: Neo Jia <cjia@nvidia.com> > + * Kirti Wankhede <kwankhede@nvidia.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/init.h> > +#include <linux/module.h> > +#include <linux/device.h> > +#include <linux/kernel.h> > +#include <linux/fs.h> > +#include <linux/slab.h> > +#include <linux/cdev.h> > +#include <linux/sched.h> > +#include <linux/uuid.h> > +#include <linux/vfio.h> > +#include <linux/iommu.h> > +#include <linux/sysfs.h> > +#include <linux/mdev.h> > + > +#include "mdev_private.h" > + > +#define DRIVER_VERSION "0.1" > +#define DRIVER_AUTHOR "NVIDIA Corporation" > +#define DRIVER_DESC "Mediated device Core Driver" > + > +#define MDEV_CLASS_NAME "mdev" > + > +static struct devices_list { > + struct list_head dev_list; > + struct mutex list_lock; > +} parent_devices; > + > +static int mdev_add_attribute_group(struct device *dev, > + const struct attribute_group **groups) > +{ > + return sysfs_create_groups(&dev->kobj, groups); > +} > + > +static void mdev_remove_attribute_group(struct device *dev, > + const struct attribute_group **groups) > +{ > + sysfs_remove_groups(&dev->kobj, groups); > +} > + better use device_add_groups() / device_remove_groups() instead? > +static struct mdev_device *find_mdev_device(struct parent_device *parent, > + uuid_le uuid, int instance) > +{ > + struct mdev_device *mdev = NULL, *p; > + > + list_for_each_entry(p, &parent->mdev_list, next) { > + if ((uuid_le_cmp(p->uuid, uuid) == 0) && > + (p->instance == instance)) { > + mdev = p; > + break; > + } > + } > + return mdev; > +} > + > +/* Should be called holding parent_devices.list_lock */ > +static struct parent_device *find_parent_device(struct device *dev) > +{ > + struct parent_device *parent = NULL, *p; > + > + WARN_ON(!mutex_is_locked(&parent_devices.list_lock)); This is a static function, do we really need this assert? > + list_for_each_entry(p, &parent_devices.dev_list, next) { > + if (p->dev == dev) { > + parent = p; > + break; > + } > + } > + return parent; > +} > + > +static void mdev_release_parent(struct kref *kref) > +{ > + struct parent_device *parent = container_of(kref, struct parent_device, > + ref); > + kfree(parent); > +} > + > +static > +inline struct parent_device *mdev_get_parent(struct parent_device *parent) > +{ > + if (parent) > + kref_get(&parent->ref); > + > + return parent; > +} > + > +static inline void mdev_put_parent(struct parent_device *parent) > +{ > + if (parent) > + kref_put(&parent->ref, mdev_release_parent); > +} > + > +static struct parent_device *mdev_get_parent_by_dev(struct device *dev) > +{ > + struct parent_device *parent = NULL, *p; > + > + mutex_lock(&parent_devices.list_lock); > + list_for_each_entry(p, &parent_devices.dev_list, next) { > + if (p->dev == dev) { > + parent = mdev_get_parent(p); > + break; > + } > + } Can directly call find_parent_device(). > + mutex_unlock(&parent_devices.list_lock); > + return parent; > +} > + > +static int mdev_device_create_ops(struct mdev_device *mdev, char *mdev_params) > +{ > + struct parent_device *parent = mdev->parent; > + int ret; > + > + mutex_lock(&parent->ops_lock); > + if (parent->ops->create) { > + ret = parent->ops->create(mdev->dev.parent, mdev->uuid, > + mdev->instance, mdev_params); I think it is better if we pass @mdev to this callback, then the parent driver can do its specified operations and associate it with the instance, e.g, via mdev->private. > + if (ret) > + goto create_ops_err; > + } > + > + ret = mdev_add_attribute_group(&mdev->dev, > + parent->ops->mdev_attr_groups); > +create_ops_err: > + mutex_unlock(&parent->ops_lock); > + return ret; > +} > + > +static int mdev_device_destroy_ops(struct mdev_device *mdev, bool force) > +{ > + struct parent_device *parent = mdev->parent; > + int ret = 0; > + > + /* > + * If vendor driver doesn't return success that means vendor > + * driver doesn't support hot-unplug > + */ > + mutex_lock(&parent->ops_lock); > + if (parent->ops->destroy) { > + ret = parent->ops->destroy(parent->dev, mdev->uuid, > + mdev->instance); > + if (ret && !force) { > + ret = -EBUSY; > + goto destroy_ops_err; > + } > + } > + mdev_remove_attribute_group(&mdev->dev, > + parent->ops->mdev_attr_groups); > +destroy_ops_err: > + mutex_unlock(&parent->ops_lock); > + > + return ret; > +} > + > +static void mdev_release_device(struct kref *kref) > +{ > + struct mdev_device *mdev = container_of(kref, struct mdev_device, ref); > + struct parent_device *parent = mdev->parent; > + > + device_unregister(&mdev->dev); > + wake_up(&parent->release_done); > + mdev_put_parent(parent); > +} > + > +struct mdev_device *mdev_get_device(struct mdev_device *mdev) > +{ > + if (mdev) > + kref_get(&mdev->ref); > + > + return mdev; > +} > +EXPORT_SYMBOL(mdev_get_device); > + > +void mdev_put_device(struct mdev_device *mdev) > +{ > + if (mdev) > + kref_put(&mdev->ref, mdev_release_device); > +} > +EXPORT_SYMBOL(mdev_put_device); > + > +/* > + * Find first mediated device from given uuid and increment refcount of > + * mediated device. Caller should call mdev_put_device() when the use of > + * mdev_device is done. > + */ > +static struct mdev_device *mdev_get_first_device_by_uuid(uuid_le uuid) > +{ > + struct mdev_device *mdev = NULL, *p; > + struct parent_device *parent; > + > + mutex_lock(&parent_devices.list_lock); > + list_for_each_entry(parent, &parent_devices.dev_list, next) { > + mutex_lock(&parent->mdev_list_lock); > + list_for_each_entry(p, &parent->mdev_list, next) { > + if (uuid_le_cmp(p->uuid, uuid) == 0) { > + mdev = mdev_get_device(p); > + break; > + } > + } > + mutex_unlock(&parent->mdev_list_lock); > + > + if (mdev) > + break; > + } > + mutex_unlock(&parent_devices.list_lock); > + return mdev; > +} > + > +/* > + * Find mediated device from given iommu_group and increment refcount of > + * mediated device. Caller should call mdev_put_device() when the use of > + * mdev_device is done. > + */ > +struct mdev_device *mdev_get_device_by_group(struct iommu_group *group) > +{ > + struct mdev_device *mdev = NULL, *p; > + struct parent_device *parent; > + > + mutex_lock(&parent_devices.list_lock); > + list_for_each_entry(parent, &parent_devices.dev_list, next) { > + mutex_lock(&parent->mdev_list_lock); > + list_for_each_entry(p, &parent->mdev_list, next) { > + if (!p->group) > + continue; > + > + if (iommu_group_id(p->group) == iommu_group_id(group)) { > + mdev = mdev_get_device(p); > + break; > + } > + } > + mutex_unlock(&parent->mdev_list_lock); > + > + if (mdev) > + break; > + } > + mutex_unlock(&parent_devices.list_lock); > + return mdev; > +} > +EXPORT_SYMBOL(mdev_get_device_by_group); > + > +/* > + * mdev_register_device : Register a device > + * @dev: device structure representing parent device. > + * @ops: Parent device operation structure to be registered. > + * > + * Add device to list of registered parent devices. > + * Returns a negative value on error, otherwise 0. > + */ > +int mdev_register_device(struct device *dev, const struct parent_ops *ops) > +{ > + int ret = 0; > + struct parent_device *parent; > + > + if (!dev || !ops) > + return -EINVAL; > + > + mutex_lock(&parent_devices.list_lock); > + > + /* Check for duplicate */ > + parent = find_parent_device(dev); > + if (parent) { > + ret = -EEXIST; > + goto add_dev_err; > + } > + > + parent = kzalloc(sizeof(*parent), GFP_KERNEL); > + if (!parent) { > + ret = -ENOMEM; > + goto add_dev_err; > + } > + > + kref_init(&parent->ref); > + list_add(&parent->next, &parent_devices.dev_list); > + mutex_unlock(&parent_devices.list_lock); It is not safe as Alex's already pointed it out. > + > + parent->dev = dev; > + parent->ops = ops; > + mutex_init(&parent->ops_lock); > + mutex_init(&parent->mdev_list_lock); > + INIT_LIST_HEAD(&parent->mdev_list); > + init_waitqueue_head(&parent->release_done); And no lock to protect these operations. > + > + ret = mdev_create_sysfs_files(dev); > + if (ret) > + goto add_sysfs_error; > + > + ret = mdev_add_attribute_group(dev, ops->dev_attr_groups); > + if (ret) > + goto add_group_error; > + > + dev_info(dev, "MDEV: Registered\n"); > + return 0; > + > +add_group_error: > + mdev_remove_sysfs_files(dev); > +add_sysfs_error: > + mutex_lock(&parent_devices.list_lock); > + list_del(&parent->next); > + mutex_unlock(&parent_devices.list_lock); > + mdev_put_parent(parent); > + return ret; > + > +add_dev_err: > + mutex_unlock(&parent_devices.list_lock); > + return ret; > +} > +EXPORT_SYMBOL(mdev_register_device); > + > +/* > + * mdev_unregister_device : Unregister a parent device > + * @dev: device structure representing parent device. > + * > + * Remove device from list of registered parent devices. Give a chance to free > + * existing mediated devices for given device. > + */ > + > +void mdev_unregister_device(struct device *dev) > +{ > + struct parent_device *parent; > + struct mdev_device *mdev, *n; > + int ret; > + > + mutex_lock(&parent_devices.list_lock); > + parent = find_parent_device(dev); > + > + if (!parent) { > + mutex_unlock(&parent_devices.list_lock); > + return; > + } > + dev_info(dev, "MDEV: Unregistering\n"); > + > + /* > + * Remove parent from the list and remove create and destroy sysfs > + * files so that no new mediated device could be created for this parent > + */ > + list_del(&parent->next); > + mdev_remove_sysfs_files(dev); > + mutex_unlock(&parent_devices.list_lock); > + find_parent_device() does not increase the refcount of the parent-device, after releasing the lock, is it still safe to use the device? > + mutex_lock(&parent->ops_lock); > + mdev_remove_attribute_group(dev, > + parent->ops->dev_attr_groups); Why mdev_remove_sysfs_files() and mdev_remove_attribute_group() are protected by different locks? > + mutex_unlock(&parent->ops_lock); > + > + mutex_lock(&parent->mdev_list_lock); > + list_for_each_entry_safe(mdev, n, &parent->mdev_list, next) { > + mdev_device_destroy_ops(mdev, true); > + list_del(&mdev->next); > + mdev_put_device(mdev); > + } > + mutex_unlock(&parent->mdev_list_lock); > + > + do { > + ret = wait_event_interruptible_timeout(parent->release_done, > + list_empty(&parent->mdev_list), HZ * 10); > + if (ret == -ERESTARTSYS) { > + dev_warn(dev, "Mediated devices are in use, task" > + " \"%s\" (%d) " > + "blocked until all are released", > + current->comm, task_pid_nr(current)); > + } > + } while (ret <= 0); > + > + mdev_put_parent(parent); > +} > +EXPORT_SYMBOL(mdev_unregister_device); > + > +/* > + * Functions required for mdev-sysfs > + */ > +static void mdev_device_release(struct device *dev) > +{ > + struct mdev_device *mdev = to_mdev_device(dev); > + > + dev_dbg(&mdev->dev, "MDEV: destroying\n"); > + kfree(mdev); > +} > + > +int mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance, > + char *mdev_params) > +{ > + int ret; > + struct mdev_device *mdev; > + struct parent_device *parent; > + > + parent = mdev_get_parent_by_dev(dev); > + if (!parent) > + return -EINVAL; > + > + /* Check for duplicate */ > + mdev = find_mdev_device(parent, uuid, instance); > + if (mdev) { > + ret = -EEXIST; > + goto create_err; > + } > + > + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); > + if (!mdev) { > + ret = -ENOMEM; > + goto create_err; > + } > + > + memcpy(&mdev->uuid, &uuid, sizeof(uuid_le)); > + mdev->instance = instance; > + mdev->parent = parent; > + mutex_init(&mdev->ops_lock); > + kref_init(&mdev->ref); > + > + mdev->dev.parent = dev; > + mdev->dev.bus = &mdev_bus_type; > + mdev->dev.release = mdev_device_release; > + dev_set_name(&mdev->dev, "%pUb-%d", uuid.b, instance); > + > + ret = device_register(&mdev->dev); > + if (ret) { > + put_device(&mdev->dev); > + goto create_err; > + } > + > + ret = mdev_device_create_ops(mdev, mdev_params); > + if (ret) > + goto create_failed; > + > + mutex_lock(&parent->mdev_list_lock); > + list_add(&mdev->next, &parent->mdev_list); > + mutex_unlock(&parent->mdev_list_lock); > + > + dev_dbg(&mdev->dev, "MDEV: created\n"); > + > + return ret; > + > +create_failed: > + device_unregister(&mdev->dev); > + > +create_err: > + mdev_put_parent(parent); > + return ret; > +} > + > +int mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t instance) > +{ > + struct mdev_device *mdev; > + struct parent_device *parent; > + int ret; > + > + parent = mdev_get_parent_by_dev(dev); > + if (!parent) { > + ret = -EINVAL; > + goto destroy_err; > + } > + > + mdev = find_mdev_device(parent, uuid, instance); > + if (!mdev) { > + ret = -EINVAL; > + goto destroy_err; > + } > + > + ret = mdev_device_destroy_ops(mdev, false); > + if (ret) > + goto destroy_err; find_mdev_device() does not hold the refcount of mdev, is it safe? > + > + mdev_put_parent(parent); > + The refcount of parent-device is released, you can not continue to use it. > + mutex_lock(&parent->mdev_list_lock); > + list_del(&mdev->next); > + mutex_unlock(&parent->mdev_list_lock); > + > + mdev_put_device(mdev); > + return ret; > + > +destroy_err: > + mdev_put_parent(parent); > + return ret; > +} > + > +void mdev_device_supported_config(struct device *dev, char *str) > +{ > + struct parent_device *parent; > + > + parent = mdev_get_parent_by_dev(dev); > + > + if (parent) { > + mutex_lock(&parent->ops_lock); > + if (parent->ops->supported_config) > + parent->ops->supported_config(parent->dev, str); > + mutex_unlock(&parent->ops_lock); > + mdev_put_parent(parent); > + } > +} > + > +int mdev_device_start(uuid_le uuid) > +{ > + int ret = 0; > + struct mdev_device *mdev; > + struct parent_device *parent; > + > + mdev = mdev_get_first_device_by_uuid(uuid); > + if (!mdev) > + return -EINVAL; > + > + parent = mdev->parent; > + > + mutex_lock(&parent->ops_lock); > + if (parent->ops->start) > + ret = parent->ops->start(mdev->uuid); > + mutex_unlock(&parent->ops_lock); > + > + if (ret) > + pr_err("mdev_start failed %d\n", ret); > + else > + kobject_uevent(&mdev->dev.kobj, KOBJ_ONLINE); > + > + mdev_put_device(mdev); > + > + return ret; > +} > + > +int mdev_device_shutdown(uuid_le uuid) > +{ > + int ret = 0; > + struct mdev_device *mdev; > + struct parent_device *parent; > + > + mdev = mdev_get_first_device_by_uuid(uuid); > + if (!mdev) > + return -EINVAL; > + > + parent = mdev->parent; > + > + mutex_lock(&parent->ops_lock); > + if (parent->ops->shutdown) > + ret = parent->ops->shutdown(mdev->uuid); > + mutex_unlock(&parent->ops_lock); > + > + if (ret) > + pr_err("mdev_shutdown failed %d\n", ret); > + else > + kobject_uevent(&mdev->dev.kobj, KOBJ_OFFLINE); > + > + mdev_put_device(mdev); > + return ret; > +} > + > +static struct class mdev_class = { > + .name = MDEV_CLASS_NAME, > + .owner = THIS_MODULE, > + .class_attrs = mdev_class_attrs, These interfaces, start and shutdown, are based on UUID, how about if we want to operate on the specified instance? > +}; > + > +static int __init mdev_init(void) > +{ > + int ret; > + > + mutex_init(&parent_devices.list_lock); > + INIT_LIST_HEAD(&parent_devices.dev_list); > + > + ret = class_register(&mdev_class); > + if (ret) { > + pr_err("Failed to register mdev class\n"); > + return ret; > + } > + > + ret = mdev_bus_register(); > + if (ret) { > + pr_err("Failed to register mdev bus\n"); > + class_unregister(&mdev_class); > + return ret; > + } > + > + return ret; > +} > + > +static void __exit mdev_exit(void) > +{ > + mdev_bus_unregister(); > + class_unregister(&mdev_class); > +} Hmm, how to prevent if there are parent-devices existing when the module is being unloaded? > + > +module_init(mdev_init) > +module_exit(mdev_exit) > + > +MODULE_VERSION(DRIVER_VERSION); > +MODULE_LICENSE("GPL"); > +MODULE_AUTHOR(DRIVER_AUTHOR); > +MODULE_DESCRIPTION(DRIVER_DESC); > diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c > new file mode 100644 > index 000000000000..f1aed541111d > --- /dev/null > +++ b/drivers/vfio/mdev/mdev_driver.c > @@ -0,0 +1,138 @@ > +/* > + * MDEV driver > + * > + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. > + * Author: Neo Jia <cjia@nvidia.com> > + * Kirti Wankhede <kwankhede@nvidia.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/device.h> > +#include <linux/iommu.h> > +#include <linux/mdev.h> > + > +#include "mdev_private.h" > + > +static int mdev_attach_iommu(struct mdev_device *mdev) > +{ > + int ret; > + struct iommu_group *group; > + > + group = iommu_group_alloc(); > + if (IS_ERR(group)) { > + dev_err(&mdev->dev, "MDEV: failed to allocate group!\n"); > + return PTR_ERR(group); > + } > + > + ret = iommu_group_add_device(group, &mdev->dev); > + if (ret) { > + dev_err(&mdev->dev, "MDEV: failed to add dev to group!\n"); > + goto attach_fail; > + } > + > + mdev->group = group; > + > + dev_info(&mdev->dev, "MDEV: group_id = %d\n", > + iommu_group_id(group)); > +attach_fail: > + iommu_group_put(group); > + return ret; > +} > + > +static void mdev_detach_iommu(struct mdev_device *mdev) > +{ > + iommu_group_remove_device(&mdev->dev); > + dev_info(&mdev->dev, "MDEV: detaching iommu\n"); > +} > + > +static int mdev_probe(struct device *dev) > +{ > + struct mdev_driver *drv = to_mdev_driver(dev->driver); > + struct mdev_device *mdev = to_mdev_device(dev); > + int ret; > + > + ret = mdev_attach_iommu(mdev); > + if (ret) { > + dev_err(dev, "Failed to attach IOMMU\n"); > + return ret; > + } > + > + if (drv && drv->probe) > + ret = drv->probe(dev); If probe failed, need to deattache mdev? > + > + return ret; > +} > + > +static int mdev_remove(struct device *dev) > +{ > + struct mdev_driver *drv = to_mdev_driver(dev->driver); > + struct mdev_device *mdev = to_mdev_device(dev); > + > + if (drv && drv->remove) > + drv->remove(dev); > + > + mdev_detach_iommu(mdev); > + > + return 0; > +} > + > +static int mdev_match(struct device *dev, struct device_driver *drv) > +{ > + struct mdev_driver *mdrv = to_mdev_driver(drv); > + > + if (mdrv && mdrv->match) > + return mdrv->match(dev); > + > + return 0; > +} > + > +struct bus_type mdev_bus_type = { > + .name = "mdev", > + .match = mdev_match, > + .probe = mdev_probe, > + .remove = mdev_remove, > +}; > +EXPORT_SYMBOL_GPL(mdev_bus_type); > + > +/* > + * mdev_register_driver - register a new MDEV driver > + * @drv: the driver to register > + * @owner: module owner of driver to be registered > + * > + * Returns a negative value on error, otherwise 0. > + */ > +int mdev_register_driver(struct mdev_driver *drv, struct module *owner) > +{ > + /* initialize common driver fields */ > + drv->driver.name = drv->name; > + drv->driver.bus = &mdev_bus_type; > + drv->driver.owner = owner; > + > + /* register with core */ > + return driver_register(&drv->driver); > +} > +EXPORT_SYMBOL(mdev_register_driver); > + > +/* > + * mdev_unregister_driver - unregister MDEV driver > + * @drv: the driver to unregister > + * > + */ > +void mdev_unregister_driver(struct mdev_driver *drv) > +{ > + driver_unregister(&drv->driver); > +} > +EXPORT_SYMBOL(mdev_unregister_driver); > + > +int mdev_bus_register(void) > +{ > + return bus_register(&mdev_bus_type); > +} > + > +void mdev_bus_unregister(void) > +{ > + bus_unregister(&mdev_bus_type); > +} > diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h > new file mode 100644 > index 000000000000..991d7f796169 > --- /dev/null > +++ b/drivers/vfio/mdev/mdev_private.h > @@ -0,0 +1,33 @@ > +/* > + * Mediated device interal definitions > + * > + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. > + * Author: Neo Jia <cjia@nvidia.com> > + * Kirti Wankhede <kwankhede@nvidia.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#ifndef MDEV_PRIVATE_H > +#define MDEV_PRIVATE_H > + > +int mdev_bus_register(void); > +void mdev_bus_unregister(void); > + > +/* Function prototypes for mdev_sysfs */ > + > +extern struct class_attribute mdev_class_attrs[]; > + > +int mdev_create_sysfs_files(struct device *dev); > +void mdev_remove_sysfs_files(struct device *dev); > + > +int mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance, > + char *mdev_params); > +int mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t instance); > +void mdev_device_supported_config(struct device *dev, char *str); > +int mdev_device_start(uuid_le uuid); > +int mdev_device_shutdown(uuid_le uuid); > + > +#endif /* MDEV_PRIVATE_H */ > diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c > new file mode 100644 > index 000000000000..48b66e40009e > --- /dev/null > +++ b/drivers/vfio/mdev/mdev_sysfs.c > @@ -0,0 +1,300 @@ > +/* > + * File attributes for Mediated devices > + * > + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. > + * Author: Neo Jia <cjia@nvidia.com> > + * Kirti Wankhede <kwankhede@nvidia.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/sysfs.h> > +#include <linux/ctype.h> > +#include <linux/device.h> > +#include <linux/slab.h> > +#include <linux/uuid.h> > +#include <linux/mdev.h> > + > +#include "mdev_private.h" > + > +/* Prototypes */ > +static ssize_t mdev_supported_types_show(struct device *dev, > + struct device_attribute *attr, > + char *buf); > +static DEVICE_ATTR_RO(mdev_supported_types); > + > +static ssize_t mdev_create_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t count); > +static DEVICE_ATTR_WO(mdev_create); > + > +static ssize_t mdev_destroy_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t count); > +static DEVICE_ATTR_WO(mdev_destroy); > + > +/* Static functions */ > + > +#define UUID_CHAR_LENGTH 36 > +#define UUID_BYTE_LENGTH 16 > + > +#define SUPPORTED_TYPE_BUFFER_LENGTH 1024 > + > +static inline bool is_uuid_sep(char sep) > +{ > + if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0') > + return true; > + return false; > +} > + > +static int uuid_parse(const char *str, uuid_le *uuid) > +{ > + int i; > + > + if (strlen(str) < UUID_CHAR_LENGTH) > + return -EINVAL; > + > + for (i = 0; i < UUID_BYTE_LENGTH; i++) { > + if (!isxdigit(str[0]) || !isxdigit(str[1])) { > + pr_err("%s err", __func__); > + return -EINVAL; > + } > + > + uuid->b[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]); > + str += 2; > + if (is_uuid_sep(*str)) > + str++; > + } > + > + return 0; > +} > + Can we use uuid_le_to_bin()? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/29/2016 09:51 PM, Xiao Guangrong wrote: > On 06/21/2016 12:31 AM, Kirti Wankhede wrote: >> + mutex_unlock(&parent_devices.list_lock); >> + return parent; >> +} >> + >> +static int mdev_device_create_ops(struct mdev_device *mdev, char *mdev_params) >> +{ >> + struct parent_device *parent = mdev->parent; >> + int ret; >> + >> + mutex_lock(&parent->ops_lock); >> + if (parent->ops->create) { >> + ret = parent->ops->create(mdev->dev.parent, mdev->uuid, >> + mdev->instance, mdev_params); > > I think it is better if we pass @mdev to this callback, then the parent driver > can do its specified operations and associate it with the instance, > e.g, via mdev->private. > Just noticed that mdev->driver_data is missing in v5, I'd like to have it back :) Yes either mdev need to be passed to parent driver (preferred), or find_mdev_device to be exported for parent driver (less preferred, but at least functional). -- Thanks, Jike -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 6/25/2016 1:10 AM, Alex Williamson wrote: > On Fri, 24 Jun 2016 23:24:58 +0530 > Kirti Wankhede <kwankhede@nvidia.com> wrote: > >> Alex, >> >> Thanks for taking closer look. I'll incorporate all the nits you suggested. >> >> On 6/22/2016 3:00 AM, Alex Williamson wrote: >>> On Mon, 20 Jun 2016 22:01:46 +0530 >>> Kirti Wankhede <kwankhede@nvidia.com> wrote: >>> ... >>>> +create_ops_err: >>>> + mutex_unlock(&parent->ops_lock); >>> >>> It seems like ops_lock isn't used so much as a lock as a serialization >>> mechanism. Why? Where is this serialization per parent device >>> documented? >>> >> >> parent->ops_lock is to serialize parent device callbacks to vendor >> driver, i.e supported_config(), create() and destroy(). >> mdev->ops_lock is to serialize mediated device related callbacks to >> vendor driver, i.e. start(), stop(), read(), write(), set_irqs(), >> get_region_info(), validate_map_request(). >> Its not documented, I'll add comments to mdev.h about these locks. > > Should it be the mediated driver core's responsibility to do this? If > a given mediated driver wants to serialize on their own, they can do > that, but I don't see why we would impose that on every mediated driver. > Ok. Removing these locks from here, so it would be mediated driver responsibility to serialize if they need to. >> >>>> + >>>> +struct pci_region_info { >>>> + uint64_t start; >>>> + uint64_t size; >>>> + uint32_t flags; /* VFIO region info flags */ >>>> +}; >>>> + >>>> +enum mdev_emul_space { >>>> + EMUL_CONFIG_SPACE, /* PCI configuration space */ >>>> + EMUL_IO, /* I/O register space */ >>>> + EMUL_MMIO /* Memory-mapped I/O space */ >>>> +}; >>> >>> >>> I'm still confused why this is needed, perhaps a description here would >>> be useful so I can stop asking. Clearly config space is PCI only, so >>> it's strange to have it in the common code. Everyone not on x86 will >>> say I/O space is also strange. I can't keep it in my head why the >>> read/write offsets aren't sufficient for the driver to figure out what >>> type it is. >>> >>> >> >> Now that VFIO_PCI_OFFSET_* macros are moved to vfio.h which vendor >> driver can also use, above enum could be removed from read/write. But >> again these macros are useful when parent device is PCI device. How >> would non-pci parent device differentiate IO ports and MMIO? > > Moving VFIO_PCI_OFFSET_* to vfio.h already worries me, the vfio api > does not impose fixed offsets, it's simply an implementation detail of > vfio-pci. We should be free to change that whenever we want and not > break userspace. By moving it to vfio.h and potentially having > external mediated drivers depend on those offset macros, they now become > part of the kABI. So more and more, I'd prefer that reads/writes/mmaps > get passed directly to the mediated driver, let them define which > offset is which, the core is just a passthrough. For non-PCI devices, > like platform devices, the indexes are implementation specific, the > user really needs to know how to work with the specific device and how > it defines device mmio to region indexes. > Ok. With this vfio_mpci looks simple. Thanks, Kirti. >>>> + int (*get_region_info)(struct mdev_device *vdev, int region_index, >>>> + struct pci_region_info *region_info); >>> >>> This can't be //pci_//region_info. How do you intend to support things >>> like sparse mmap capabilities in the user REGION_INFO ioctl when such >>> things are not part of the mediated device API? Seems like the driver >>> should just return a buffer. >>> >> >> If not pci_region_info, can use vfio_region_info here, even to fetch >> sparce mmap capabilities from vendor driver? > > Sure, you can use vfio_region_info, then it's just a pointer to a > buffer allocated by the callee and the mediated core is just a > passthrough, which is probably how it should be. Thanks, > > Alex > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 6/29/2016 7:21 PM, Xiao Guangrong wrote: > > > On 06/21/2016 12:31 AM, Kirti Wankhede wrote: >> Design for Mediated Device Driver: ... >> +static int mdev_add_attribute_group(struct device *dev, >> + const struct attribute_group **groups) >> +{ >> + return sysfs_create_groups(&dev->kobj, groups); >> +} >> + >> +static void mdev_remove_attribute_group(struct device *dev, >> + const struct attribute_group **groups) >> +{ >> + sysfs_remove_groups(&dev->kobj, groups); >> +} >> + > > better use device_add_groups() / device_remove_groups() instead? > These are not exported from base module. They can't be used here. >> +} >> + >> +static int mdev_device_create_ops(struct mdev_device *mdev, char >> *mdev_params) >> +{ >> + struct parent_device *parent = mdev->parent; >> + int ret; >> + >> + mutex_lock(&parent->ops_lock); >> + if (parent->ops->create) { >> + ret = parent->ops->create(mdev->dev.parent, mdev->uuid, >> + mdev->instance, mdev_params); > > I think it is better if we pass @mdev to this callback, then the parent > driver > can do its specified operations and associate it with the instance, > e.g, via mdev->private. > Yes, actually I was also thinking of changing it to - ret = parent->ops->create(mdev->dev.parent, mdev->uuid, - mdev->instance, mdev_params); + ret = parent->ops->create(mdev, mdev_params); >> +int mdev_register_device(struct device *dev, const struct parent_ops >> *ops) >> +{ >> + int ret = 0; >> + struct parent_device *parent; >> + >> + if (!dev || !ops) >> + return -EINVAL; >> + >> + mutex_lock(&parent_devices.list_lock); >> + >> + /* Check for duplicate */ >> + parent = find_parent_device(dev); >> + if (parent) { >> + ret = -EEXIST; >> + goto add_dev_err; >> + } >> + >> + parent = kzalloc(sizeof(*parent), GFP_KERNEL); >> + if (!parent) { >> + ret = -ENOMEM; >> + goto add_dev_err; >> + } >> + >> + kref_init(&parent->ref); >> + list_add(&parent->next, &parent_devices.dev_list); >> + mutex_unlock(&parent_devices.list_lock); > > It is not safe as Alex's already pointed it out. > >> + >> + parent->dev = dev; >> + parent->ops = ops; >> + mutex_init(&parent->ops_lock); >> + mutex_init(&parent->mdev_list_lock); >> + INIT_LIST_HEAD(&parent->mdev_list); >> + init_waitqueue_head(&parent->release_done); > > And no lock to protect these operations. > As I replied to Alex also, yes I'm fixing it. >> +void mdev_unregister_device(struct device *dev) >> +{ >> + struct parent_device *parent; >> + struct mdev_device *mdev, *n; >> + int ret; >> + >> + mutex_lock(&parent_devices.list_lock); >> + parent = find_parent_device(dev); >> + >> + if (!parent) { >> + mutex_unlock(&parent_devices.list_lock); >> + return; >> + } >> + dev_info(dev, "MDEV: Unregistering\n"); >> + >> + /* >> + * Remove parent from the list and remove create and destroy sysfs >> + * files so that no new mediated device could be created for this >> parent >> + */ >> + list_del(&parent->next); >> + mdev_remove_sysfs_files(dev); >> + mutex_unlock(&parent_devices.list_lock); >> + > > find_parent_device() does not increase the refcount of the parent-device, > after releasing the lock, is it still safe to use the device? > Yes. In mdev_register_device(), kref_init() initialises refcount to 1 and then when mdev child is created refcount is incremented and on child mdev destroys refcount is decremented. So when all child mdev are destroyed, refcount will still be 1 until mdev_unregister_device() is called. So when no mdev device is created, mdev_register_device() hold parent's refcount and released from mdev_unregister_device(). >> + mutex_lock(&parent->ops_lock); >> + mdev_remove_attribute_group(dev, >> + parent->ops->dev_attr_groups); > > Why mdev_remove_sysfs_files() and mdev_remove_attribute_group() > are protected by different locks? > As mentioned in reply to Alex on another thread, removing these locks. >> + >> +int mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t >> instance) >> +{ >> + struct mdev_device *mdev; >> + struct parent_device *parent; >> + int ret; >> + >> + parent = mdev_get_parent_by_dev(dev); >> + if (!parent) { >> + ret = -EINVAL; >> + goto destroy_err; >> + } >> + >> + mdev = find_mdev_device(parent, uuid, instance); >> + if (!mdev) { >> + ret = -EINVAL; >> + goto destroy_err; >> + } >> + >> + ret = mdev_device_destroy_ops(mdev, false); >> + if (ret) >> + goto destroy_err; > > find_mdev_device() does not hold the refcount of mdev, is it safe? > Yes, this function is just to check duplicate entry or existence of mdev device. >> + >> + mdev_put_parent(parent); >> + > > The refcount of parent-device is released, you can not continue to > use it. > Removing these locks. >> + mutex_lock(&parent->mdev_list_lock); >> + list_del(&mdev->next); >> + mutex_unlock(&parent->mdev_list_lock); >> + >> + mdev_put_device(mdev); >> + return ret; >> + >> +destroy_err: >> + mdev_put_parent(parent); >> + return ret; >> +} >> + >> + >> +static struct class mdev_class = { >> + .name = MDEV_CLASS_NAME, >> + .owner = THIS_MODULE, >> + .class_attrs = mdev_class_attrs, > > These interfaces, start and shutdown, are based on UUID, how > about if we want to operate on the specified instance? > Do you mean hot-plug a device? >> +}; >> + >> +static int __init mdev_init(void) >> +{ >> + int ret; >> + >> + mutex_init(&parent_devices.list_lock); >> + INIT_LIST_HEAD(&parent_devices.dev_list); >> + >> + ret = class_register(&mdev_class); >> + if (ret) { >> + pr_err("Failed to register mdev class\n"); >> + return ret; >> + } >> + >> + ret = mdev_bus_register(); >> + if (ret) { >> + pr_err("Failed to register mdev bus\n"); >> + class_unregister(&mdev_class); >> + return ret; >> + } >> + >> + return ret; >> +} >> + >> +static void __exit mdev_exit(void) >> +{ >> + mdev_bus_unregister(); >> + class_unregister(&mdev_class); >> +} > > Hmm, how to prevent if there are parent-devices existing > when the module is being unloaded? > If parent device exits that means that other module is using mdev_register_device()/mdev_unregister_device() from their module. 'rmmod mdev' would fail until that module is unloaded. # rmmod mdev rmmod: ERROR: Module mdev is in use by: nvidia >> + >> +static int uuid_parse(const char *str, uuid_le *uuid) >> +{ >> + int i; >> + >> + if (strlen(str) < UUID_CHAR_LENGTH) >> + return -EINVAL; >> + >> + for (i = 0; i < UUID_BYTE_LENGTH; i++) { >> + if (!isxdigit(str[0]) || !isxdigit(str[1])) { >> + pr_err("%s err", __func__); >> + return -EINVAL; >> + } >> + >> + uuid->b[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]); >> + str += 2; >> + if (is_uuid_sep(*str)) >> + str++; >> + } >> + >> + return 0; >> +} >> + > > Can we use uuid_le_to_bin()? I couldn't find this in kernel? Thanks, Kirti -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 6/30/2016 12:42 PM, Jike Song wrote: > On 06/29/2016 09:51 PM, Xiao Guangrong wrote: >> On 06/21/2016 12:31 AM, Kirti Wankhede wrote: >>> + mutex_unlock(&parent_devices.list_lock); >>> + return parent; >>> +} >>> + >>> +static int mdev_device_create_ops(struct mdev_device *mdev, char *mdev_params) >>> +{ >>> + struct parent_device *parent = mdev->parent; >>> + int ret; >>> + >>> + mutex_lock(&parent->ops_lock); >>> + if (parent->ops->create) { >>> + ret = parent->ops->create(mdev->dev.parent, mdev->uuid, >>> + mdev->instance, mdev_params); >> >> I think it is better if we pass @mdev to this callback, then the parent driver >> can do its specified operations and associate it with the instance, >> e.g, via mdev->private. >> > > Just noticed that mdev->driver_data is missing in v5, I'd like to have it back :) > Actually, I added mdev_get_drvdata() and mdev_set_drvdata() but I missed earlier that mdev->dev->driver_data is used by vfio module to keep reference of vfio_device. So adding driver_data to struct mdev_device again and updating mdev_get_drvdata() and mdev_set_drvdata() as below. static inline void *mdev_get_drvdata(struct mdev_device *mdev) { - return dev_get_drvdata(&mdev->dev); + return mdev->driver_data; } static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data) { - dev_set_drvdata(&mdev->dev, data); + mdev->driver_data = data; } > Yes either mdev need to be passed to parent driver (preferred), or find_mdev_device to > be exported for parent driver (less preferred, but at least functional). > Updating argument to create to have mdev. Thanks, Kirti. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/21/2016 12:31 AM, Kirti Wankhede wrote: > +/* > + * mdev_register_device : Register a device > + * @dev: device structure representing parent device. > + * @ops: Parent device operation structure to be registered. > + * > + * Add device to list of registered parent devices. > + * Returns a negative value on error, otherwise 0. > + */ > +int mdev_register_device(struct device *dev, const struct parent_ops *ops) > +{ > + int ret = 0; > + struct parent_device *parent; > + > + if (!dev || !ops) > + return -EINVAL; > + > + mutex_lock(&parent_devices.list_lock); > + > + /* Check for duplicate */ > + parent = find_parent_device(dev); > + if (parent) { > + ret = -EEXIST; > + goto add_dev_err; > + } > + > + parent = kzalloc(sizeof(*parent), GFP_KERNEL); > + if (!parent) { > + ret = -ENOMEM; > + goto add_dev_err; > + } > + > + kref_init(&parent->ref); > + list_add(&parent->next, &parent_devices.dev_list); > + mutex_unlock(&parent_devices.list_lock); > + > + parent->dev = dev; > + parent->ops = ops; > + mutex_init(&parent->ops_lock); > + mutex_init(&parent->mdev_list_lock); > + INIT_LIST_HEAD(&parent->mdev_list); > + init_waitqueue_head(&parent->release_done); > + > + ret = mdev_create_sysfs_files(dev); > + if (ret) > + goto add_sysfs_error; > + > + ret = mdev_add_attribute_group(dev, ops->dev_attr_groups); > + if (ret) > + goto add_group_error; > + > + dev_info(dev, "MDEV: Registered\n"); > + return 0; > + > +add_group_error: > + mdev_remove_sysfs_files(dev); > +add_sysfs_error: > + mutex_lock(&parent_devices.list_lock); > + list_del(&parent->next); > + mutex_unlock(&parent_devices.list_lock); > + mdev_put_parent(parent); > + return ret; > + > +add_dev_err: > + mutex_unlock(&parent_devices.list_lock); > + return ret; > +} > +EXPORT_SYMBOL(mdev_register_device); > + ... > +static int __init mdev_init(void) > +{ > + int ret; > + > + mutex_init(&parent_devices.list_lock); > + INIT_LIST_HEAD(&parent_devices.dev_list); > + > + ret = class_register(&mdev_class); > + if (ret) { > + pr_err("Failed to register mdev class\n"); > + return ret; > + } > + > + ret = mdev_bus_register(); > + if (ret) { > + pr_err("Failed to register mdev bus\n"); > + class_unregister(&mdev_class); > + return ret; > + } > + > + return ret; > +} > + > +static void __exit mdev_exit(void) > +{ > + mdev_bus_unregister(); > + class_unregister(&mdev_class); > +} > + > +module_init(mdev_init) > +module_exit(mdev_exit) Hi Kirti, I have a question about the order of initialization, phy_driver calls mdev_register_device in its __init function; mdev_register_device accesses parent_devices.list_lock; parent.list_lock is initialized in __init of mdev; The __init function of both phy driver and mdev are classified with module_init, if they are selected to be 'Y' in .config, it's possible that in mdev_register_device(), the mutex is still uninitialized. The problem here I think is both mdev and phy driver are actually *drivers*, so once they are builtin, the initialization order is hard to assume. Do you have any idea to avoid this? Thanks! -- Thanks, Jike -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 07/01/2016 02:51 AM, Kirti Wankhede wrote: > > > On 6/29/2016 7:21 PM, Xiao Guangrong wrote: >> >> >> On 06/21/2016 12:31 AM, Kirti Wankhede wrote: >>> Design for Mediated Device Driver: > ... >>> +static int mdev_add_attribute_group(struct device *dev, >>> + const struct attribute_group **groups) >>> +{ >>> + return sysfs_create_groups(&dev->kobj, groups); >>> +} >>> + >>> +static void mdev_remove_attribute_group(struct device *dev, >>> + const struct attribute_group **groups) >>> +{ >>> + sysfs_remove_groups(&dev->kobj, groups); >>> +} >>> + >> >> better use device_add_groups() / device_remove_groups() instead? >> > > These are not exported from base module. They can't be used here. Er, i did not realize it, sorry. > > >>> +} >>> + >>> +static int mdev_device_create_ops(struct mdev_device *mdev, char >>> *mdev_params) >>> +{ >>> + struct parent_device *parent = mdev->parent; >>> + int ret; >>> + >>> + mutex_lock(&parent->ops_lock); >>> + if (parent->ops->create) { >>> + ret = parent->ops->create(mdev->dev.parent, mdev->uuid, >>> + mdev->instance, mdev_params); >> >> I think it is better if we pass @mdev to this callback, then the parent >> driver >> can do its specified operations and associate it with the instance, >> e.g, via mdev->private. >> > > Yes, actually I was also thinking of changing it to > > - ret = parent->ops->create(mdev->dev.parent, mdev->uuid, > - mdev->instance, mdev_params); > + ret = parent->ops->create(mdev, mdev_params); > Good. :) > >>> +int mdev_register_device(struct device *dev, const struct parent_ops >>> *ops) >>> +{ >>> + int ret = 0; >>> + struct parent_device *parent; >>> + >>> + if (!dev || !ops) >>> + return -EINVAL; >>> + >>> + mutex_lock(&parent_devices.list_lock); >>> + >>> + /* Check for duplicate */ >>> + parent = find_parent_device(dev); >>> + if (parent) { >>> + ret = -EEXIST; >>> + goto add_dev_err; >>> + } >>> + >>> + parent = kzalloc(sizeof(*parent), GFP_KERNEL); >>> + if (!parent) { >>> + ret = -ENOMEM; >>> + goto add_dev_err; >>> + } >>> + >>> + kref_init(&parent->ref); >>> + list_add(&parent->next, &parent_devices.dev_list); >>> + mutex_unlock(&parent_devices.list_lock); >> >> It is not safe as Alex's already pointed it out. >> >>> + >>> + parent->dev = dev; >>> + parent->ops = ops; >>> + mutex_init(&parent->ops_lock); >>> + mutex_init(&parent->mdev_list_lock); >>> + INIT_LIST_HEAD(&parent->mdev_list); >>> + init_waitqueue_head(&parent->release_done); >> >> And no lock to protect these operations. >> > > As I replied to Alex also, yes I'm fixing it. > >>> +void mdev_unregister_device(struct device *dev) >>> +{ >>> + struct parent_device *parent; >>> + struct mdev_device *mdev, *n; >>> + int ret; >>> + >>> + mutex_lock(&parent_devices.list_lock); >>> + parent = find_parent_device(dev); >>> + >>> + if (!parent) { >>> + mutex_unlock(&parent_devices.list_lock); >>> + return; >>> + } >>> + dev_info(dev, "MDEV: Unregistering\n"); >>> + >>> + /* >>> + * Remove parent from the list and remove create and destroy sysfs >>> + * files so that no new mediated device could be created for this >>> parent >>> + */ >>> + list_del(&parent->next); >>> + mdev_remove_sysfs_files(dev); >>> + mutex_unlock(&parent_devices.list_lock); >>> + >> >> find_parent_device() does not increase the refcount of the parent-device, >> after releasing the lock, is it still safe to use the device? >> > > Yes. In mdev_register_device(), kref_init() initialises refcount to 1 > and then when mdev child is created refcount is incremented and on child > mdev destroys refcount is decremented. So when all child mdev are > destroyed, refcount will still be 1 until mdev_unregister_device() is > called. So when no mdev device is created, mdev_register_device() hold > parent's refcount and released from mdev_unregister_device(). > > >>> + mutex_lock(&parent->ops_lock); >>> + mdev_remove_attribute_group(dev, >>> + parent->ops->dev_attr_groups); >> >> Why mdev_remove_sysfs_files() and mdev_remove_attribute_group() >> are protected by different locks? >> > > As mentioned in reply to Alex on another thread, removing these locks. > >>> + >>> +int mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t >>> instance) >>> +{ >>> + struct mdev_device *mdev; >>> + struct parent_device *parent; >>> + int ret; >>> + >>> + parent = mdev_get_parent_by_dev(dev); >>> + if (!parent) { >>> + ret = -EINVAL; >>> + goto destroy_err; >>> + } >>> + >>> + mdev = find_mdev_device(parent, uuid, instance); >>> + if (!mdev) { >>> + ret = -EINVAL; >>> + goto destroy_err; >>> + } >>> + >>> + ret = mdev_device_destroy_ops(mdev, false); >>> + if (ret) >>> + goto destroy_err; >> >> find_mdev_device() does not hold the refcount of mdev, is it safe? >> > > Yes, this function is just to check duplicate entry or existence of mdev > device. > Now, i am getting more confused, the caller of mdev_device_destroy(), e.g, mdev_destroy_store(), does not hold any lock, however, you was walking the list of parent->mdev_list (calling find_mdev_device()), is it really safe? And how to prevent this scenario? CPU 0 CPU 1 in mdev_device_destroy(): in mdev_unregister_device(): mdev = find_mdev_device(parent, uuid, instance); // no refcount hold mdev_device_destroy_ops(mdev, true); list_del(&mdev->next); mdev_put_device(mdev); // last refcount is gone. mdev = find_mdev_device(parent, uuid, instance); !!!!!! PANIC !!!!!! >>> + >>> + mdev_put_parent(parent); >>> + >> >> The refcount of parent-device is released, you can not continue to >> use it. >> > > Removing these locks. > >>> + mutex_lock(&parent->mdev_list_lock); >>> + list_del(&mdev->next); >>> + mutex_unlock(&parent->mdev_list_lock); >>> + >>> + mdev_put_device(mdev); >>> + return ret; >>> + >>> +destroy_err: >>> + mdev_put_parent(parent); >>> + return ret; >>> +} >>> + >>> + >>> +static struct class mdev_class = { >>> + .name = MDEV_CLASS_NAME, >>> + .owner = THIS_MODULE, >>> + .class_attrs = mdev_class_attrs, >> >> These interfaces, start and shutdown, are based on UUID, how >> about if we want to operate on the specified instance? >> > > Do you mean hot-plug a device? Hot-plug and hot-unplug, yes. > >>> +}; >>> + >>> +static int __init mdev_init(void) >>> +{ >>> + int ret; >>> + >>> + mutex_init(&parent_devices.list_lock); >>> + INIT_LIST_HEAD(&parent_devices.dev_list); >>> + >>> + ret = class_register(&mdev_class); >>> + if (ret) { >>> + pr_err("Failed to register mdev class\n"); >>> + return ret; >>> + } >>> + >>> + ret = mdev_bus_register(); >>> + if (ret) { >>> + pr_err("Failed to register mdev bus\n"); >>> + class_unregister(&mdev_class); >>> + return ret; >>> + } >>> + >>> + return ret; >>> +} >>> + >>> +static void __exit mdev_exit(void) >>> +{ >>> + mdev_bus_unregister(); >>> + class_unregister(&mdev_class); >>> +} >> >> Hmm, how to prevent if there are parent-devices existing >> when the module is being unloaded? >> > > If parent device exits that means that other module is using > mdev_register_device()/mdev_unregister_device() from their module. > 'rmmod mdev' would fail until that module is unloaded. > > # rmmod mdev > rmmod: ERROR: Module mdev is in use by: nvidia > So other module need to explicitly increase the refcount of mdev.ko? > > >>> + >>> +static int uuid_parse(const char *str, uuid_le *uuid) >>> +{ >>> + int i; >>> + >>> + if (strlen(str) < UUID_CHAR_LENGTH) >>> + return -EINVAL; >>> + >>> + for (i = 0; i < UUID_BYTE_LENGTH; i++) { >>> + if (!isxdigit(str[0]) || !isxdigit(str[1])) { >>> + pr_err("%s err", __func__); >>> + return -EINVAL; >>> + } >>> + >>> + uuid->b[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]); >>> + str += 2; >>> + if (is_uuid_sep(*str)) >>> + str++; >>> + } >>> + >>> + return 0; >>> +} >>> + >> >> Can we use uuid_le_to_bin()? > > I couldn't find this in kernel? It is in lib/uuid.c, i am using kvm tree. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index da6e2ce77495..23eced02aaf6 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -48,4 +48,5 @@ menuconfig VFIO_NOIOMMU source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" +source "drivers/vfio/mdev/Kconfig" source "virt/lib/Kconfig" diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 7b8a31f63fea..7c70753e54ab 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o obj-$(CONFIG_VFIO_PCI) += pci/ obj-$(CONFIG_VFIO_PLATFORM) += platform/ +obj-$(CONFIG_MDEV) += mdev/ diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig new file mode 100644 index 000000000000..951e2bb06a3f --- /dev/null +++ b/drivers/vfio/mdev/Kconfig @@ -0,0 +1,11 @@ + +config MDEV + tristate "Mediated device driver framework" + depends on VFIO + default n + help + MDEV provides a framework to virtualize device without SR-IOV cap + See Documentation/mdev.txt for more details. + + If you don't know what do here, say N. + diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile new file mode 100644 index 000000000000..2c6d11f7bc24 --- /dev/null +++ b/drivers/vfio/mdev/Makefile @@ -0,0 +1,5 @@ + +mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o + +obj-$(CONFIG_MDEV) += mdev.o + diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c new file mode 100644 index 000000000000..3c45ed2ae1e9 --- /dev/null +++ b/drivers/vfio/mdev/mdev_core.c @@ -0,0 +1,595 @@ +/* + * Mediated device Core Driver + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/cdev.h> +#include <linux/sched.h> +#include <linux/uuid.h> +#include <linux/vfio.h> +#include <linux/iommu.h> +#include <linux/sysfs.h> +#include <linux/mdev.h> + +#include "mdev_private.h" + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "NVIDIA Corporation" +#define DRIVER_DESC "Mediated device Core Driver" + +#define MDEV_CLASS_NAME "mdev" + +static struct devices_list { + struct list_head dev_list; + struct mutex list_lock; +} parent_devices; + +static int mdev_add_attribute_group(struct device *dev, + const struct attribute_group **groups) +{ + return sysfs_create_groups(&dev->kobj, groups); +} + +static void mdev_remove_attribute_group(struct device *dev, + const struct attribute_group **groups) +{ + sysfs_remove_groups(&dev->kobj, groups); +} + +static struct mdev_device *find_mdev_device(struct parent_device *parent, + uuid_le uuid, int instance) +{ + struct mdev_device *mdev = NULL, *p; + + list_for_each_entry(p, &parent->mdev_list, next) { + if ((uuid_le_cmp(p->uuid, uuid) == 0) && + (p->instance == instance)) { + mdev = p; + break; + } + } + return mdev; +} + +/* Should be called holding parent_devices.list_lock */ +static struct parent_device *find_parent_device(struct device *dev) +{ + struct parent_device *parent = NULL, *p; + + WARN_ON(!mutex_is_locked(&parent_devices.list_lock)); + list_for_each_entry(p, &parent_devices.dev_list, next) { + if (p->dev == dev) { + parent = p; + break; + } + } + return parent; +} + +static void mdev_release_parent(struct kref *kref) +{ + struct parent_device *parent = container_of(kref, struct parent_device, + ref); + kfree(parent); +} + +static +inline struct parent_device *mdev_get_parent(struct parent_device *parent) +{ + if (parent) + kref_get(&parent->ref); + + return parent; +} + +static inline void mdev_put_parent(struct parent_device *parent) +{ + if (parent) + kref_put(&parent->ref, mdev_release_parent); +} + +static struct parent_device *mdev_get_parent_by_dev(struct device *dev) +{ + struct parent_device *parent = NULL, *p; + + mutex_lock(&parent_devices.list_lock); + list_for_each_entry(p, &parent_devices.dev_list, next) { + if (p->dev == dev) { + parent = mdev_get_parent(p); + break; + } + } + mutex_unlock(&parent_devices.list_lock); + return parent; +} + +static int mdev_device_create_ops(struct mdev_device *mdev, char *mdev_params) +{ + struct parent_device *parent = mdev->parent; + int ret; + + mutex_lock(&parent->ops_lock); + if (parent->ops->create) { + ret = parent->ops->create(mdev->dev.parent, mdev->uuid, + mdev->instance, mdev_params); + if (ret) + goto create_ops_err; + } + + ret = mdev_add_attribute_group(&mdev->dev, + parent->ops->mdev_attr_groups); +create_ops_err: + mutex_unlock(&parent->ops_lock); + return ret; +} + +static int mdev_device_destroy_ops(struct mdev_device *mdev, bool force) +{ + struct parent_device *parent = mdev->parent; + int ret = 0; + + /* + * If vendor driver doesn't return success that means vendor + * driver doesn't support hot-unplug + */ + mutex_lock(&parent->ops_lock); + if (parent->ops->destroy) { + ret = parent->ops->destroy(parent->dev, mdev->uuid, + mdev->instance); + if (ret && !force) { + ret = -EBUSY; + goto destroy_ops_err; + } + } + mdev_remove_attribute_group(&mdev->dev, + parent->ops->mdev_attr_groups); +destroy_ops_err: + mutex_unlock(&parent->ops_lock); + + return ret; +} + +static void mdev_release_device(struct kref *kref) +{ + struct mdev_device *mdev = container_of(kref, struct mdev_device, ref); + struct parent_device *parent = mdev->parent; + + device_unregister(&mdev->dev); + wake_up(&parent->release_done); + mdev_put_parent(parent); +} + +struct mdev_device *mdev_get_device(struct mdev_device *mdev) +{ + if (mdev) + kref_get(&mdev->ref); + + return mdev; +} +EXPORT_SYMBOL(mdev_get_device); + +void mdev_put_device(struct mdev_device *mdev) +{ + if (mdev) + kref_put(&mdev->ref, mdev_release_device); +} +EXPORT_SYMBOL(mdev_put_device); + +/* + * Find first mediated device from given uuid and increment refcount of + * mediated device. Caller should call mdev_put_device() when the use of + * mdev_device is done. + */ +static struct mdev_device *mdev_get_first_device_by_uuid(uuid_le uuid) +{ + struct mdev_device *mdev = NULL, *p; + struct parent_device *parent; + + mutex_lock(&parent_devices.list_lock); + list_for_each_entry(parent, &parent_devices.dev_list, next) { + mutex_lock(&parent->mdev_list_lock); + list_for_each_entry(p, &parent->mdev_list, next) { + if (uuid_le_cmp(p->uuid, uuid) == 0) { + mdev = mdev_get_device(p); + break; + } + } + mutex_unlock(&parent->mdev_list_lock); + + if (mdev) + break; + } + mutex_unlock(&parent_devices.list_lock); + return mdev; +} + +/* + * Find mediated device from given iommu_group and increment refcount of + * mediated device. Caller should call mdev_put_device() when the use of + * mdev_device is done. + */ +struct mdev_device *mdev_get_device_by_group(struct iommu_group *group) +{ + struct mdev_device *mdev = NULL, *p; + struct parent_device *parent; + + mutex_lock(&parent_devices.list_lock); + list_for_each_entry(parent, &parent_devices.dev_list, next) { + mutex_lock(&parent->mdev_list_lock); + list_for_each_entry(p, &parent->mdev_list, next) { + if (!p->group) + continue; + + if (iommu_group_id(p->group) == iommu_group_id(group)) { + mdev = mdev_get_device(p); + break; + } + } + mutex_unlock(&parent->mdev_list_lock); + + if (mdev) + break; + } + mutex_unlock(&parent_devices.list_lock); + return mdev; +} +EXPORT_SYMBOL(mdev_get_device_by_group); + +/* + * mdev_register_device : Register a device + * @dev: device structure representing parent device. + * @ops: Parent device operation structure to be registered. + * + * Add device to list of registered parent devices. + * Returns a negative value on error, otherwise 0. + */ +int mdev_register_device(struct device *dev, const struct parent_ops *ops) +{ + int ret = 0; + struct parent_device *parent; + + if (!dev || !ops) + return -EINVAL; + + mutex_lock(&parent_devices.list_lock); + + /* Check for duplicate */ + parent = find_parent_device(dev); + if (parent) { + ret = -EEXIST; + goto add_dev_err; + } + + parent = kzalloc(sizeof(*parent), GFP_KERNEL); + if (!parent) { + ret = -ENOMEM; + goto add_dev_err; + } + + kref_init(&parent->ref); + list_add(&parent->next, &parent_devices.dev_list); + mutex_unlock(&parent_devices.list_lock); + + parent->dev = dev; + parent->ops = ops; + mutex_init(&parent->ops_lock); + mutex_init(&parent->mdev_list_lock); + INIT_LIST_HEAD(&parent->mdev_list); + init_waitqueue_head(&parent->release_done); + + ret = mdev_create_sysfs_files(dev); + if (ret) + goto add_sysfs_error; + + ret = mdev_add_attribute_group(dev, ops->dev_attr_groups); + if (ret) + goto add_group_error; + + dev_info(dev, "MDEV: Registered\n"); + return 0; + +add_group_error: + mdev_remove_sysfs_files(dev); +add_sysfs_error: + mutex_lock(&parent_devices.list_lock); + list_del(&parent->next); + mutex_unlock(&parent_devices.list_lock); + mdev_put_parent(parent); + return ret; + +add_dev_err: + mutex_unlock(&parent_devices.list_lock); + return ret; +} +EXPORT_SYMBOL(mdev_register_device); + +/* + * mdev_unregister_device : Unregister a parent device + * @dev: device structure representing parent device. + * + * Remove device from list of registered parent devices. Give a chance to free + * existing mediated devices for given device. + */ + +void mdev_unregister_device(struct device *dev) +{ + struct parent_device *parent; + struct mdev_device *mdev, *n; + int ret; + + mutex_lock(&parent_devices.list_lock); + parent = find_parent_device(dev); + + if (!parent) { + mutex_unlock(&parent_devices.list_lock); + return; + } + dev_info(dev, "MDEV: Unregistering\n"); + + /* + * Remove parent from the list and remove create and destroy sysfs + * files so that no new mediated device could be created for this parent + */ + list_del(&parent->next); + mdev_remove_sysfs_files(dev); + mutex_unlock(&parent_devices.list_lock); + + mutex_lock(&parent->ops_lock); + mdev_remove_attribute_group(dev, + parent->ops->dev_attr_groups); + mutex_unlock(&parent->ops_lock); + + mutex_lock(&parent->mdev_list_lock); + list_for_each_entry_safe(mdev, n, &parent->mdev_list, next) { + mdev_device_destroy_ops(mdev, true); + list_del(&mdev->next); + mdev_put_device(mdev); + } + mutex_unlock(&parent->mdev_list_lock); + + do { + ret = wait_event_interruptible_timeout(parent->release_done, + list_empty(&parent->mdev_list), HZ * 10); + if (ret == -ERESTARTSYS) { + dev_warn(dev, "Mediated devices are in use, task" + " \"%s\" (%d) " + "blocked until all are released", + current->comm, task_pid_nr(current)); + } + } while (ret <= 0); + + mdev_put_parent(parent); +} +EXPORT_SYMBOL(mdev_unregister_device); + +/* + * Functions required for mdev-sysfs + */ +static void mdev_device_release(struct device *dev) +{ + struct mdev_device *mdev = to_mdev_device(dev); + + dev_dbg(&mdev->dev, "MDEV: destroying\n"); + kfree(mdev); +} + +int mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance, + char *mdev_params) +{ + int ret; + struct mdev_device *mdev; + struct parent_device *parent; + + parent = mdev_get_parent_by_dev(dev); + if (!parent) + return -EINVAL; + + /* Check for duplicate */ + mdev = find_mdev_device(parent, uuid, instance); + if (mdev) { + ret = -EEXIST; + goto create_err; + } + + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); + if (!mdev) { + ret = -ENOMEM; + goto create_err; + } + + memcpy(&mdev->uuid, &uuid, sizeof(uuid_le)); + mdev->instance = instance; + mdev->parent = parent; + mutex_init(&mdev->ops_lock); + kref_init(&mdev->ref); + + mdev->dev.parent = dev; + mdev->dev.bus = &mdev_bus_type; + mdev->dev.release = mdev_device_release; + dev_set_name(&mdev->dev, "%pUb-%d", uuid.b, instance); + + ret = device_register(&mdev->dev); + if (ret) { + put_device(&mdev->dev); + goto create_err; + } + + ret = mdev_device_create_ops(mdev, mdev_params); + if (ret) + goto create_failed; + + mutex_lock(&parent->mdev_list_lock); + list_add(&mdev->next, &parent->mdev_list); + mutex_unlock(&parent->mdev_list_lock); + + dev_dbg(&mdev->dev, "MDEV: created\n"); + + return ret; + +create_failed: + device_unregister(&mdev->dev); + +create_err: + mdev_put_parent(parent); + return ret; +} + +int mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t instance) +{ + struct mdev_device *mdev; + struct parent_device *parent; + int ret; + + parent = mdev_get_parent_by_dev(dev); + if (!parent) { + ret = -EINVAL; + goto destroy_err; + } + + mdev = find_mdev_device(parent, uuid, instance); + if (!mdev) { + ret = -EINVAL; + goto destroy_err; + } + + ret = mdev_device_destroy_ops(mdev, false); + if (ret) + goto destroy_err; + + mdev_put_parent(parent); + + mutex_lock(&parent->mdev_list_lock); + list_del(&mdev->next); + mutex_unlock(&parent->mdev_list_lock); + + mdev_put_device(mdev); + return ret; + +destroy_err: + mdev_put_parent(parent); + return ret; +} + +void mdev_device_supported_config(struct device *dev, char *str) +{ + struct parent_device *parent; + + parent = mdev_get_parent_by_dev(dev); + + if (parent) { + mutex_lock(&parent->ops_lock); + if (parent->ops->supported_config) + parent->ops->supported_config(parent->dev, str); + mutex_unlock(&parent->ops_lock); + mdev_put_parent(parent); + } +} + +int mdev_device_start(uuid_le uuid) +{ + int ret = 0; + struct mdev_device *mdev; + struct parent_device *parent; + + mdev = mdev_get_first_device_by_uuid(uuid); + if (!mdev) + return -EINVAL; + + parent = mdev->parent; + + mutex_lock(&parent->ops_lock); + if (parent->ops->start) + ret = parent->ops->start(mdev->uuid); + mutex_unlock(&parent->ops_lock); + + if (ret) + pr_err("mdev_start failed %d\n", ret); + else + kobject_uevent(&mdev->dev.kobj, KOBJ_ONLINE); + + mdev_put_device(mdev); + + return ret; +} + +int mdev_device_shutdown(uuid_le uuid) +{ + int ret = 0; + struct mdev_device *mdev; + struct parent_device *parent; + + mdev = mdev_get_first_device_by_uuid(uuid); + if (!mdev) + return -EINVAL; + + parent = mdev->parent; + + mutex_lock(&parent->ops_lock); + if (parent->ops->shutdown) + ret = parent->ops->shutdown(mdev->uuid); + mutex_unlock(&parent->ops_lock); + + if (ret) + pr_err("mdev_shutdown failed %d\n", ret); + else + kobject_uevent(&mdev->dev.kobj, KOBJ_OFFLINE); + + mdev_put_device(mdev); + return ret; +} + +static struct class mdev_class = { + .name = MDEV_CLASS_NAME, + .owner = THIS_MODULE, + .class_attrs = mdev_class_attrs, +}; + +static int __init mdev_init(void) +{ + int ret; + + mutex_init(&parent_devices.list_lock); + INIT_LIST_HEAD(&parent_devices.dev_list); + + ret = class_register(&mdev_class); + if (ret) { + pr_err("Failed to register mdev class\n"); + return ret; + } + + ret = mdev_bus_register(); + if (ret) { + pr_err("Failed to register mdev bus\n"); + class_unregister(&mdev_class); + return ret; + } + + return ret; +} + +static void __exit mdev_exit(void) +{ + mdev_bus_unregister(); + class_unregister(&mdev_class); +} + +module_init(mdev_init) +module_exit(mdev_exit) + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c new file mode 100644 index 000000000000..f1aed541111d --- /dev/null +++ b/drivers/vfio/mdev/mdev_driver.c @@ -0,0 +1,138 @@ +/* + * MDEV driver + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/device.h> +#include <linux/iommu.h> +#include <linux/mdev.h> + +#include "mdev_private.h" + +static int mdev_attach_iommu(struct mdev_device *mdev) +{ + int ret; + struct iommu_group *group; + + group = iommu_group_alloc(); + if (IS_ERR(group)) { + dev_err(&mdev->dev, "MDEV: failed to allocate group!\n"); + return PTR_ERR(group); + } + + ret = iommu_group_add_device(group, &mdev->dev); + if (ret) { + dev_err(&mdev->dev, "MDEV: failed to add dev to group!\n"); + goto attach_fail; + } + + mdev->group = group; + + dev_info(&mdev->dev, "MDEV: group_id = %d\n", + iommu_group_id(group)); +attach_fail: + iommu_group_put(group); + return ret; +} + +static void mdev_detach_iommu(struct mdev_device *mdev) +{ + iommu_group_remove_device(&mdev->dev); + dev_info(&mdev->dev, "MDEV: detaching iommu\n"); +} + +static int mdev_probe(struct device *dev) +{ + struct mdev_driver *drv = to_mdev_driver(dev->driver); + struct mdev_device *mdev = to_mdev_device(dev); + int ret; + + ret = mdev_attach_iommu(mdev); + if (ret) { + dev_err(dev, "Failed to attach IOMMU\n"); + return ret; + } + + if (drv && drv->probe) + ret = drv->probe(dev); + + return ret; +} + +static int mdev_remove(struct device *dev) +{ + struct mdev_driver *drv = to_mdev_driver(dev->driver); + struct mdev_device *mdev = to_mdev_device(dev); + + if (drv && drv->remove) + drv->remove(dev); + + mdev_detach_iommu(mdev); + + return 0; +} + +static int mdev_match(struct device *dev, struct device_driver *drv) +{ + struct mdev_driver *mdrv = to_mdev_driver(drv); + + if (mdrv && mdrv->match) + return mdrv->match(dev); + + return 0; +} + +struct bus_type mdev_bus_type = { + .name = "mdev", + .match = mdev_match, + .probe = mdev_probe, + .remove = mdev_remove, +}; +EXPORT_SYMBOL_GPL(mdev_bus_type); + +/* + * mdev_register_driver - register a new MDEV driver + * @drv: the driver to register + * @owner: module owner of driver to be registered + * + * Returns a negative value on error, otherwise 0. + */ +int mdev_register_driver(struct mdev_driver *drv, struct module *owner) +{ + /* initialize common driver fields */ + drv->driver.name = drv->name; + drv->driver.bus = &mdev_bus_type; + drv->driver.owner = owner; + + /* register with core */ + return driver_register(&drv->driver); +} +EXPORT_SYMBOL(mdev_register_driver); + +/* + * mdev_unregister_driver - unregister MDEV driver + * @drv: the driver to unregister + * + */ +void mdev_unregister_driver(struct mdev_driver *drv) +{ + driver_unregister(&drv->driver); +} +EXPORT_SYMBOL(mdev_unregister_driver); + +int mdev_bus_register(void) +{ + return bus_register(&mdev_bus_type); +} + +void mdev_bus_unregister(void) +{ + bus_unregister(&mdev_bus_type); +} diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h new file mode 100644 index 000000000000..991d7f796169 --- /dev/null +++ b/drivers/vfio/mdev/mdev_private.h @@ -0,0 +1,33 @@ +/* + * Mediated device interal definitions + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef MDEV_PRIVATE_H +#define MDEV_PRIVATE_H + +int mdev_bus_register(void); +void mdev_bus_unregister(void); + +/* Function prototypes for mdev_sysfs */ + +extern struct class_attribute mdev_class_attrs[]; + +int mdev_create_sysfs_files(struct device *dev); +void mdev_remove_sysfs_files(struct device *dev); + +int mdev_device_create(struct device *dev, uuid_le uuid, uint32_t instance, + char *mdev_params); +int mdev_device_destroy(struct device *dev, uuid_le uuid, uint32_t instance); +void mdev_device_supported_config(struct device *dev, char *str); +int mdev_device_start(uuid_le uuid); +int mdev_device_shutdown(uuid_le uuid); + +#endif /* MDEV_PRIVATE_H */ diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c new file mode 100644 index 000000000000..48b66e40009e --- /dev/null +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -0,0 +1,300 @@ +/* + * File attributes for Mediated devices + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/sysfs.h> +#include <linux/ctype.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/uuid.h> +#include <linux/mdev.h> + +#include "mdev_private.h" + +/* Prototypes */ +static ssize_t mdev_supported_types_show(struct device *dev, + struct device_attribute *attr, + char *buf); +static DEVICE_ATTR_RO(mdev_supported_types); + +static ssize_t mdev_create_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count); +static DEVICE_ATTR_WO(mdev_create); + +static ssize_t mdev_destroy_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count); +static DEVICE_ATTR_WO(mdev_destroy); + +/* Static functions */ + +#define UUID_CHAR_LENGTH 36 +#define UUID_BYTE_LENGTH 16 + +#define SUPPORTED_TYPE_BUFFER_LENGTH 1024 + +static inline bool is_uuid_sep(char sep) +{ + if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0') + return true; + return false; +} + +static int uuid_parse(const char *str, uuid_le *uuid) +{ + int i; + + if (strlen(str) < UUID_CHAR_LENGTH) + return -EINVAL; + + for (i = 0; i < UUID_BYTE_LENGTH; i++) { + if (!isxdigit(str[0]) || !isxdigit(str[1])) { + pr_err("%s err", __func__); + return -EINVAL; + } + + uuid->b[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]); + str += 2; + if (is_uuid_sep(*str)) + str++; + } + + return 0; +} + +/* mdev sysfs Functions */ +static ssize_t mdev_supported_types_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + char *str, *ptr; + ssize_t n; + + str = kzalloc(sizeof(*str) * SUPPORTED_TYPE_BUFFER_LENGTH, GFP_KERNEL); + if (!str) + return -ENOMEM; + + ptr = str; + mdev_device_supported_config(dev, str); + + n = sprintf(buf, "%s\n", str); + kfree(ptr); + + return n; +} + +static ssize_t mdev_create_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + char *str, *pstr; + char *uuid_str, *instance_str, *mdev_params = NULL, *params = NULL; + uuid_le uuid; + uint32_t instance; + int ret; + + pstr = str = kstrndup(buf, count, GFP_KERNEL); + + if (!str) + return -ENOMEM; + + uuid_str = strsep(&str, ":"); + if (!uuid_str) { + pr_err("mdev_create: Empty UUID string %s\n", buf); + ret = -EINVAL; + goto create_error; + } + + if (!str) { + pr_err("mdev_create: mdev instance not present %s\n", buf); + ret = -EINVAL; + goto create_error; + } + + instance_str = strsep(&str, ":"); + if (!instance_str) { + pr_err("mdev_create: Empty instance string %s\n", buf); + ret = -EINVAL; + goto create_error; + } + + ret = kstrtouint(instance_str, 0, &instance); + if (ret) { + pr_err("mdev_create: mdev instance parsing error %s\n", buf); + goto create_error; + } + + if (str) + params = mdev_params = kstrdup(str, GFP_KERNEL); + + ret = uuid_parse(uuid_str, &uuid); + if (ret) { + pr_err("mdev_create: UUID parse error %s\n", buf); + goto create_error; + } + + ret = mdev_device_create(dev, uuid, instance, mdev_params); + if (ret) + pr_err("mdev_create: Failed to create mdev device\n"); + else + ret = count; + +create_error: + kfree(params); + kfree(pstr); + return ret; +} + +static ssize_t mdev_destroy_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + char *uuid_str, *str, *pstr; + uuid_le uuid; + unsigned int instance; + int ret; + + str = pstr = kstrndup(buf, count, GFP_KERNEL); + + if (!str) + return -ENOMEM; + + uuid_str = strsep(&str, ":"); + if (!uuid_str) { + pr_err("mdev_destroy: Empty UUID string %s\n", buf); + ret = -EINVAL; + goto destroy_error; + } + + if (str == NULL) { + pr_err("mdev_destroy: instance not specified %s\n", buf); + ret = -EINVAL; + goto destroy_error; + } + + ret = kstrtouint(str, 0, &instance); + if (ret) { + pr_err("mdev_destroy: instance parsing error %s\n", buf); + goto destroy_error; + } + + ret = uuid_parse(uuid_str, &uuid); + if (ret) { + pr_err("mdev_destroy: UUID parse error %s\n", buf); + goto destroy_error; + } + + ret = mdev_device_destroy(dev, uuid, instance); + if (ret == 0) + ret = count; + +destroy_error: + kfree(pstr); + return ret; +} + +ssize_t mdev_start_store(struct class *class, struct class_attribute *attr, + const char *buf, size_t count) +{ + char *uuid_str, *ptr; + uuid_le uuid; + int ret; + + ptr = uuid_str = kstrndup(buf, count, GFP_KERNEL); + + if (!uuid_str) + return -ENOMEM; + + ret = uuid_parse(uuid_str, &uuid); + if (ret) { + pr_err("mdev_start: UUID parse error %s\n", buf); + goto start_error; + } + + ret = mdev_device_start(uuid); + if (ret == 0) + ret = count; + +start_error: + kfree(ptr); + return ret; +} + +ssize_t mdev_shutdown_store(struct class *class, struct class_attribute *attr, + const char *buf, size_t count) +{ + char *uuid_str, *ptr; + uuid_le uuid; + int ret; + + ptr = uuid_str = kstrndup(buf, count, GFP_KERNEL); + + if (!uuid_str) + return -ENOMEM; + + ret = uuid_parse(uuid_str, &uuid); + if (ret) { + pr_err("mdev_shutdown: UUID parse error %s\n", buf); + goto shutdown_error; + } + + ret = mdev_device_shutdown(uuid); + if (ret == 0) + ret = count; + +shutdown_error: + kfree(ptr); + return ret; + +} + +struct class_attribute mdev_class_attrs[] = { + __ATTR_WO(mdev_start), + __ATTR_WO(mdev_shutdown), + __ATTR_NULL +}; + +int mdev_create_sysfs_files(struct device *dev) +{ + int ret; + + ret = sysfs_create_file(&dev->kobj, + &dev_attr_mdev_supported_types.attr); + if (ret) { + pr_err("Failed to create mdev_supported_types sysfs entry\n"); + return ret; + } + + ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_create.attr); + if (ret) { + pr_err("Failed to create mdev_create sysfs entry\n"); + goto create_sysfs_failed; + } + + ret = sysfs_create_file(&dev->kobj, &dev_attr_mdev_destroy.attr); + if (ret) { + pr_err("Failed to create mdev_destroy sysfs entry\n"); + sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr); + } else + return ret; + +create_sysfs_failed: + sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr); + return ret; +} + +void mdev_remove_sysfs_files(struct device *dev) +{ + sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr); + sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr); + sysfs_remove_file(&dev->kobj, &dev_attr_mdev_destroy.attr); +} diff --git a/include/linux/mdev.h b/include/linux/mdev.h new file mode 100644 index 000000000000..31b6f8572cfa --- /dev/null +++ b/include/linux/mdev.h @@ -0,0 +1,232 @@ +/* + * Mediated device definition + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef MDEV_H +#define MDEV_H + +/* Common Data structures */ + +struct pci_region_info { + uint64_t start; + uint64_t size; + uint32_t flags; /* VFIO region info flags */ +}; + +enum mdev_emul_space { + EMUL_CONFIG_SPACE, /* PCI configuration space */ + EMUL_IO, /* I/O register space */ + EMUL_MMIO /* Memory-mapped I/O space */ +}; + +struct parent_device; + +/* + * Mediated device + */ + +struct mdev_device { + struct device dev; + struct parent_device *parent; + struct iommu_group *group; + void *iommu_data; + uuid_le uuid; + uint32_t instance; + + /* internal only */ + struct kref ref; + struct mutex ops_lock; + struct list_head next; +}; + + +/** + * struct parent_ops - Structure to be registered for each parent device to + * register the device to mdev module. + * + * @owner: The module owner. + * @dev_attr_groups: Default attributes of the parent device. + * @mdev_attr_groups: Default attributes of the mediated device. + * @supported_config: Called to get information about supported types. + * @dev : device structure of parent device. + * @config: should return string listing supported config + * Returns integer: success (0) or error (< 0) + * @create: Called to allocate basic resources in parent device's + * driver for a particular mediated device + * @dev: parent device structure on which mediated device + * should be created + * @uuid: VM's uuid for which VM it is intended to + * @instance: mediated instance in that VM + * @mdev_params: extra parameters required by parent + * device's driver. + * Returns integer: success (0) or error (< 0) + * @destroy: Called to free resources in parent device's driver for a + * a mediated device instance of that VM. + * @dev: parent device structure to which this mediated + * device points to. + * @uuid: VM's uuid for which the mediated device belongs + * @instance: mdev instance in that VM + * Returns integer: success (0) or error (< 0) + * If VM is running and destroy() is called that means the + * mdev is being hotunpluged. Return error if VM is running + * and driver doesn't support mediated device hotplug. + * @start: Called to initiate mediated device initialization + * process in parent device's driver when VM boots before + * VMM starts + * @uuid: VM's UUID which is booting. + * Returns integer: success (0) or error (< 0) + * @shutdown: Called to teardown mediated device related resources for + * the VM + * @uuid: VM's UUID which is shutting down . + * Returns integer: success (0) or error (< 0) + * @read: Read emulation callback + * @mdev: mediated device structure + * @buf: read buffer + * @count: number of bytes to read + * @address_space: specifies for which address space the + * request is intended for - pci_config_space, IO register + * space or MMIO space. + * @addr: address. + * Retuns number on bytes read on success or error. + * @write: Write emulation callback + * @mdev: mediated device structure + * @buf: write buffer + * @count: number of bytes to be written + * @address_space: specifies for which address space the + * request is intended for - pci_config_space, IO register + * space or MMIO space. + * @addr: address. + * Retuns number on bytes written on success or error. + * @set_irqs: Called to send about interrupts configuration + * information that VMM sets. + * @mdev: mediated device structure + * @flags, index, start, count and *data : same as that of + * struct vfio_irq_set of VFIO_DEVICE_SET_IRQS API. + * @get_region_info: Called to get VFIO region size and flags of mediated + * device. + * @mdev: mediated device structure + * @region_index: VFIO region index + * @region_info: output, returns size and flags of + * requested region. + * Returns integer: success (0) or error (< 0) + * @validate_map_request: Validate remap pfn request + * @mdev: mediated device structure + * @virtaddr: target user address to start at + * @pfn: parent address of kernel memory, vendor driver + * can change if required. + * @size: size of map area, vendor driver can change the + * size of map area if desired. + * @prot: page protection flags for this mapping, vendor + * driver can change, if required. + * Returns integer: success (0) or error (< 0) + * + * Parent device that support mediated device should be registered with mdev + * module with parent_ops structure. + */ + +struct parent_ops { + struct module *owner; + const struct attribute_group **dev_attr_groups; + const struct attribute_group **mdev_attr_groups; + + int (*supported_config)(struct device *dev, char *config); + int (*create)(struct device *dev, uuid_le uuid, + uint32_t instance, char *mdev_params); + int (*destroy)(struct device *dev, uuid_le uuid, + uint32_t instance); + int (*start)(uuid_le uuid); + int (*shutdown)(uuid_le uuid); + ssize_t (*read)(struct mdev_device *vdev, char *buf, size_t count, + enum mdev_emul_space address_space, loff_t pos); + ssize_t (*write)(struct mdev_device *vdev, char *buf, size_t count, + enum mdev_emul_space address_space, loff_t pos); + int (*set_irqs)(struct mdev_device *vdev, uint32_t flags, + unsigned int index, unsigned int start, + unsigned int count, void *data); + int (*get_region_info)(struct mdev_device *vdev, int region_index, + struct pci_region_info *region_info); + int (*validate_map_request)(struct mdev_device *vdev, + unsigned long virtaddr, + unsigned long *pfn, unsigned long *size, + pgprot_t *prot); +}; + +/* + * Parent Device + */ +struct parent_device { + struct device *dev; + const struct parent_ops *ops; + + /* internal */ + struct kref ref; + struct mutex ops_lock; + struct list_head next; + struct list_head mdev_list; + struct mutex mdev_list_lock; + wait_queue_head_t release_done; +}; + +/** + * struct mdev_driver - Mediated device driver + * @name: driver name + * @probe: called when new device created + * @remove: called when device removed + * @match: called when new device or driver is added for this bus. Return 1 if + * given device can be handled by given driver and zero otherwise. + * @driver: device driver structure + * + **/ +struct mdev_driver { + const char *name; + int (*probe)(struct device *dev); + void (*remove)(struct device *dev); + int (*match)(struct device *dev); + struct device_driver driver; +}; + +static inline struct mdev_driver *to_mdev_driver(struct device_driver *drv) +{ + return drv ? container_of(drv, struct mdev_driver, driver) : NULL; +} + +static inline struct mdev_device *to_mdev_device(struct device *dev) +{ + return dev ? container_of(dev, struct mdev_device, dev) : NULL; +} + +static inline void *mdev_get_drvdata(struct mdev_device *mdev) +{ + return dev_get_drvdata(&mdev->dev); +} + +static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data) +{ + dev_set_drvdata(&mdev->dev, data); +} + +extern struct bus_type mdev_bus_type; + +#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type) + +extern int mdev_register_device(struct device *dev, + const struct parent_ops *ops); +extern void mdev_unregister_device(struct device *dev); + +extern int mdev_register_driver(struct mdev_driver *drv, struct module *owner); +extern void mdev_unregister_driver(struct mdev_driver *drv); + +extern struct mdev_device *mdev_get_device(struct mdev_device *mdev); +extern void mdev_put_device(struct mdev_device *mdev); + +extern struct mdev_device *mdev_get_device_by_group(struct iommu_group *group); + +#endif /* MDEV_H */