diff mbox series

[v7,2/3] uacce: add uacce driver

Message ID 1572331216-9503-3-git-send-email-zhangfei.gao@linaro.org (mailing list archive)
State Not Applicable
Delegated to: Herbert Xu
Headers show
Series Add uacce module for Accelerator | expand

Commit Message

Zhangfei Gao Oct. 29, 2019, 6:40 a.m. UTC
From: Kenneth Lee <liguozhu@hisilicon.com>

Uacce (Unified/User-space-access-intended Accelerator Framework) targets to
provide Shared Virtual Addressing (SVA) between accelerators and processes.
So accelerator can access any data structure of the main cpu.
This differs from the data sharing between cpu and io device, which share
data content rather than address.
Since unified address, hardware and user space of process can share the
same virtual address in the communication.

Uacce create a chrdev for every registration, the queue is allocated to
the process when the chrdev is opened. Then the process can access the
hardware resource by interact with the queue file. By mmap the queue
file space to user space, the process can directly put requests to the
hardware without syscall to the kernel space.

Signed-off-by: Kenneth Lee <liguozhu@hisilicon.com>
Signed-off-by: Zaibo Xu <xuzaibo@huawei.com>
Signed-off-by: Zhou Wang <wangzhou1@hisilicon.com>
Signed-off-by: Zhangfei Gao <zhangfei.gao@linaro.org>
---
 Documentation/ABI/testing/sysfs-driver-uacce |  53 +++
 drivers/misc/Kconfig                         |   1 +
 drivers/misc/Makefile                        |   1 +
 drivers/misc/uacce/Kconfig                   |  13 +
 drivers/misc/uacce/Makefile                  |   2 +
 drivers/misc/uacce/uacce.c                   | 574 +++++++++++++++++++++++++++
 include/linux/uacce.h                        | 163 ++++++++
 include/uapi/misc/uacce/uacce.h              |  38 ++
 8 files changed, 845 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-driver-uacce
 create mode 100644 drivers/misc/uacce/Kconfig
 create mode 100644 drivers/misc/uacce/Makefile
 create mode 100644 drivers/misc/uacce/uacce.c
 create mode 100644 include/linux/uacce.h
 create mode 100644 include/uapi/misc/uacce/uacce.h

Comments

Jonathan Cameron Oct. 31, 2019, 5:13 p.m. UTC | #1
On Tue, 29 Oct 2019 14:40:15 +0800
Zhangfei Gao <zhangfei.gao@linaro.org> wrote:

> From: Kenneth Lee <liguozhu@hisilicon.com>
> 
> Uacce (Unified/User-space-access-intended Accelerator Framework) targets to
> provide Shared Virtual Addressing (SVA) between accelerators and processes.
> So accelerator can access any data structure of the main cpu.
> This differs from the data sharing between cpu and io device, which share
> data content rather than address.
> Since unified address, hardware and user space of process can share the
> same virtual address in the communication.
> 
> Uacce create a chrdev for every registration, the queue is allocated to
> the process when the chrdev is opened. Then the process can access the
> hardware resource by interact with the queue file. By mmap the queue
> file space to user space, the process can directly put requests to the
> hardware without syscall to the kernel space.
> 
> Signed-off-by: Kenneth Lee <liguozhu@hisilicon.com>
> Signed-off-by: Zaibo Xu <xuzaibo@huawei.com>
> Signed-off-by: Zhou Wang <wangzhou1@hisilicon.com>
> Signed-off-by: Zhangfei Gao <zhangfei.gao@linaro.org>

Great, much more compact.

I've not gone through this in detail yet but a few initial comments inline.

Thanks,

Jonathan

> ---
>  Documentation/ABI/testing/sysfs-driver-uacce |  53 +++
>  drivers/misc/Kconfig                         |   1 +
>  drivers/misc/Makefile                        |   1 +
>  drivers/misc/uacce/Kconfig                   |  13 +
>  drivers/misc/uacce/Makefile                  |   2 +
>  drivers/misc/uacce/uacce.c                   | 574 +++++++++++++++++++++++++++
>  include/linux/uacce.h                        | 163 ++++++++
>  include/uapi/misc/uacce/uacce.h              |  38 ++
>  8 files changed, 845 insertions(+)
>  create mode 100644 Documentation/ABI/testing/sysfs-driver-uacce
>  create mode 100644 drivers/misc/uacce/Kconfig
>  create mode 100644 drivers/misc/uacce/Makefile
>  create mode 100644 drivers/misc/uacce/uacce.c
>  create mode 100644 include/linux/uacce.h
>  create mode 100644 include/uapi/misc/uacce/uacce.h
> 
> diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
> new file mode 100644
> index 0000000..35699dc
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-driver-uacce
> @@ -0,0 +1,53 @@
> +What:           /sys/class/uacce/<dev_name>/id
> +Date:           Oct 2019
> +KernelVersion:  5.5
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    Id of the device.
> +
> +What:           /sys/class/uacce/<dev_name>/api
> +Date:           Oct 2019
> +KernelVersion:  5.5
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    Api of the device, used by application to match the correct driver
> +
> +What:           /sys/class/uacce/<dev_name>/flags
> +Date:           Oct 2019
> +KernelVersion:  5.5
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    Attributes of the device, see UACCE_DEV_xxx flag defined in uacce.h
> +
> +What:           /sys/class/uacce/<dev_name>/available_instances
> +Date:           Oct 2019
> +KernelVersion:  5.5
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    Available instances left of the device
> +
> +What:           /sys/class/uacce/<dev_name>/algorithms
> +Date:           Oct 2019
> +KernelVersion:  5.5
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    Algorithms supported by this accelerator
How are they separated?  Userspace code needs to know that.
(comma, tab, newline?)

> +
> +What:           /sys/class/uacce/<dev_name>/qfrt_mmio_size

qfrt is not the most obvious naming ever.  Do we care beyond its
a region for this interface?  region_mmio_size maybe?

> +Date:           Oct 2019
> +KernelVersion:  5.5
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    Page size of mmio region queue file

Size of page in this region, or number of pages in the region?

> +
> +What:           /sys/class/uacce/<dev_name>/qfrt_dus_size
> +Date:           Oct 2019
> +KernelVersion:  5.5
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    Page size of dus region queue file
> +
> +What:           /sys/class/uacce/<dev_name>/numa_distance
> +Date:           Oct 2019
> +KernelVersion:  5.5
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    Distance of device node to cpu node

I wonder if we should be doing this in here. There are other standard
ways of obtaining this for the device.  Follow parent and check node_id
there then use the /sys/bus/node path to find out the distances.

> +
> +What:           /sys/class/uacce/<dev_name>/node_id
> +Date:           Oct 2019
> +KernelVersion:  5.5
> +Contact:        linux-accelerators@lists.ozlabs.org
> +Description:    Id of the numa node
> diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
> index c55b637..929feb0 100644
> --- a/drivers/misc/Kconfig
> +++ b/drivers/misc/Kconfig
> @@ -481,4 +481,5 @@ source "drivers/misc/cxl/Kconfig"
>  source "drivers/misc/ocxl/Kconfig"
>  source "drivers/misc/cardreader/Kconfig"
>  source "drivers/misc/habanalabs/Kconfig"
> +source "drivers/misc/uacce/Kconfig"
>  endmenu
> diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
> index c1860d3..9abf292 100644
> --- a/drivers/misc/Makefile
> +++ b/drivers/misc/Makefile
> @@ -56,4 +56,5 @@ obj-$(CONFIG_OCXL)		+= ocxl/
>  obj-y				+= cardreader/
>  obj-$(CONFIG_PVPANIC)   	+= pvpanic.o
>  obj-$(CONFIG_HABANA_AI)		+= habanalabs/
> +obj-$(CONFIG_UACCE)		+= uacce/
>  obj-$(CONFIG_XILINX_SDFEC)	+= xilinx_sdfec.o
> diff --git a/drivers/misc/uacce/Kconfig b/drivers/misc/uacce/Kconfig
> new file mode 100644
> index 0000000..5e39b60
> --- /dev/null
> +++ b/drivers/misc/uacce/Kconfig
> @@ -0,0 +1,13 @@
> +config UACCE
> +	tristate "Accelerator Framework for User Land"
> +	depends on IOMMU_API
> +	help
> +	  UACCE provides interface for the user process to access the hardware
> +	  without interaction with the kernel space in data path.
> +
> +	  The user-space interface is described in
> +	  include/uapi/misc/uacce/uacce.h
> +
> +	  See Documentation/misc-devices/uacce.rst for more details.
> +
> +	  If you don't know what to do here, say N.

Pessimist :) Everyone should want uacce so don't put them off.  Having said
that perhaps for now it should be hidden and enabled on a driver by driver
basis?

> diff --git a/drivers/misc/uacce/Makefile b/drivers/misc/uacce/Makefile
> new file mode 100644
> index 0000000..5b4374e
> --- /dev/null
> +++ b/drivers/misc/uacce/Makefile
> @@ -0,0 +1,2 @@
> +# SPDX-License-Identifier: GPL-2.0-or-later
> +obj-$(CONFIG_UACCE) += uacce.o
> diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
> new file mode 100644
> index 0000000..2b6b038
> --- /dev/null
> +++ b/drivers/misc/uacce/uacce.c
> @@ -0,0 +1,574 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +#include <linux/compat.h>
> +#include <linux/dma-iommu.h>
> +#include <linux/module.h>
> +#include <linux/poll.h>
> +#include <linux/uacce.h>
> +
> +static struct class *uacce_class;
> +static dev_t uacce_devt;
> +static DEFINE_MUTEX(uacce_mutex);
> +static DEFINE_XARRAY_ALLOC(uacce_xa);
> +
> +static int uacce_start_queue(struct uacce_queue *q)
> +{
> +	int ret = -EINVAL;
> +
> +	mutex_lock(&uacce_mutex);
> +
> +	if (q->state != UACCE_Q_INIT)
> +		goto out_with_lock;
> +
> +	if (q->uacce->ops->start_queue) {
> +		ret = q->uacce->ops->start_queue(q);
> +		if (ret < 0)
> +			goto out_with_lock;
> +	}
> +
> +	q->state = UACCE_Q_STARTED;
out_with_lock:
> +	mutex_unlock(&uacce_mutex);
> +
return ret;
Though need to handle ret a bit differently above...

> +	return 0;
> +
> +out_with_lock:
> +	mutex_unlock(&uacce_mutex);
> +	return ret;
> +}
> +
> +static int uacce_put_queue(struct uacce_queue *q)
> +{
> +	struct uacce_device *uacce = q->uacce;
> +
> +	mutex_lock(&uacce_mutex);
> +
> +	if (q->state == UACCE_Q_ZOMBIE)
> +		goto out;
> +
> +	if ((q->state == UACCE_Q_STARTED) && uacce->ops->stop_queue)
> +		uacce->ops->stop_queue(q);
> +
> +	if ((q->state == UACCE_Q_INIT || q->state == UACCE_Q_STARTED) &&
> +	     uacce->ops->put_queue)
> +		uacce->ops->put_queue(q);
> +
> +	q->state = UACCE_Q_ZOMBIE;
> +out:
> +	mutex_unlock(&uacce_mutex);
> +
> +	return 0;
> +}
> +
> +static long uacce_fops_unl_ioctl(struct file *filep,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct uacce_queue *q = filep->private_data;
> +	struct uacce_device *uacce = q->uacce;
> +
> +	switch (cmd) {
> +	case UACCE_CMD_START_Q:
> +		return uacce_start_queue(q);
> +
> +	case UACCE_CMD_PUT_Q:
> +		return uacce_put_queue(q);
> +
> +	default:
> +		if (!uacce->ops->ioctl)
> +			return -EINVAL;
> +
> +		return uacce->ops->ioctl(q, cmd, arg);
> +	}
> +}
> +
> +#ifdef CONFIG_COMPAT
> +static long uacce_fops_compat_ioctl(struct file *filep,
> +				   unsigned int cmd, unsigned long arg)
> +{
> +	arg = (unsigned long)compat_ptr(arg);
> +
> +	return uacce_fops_unl_ioctl(filep, cmd, arg);
> +}
> +#endif
> +
> +static int uacce_sva_exit(struct device *dev, struct iommu_sva *handle,
> +			  void *data)
> +{
> +	struct uacce_device *uacce = data;
> +	struct uacce_queue *q;
> +
> +	mutex_lock(&uacce->q_lock);
> +	list_for_each_entry(q, &uacce->qs, list) {
> +		if (q->pid == task_pid_nr(current))
> +			uacce_put_queue(q);
> +	}
> +	mutex_unlock(&uacce->q_lock);
> +
> +	return 0;
> +}
> +
> +static struct iommu_sva_ops uacce_sva_ops = {
> +	.mm_exit = uacce_sva_exit,
> +};
> +
> +static int uacce_fops_open(struct inode *inode, struct file *filep)
> +{
> +	struct iommu_sva *handle = NULL;
> +	struct uacce_device *uacce;
> +	struct uacce_queue *q;
> +	int ret = 0;
> +	int pasid = 0;
> +
> +	uacce = xa_load(&uacce_xa, iminor(inode));
> +	if (!uacce)
> +		return -ENODEV;
> +
> +	if (!try_module_get(uacce->pdev->driver->owner))
> +		return -ENODEV;
> +
> +	q = kzalloc(sizeof(struct uacce_queue), GFP_KERNEL);
> +	if (!q) {
> +		ret = -ENOMEM;
> +		goto out_with_module;
> +	}
> +
> +	if (uacce->flags & UACCE_DEV_SVA) {
> +		handle = iommu_sva_bind_device(uacce->pdev, current->mm, uacce);
> +		if (IS_ERR(handle))
> +			goto out_with_mem;
> +
> +		ret = iommu_sva_set_ops(handle, &uacce_sva_ops);
> +		if (ret)
> +			goto out_unbind;
> +
> +		pasid = iommu_sva_get_pasid(handle);
> +		if (pasid == IOMMU_PASID_INVALID)
> +			goto out_unbind;
> +	}
> +
> +	if (uacce->ops->get_queue) {
> +		ret = uacce->ops->get_queue(uacce, pasid, q);
> +		if (ret < 0)
> +			goto out_unbind;
> +	}
> +
> +	q->pid = task_pid_nr(current);
> +	q->pasid = pasid;
> +	q->handle = handle;
> +	q->uacce = uacce;
> +	memset(q->qfrs, 0, sizeof(q->qfrs));
> +	init_waitqueue_head(&q->wait);
> +	filep->private_data = q;
> +	q->state = UACCE_Q_INIT;
> +
> +	mutex_lock(&uacce->q_lock);
> +	list_add(&q->list, &uacce->qs);
> +	mutex_unlock(&uacce->q_lock);
> +
> +	return 0;
> +
> +out_unbind:
> +	if (uacce->flags & UACCE_DEV_SVA)
> +		iommu_sva_unbind_device(handle);
> +out_with_mem:
> +	kfree(q);
> +out_with_module:
> +	module_put(uacce->pdev->driver->owner);
> +	return ret;
> +}
> +
> +static int uacce_fops_release(struct inode *inode, struct file *filep)
> +{
> +	struct uacce_queue *q = filep->private_data;
> +	struct uacce_device *uacce = q->uacce;
> +
> +	uacce_put_queue(q);
> +
> +	if (uacce->flags & UACCE_DEV_SVA)
> +		iommu_sva_unbind_device(q->handle);
> +
> +	mutex_lock(&uacce->q_lock);
> +	list_del(&q->list);
> +	mutex_unlock(&uacce->q_lock);
> +	kfree(q);
> +	module_put(uacce->pdev->driver->owner);
> +
> +	return 0;
> +}
> +
> +static void uacce_vma_close(struct vm_area_struct *vma)
> +{
> +	struct uacce_queue *q = vma->vm_private_data;
> +	enum uacce_qfrt type = 0;
> +
> +	if (vma->vm_pgoff < UACCE_QFRT_MAX)
> +		type = vma->vm_pgoff;
> +
> +	kfree(q->qfrs[type]);
> +}
> +
> +static const struct vm_operations_struct uacce_vm_ops = {
> +	.close = uacce_vma_close,
> +};
> +
> +static struct uacce_qfile_region *
> +uacce_create_region(struct uacce_queue *q, struct vm_area_struct *vma,
> +		    enum uacce_qfrt type, unsigned int flags)
> +{
> +	struct uacce_device *uacce = q->uacce;
> +	struct uacce_qfile_region *qfr;
> +	int ret = -ENOMEM;
> +
> +	qfr = kzalloc(sizeof(*qfr), GFP_KERNEL);
> +	if (!qfr)
> +		return ERR_PTR(-ENOMEM);
> +
> +	qfr->type = type;
> +	qfr->flags = flags;
> +
> +	if (vma->vm_flags & VM_READ)
> +		qfr->prot |= IOMMU_READ;
> +
> +	if (vma->vm_flags & VM_WRITE)
> +		qfr->prot |= IOMMU_WRITE;
> +
> +	if (flags & UACCE_QFRF_SELFMT) {
> +		if (!uacce->ops->mmap) {
> +			ret = -EINVAL;
> +			goto err_with_qfr;
> +		}
> +
> +		ret = uacce->ops->mmap(q, vma, qfr);
> +		if (ret)
> +			goto err_with_qfr;
> +		return qfr;
> +	}
> +
> +	return qfr;
> +
> +err_with_qfr:
> +	kfree(qfr);
> +	return ERR_PTR(ret);
> +}
> +
> +static int uacce_fops_mmap(struct file *filep, struct vm_area_struct *vma)
> +{
> +	struct uacce_queue *q = filep->private_data;
> +	struct uacce_device *uacce = q->uacce;
> +	struct uacce_qfile_region *qfr;
> +	enum uacce_qfrt type = 0;
> +	unsigned int flags = 0;
> +	int ret;
> +
> +	if (vma->vm_pgoff < UACCE_QFRT_MAX)
> +		type = vma->vm_pgoff;
> +
> +	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_WIPEONFORK;
> +	vma->vm_ops = &uacce_vm_ops;
> +	vma->vm_private_data = q;
> +
> +	mutex_lock(&uacce_mutex);
> +
> +	if (q->qfrs[type]) {
> +		ret = -EEXIST;
> +		goto out_with_lock;
> +	}
> +
> +	switch (type) {
> +	case UACCE_QFRT_MMIO:
> +		flags = UACCE_QFRF_SELFMT;
> +		break;
> +
> +	case UACCE_QFRT_DUS:
> +		if (uacce->flags & UACCE_DEV_SVA) {
> +			flags = UACCE_QFRF_SELFMT;
> +			break;
> +		}
> +		break;
> +
> +	default:
> +		WARN_ON(&uacce->dev);
> +		break;
> +	}
> +
> +	qfr = uacce_create_region(q, vma, type, flags);
> +	if (IS_ERR(qfr)) {
> +		ret = PTR_ERR(qfr);
> +		goto out_with_lock;
> +	}
> +	q->qfrs[type] = qfr;
> +

Could put
out_with_lock:
here and return ret instead of 0.
You'll need to set ret to default to 0 in that
case though.

> +	mutex_unlock(&uacce_mutex);
> +
> +	return 0;
> +
> +out_with_lock:
> +	mutex_unlock(&uacce_mutex);
> +	return ret;
> +}
> +
> +static __poll_t uacce_fops_poll(struct file *file, poll_table *wait)
> +{
> +	struct uacce_queue *q = file->private_data;
> +	struct uacce_device *uacce = q->uacce;
> +
> +	poll_wait(file, &q->wait, wait);
> +	if (uacce->ops->is_q_updated && uacce->ops->is_q_updated(q))
> +		return EPOLLIN | EPOLLRDNORM;
> +
> +	return 0;
> +}
> +
> +static const struct file_operations uacce_fops = {
> +	.owner		= THIS_MODULE,
> +	.open		= uacce_fops_open,
> +	.release	= uacce_fops_release,
> +	.unlocked_ioctl	= uacce_fops_unl_ioctl,
> +#ifdef CONFIG_COMPAT
> +	.compat_ioctl	= uacce_fops_compat_ioctl,
> +#endif
> +	.mmap		= uacce_fops_mmap,
> +	.poll		= uacce_fops_poll,
> +};
> +
> +#define to_uacce_device(dev) container_of(dev, struct uacce_device, dev)
> +
> +static ssize_t id_show(struct device *dev,
> +		       struct device_attribute *attr, char *buf)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +
> +	return sprintf(buf, "%d\n", uacce->dev_id);
> +}
> +
> +static ssize_t api_show(struct device *dev,
> +			struct device_attribute *attr, char *buf)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +
> +	return sprintf(buf, "%s\n", uacce->api_ver);
> +}
> +
> +static ssize_t numa_distance_show(struct device *dev,
> +				  struct device_attribute *attr, char *buf)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +	int distance;
> +
> +	distance = node_distance(smp_processor_id(), uacce->pdev->numa_node);
> +
> +	return sprintf(buf, "%d\n", abs(distance));
> +}
> +
> +static ssize_t node_id_show(struct device *dev,
> +			    struct device_attribute *attr, char *buf)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +	int node_id;
> +
> +	node_id = dev_to_node(uacce->pdev);
> +
> +	return sprintf(buf, "%d\n", node_id);
> +}
> +
> +static ssize_t flags_show(struct device *dev,
> +			  struct device_attribute *attr, char *buf)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +
> +	return sprintf(buf, "%u\n", uacce->flags);
> +}
> +
> +static ssize_t available_instances_show(struct device *dev,
> +					struct device_attribute *attr,
> +					char *buf)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +	int val = 0;
> +
> +	if (uacce->ops->get_available_instances)
> +		val = uacce->ops->get_available_instances(uacce);
> +
> +	return sprintf(buf, "%d\n", val);
> +}
> +
> +static ssize_t algorithms_show(struct device *dev,
> +			       struct device_attribute *attr, char *buf)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +
> +	return sprintf(buf, "%s", uacce->algs);
Any risk algs won't have the \n?
I'd kind of expect it to be a null termated arrays to allow the core
to format it however it wants to.

> +}
> +
> +static ssize_t qfrt_mmio_size_show(struct device *dev,
> +				   struct device_attribute *attr, char *buf)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +
> +	return sprintf(buf, "%lu\n",
> +		       uacce->qf_pg_size[UACCE_QFRT_MMIO] << PAGE_SHIFT);
> +}
> +
> +static ssize_t qfrt_dus_size_show(struct device *dev,
> +				  struct device_attribute *attr, char *buf)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +
> +	return sprintf(buf, "%lu\n",
> +		       uacce->qf_pg_size[UACCE_QFRT_DUS] << PAGE_SHIFT);
> +}
> +
> +static DEVICE_ATTR_RO(id);
> +static DEVICE_ATTR_RO(api);
> +static DEVICE_ATTR_RO(numa_distance);
> +static DEVICE_ATTR_RO(node_id);
> +static DEVICE_ATTR_RO(flags);
> +static DEVICE_ATTR_RO(available_instances);
> +static DEVICE_ATTR_RO(algorithms);
> +static DEVICE_ATTR_RO(qfrt_mmio_size);
> +static DEVICE_ATTR_RO(qfrt_dus_size);
> +
> +static struct attribute *uacce_dev_attrs[] = {
> +	&dev_attr_id.attr,
> +	&dev_attr_api.attr,
> +	&dev_attr_node_id.attr,
> +	&dev_attr_numa_distance.attr,
> +	&dev_attr_flags.attr,
> +	&dev_attr_available_instances.attr,
> +	&dev_attr_algorithms.attr,
> +	&dev_attr_qfrt_mmio_size.attr,
> +	&dev_attr_qfrt_dus_size.attr,
> +	NULL,
> +};
> +ATTRIBUTE_GROUPS(uacce_dev);
> +
> +static void uacce_release(struct device *dev)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +
> +	kfree(uacce);
> +}
> +
> +/**
> + * uacce_register - register an accelerator
This isn't quite correct kernel-doc.  Please run the
generation script over it and fix any warnings.

	uacce_register() - register an accelerator

> + * @parent: pointer of uacce parent device
> + * @interface: pointer of uacce_interface for register
> + */
> +struct uacce_device *uacce_register(struct device *parent,
> +				    struct uacce_interface *interface)
> +{
> +	unsigned int flags = interface->flags;
> +	struct uacce_device *uacce;
> +	int ret;
> +
> +	uacce = kzalloc(sizeof(struct uacce_device), GFP_KERNEL);
> +	if (!uacce)
> +		return ERR_PTR(-ENOMEM);
> +
> +	if (flags & UACCE_DEV_SVA) {
> +		ret = iommu_dev_enable_feature(parent, IOMMU_DEV_FEAT_SVA);
> +		if (ret)
> +			flags &= ~UACCE_DEV_SVA;
> +	}
> +
> +	uacce->pdev = parent;
> +	uacce->flags = flags;
> +	uacce->ops = interface->ops;
> +
> +	ret = xa_alloc(&uacce_xa, &uacce->dev_id, uacce, xa_limit_32b,
> +		       GFP_KERNEL);
> +	if (ret < 0)
> +		goto err_with_uacce;
> +
> +	uacce->cdev = cdev_alloc();

If we can embed this (see below) then use cdev_init instead.

> +	if (!uacce->cdev) {
> +		ret = -ENOMEM;
> +		goto err_with_xa;
> +	}
> +
> +	INIT_LIST_HEAD(&uacce->qs);
> +	mutex_init(&uacce->q_lock);
> +	uacce->cdev->ops = &uacce_fops;
> +	uacce->cdev->owner = THIS_MODULE;
> +	device_initialize(&uacce->dev);
> +	uacce->dev.devt = MKDEV(MAJOR(uacce_devt), uacce->dev_id);
> +	uacce->dev.class = uacce_class;
> +	uacce->dev.groups = uacce_dev_groups;
> +	uacce->dev.parent = uacce->pdev;
> +	uacce->dev.release = uacce_release;
> +	dev_set_name(&uacce->dev, "%s-%d", interface->name, uacce->dev_id);
> +	ret = cdev_device_add(uacce->cdev, &uacce->dev);
> +	if (ret)
> +		goto err_with_xa;
> +
> +	return uacce;
> +
> +err_with_xa:
> +	if (uacce->cdev)
> +		cdev_del(uacce->cdev);
Why not use a separate label to handle the above rather than checking if
it's set?

> +	xa_erase(&uacce_xa, uacce->dev_id);
> +err_with_uacce:
> +	if (flags & UACCE_DEV_SVA)
> +		iommu_dev_disable_feature(uacce->pdev, IOMMU_DEV_FEAT_SVA);
> +	kfree(uacce);
> +	return ERR_PTR(ret);
> +}
> +EXPORT_SYMBOL_GPL(uacce_register);
> +
> +/**
> + * uacce_unregister - unregisters an accelerator
> + * @uacce: the accelerator to unregister
> + */
> +void uacce_unregister(struct uacce_device *uacce)
> +{
> +	if (!uacce)
> +		return;
> +

I'd like to see a comment here on why we are doing things not unwinding
actions from uacce_register.

> +	mutex_lock(&uacce->q_lock);
> +	if (!list_empty(&uacce->qs)) {
> +		struct uacce_queue *q;
> +
> +		list_for_each_entry(q, &uacce->qs, list) {
> +			uacce_put_queue(q);
> +			if (uacce->flags & UACCE_DEV_SVA)
> +				iommu_sva_unbind_device(q->handle);
> +		}
> +	}
> +	mutex_unlock(&uacce->q_lock);
> +

For these next parts which are the unwind of uacce_register, why are they not
in the reverse order of what is happening in there (where possible given
device lifespan). That is why do we not disable the iommu feature much later?

> +	if (uacce->flags & UACCE_DEV_SVA)
> +		iommu_dev_disable_feature(uacce->pdev, IOMMU_DEV_FEAT_SVA);
> +
> +	cdev_device_del(uacce->cdev, &uacce->dev);
> +	xa_erase(&uacce_xa, uacce->dev_id);
> +	put_device(&uacce->dev);
> +}
> +EXPORT_SYMBOL_GPL(uacce_unregister);
> +
> +static int __init uacce_init(void)
> +{
> +	int ret;
> +
> +	uacce_class = class_create(THIS_MODULE, UACCE_NAME);
> +	if (IS_ERR(uacce_class))
> +		return PTR_ERR(uacce_class);
> +
> +	ret = alloc_chrdev_region(&uacce_devt, 0, MINORMASK, UACCE_NAME);
> +	if (ret) {
> +		class_destroy(uacce_class);
> +		return ret;
drop the return ret out of these brackets. i.e.

if (ret)
	class_destroy(uacce_class)

return ret;

> +	}
> +
> +	return 0;
> +}
> +
> +static __exit void uacce_exit(void)
> +{
> +	unregister_chrdev_region(uacce_devt, MINORMASK);
> +	class_destroy(uacce_class);
> +}
> +
> +subsys_initcall(uacce_init);
> +module_exit(uacce_exit);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Hisilicon Tech. Co., Ltd.");
> +MODULE_DESCRIPTION("Accelerator interface for Userland applications");
> diff --git a/include/linux/uacce.h b/include/linux/uacce.h
> new file mode 100644
> index 0000000..04c8643
> --- /dev/null
> +++ b/include/linux/uacce.h
> @@ -0,0 +1,163 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +#ifndef _LINUX_UACCE_H
> +#define _LINUX_UACCE_H
> +
> +#include <linux/cdev.h>
> +#include <uapi/misc/uacce/uacce.h>
> +
> +#define UACCE_NAME		"uacce"
> +#define UACCE_QFRT_MAX		16
What does QFRT stand for?
> +#define UACCE_MAX_NAME_SIZE	64
> +
> +struct uacce_queue;
> +struct uacce_device;
> +
> +/**
> + * enum uacce_qfr_flag: queue file flag:
> + * @UACCE_QFRF_SELFMT: self maintained qfr
> + */
> +enum uacce_qfr_flag {
> +	UACCE_QFRF_SELFMT = BIT(0),
> +};

Same issue with enums for flags.  Doesn't make much sense to me.
Only one value can be taken which doesn't make it a flag.

> +
> +/**
> + * struct uacce_qfile_region - structure of queue file region
> + * @type: type of the qfr
> + * @flags: flags of qfr
> + * @prot: qfr protection flag
> + */
> +struct uacce_qfile_region {
> +	enum uacce_qfrt type;
> +	enum uacce_qfr_flag flags;
> +	u32 prot;
> +};
> +
> +/**
> + * struct uacce_ops - uacce device operations
> + * @get_available_instances:  get available instances left of the device
> + * @get_queue: get a queue from the device
> + * @put_queue: free a queue to the device
> + * @start_queue: make the queue start work after get_queue
> + * @stop_queue: make the queue stop work before put_queue
> + * @is_q_updated: check whether the task is finished
> + * @mask_notify: mask the task irq of queue
> + * @mmap: mmap addresses of queue to user space
> + * @reset: reset the uacce device
> + * @reset_queue: reset the queue
> + * @ioctl: ioctl for user space users of the queue
> + */
> +struct uacce_ops {
> +	int (*get_available_instances)(struct uacce_device *uacce);
> +	int (*get_queue)(struct uacce_device *uacce, unsigned long arg,
> +			 struct uacce_queue *q);
> +	void (*put_queue)(struct uacce_queue *q);
> +	int (*start_queue)(struct uacce_queue *q);
> +	void (*stop_queue)(struct uacce_queue *q);
> +	int (*is_q_updated)(struct uacce_queue *q);
> +	void (*mask_notify)(struct uacce_queue *q, int event_mask);
> +	int (*mmap)(struct uacce_queue *q, struct vm_area_struct *vma,
> +		    struct uacce_qfile_region *qfr);
> +	int (*reset)(struct uacce_device *uacce);
> +	int (*reset_queue)(struct uacce_queue *q);

Some of these aren't used on only existing driver.  Introduce them only
in the series that uses them.

> +	long (*ioctl)(struct uacce_queue *q, unsigned int cmd,
> +		      unsigned long arg);
> +};
> +
> +/**
> + * struct uacce_interface
I think this needs a description for kernel doc (even if it's obvious!)
Could be wrong though.

> + * @name: the uacce device name.  Will show up in sysfs
> + * @flags: uacce device attributes
> + * @ops: pointer to the struct uacce_ops
> + *
> + * This structure is used for the uacce_register()
> + */
> +struct uacce_interface {
> +	char name[UACCE_MAX_NAME_SIZE];
> +	enum uacce_dev_flag flags;
> +	struct uacce_ops *ops;
> +};
> +
> +enum uacce_q_state {
> +	UACCE_Q_INIT,
> +	UACCE_Q_STARTED,
> +	UACCE_Q_ZOMBIE,
> +};
> +
> +/**
> + * struct uacce_queue
> + * @uacce: pointer to uacce
> + * @priv: private pointer
> + * @wait: wait queue head
> + * @pasid: pasid of the queue
> + * @pid: pid of the process using the queue
> + * @handle: iommu_sva handle return from iommu_sva_bind_device
> + * @list: queue list
> + * @qfrs: pointer of qfr regions
> + * @state: queue state machine
> + */
> +struct uacce_queue {
> +	struct uacce_device *uacce;
> +	void *priv;
> +	wait_queue_head_t wait;
> +	int pasid;
> +	pid_t pid;
> +	struct iommu_sva *handle;
> +	struct list_head list;
> +	struct uacce_qfile_region *qfrs[UACCE_QFRT_MAX];
> +	enum uacce_q_state state;
> +};
> +
> +/**
> + * struct uacce_device
> + * @algs: supported algorithms
> + * @api_ver: api version
> + * @qf_pg_size: page size of the queue file regions
> + * @ops: pointer to the struct uacce_ops
> + * @pdev: pointer to the parent device
> + * @is_vf: whether virtual function
> + * @flags: uacce attributes
> + * @dev_id: id of the uacce device
> + * @prot: uacce protection flag
> + * @cdev: cdev of the uacce
> + * @dev: dev of the uacce
> + * @priv: private pointer of the uacce
> + * @qs: list head of queue->list
> + * @q_lock: lock for qs
> + */
> +struct uacce_device {
> +	const char *algs;
> +	const char *api_ver;
> +	unsigned long qf_pg_size[UACCE_QFRT_MAX];
> +	struct uacce_ops *ops;

Can we make this ops structure a point to a constant struct?
I'm guessing it'll be fixed for a given driver.

> +	struct device *pdev;

Perhaps just call it parent. pdev will be confusing with
pci devices.

> +	bool is_vf;
> +	u32 flags;
> +	u32 dev_id;
> +	u32 prot;
> +	struct cdev *cdev;

Can we embed the cdev structure rather than use a pointer
and separate allocation?

> +	struct device dev;
> +	void *priv;
> +	struct list_head qs;
> +	struct mutex q_lock;
> +};
> +
> +#if IS_ENABLED(CONFIG_UACCE)
> +
> +struct uacce_device *uacce_register(struct device *parent,
> +				    struct uacce_interface *interface);
> +void uacce_unregister(struct uacce_device *uacce);
> +
> +#else /* CONFIG_UACCE */
> +
> +static inline
> +struct uacce_device *uacce_register(struct device *parent,
> +				    struct uacce_interface *interface)
> +{
> +	return ERR_PTR(-ENODEV);
> +}
> +
> +static inline void uacce_unregister(struct uacce_device *uacce) {}
> +
> +#endif /* CONFIG_UACCE */
> +
> +#endif /* _LINUX_UACCE_H */
> diff --git a/include/uapi/misc/uacce/uacce.h b/include/uapi/misc/uacce/uacce.h
> new file mode 100644
> index 0000000..a4f9378
> --- /dev/null
> +++ b/include/uapi/misc/uacce/uacce.h
> @@ -0,0 +1,38 @@
> +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> +#ifndef _UAPIUUACCE_H
> +#define _UAPIUUACCE_H
> +
> +#include <linux/types.h>
> +#include <linux/ioctl.h>
> +
> +/* UACCE_CMD_START_Q: Start the queue */
> +#define UACCE_CMD_START_Q	_IO('W', 0)
> +
> +/**
> + * UACCE_CMD_PUT_Q:
> + * User actively stop queue and free queue resource immediately
> + * Optimization method since close fd may delay
> + */
> +#define UACCE_CMD_PUT_Q		_IO('W', 1)
> +
> +/**
> + * enum uacce_dev_flag: Device flags:
> + * @UACCE_DEV_SVA: Shared Virtual Addresses
> + *		   Support PASID
> + *		   Support device page faults (PCI PRI or SMMU Stall)
> + */
> +enum uacce_dev_flag {
> +	UACCE_DEV_SVA = BIT(0),

As mentioned in docs review, this doesn't look like an enum to me.
Just use #define for the bit and a suitable sized integer for any
calls using it.

> +};
> +
> +/**
> + * enum uacce_qfrt: qfrt type
> + * @UACCE_QFRT_MMIO: device mmio region
> + * @UACCE_QFRT_DUS: device user share region
> + */
> +enum uacce_qfrt {
> +	UACCE_QFRT_MMIO = 0,
> +	UACCE_QFRT_DUS = 1,
> +};
> +
> +#endif
Zhangfei Gao Nov. 5, 2019, 7:43 a.m. UTC | #2
Hi, Jonathan

Thanks for the suggestions

On 2019/11/1 上午1:13, Jonathan Cameron wrote:
> On Tue, 29 Oct 2019 14:40:15 +0800
> Zhangfei Gao <zhangfei.gao@linaro.org> wrote:
>
>> From: Kenneth Lee <liguozhu@hisilicon.com>
>>
>> Uacce (Unified/User-space-access-intended Accelerator Framework) targets to
>> provide Shared Virtual Addressing (SVA) between accelerators and processes.
>> So accelerator can access any data structure of the main cpu.
>> This differs from the data sharing between cpu and io device, which share
>> data content rather than address.
>> Since unified address, hardware and user space of process can share the
>> same virtual address in the communication.
>>
>> Uacce create a chrdev for every registration, the queue is allocated to
>> the process when the chrdev is opened. Then the process can access the
>> hardware resource by interact with the queue file. By mmap the queue
>> file space to user space, the process can directly put requests to the
>> hardware without syscall to the kernel space.
>>
>> Signed-off-by: Kenneth Lee <liguozhu@hisilicon.com>
>> Signed-off-by: Zaibo Xu <xuzaibo@huawei.com>
>> Signed-off-by: Zhou Wang <wangzhou1@hisilicon.com>
>> Signed-off-by: Zhangfei Gao <zhangfei.gao@linaro.org>
> Great, much more compact.
>
> I've not gone through this in detail yet but a few initial comments inline.
>
> Thanks,
>
> Jonathan
>
>> ---
>>   Documentation/ABI/testing/sysfs-driver-uacce |  53 +++
>>   drivers/misc/Kconfig                         |   1 +
>>   drivers/misc/Makefile                        |   1 +
>>   drivers/misc/uacce/Kconfig                   |  13 +
>>   drivers/misc/uacce/Makefile                  |   2 +
>>   drivers/misc/uacce/uacce.c                   | 574 +++++++++++++++++++++++++++
>>   include/linux/uacce.h                        | 163 ++++++++
>>   include/uapi/misc/uacce/uacce.h              |  38 ++
>>   8 files changed, 845 insertions(+)
>>   create mode 100644 Documentation/ABI/testing/sysfs-driver-uacce
>>   create mode 100644 drivers/misc/uacce/Kconfig
>>   create mode 100644 drivers/misc/uacce/Makefile
>>   create mode 100644 drivers/misc/uacce/uacce.c
>>   create mode 100644 include/linux/uacce.h
>>   create mode 100644 include/uapi/misc/uacce/uacce.h
>>
>> diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
>> new file mode 100644
>> index 0000000..35699dc
>> --- /dev/null
>> +++ b/Documentation/ABI/testing/sysfs-driver-uacce
>> @@ -0,0 +1,53 @@
>> +What:           /sys/class/uacce/<dev_name>/id
>> +Date:           Oct 2019
>> +KernelVersion:  5.5
>> +Contact:        linux-accelerators@lists.ozlabs.org
>> +Description:    Id of the device.
>> +
>> +What:           /sys/class/uacce/<dev_name>/api
>> +Date:           Oct 2019
>> +KernelVersion:  5.5
>> +Contact:        linux-accelerators@lists.ozlabs.org
>> +Description:    Api of the device, used by application to match the correct driver
>> +
>> +What:           /sys/class/uacce/<dev_name>/flags
>> +Date:           Oct 2019
>> +KernelVersion:  5.5
>> +Contact:        linux-accelerators@lists.ozlabs.org
>> +Description:    Attributes of the device, see UACCE_DEV_xxx flag defined in uacce.h
>> +
>> +What:           /sys/class/uacce/<dev_name>/available_instances
>> +Date:           Oct 2019
>> +KernelVersion:  5.5
>> +Contact:        linux-accelerators@lists.ozlabs.org
>> +Description:    Available instances left of the device
>> +
>> +What:           /sys/class/uacce/<dev_name>/algorithms
>> +Date:           Oct 2019
>> +KernelVersion:  5.5
>> +Contact:        linux-accelerators@lists.ozlabs.org
>> +Description:    Algorithms supported by this accelerator
> How are they separated?  Userspace code needs to know that.
> (comma, tab, newline?)
Yes, will add "separated by new line"
>
>> +
>> +What:           /sys/class/uacce/<dev_name>/qfrt_mmio_size
> qfrt is not the most obvious naming ever.  Do we care beyond its
> a region for this interface?  region_mmio_size maybe?
OK,
>
>> +Date:           Oct 2019
>> +KernelVersion:  5.5
>> +Contact:        linux-accelerators@lists.ozlabs.org
>> +Description:    Page size of mmio region queue file
> Size of page in this region, or number of pages in the region?
Change to "Page numbers of mmio region queue file"
>
>> +
>> +What:           /sys/class/uacce/<dev_name>/qfrt_dus_size
>> +Date:           Oct 2019
>> +KernelVersion:  5.5
>> +Contact:        linux-accelerators@lists.ozlabs.org
>> +Description:    Page size of dus region queue file
>> +
>> +What:           /sys/class/uacce/<dev_name>/numa_distance
>> +Date:           Oct 2019
>> +KernelVersion:  5.5
>> +Contact:        linux-accelerators@lists.ozlabs.org
>> +Description:    Distance of device node to cpu node
> I wonder if we should be doing this in here. There are other standard
> ways of obtaining this for the device.  Follow parent and check node_id
> there then use the /sys/bus/node path to find out the distances.
Could you clarify more about this method.
The purpose here is cpu searching the nearest device(zip) doing work.
Does user application know which node it is running and compare distance?
>> +
>> +What:           /sys/class/uacce/<dev_name>/node_id
>> +Date:           Oct 2019
>> +KernelVersion:  5.5
>> +Contact:        linux-accelerators@lists.ozlabs.org
>> +Description:    Id of the numa node
>> diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
>> index c55b637..929feb0 100644
>> --- a/drivers/misc/Kconfig
>> +++ b/drivers/misc/Kconfig
>> @@ -481,4 +481,5 @@ source "drivers/misc/cxl/Kconfig"
>>   source "drivers/misc/ocxl/Kconfig"
>>   source "drivers/misc/cardreader/Kconfig"
>>   source "drivers/misc/habanalabs/Kconfig"
>> +source "drivers/misc/uacce/Kconfig"
>>   endmenu
>> diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
>> index c1860d3..9abf292 100644
>> --- a/drivers/misc/Makefile
>> +++ b/drivers/misc/Makefile
>> @@ -56,4 +56,5 @@ obj-$(CONFIG_OCXL)		+= ocxl/
>>   obj-y				+= cardreader/
>>   obj-$(CONFIG_PVPANIC)   	+= pvpanic.o
>>   obj-$(CONFIG_HABANA_AI)		+= habanalabs/
>> +obj-$(CONFIG_UACCE)		+= uacce/
>>   obj-$(CONFIG_XILINX_SDFEC)	+= xilinx_sdfec.o
>> diff --git a/drivers/misc/uacce/Kconfig b/drivers/misc/uacce/Kconfig
>> new file mode 100644
>> index 0000000..5e39b60
>> --- /dev/null
>> +++ b/drivers/misc/uacce/Kconfig
>> @@ -0,0 +1,13 @@
>> +config UACCE
>> +	tristate "Accelerator Framework for User Land"
>> +	depends on IOMMU_API
>> +	help
>> +	  UACCE provides interface for the user process to access the hardware
>> +	  without interaction with the kernel space in data path.
>> +
>> +	  The user-space interface is described in
>> +	  include/uapi/misc/uacce/uacce.h
>> +
>> +	  See Documentation/misc-devices/uacce.rst for more details.
>> +
>> +	  If you don't know what to do here, say N.
> Pessimist :) Everyone should want uacce so don't put them off.  Having said
> that perhaps for now it should be hidden and enabled on a driver by driver
> basis?
>
>> diff --git a/drivers/misc/uacce/Makefile b/drivers/misc/uacce/Makefile
>> new file mode 100644
>> index 0000000..5b4374e
>> --- /dev/null
>> +++ b/drivers/misc/uacce/Makefile
>> @@ -0,0 +1,2 @@
>> +# SPDX-License-Identifier: GPL-2.0-or-later
>> +obj-$(CONFIG_UACCE) += uacce.o
>> diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
>> new file mode 100644
>> index 0000000..2b6b038
>> --- /dev/null
>> +++ b/drivers/misc/uacce/uacce.c
>> @@ -0,0 +1,574 @@
>> +// SPDX-License-Identifier: GPL-2.0-or-later
>> +#include <linux/compat.h>
>> +#include <linux/dma-iommu.h>
>> +#include <linux/module.h>
>> +#include <linux/poll.h>
>> +#include <linux/uacce.h>
>> +
>> +static struct class *uacce_class;
>> +static dev_t uacce_devt;
>> +static DEFINE_MUTEX(uacce_mutex);
>> +static DEFINE_XARRAY_ALLOC(uacce_xa);
>> +
>> +static int uacce_start_queue(struct uacce_queue *q)
>> +{
>> +	int ret = -EINVAL;
>> +
>> +	mutex_lock(&uacce_mutex);
>> +
>> +	if (q->state != UACCE_Q_INIT)
>> +		goto out_with_lock;
>> +
>> +	if (q->uacce->ops->start_queue) {
>> +		ret = q->uacce->ops->start_queue(q);
>> +		if (ret < 0)
>> +			goto out_with_lock;
>> +	}
>> +
>> +	q->state = UACCE_Q_STARTED;
> out_with_lock:
>> +	mutex_unlock(&uacce_mutex);
>> +
> return ret;
> Though need to handle ret a bit differently above...
OK
>
> +static int uacce_fops_mmap(struct file *filep, struct vm_area_struct *vma)
> +{
> +	struct uacce_queue *q = filep->private_data;
> +	struct uacce_device *uacce = q->uacce;
> +	struct uacce_qfile_region *qfr;
> +	enum uacce_qfrt type = 0;
> +	unsigned int flags = 0;
> +	int ret;
> +
> +	if (vma->vm_pgoff < UACCE_QFRT_MAX)
> +		type = vma->vm_pgoff;
> +
> +	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_WIPEONFORK;
> +	vma->vm_ops = &uacce_vm_ops;
> +	vma->vm_private_data = q;
> +
> +	mutex_lock(&uacce_mutex);
> +
> +	if (q->qfrs[type]) {
> +		ret = -EEXIST;
> +		goto out_with_lock;
> +	}
> +
> +	switch (type) {
> +	case UACCE_QFRT_MMIO:
> +		flags = UACCE_QFRF_SELFMT;
> +		break;
> +
> +	case UACCE_QFRT_DUS:
> +		if (uacce->flags & UACCE_DEV_SVA) {
> +			flags = UACCE_QFRF_SELFMT;
> +			break;
> +		}
> +		break;
> +
> +	default:
> +		WARN_ON(&uacce->dev);
> +		break;
> +	}
> +
> +	qfr = uacce_create_region(q, vma, type, flags);
> +	if (IS_ERR(qfr)) {
> +		ret = PTR_ERR(qfr);
> +		goto out_with_lock;
> +	}
> +	q->qfrs[type] = qfr;
> +
> Could put
> out_with_lock:
> here and return ret instead of 0.
> You'll need to set ret to default to 0 in that
> case though.
OK
>
> +static ssize_t algorithms_show(struct device *dev,
> +			       struct device_attribute *attr, char *buf)
> +{
> +	struct uacce_device *uacce = to_uacce_device(dev);
> +
> +	return sprintf(buf, "%s", uacce->algs);
> Any risk algs won't have the \n?
> I'd kind of expect it to be a null termated arrays to allow the core
> to format it however it wants to.
Yes, adding \n is better.
>
>> +}
>> +
>> +static ssize_t qfrt_mmio_size_show(struct device *dev,
>> +				   struct device_attribute *attr, char *buf)
>> +{
>> +	struct uacce_device *uacce = to_uacce_device(dev);
>> +
>> +	return sprintf(buf, "%lu\n",
>> +		       uacce->qf_pg_size[UACCE_QFRT_MMIO] << PAGE_SHIFT);
>> +}
>> +
>> +static ssize_t qfrt_dus_size_show(struct device *dev,
>> +				  struct device_attribute *attr, char *buf)
>> +{
>> +	struct uacce_device *uacce = to_uacce_device(dev);
>> +
>> +	return sprintf(buf, "%lu\n",
>> +		       uacce->qf_pg_size[UACCE_QFRT_DUS] << PAGE_SHIFT);
>> +}
>> +
>> +static DEVICE_ATTR_RO(id);
>> +static DEVICE_ATTR_RO(api);
>> +static DEVICE_ATTR_RO(numa_distance);
>> +static DEVICE_ATTR_RO(node_id);
>> +static DEVICE_ATTR_RO(flags);
>> +static DEVICE_ATTR_RO(available_instances);
>> +static DEVICE_ATTR_RO(algorithms);
>> +static DEVICE_ATTR_RO(qfrt_mmio_size);
>> +static DEVICE_ATTR_RO(qfrt_dus_size);
>> +
>> +static struct attribute *uacce_dev_attrs[] = {
>> +	&dev_attr_id.attr,
>> +	&dev_attr_api.attr,
>> +	&dev_attr_node_id.attr,
>> +	&dev_attr_numa_distance.attr,
>> +	&dev_attr_flags.attr,
>> +	&dev_attr_available_instances.attr,
>> +	&dev_attr_algorithms.attr,
>> +	&dev_attr_qfrt_mmio_size.attr,
>> +	&dev_attr_qfrt_dus_size.attr,
>> +	NULL,
>> +};
>> +ATTRIBUTE_GROUPS(uacce_dev);
>> +
>> +static void uacce_release(struct device *dev)
>> +{
>> +	struct uacce_device *uacce = to_uacce_device(dev);
>> +
>> +	kfree(uacce);
>> +}
>> +
>> +/**
>> + * uacce_register - register an accelerator
> This isn't quite correct kernel-doc.  Please run the
> generation script over it and fix any warnings.
>
> 	uacce_register() - register an accelerator
Sure, will add (), though no warning reported from ./scripts/kernel-doc
>
>> + * @parent: pointer of uacce parent device
>> + * @interface: pointer of uacce_interface for register
>> + */
>> +struct uacce_device *uacce_register(struct device *parent,
>> +				    struct uacce_interface *interface)
>> +{
>> +	unsigned int flags = interface->flags;
>> +	struct uacce_device *uacce;
>> +	int ret;
>> +
>> +	uacce = kzalloc(sizeof(struct uacce_device), GFP_KERNEL);
>> +	if (!uacce)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	if (flags & UACCE_DEV_SVA) {
>> +		ret = iommu_dev_enable_feature(parent, IOMMU_DEV_FEAT_SVA);
>> +		if (ret)
>> +			flags &= ~UACCE_DEV_SVA;
>> +	}
>> +
>> +	uacce->pdev = parent;
>> +	uacce->flags = flags;
>> +	uacce->ops = interface->ops;
>> +
>> +	ret = xa_alloc(&uacce_xa, &uacce->dev_id, uacce, xa_limit_32b,
>> +		       GFP_KERNEL);
>> +	if (ret < 0)
>> +		goto err_with_uacce;
>> +
>> +	uacce->cdev = cdev_alloc();
> If we can embed this (see below) then use cdev_init instead.
>
>> +	if (!uacce->cdev) {
>> +		ret = -ENOMEM;
>> +		goto err_with_xa;
>> +	}
>> +
>> +	INIT_LIST_HEAD(&uacce->qs);
>> +	mutex_init(&uacce->q_lock);
>> +	uacce->cdev->ops = &uacce_fops;
>> +	uacce->cdev->owner = THIS_MODULE;
>> +	device_initialize(&uacce->dev);
>> +	uacce->dev.devt = MKDEV(MAJOR(uacce_devt), uacce->dev_id);
>> +	uacce->dev.class = uacce_class;
>> +	uacce->dev.groups = uacce_dev_groups;
>> +	uacce->dev.parent = uacce->pdev;
>> +	uacce->dev.release = uacce_release;
>> +	dev_set_name(&uacce->dev, "%s-%d", interface->name, uacce->dev_id);
>> +	ret = cdev_device_add(uacce->cdev, &uacce->dev);
>> +	if (ret)
>> +		goto err_with_xa;
>> +
>> +	return uacce;
>> +
>> +err_with_xa:
>> +	if (uacce->cdev)
>> +		cdev_del(uacce->cdev);
> Why not use a separate label to handle the above rather than checking if
> it's set?
ok,
>
>> +	xa_erase(&uacce_xa, uacce->dev_id);
>> +err_with_uacce:
>> +	if (flags & UACCE_DEV_SVA)
>> +		iommu_dev_disable_feature(uacce->pdev, IOMMU_DEV_FEAT_SVA);
>> +	kfree(uacce);
>> +	return ERR_PTR(ret);
>> +}
>> +EXPORT_SYMBOL_GPL(uacce_register);
>> +
>> +/**
>> + * uacce_unregister - unregisters an accelerator
>> + * @uacce: the accelerator to unregister
>> + */
>> +void uacce_unregister(struct uacce_device *uacce)
>> +{
>> +	if (!uacce)
>> +		return;
>> +
> I'd like to see a comment here on why we are doing things not unwinding
> actions from uacce_register.
OK will add comments.
Here is "ensure no open queue remains"
>> +	mutex_lock(&uacce->q_lock);
>> +	if (!list_empty(&uacce->qs)) {
>> +		struct uacce_queue *q;
>> +
>> +		list_for_each_entry(q, &uacce->qs, list) {
>> +			uacce_put_queue(q);
>> +			if (uacce->flags & UACCE_DEV_SVA)
>> +				iommu_sva_unbind_device(q->handle);
>> +		}
>> +	}
>> +	mutex_unlock(&uacce->q_lock);
>> +
> For these next parts which are the unwind of uacce_register, why are they not
> in the reverse order of what is happening in there (where possible given
> device lifespan). That is why do we not disable the iommu feature much later?
First close all queues, then disable sva feature.
>
>> +	if (uacce->flags & UACCE_DEV_SVA)
>> +		iommu_dev_disable_feature(uacce->pdev, IOMMU_DEV_FEAT_SVA);
>> +
>> +	cdev_device_del(uacce->cdev, &uacce->dev);
>> +	xa_erase(&uacce_xa, uacce->dev_id);
>> +	put_device(&uacce->dev);
>> +}
>> +EXPORT_SYMBOL_GPL(uacce_unregister);
>> +
>> +static int __init uacce_init(void)
>> +{
>> +	int ret;
>> +
>> +	uacce_class = class_create(THIS_MODULE, UACCE_NAME);
>> +	if (IS_ERR(uacce_class))
>> +		return PTR_ERR(uacce_class);
>> +
>> +	ret = alloc_chrdev_region(&uacce_devt, 0, MINORMASK, UACCE_NAME);
>> +	if (ret) {
>> +		class_destroy(uacce_class);
>> +		return ret;
> drop the return ret out of these brackets. i.e.
>
> if (ret)
> 	class_destroy(uacce_class)
>
> return ret;
sure, thanks
>
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static __exit void uacce_exit(void)
>> +{
>> +	unregister_chrdev_region(uacce_devt, MINORMASK);
>> +	class_destroy(uacce_class);
>> +}
>> +
>> +subsys_initcall(uacce_init);
>> +module_exit(uacce_exit);
>> +
>> +MODULE_LICENSE("GPL");
>> +MODULE_AUTHOR("Hisilicon Tech. Co., Ltd.");
>> +MODULE_DESCRIPTION("Accelerator interface for Userland applications");
>> diff --git a/include/linux/uacce.h b/include/linux/uacce.h
>> new file mode 100644
>> index 0000000..04c8643
>> --- /dev/null
>> +++ b/include/linux/uacce.h
>> @@ -0,0 +1,163 @@
>> +/* SPDX-License-Identifier: GPL-2.0-or-later */
>> +#ifndef _LINUX_UACCE_H
>> +#define _LINUX_UACCE_H
>> +
>> +#include <linux/cdev.h>
>> +#include <uapi/misc/uacce/uacce.h>
>> +
>> +#define UACCE_NAME		"uacce"
>> +#define UACCE_QFRT_MAX		16
> What does QFRT stand for?
change to UACCE_MAX_REGION
>> +#define UACCE_MAX_NAME_SIZE	64
>> +
>> +struct uacce_queue;
>> +struct uacce_device;
>> +
>> +/**
>> + * enum uacce_qfr_flag: queue file flag:
>> + * @UACCE_QFRF_SELFMT: self maintained qfr
>> + */
>> +enum uacce_qfr_flag {
>> +	UACCE_QFRF_SELFMT = BIT(0),
>> +};
> Same issue with enums for flags.  Doesn't make much sense to me.
> Only one value can be taken which doesn't make it a flag.
>
>> +
>> +/**
>> + * struct uacce_qfile_region - structure of queue file region
>> + * @type: type of the qfr
>> + * @flags: flags of qfr
>> + * @prot: qfr protection flag
>> + */
>> +struct uacce_qfile_region {
>> +	enum uacce_qfrt type;
>> +	enum uacce_qfr_flag flags;
>> +	u32 prot;
>> +};
>> +
>> +/**
>> + * struct uacce_ops - uacce device operations
>> + * @get_available_instances:  get available instances left of the device
>> + * @get_queue: get a queue from the device
>> + * @put_queue: free a queue to the device
>> + * @start_queue: make the queue start work after get_queue
>> + * @stop_queue: make the queue stop work before put_queue
>> + * @is_q_updated: check whether the task is finished
>> + * @mask_notify: mask the task irq of queue
>> + * @mmap: mmap addresses of queue to user space
>> + * @reset: reset the uacce device
>> + * @reset_queue: reset the queue
>> + * @ioctl: ioctl for user space users of the queue
>> + */
>> +struct uacce_ops {
>> +	int (*get_available_instances)(struct uacce_device *uacce);
>> +	int (*get_queue)(struct uacce_device *uacce, unsigned long arg,
>> +			 struct uacce_queue *q);
>> +	void (*put_queue)(struct uacce_queue *q);
>> +	int (*start_queue)(struct uacce_queue *q);
>> +	void (*stop_queue)(struct uacce_queue *q);
>> +	int (*is_q_updated)(struct uacce_queue *q);
>> +	void (*mask_notify)(struct uacce_queue *q, int event_mask);
>> +	int (*mmap)(struct uacce_queue *q, struct vm_area_struct *vma,
>> +		    struct uacce_qfile_region *qfr);
>> +	int (*reset)(struct uacce_device *uacce);
>> +	int (*reset_queue)(struct uacce_queue *q);
> Some of these aren't used on only existing driver.  Introduce them only
> in the series that uses them.
OK
>
>> +	long (*ioctl)(struct uacce_queue *q, unsigned int cmd,
>> +		      unsigned long arg);
>> +};
>> +
>> +/**
>> + * struct uacce_interface
> I think this needs a description for kernel doc (even if it's obvious!)
> Could be wrong though.
OK
>
>> + * @name: the uacce device name.  Will show up in sysfs
>> + * @flags: uacce device attributes
>> + * @ops: pointer to the struct uacce_ops
>> + *
>> + * This structure is used for the uacce_register()
>> + */
>> +struct uacce_interface {
>> +	char name[UACCE_MAX_NAME_SIZE];
>> +	enum uacce_dev_flag flags;
>> +	struct uacce_ops *ops;
>> +};
>> +
>> +enum uacce_q_state {
>> +	UACCE_Q_INIT,
>> +	UACCE_Q_STARTED,
>> +	UACCE_Q_ZOMBIE,
>> +};
>> +
>> +/**
>> + * struct uacce_queue
>> + * @uacce: pointer to uacce
>> + * @priv: private pointer
>> + * @wait: wait queue head
>> + * @pasid: pasid of the queue
>> + * @pid: pid of the process using the queue
>> + * @handle: iommu_sva handle return from iommu_sva_bind_device
>> + * @list: queue list
>> + * @qfrs: pointer of qfr regions
>> + * @state: queue state machine
>> + */
>> +struct uacce_queue {
>> +	struct uacce_device *uacce;
>> +	void *priv;
>> +	wait_queue_head_t wait;
>> +	int pasid;
>> +	pid_t pid;
>> +	struct iommu_sva *handle;
>> +	struct list_head list;
>> +	struct uacce_qfile_region *qfrs[UACCE_QFRT_MAX];
>> +	enum uacce_q_state state;
>> +};
>> +
>> +/**
>> + * struct uacce_device
>> + * @algs: supported algorithms
>> + * @api_ver: api version
>> + * @qf_pg_size: page size of the queue file regions
>> + * @ops: pointer to the struct uacce_ops
>> + * @pdev: pointer to the parent device
>> + * @is_vf: whether virtual function
>> + * @flags: uacce attributes
>> + * @dev_id: id of the uacce device
>> + * @prot: uacce protection flag
>> + * @cdev: cdev of the uacce
>> + * @dev: dev of the uacce
>> + * @priv: private pointer of the uacce
>> + * @qs: list head of queue->list
>> + * @q_lock: lock for qs
>> + */
>> +struct uacce_device {
>> +	const char *algs;
>> +	const char *api_ver;
>> +	unsigned long qf_pg_size[UACCE_QFRT_MAX];
>> +	struct uacce_ops *ops;
> Can we make this ops structure a point to a constant struct?
> I'm guessing it'll be fixed for a given driver.
OK
>
>> +	struct device *pdev;
> Perhaps just call it parent. pdev will be confusing with
> pci devices.
OK
>
>> +	bool is_vf;
>> +	u32 flags;
>> +	u32 dev_id;
>> +	u32 prot;
>> +	struct cdev *cdev;
> Can we embed the cdev structure rather than use a pointer
> and separate allocation?
NO, we can't.
We originally embed the cdev structure, and Greg reminded us these two 
structure have different lifetime.
https://lkml.org/lkml/2019/8/28/771
>> +	struct device dev;
>> +	void *priv;
>> +	struct list_head qs;
>> +	struct mutex q_lock;
>> +};
>> +
>> +#if IS_ENABLED(CONFIG_UACCE)
>> +
>> +struct uacce_device *uacce_register(struct device *parent,
>> +				    struct uacce_interface *interface);
>> +void uacce_unregister(struct uacce_device *uacce);
>> +
>> +#else /* CONFIG_UACCE */
>> +
>> +static inline
>> +struct uacce_device *uacce_register(struct device *parent,
>> +				    struct uacce_interface *interface)
>> +{
>> +	return ERR_PTR(-ENODEV);
>> +}
>> +
>> +static inline void uacce_unregister(struct uacce_device *uacce) {}
>> +
>> +#endif /* CONFIG_UACCE */
>> +
>> +#endif /* _LINUX_UACCE_H */
>> diff --git a/include/uapi/misc/uacce/uacce.h b/include/uapi/misc/uacce/uacce.h
>> new file mode 100644
>> index 0000000..a4f9378
>> --- /dev/null
>> +++ b/include/uapi/misc/uacce/uacce.h
>> @@ -0,0 +1,38 @@
>> +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
>> +#ifndef _UAPIUUACCE_H
>> +#define _UAPIUUACCE_H
>> +
>> +#include <linux/types.h>
>> +#include <linux/ioctl.h>
>> +
>> +/* UACCE_CMD_START_Q: Start the queue */
>> +#define UACCE_CMD_START_Q	_IO('W', 0)
>> +
>> +/**
>> + * UACCE_CMD_PUT_Q:
>> + * User actively stop queue and free queue resource immediately
>> + * Optimization method since close fd may delay
>> + */
>> +#define UACCE_CMD_PUT_Q		_IO('W', 1)
>> +
>> +/**
>> + * enum uacce_dev_flag: Device flags:
>> + * @UACCE_DEV_SVA: Shared Virtual Addresses
>> + *		   Support PASID
>> + *		   Support device page faults (PCI PRI or SMMU Stall)
>> + */
>> +enum uacce_dev_flag {
>> +	UACCE_DEV_SVA = BIT(0),
> As mentioned in docs review, this doesn't look like an enum to me.
> Just use #define for the bit and a suitable sized integer for any
> calls using it.
OK, but there are still more features in the future patch.

Thanks
Jean-Philippe Brucker Nov. 5, 2019, 11:48 a.m. UTC | #3
Hi Zhangfei,

Thanks for simplifying this, it's a lot easier to review. I have some
additional comments.

On Tue, Oct 29, 2019 at 02:40:15PM +0800, Zhangfei Gao wrote:
> +static int uacce_sva_exit(struct device *dev, struct iommu_sva *handle,
> +			  void *data)
> +{
> +	struct uacce_device *uacce = data;
> +	struct uacce_queue *q;
> +
> +	mutex_lock(&uacce->q_lock);
> +	list_for_each_entry(q, &uacce->qs, list) {
> +		if (q->pid == task_pid_nr(current))
> +			uacce_put_queue(q);

This won't work in some cases, because any thread can call __mmput() and
end up here. For example a sibling thread that inherited the queue, or a
workqueue that's executing mmput_async_fn(). In addition I think comparing
PID values is unsafe (see comment in pid.h), we'd need to use the struct
pid if we wanted to do it this way.

But I still believe it would be better to create an uacce_mm structure
that tracks all queues bound to this mm, and pass that to uacce_sva_exit
instead of the uacce_device.

The queue isn't bound to a task, but its address space. With clone() the
address space can be shared between tasks. In addition, whoever has a
queue fd also gets access to this address space. So after a fork() the
child may be able to program the queue to DMA into the parent's address
space, even without CLONE_VM. Users must be aware of this and I think it's
important to explain it very clearly in the UAPI.

[...]
> +static struct uacce_qfile_region *
> +uacce_create_region(struct uacce_queue *q, struct vm_area_struct *vma,
> +		    enum uacce_qfrt type, unsigned int flags)
> +{
> +	struct uacce_device *uacce = q->uacce;
> +	struct uacce_qfile_region *qfr;
> +	int ret = -ENOMEM;
> +
> +	qfr = kzalloc(sizeof(*qfr), GFP_KERNEL);
> +	if (!qfr)
> +		return ERR_PTR(-ENOMEM);
> +
> +	qfr->type = type;
> +	qfr->flags = flags;
> +
> +	if (vma->vm_flags & VM_READ)
> +		qfr->prot |= IOMMU_READ;

qfr->prot and qfr->flags aren't used at the moment, you could remove them.

> +
> +	if (vma->vm_flags & VM_WRITE)
> +		qfr->prot |= IOMMU_WRITE;
> +
> +	if (flags & UACCE_QFRF_SELFMT) {
> +		if (!uacce->ops->mmap) {
> +			ret = -EINVAL;
> +			goto err_with_qfr;
> +		}
> +
> +		ret = uacce->ops->mmap(q, vma, qfr);
> +		if (ret)
> +			goto err_with_qfr;
> +		return qfr;
> +	}
> +
> +	return qfr;
> +
> +err_with_qfr:
> +	kfree(qfr);
> +	return ERR_PTR(ret);
> +}
> +
> +static int uacce_fops_mmap(struct file *filep, struct vm_area_struct *vma)
> +{
> +	struct uacce_queue *q = filep->private_data;
> +	struct uacce_device *uacce = q->uacce;
> +	struct uacce_qfile_region *qfr;
> +	enum uacce_qfrt type = 0;
> +	unsigned int flags = 0;
> +	int ret;
> +
> +	if (vma->vm_pgoff < UACCE_QFRT_MAX)
> +		type = vma->vm_pgoff;

Otherwise return -EINVAL?  type probably shouldn't default to MMIO if it
wasn't explicitly requested by the user.

> +
> +	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_WIPEONFORK;
> +	vma->vm_ops = &uacce_vm_ops;
> +	vma->vm_private_data = q;
> +
> +	mutex_lock(&uacce_mutex);
> +
> +	if (q->qfrs[type]) {
> +		ret = -EEXIST;
> +		goto out_with_lock;
> +	}
> +
> +	switch (type) {
> +	case UACCE_QFRT_MMIO:
> +		flags = UACCE_QFRF_SELFMT;
> +		break;
> +
> +	case UACCE_QFRT_DUS:
> +		if (uacce->flags & UACCE_DEV_SVA) {
> +			flags = UACCE_QFRF_SELFMT;

I'd simplify this even further by getting rid of the SELFMT flag. It's the
only possibility at the moment.

> +			break;
> +		}
> +		break;
> +
> +	default:
> +		WARN_ON(&uacce->dev);

WARN_ON(uacce->dev). But shouldn't we instead return -EINVAL here?
UACCE_QFRT_MAX is currently 16, so users can easily trigger this WARN by
passing an invalid value.

[...]
> +void uacce_unregister(struct uacce_device *uacce)
> +{
> +	if (!uacce)
> +		return;
> +
> +	mutex_lock(&uacce->q_lock);
> +	if (!list_empty(&uacce->qs)) {
> +		struct uacce_queue *q;
> +
> +		list_for_each_entry(q, &uacce->qs, list) {
> +			uacce_put_queue(q);

The open file descriptor will still exist after this function returns.
Can all fops can be called with a stale queue?

Thanks,
Jean
Zhangfei Gao Nov. 6, 2019, 8:17 a.m. UTC | #4
Hi, Jean

Thanks for the review.

On 2019/11/5 下午7:48, Jean-Philippe Brucker wrote:
> Hi Zhangfei,
>
> Thanks for simplifying this, it's a lot easier to review. I have some
> additional comments.
>
> On Tue, Oct 29, 2019 at 02:40:15PM +0800, Zhangfei Gao wrote:
>> +static int uacce_sva_exit(struct device *dev, struct iommu_sva *handle,
>> +			  void *data)
>> +{
>> +	struct uacce_device *uacce = data;
>> +	struct uacce_queue *q;
>> +
>> +	mutex_lock(&uacce->q_lock);
>> +	list_for_each_entry(q, &uacce->qs, list) {
>> +		if (q->pid == task_pid_nr(current))
>> +			uacce_put_queue(q);
> This won't work in some cases, because any thread can call __mmput() and
> end up here. For example a sibling thread that inherited the queue, or a
> workqueue that's executing mmput_async_fn(). In addition I think comparing
> PID values is unsafe (see comment in pid.h), we'd need to use the struct
> pid if we wanted to do it this way.
OK, still in check.
>
> But I still believe it would be better to create an uacce_mm structure
> that tracks all queues bound to this mm, and pass that to uacce_sva_exit
> instead of the uacce_device.
I am afraid this method may not work.
Since currently iommu_sva_bind_device only accept the same drvdata for 
the same dev,
that's the reason we can not directly use "queue" as drvdata.
Each time create an uacce_mm structure should be same problem as queue, 
and fail for same dev.
So we use uacce and pick up the right queue inside.

>
> The queue isn't bound to a task, but its address space. With clone() the
> address space can be shared between tasks. In addition, whoever has a
> queue fd also gets access to this address space. So after a fork() the
> child may be able to program the queue to DMA into the parent's address
> space, even without CLONE_VM. Users must be aware of this and I think it's
> important to explain it very clearly in the UAPI.
>
> [...]
>> +static struct uacce_qfile_region *
>> +uacce_create_region(struct uacce_queue *q, struct vm_area_struct *vma,
>> +		    enum uacce_qfrt type, unsigned int flags)
>> +{
>> +	struct uacce_device *uacce = q->uacce;
>> +	struct uacce_qfile_region *qfr;
>> +	int ret = -ENOMEM;
>> +
>> +	qfr = kzalloc(sizeof(*qfr), GFP_KERNEL);
>> +	if (!qfr)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	qfr->type = type;
>> +	qfr->flags = flags;
>> +
>> +	if (vma->vm_flags & VM_READ)
>> +		qfr->prot |= IOMMU_READ;
> qfr->prot and qfr->flags aren't used at the moment, you could remove them.
Yes,
>
>> +
>> +	if (vma->vm_flags & VM_WRITE)
>> +		qfr->prot |= IOMMU_WRITE;
>> +
>> +	if (flags & UACCE_QFRF_SELFMT) {
>> +		if (!uacce->ops->mmap) {
>> +			ret = -EINVAL;
>> +			goto err_with_qfr;
>> +		}
>> +
>> +		ret = uacce->ops->mmap(q, vma, qfr);
>> +		if (ret)
>> +			goto err_with_qfr;
>> +		return qfr;
>> +	}
>> +
>> +	return qfr;
>> +
>> +err_with_qfr:
>> +	kfree(qfr);
>> +	return ERR_PTR(ret);
>> +}
>> +
>> +static int uacce_fops_mmap(struct file *filep, struct vm_area_struct *vma)
>> +{
>> +	struct uacce_queue *q = filep->private_data;
>> +	struct uacce_device *uacce = q->uacce;
>> +	struct uacce_qfile_region *qfr;
>> +	enum uacce_qfrt type = 0;
>> +	unsigned int flags = 0;
>> +	int ret;
>> +
>> +	if (vma->vm_pgoff < UACCE_QFRT_MAX)
>> +		type = vma->vm_pgoff;
> Otherwise return -EINVAL?  type probably shouldn't default to MMIO if it
> wasn't explicitly requested by the user.
OK
>
>> +
>> +	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_WIPEONFORK;
>> +	vma->vm_ops = &uacce_vm_ops;
>> +	vma->vm_private_data = q;
>> +
>> +	mutex_lock(&uacce_mutex);
>> +
>> +	if (q->qfrs[type]) {
>> +		ret = -EEXIST;
>> +		goto out_with_lock;
>> +	}
>> +
>> +	switch (type) {
>> +	case UACCE_QFRT_MMIO:
>> +		flags = UACCE_QFRF_SELFMT;
>> +		break;
>> +
>> +	case UACCE_QFRT_DUS:
>> +		if (uacce->flags & UACCE_DEV_SVA) {
>> +			flags = UACCE_QFRF_SELFMT;
> I'd simplify this even further by getting rid of the SELFMT flag. It's the
> only possibility at the moment.
OK, we can remove this flag for simplicity, may add it back if required 
in future patch.
>
>> +			break;
>> +		}
>> +		break;
>> +
>> +	default:
>> +		WARN_ON(&uacce->dev);
> WARN_ON(uacce->dev). But shouldn't we instead return -EINVAL here?
> UACCE_QFRT_MAX is currently 16, so users can easily trigger this WARN by
> passing an invalid value.
Yes, good idea.
>
> [...]
>> +void uacce_unregister(struct uacce_device *uacce)
>> +{
>> +	if (!uacce)
>> +		return;
>> +
>> +	mutex_lock(&uacce->q_lock);
>> +	if (!list_empty(&uacce->qs)) {
>> +		struct uacce_queue *q;
>> +
>> +		list_for_each_entry(q, &uacce->qs, list) {
>> +			uacce_put_queue(q);
> The open file descriptor will still exist after this function returns.
> Can all fops can be called with a stale queue?
To more clear:.
Do you mean rmmod without fops_release.

Thanks
Jean-Philippe Brucker Nov. 6, 2019, 3:32 p.m. UTC | #5
On Wed, Nov 06, 2019 at 04:17:40PM +0800, zhangfei wrote:
> > But I still believe it would be better to create an uacce_mm structure
> > that tracks all queues bound to this mm, and pass that to uacce_sva_exit
> > instead of the uacce_device.
> I am afraid this method may not work.
> Since currently iommu_sva_bind_device only accept the same drvdata for the
> same dev,
> that's the reason we can not directly use "queue" as drvdata.
> Each time create an uacce_mm structure should be same problem as queue, and
> fail for same dev.
> So we use uacce and pick up the right queue inside.

What I had in mind is keep one uacce_mm per mm and per device, and we can
pass that to iommu_sva_bind_device(). It requires some structure changes,
see the attached patch.

> > The queue isn't bound to a task, but its address space. With clone() the
> > address space can be shared between tasks. In addition, whoever has a
> > queue fd also gets access to this address space. So after a fork() the
> > child may be able to program the queue to DMA into the parent's address
> > space, even without CLONE_VM. Users must be aware of this and I think it's
> > important to explain it very clearly in the UAPI.
> > [...]
> > > +void uacce_unregister(struct uacce_device *uacce)
> > > +{
> > > +	if (!uacce)
> > > +		return;
> > > +
> > > +	mutex_lock(&uacce->q_lock);
> > > +	if (!list_empty(&uacce->qs)) {
> > > +		struct uacce_queue *q;
> > > +
> > > +		list_for_each_entry(q, &uacce->qs, list) {
> > > +			uacce_put_queue(q);
> > The open file descriptor will still exist after this function returns.
> > Can all fops can be called with a stale queue?
> To more clear:.
> Do you mean rmmod without fops_release.

Yes I think so. What happens when userspace starts some queues, and
the device driver suddenly calls uacce_unregister(). We call
cdev_device_del() later in this function, but quoting the documentation:
"any cdevs already open will remain and their fops will still be callable
even after this function returns." So we need to make sure that any of the
fops is safe to run after the uacce device disappears.

I noticed a lock dependency inversion on uacce->q_lock: uacce_unregister()
calls iommu_sva_unbind_device() while holding the uacce->q_lock, but
uacce_sva_exit() takes the uacce->q_lock with the SVA lock held. In theory
we could simply avoid calling iommu_sva_unbind_device() here since it will
be done by fops_release(), but then disabling the SVA feature in
uacce_unregister() won't work (because there still are bonds). The
attached patch should fix it, but I haven't tried running uacce_register()
yet.

Thanks,
Jean
From 49559efc5cb26aadbcf580de03afd6e4ff67cedc Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Wed, 6 Nov 2019 10:10:07 +0000
Subject: [PATCH] uacce: Track mm<->queue bonds

The IOMMU core only tracks mm<->device bonds at the moment, because it
only needs to handle IOTLB invalidation and PASID table entries. However
uacce needs a finer granularity since multiple queues from the same
device can be bound to an mm. When the mm exits, all bound queues must
be stopped so that the IOMMU can safely clear the PASID table entry and
reallocate the PASID.

Introduce an intermediate struct uacce_mm that links uacce devices and
queues. Note that an mm may be bound to multiple devices but an uacce_mm
structure only ever belongs to a single device, because we don't need
anything more complex (if multiple devices are bound to one mm, then
we'll create one uacce_mm for each bond).

        uacce_device --+-- uacce_mm --+-- uacce_queue
                       |              '-- uacce_queue
                       |
                       '-- uacce_mm --+-- uacce_queue
                                      +-- uacce_queue
                                      '-- uacce_queue

If multiple device drivers need this model, it should be possible to
move it to iommu-sva in the future, with some changes to the API, and
have mm_exit() be called for multiple contexts per iommu_bond.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
---
 drivers/misc/uacce/uacce.c | 174 +++++++++++++++++++++++++++----------
 include/linux/uacce.h      |  34 ++++++--
 2 files changed, 152 insertions(+), 56 deletions(-)

diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
index 2b6b03855ac6..d8a7fbfe7399 100644
--- a/drivers/misc/uacce/uacce.c
+++ b/drivers/misc/uacce/uacce.c
@@ -92,15 +92,19 @@ static long uacce_fops_compat_ioctl(struct file *filep,
 static int uacce_sva_exit(struct device *dev, struct iommu_sva *handle,
 			  void *data)
 {
-	struct uacce_device *uacce = data;
+	struct uacce_mm *uacce_mm = data;
 	struct uacce_queue *q;
 
-	mutex_lock(&uacce->q_lock);
-	list_for_each_entry(q, &uacce->qs, list) {
-		if (q->pid == task_pid_nr(current))
-			uacce_put_queue(q);
-	}
-	mutex_unlock(&uacce->q_lock);
+	/*
+	 * No new queue can be added concurrently because no caller can have a
+	 * reference to this mm. But there may be concurrent calls to
+	 * uacce_mm_put(), so we need the lock.
+	 */
+	mutex_lock(&uacce_mm->lock);
+	list_for_each_entry(q, &uacce_mm->queues, list)
+		uacce_put_queue(q);
+	uacce_mm->mm = NULL;
+	mutex_unlock(&uacce_mm->lock);
 
 	return 0;
 }
@@ -109,13 +113,88 @@ static struct iommu_sva_ops uacce_sva_ops = {
 	.mm_exit = uacce_sva_exit,
 };
 
-static int uacce_fops_open(struct inode *inode, struct file *filep)
+static struct uacce_mm *uacce_mm_get(struct uacce_device *uacce,
+				     struct uacce_queue *q,
+				     struct mm_struct *mm)
 {
+	struct uacce_mm *uacce_mm = NULL;
 	struct iommu_sva *handle = NULL;
+	int ret;
+
+	lockdep_assert_held(&uacce->mm_lock);
+
+	list_for_each_entry(uacce_mm, &uacce->mm_list, list) {
+		if (uacce_mm->mm == mm) {
+			mutex_lock(&uacce_mm->lock);
+			list_add(&q->list, &uacce_mm->queues);
+			mutex_unlock(&uacce_mm->lock);
+			return uacce_mm;
+		}
+	}
+
+	uacce_mm = kzalloc(sizeof(*uacce_mm), GFP_KERNEL);
+	if (!uacce_mm)
+		return NULL;
+
+	if (uacce->flags & UACCE_DEV_SVA) {
+		/*
+		 * Safe to pass an incomplete uacce_mm, since mm_exit cannot
+		 * fire while we hold a reference to the mm.
+		 */
+		handle = iommu_sva_bind_device(uacce->pdev, mm, uacce_mm);
+		if (IS_ERR(handle))
+			goto err_free;
+
+		ret = iommu_sva_set_ops(handle, &uacce_sva_ops);
+		if (ret)
+			goto err_unbind;
+
+		uacce_mm->pasid = iommu_sva_get_pasid(handle);
+		if (uacce_mm->pasid == IOMMU_PASID_INVALID)
+			goto err_unbind;
+	}
+
+	uacce_mm->mm = mm;
+	uacce_mm->handle = handle;
+	INIT_LIST_HEAD(&uacce_mm->queues);
+	mutex_init(&uacce_mm->lock);
+	list_add(&q->list, &uacce_mm->queues);
+	list_add(&uacce_mm->list, &uacce->mm_list);
+
+	return uacce_mm;
+
+err_unbind:
+	if (handle)
+		iommu_sva_unbind_device(handle);
+err_free:
+	kfree(uacce_mm);
+	return NULL;
+}
+
+static void uacce_mm_put(struct uacce_queue *q)
+{
+	struct uacce_mm *uacce_mm = q->uacce_mm;
+
+	lockdep_assert_held(&q->uacce->mm_lock);
+
+	mutex_lock(&uacce_mm->lock);
+	list_del(&q->list);
+	mutex_unlock(&uacce_mm->lock);
+
+	if (list_empty(&uacce_mm->queues)) {
+		if (uacce_mm->handle)
+			iommu_sva_unbind_device(uacce_mm->handle);
+		list_del(&uacce_mm->list);
+		kfree(uacce_mm);
+	}
+}
+
+static int uacce_fops_open(struct inode *inode, struct file *filep)
+{
+	struct uacce_mm *uacce_mm = NULL;
 	struct uacce_device *uacce;
 	struct uacce_queue *q;
 	int ret = 0;
-	int pasid = 0;
 
 	uacce = xa_load(&uacce_xa, iminor(inode));
 	if (!uacce)
@@ -130,44 +209,37 @@ static int uacce_fops_open(struct inode *inode, struct file *filep)
 		goto out_with_module;
 	}
 
-	if (uacce->flags & UACCE_DEV_SVA) {
-		handle = iommu_sva_bind_device(uacce->pdev, current->mm, uacce);
-		if (IS_ERR(handle))
-			goto out_with_mem;
-
-		ret = iommu_sva_set_ops(handle, &uacce_sva_ops);
-		if (ret)
-			goto out_unbind;
+	q->state = UACCE_Q_ZOMBIE;
 
-		pasid = iommu_sva_get_pasid(handle);
-		if (pasid == IOMMU_PASID_INVALID)
-			goto out_unbind;
+	mutex_lock(&uacce->mm_lock);
+	uacce_mm = uacce_mm_get(uacce, q, current->mm);
+	mutex_unlock(&uacce->mm_lock);
+	if (!uacce_mm) {
+		ret = -ENOMEM;
+		goto out_with_mem;
 	}
 
+	q->uacce = uacce;
+	q->uacce_mm = uacce_mm;
+
 	if (uacce->ops->get_queue) {
-		ret = uacce->ops->get_queue(uacce, pasid, q);
+		ret = uacce->ops->get_queue(uacce, uacce_mm->pasid, q);
 		if (ret < 0)
-			goto out_unbind;
+			goto out_with_mm;
 	}
 
 	q->pid = task_pid_nr(current);
-	q->pasid = pasid;
-	q->handle = handle;
-	q->uacce = uacce;
 	memset(q->qfrs, 0, sizeof(q->qfrs));
 	init_waitqueue_head(&q->wait);
 	filep->private_data = q;
 	q->state = UACCE_Q_INIT;
 
-	mutex_lock(&uacce->q_lock);
-	list_add(&q->list, &uacce->qs);
-	mutex_unlock(&uacce->q_lock);
-
 	return 0;
 
-out_unbind:
-	if (uacce->flags & UACCE_DEV_SVA)
-		iommu_sva_unbind_device(handle);
+out_with_mm:
+	mutex_lock(&uacce->mm_lock);
+	uacce_mm_put(q);
+	mutex_unlock(&uacce->mm_lock);
 out_with_mem:
 	kfree(q);
 out_with_module:
@@ -182,12 +254,10 @@ static int uacce_fops_release(struct inode *inode, struct file *filep)
 
 	uacce_put_queue(q);
 
-	if (uacce->flags & UACCE_DEV_SVA)
-		iommu_sva_unbind_device(q->handle);
+	mutex_lock(&uacce->mm_lock);
+	uacce_mm_put(q);
+	mutex_unlock(&uacce->mm_lock);
 
-	mutex_lock(&uacce->q_lock);
-	list_del(&q->list);
-	mutex_unlock(&uacce->q_lock);
 	kfree(q);
 	module_put(uacce->pdev->driver->owner);
 
@@ -484,8 +554,8 @@ struct uacce_device *uacce_register(struct device *parent,
 		goto err_with_xa;
 	}
 
-	INIT_LIST_HEAD(&uacce->qs);
-	mutex_init(&uacce->q_lock);
+	INIT_LIST_HEAD(&uacce->mm_list);
+	mutex_init(&uacce->mm_lock);
 	uacce->cdev->ops = &uacce_fops;
 	uacce->cdev->owner = THIS_MODULE;
 	device_initialize(&uacce->dev);
@@ -519,20 +589,30 @@ EXPORT_SYMBOL_GPL(uacce_register);
  */
 void uacce_unregister(struct uacce_device *uacce)
 {
+	struct uacce_mm *uacce_mm;
+	struct uacce_queue *q;
+
 	if (!uacce)
 		return;
 
-	mutex_lock(&uacce->q_lock);
-	if (!list_empty(&uacce->qs)) {
-		struct uacce_queue *q;
-
-		list_for_each_entry(q, &uacce->qs, list) {
+	mutex_lock(&uacce->mm_lock);
+	list_for_each_entry(uacce_mm, &uacce->mm_list, list) {
+		/*
+		 * We don't take the uacce_mm->lock here. Since we hold the
+		 * device's mm_lock, no queue can be added to or removed from
+		 * this uacce_mm. We may run concurrently with mm_exit, but
+		 * uacce_put_queue() is serialized and iommu_sva_unbind_device()
+		 * waits for the lock that mm_exit is holding.
+		 */
+		list_for_each_entry(q, &uacce_mm->queues, list)
 			uacce_put_queue(q);
-			if (uacce->flags & UACCE_DEV_SVA)
-				iommu_sva_unbind_device(q->handle);
+
+		if (uacce->flags & UACCE_DEV_SVA) {
+			iommu_sva_unbind_device(uacce_mm->handle);
+			uacce_mm->handle = NULL;
 		}
 	}
-	mutex_unlock(&uacce->q_lock);
+	mutex_unlock(&uacce->mm_lock);
 
 	if (uacce->flags & UACCE_DEV_SVA)
 		iommu_dev_disable_feature(uacce->pdev, IOMMU_DEV_FEAT_SVA);
diff --git a/include/linux/uacce.h b/include/linux/uacce.h
index 04c8643c130b..8564e078287a 100644
--- a/include/linux/uacce.h
+++ b/include/linux/uacce.h
@@ -88,10 +88,9 @@ enum uacce_q_state {
  * @uacce: pointer to uacce
  * @priv: private pointer
  * @wait: wait queue head
- * @pasid: pasid of the queue
  * @pid: pid of the process using the queue
- * @handle: iommu_sva handle return from iommu_sva_bind_device
- * @list: queue list
+ * @list: index into uacce_mm
+ * @uacce_mm: the corresponding mm
  * @qfrs: pointer of qfr regions
  * @state: queue state machine
  */
@@ -99,10 +98,9 @@ struct uacce_queue {
 	struct uacce_device *uacce;
 	void *priv;
 	wait_queue_head_t wait;
-	int pasid;
 	pid_t pid;
-	struct iommu_sva *handle;
 	struct list_head list;
+	struct uacce_mm *uacce_mm;
 	struct uacce_qfile_region *qfrs[UACCE_QFRT_MAX];
 	enum uacce_q_state state;
 };
@@ -121,8 +119,8 @@ struct uacce_queue {
  * @cdev: cdev of the uacce
  * @dev: dev of the uacce
  * @priv: private pointer of the uacce
- * @qs: list head of queue->list
- * @q_lock: lock for qs
+ * @mm_list: list head of uacce_mm->list
+ * @mm_lock: lock for mm_list
  */
 struct uacce_device {
 	const char *algs;
@@ -137,8 +135,26 @@ struct uacce_device {
 	struct cdev *cdev;
 	struct device dev;
 	void *priv;
-	struct list_head qs;
-	struct mutex q_lock;
+	struct list_head mm_list;
+	struct mutex mm_lock;
+};
+
+/*
+ * struct uacce_mm - keep track of queues bound to a process
+ * @list: index into uacce_device
+ * @queues: list of queues
+ * @mm: the mm struct
+ * @lock: protects the list of queues
+ * @pasid: pasid of the queue
+ * @handle: iommu_sva handle return from iommu_sva_bind_device
+ */
+struct uacce_mm {
+	struct list_head list;
+	struct list_head queues;
+	struct mm_struct *mm;
+	struct mutex lock;
+	int pasid;
+	struct iommu_sva *handle;
 };
 
 #if IS_ENABLED(CONFIG_UACCE)
Zhangfei Gao Nov. 7, 2019, 1:23 p.m. UTC | #6
On 2019/11/6 下午11:32, Jean-Philippe Brucker wrote:
> On Wed, Nov 06, 2019 at 04:17:40PM +0800, zhangfei wrote:
>>> But I still believe it would be better to create an uacce_mm structure
>>> that tracks all queues bound to this mm, and pass that to uacce_sva_exit
>>> instead of the uacce_device.
>> I am afraid this method may not work.
>> Since currently iommu_sva_bind_device only accept the same drvdata for the
>> same dev,
>> that's the reason we can not directly use "queue" as drvdata.
>> Each time create an uacce_mm structure should be same problem as queue, and
>> fail for same dev.
>> So we use uacce and pick up the right queue inside.
> What I had in mind is keep one uacce_mm per mm and per device, and we can
> pass that to iommu_sva_bind_device(). It requires some structure changes,
> see the attached patch.
Cool, thanks Jean
How about merge them together.
>
>>> The queue isn't bound to a task, but its address space. With clone() the
>>> address space can be shared between tasks. In addition, whoever has a
>>> queue fd also gets access to this address space. So after a fork() the
>>> child may be able to program the queue to DMA into the parent's address
>>> space, even without CLONE_VM. Users must be aware of this and I think it's
>>> important to explain it very clearly in the UAPI.
>>> [...]
>>>> +void uacce_unregister(struct uacce_device *uacce)
>>>> +{
>>>> +	if (!uacce)
>>>> +		return;
>>>> +
>>>> +	mutex_lock(&uacce->q_lock);
>>>> +	if (!list_empty(&uacce->qs)) {
>>>> +		struct uacce_queue *q;
>>>> +
>>>> +		list_for_each_entry(q, &uacce->qs, list) {
>>>> +			uacce_put_queue(q);
>>> The open file descriptor will still exist after this function returns.
>>> Can all fops can be called with a stale queue?
>> To more clear:.
>> Do you mean rmmod without fops_release.
> Yes I think so. What happens when userspace starts some queues, and
> the device driver suddenly calls uacce_unregister(). We call
> cdev_device_del() later in this function, but quoting the documentation:
> "any cdevs already open will remain and their fops will still be callable
> even after this function returns." So we need to make sure that any of the
> fops is safe to run after the uacce device disappears.
We can protect stale queue via q->state, since q is released later in 
fops_release.
And uacce_unregister: put_queue will set q->state = UACCE_Q_ZOMBIE.
Will add state check in mmap too.
>
> I noticed a lock dependency inversion on uacce->q_lock: uacce_unregister()
> calls iommu_sva_unbind_device() while holding the uacce->q_lock, but
> uacce_sva_exit() takes the uacce->q_lock with the SVA lock held. In theory
> we could simply avoid calling iommu_sva_unbind_device() here since it will
> be done by fops_release(), but then disabling the SVA feature in
> uacce_unregister() won't work (because there still are bonds). The
> attached patch should fix it, but I haven't tried running uacce_register()
> yet.
Have tested, it is OK.

Thanks
Jean-Philippe Brucker Nov. 8, 2019, 7:48 a.m. UTC | #7
On Thu, Nov 07, 2019 at 09:23:50PM +0800, zhangfei wrote:
> > What I had in mind is keep one uacce_mm per mm and per device, and we can
> > pass that to iommu_sva_bind_device(). It requires some structure changes,
> > see the attached patch.
> Cool, thanks Jean
> How about merge them together.

No problem, you can squash it into this patch

Thanks,
Jean
Jonathan Cameron Nov. 11, 2019, 11:19 a.m. UTC | #8
On Tue, 5 Nov 2019 15:43:31 +0800
zhangfei <zhangfei.gao@linaro.org> wrote:

> Hi, Jonathan
> 
> Thanks for the suggestions
> 
> On 2019/11/1 上午1:13, Jonathan Cameron wrote:
> > On Tue, 29 Oct 2019 14:40:15 +0800
> > Zhangfei Gao <zhangfei.gao@linaro.org> wrote:
> >  
> >> From: Kenneth Lee <liguozhu@hisilicon.com>
> >>
> >> Uacce (Unified/User-space-access-intended Accelerator Framework) targets to
> >> provide Shared Virtual Addressing (SVA) between accelerators and processes.
> >> So accelerator can access any data structure of the main cpu.
> >> This differs from the data sharing between cpu and io device, which share
> >> data content rather than address.
> >> Since unified address, hardware and user space of process can share the
> >> same virtual address in the communication.
> >>
> >> Uacce create a chrdev for every registration, the queue is allocated to
> >> the process when the chrdev is opened. Then the process can access the
> >> hardware resource by interact with the queue file. By mmap the queue
> >> file space to user space, the process can directly put requests to the
> >> hardware without syscall to the kernel space.
> >>
> >> Signed-off-by: Kenneth Lee <liguozhu@hisilicon.com>
> >> Signed-off-by: Zaibo Xu <xuzaibo@huawei.com>
> >> Signed-off-by: Zhou Wang <wangzhou1@hisilicon.com>
> >> Signed-off-by: Zhangfei Gao <zhangfei.gao@linaro.org>  
> > Great, much more compact.
> >
> > I've not gone through this in detail yet but a few initial comments inline.
> >
> > Thanks,
> >
> > Jonathan
> >  
> >> ---
> >>   Documentation/ABI/testing/sysfs-driver-uacce |  53 +++
> >>   drivers/misc/Kconfig                         |   1 +
> >>   drivers/misc/Makefile                        |   1 +
> >>   drivers/misc/uacce/Kconfig                   |  13 +
> >>   drivers/misc/uacce/Makefile                  |   2 +
> >>   drivers/misc/uacce/uacce.c                   | 574 +++++++++++++++++++++++++++
> >>   include/linux/uacce.h                        | 163 ++++++++
> >>   include/uapi/misc/uacce/uacce.h              |  38 ++
> >>   8 files changed, 845 insertions(+)
> >>   create mode 100644 Documentation/ABI/testing/sysfs-driver-uacce
> >>   create mode 100644 drivers/misc/uacce/Kconfig
> >>   create mode 100644 drivers/misc/uacce/Makefile
> >>   create mode 100644 drivers/misc/uacce/uacce.c
> >>   create mode 100644 include/linux/uacce.h
> >>   create mode 100644 include/uapi/misc/uacce/uacce.h
> >>
> >> diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
> >> new file mode 100644
> >> index 0000000..35699dc
> >> --- /dev/null
> >> +++ b/Documentation/ABI/testing/sysfs-driver-uacce
> >> @@ -0,0 +1,53 @@
> >> +What:           /sys/class/uacce/<dev_name>/id
> >> +Date:           Oct 2019
> >> +KernelVersion:  5.5
> >> +Contact:        linux-accelerators@lists.ozlabs.org
> >> +Description:    Id of the device.
> >> +
> >> +What:           /sys/class/uacce/<dev_name>/api
> >> +Date:           Oct 2019
> >> +KernelVersion:  5.5
> >> +Contact:        linux-accelerators@lists.ozlabs.org
> >> +Description:    Api of the device, used by application to match the correct driver
> >> +
> >> +What:           /sys/class/uacce/<dev_name>/flags
> >> +Date:           Oct 2019
> >> +KernelVersion:  5.5
> >> +Contact:        linux-accelerators@lists.ozlabs.org
> >> +Description:    Attributes of the device, see UACCE_DEV_xxx flag defined in uacce.h
> >> +
> >> +What:           /sys/class/uacce/<dev_name>/available_instances
> >> +Date:           Oct 2019
> >> +KernelVersion:  5.5
> >> +Contact:        linux-accelerators@lists.ozlabs.org
> >> +Description:    Available instances left of the device
> >> +
> >> +What:           /sys/class/uacce/<dev_name>/algorithms
> >> +Date:           Oct 2019
> >> +KernelVersion:  5.5
> >> +Contact:        linux-accelerators@lists.ozlabs.org
> >> +Description:    Algorithms supported by this accelerator  
> > How are they separated?  Userspace code needs to know that.
> > (comma, tab, newline?)  
> Yes, will add "separated by new line"
> >  
> >> +
> >> +What:           /sys/class/uacce/<dev_name>/qfrt_mmio_size  
> > qfrt is not the most obvious naming ever.  Do we care beyond its
> > a region for this interface?  region_mmio_size maybe?  
> OK,
> >  
> >> +Date:           Oct 2019
> >> +KernelVersion:  5.5
> >> +Contact:        linux-accelerators@lists.ozlabs.org
> >> +Description:    Page size of mmio region queue file  
> > Size of page in this region, or number of pages in the region?  
> Change to "Page numbers of mmio region queue file"

Number of pages used by queue in mmio region?

> >  
> >> +
> >> +What:           /sys/class/uacce/<dev_name>/qfrt_dus_size
> >> +Date:           Oct 2019
> >> +KernelVersion:  5.5
> >> +Contact:        linux-accelerators@lists.ozlabs.org
> >> +Description:    Page size of dus region queue file
> >> +
> >> +What:           /sys/class/uacce/<dev_name>/numa_distance
> >> +Date:           Oct 2019
> >> +KernelVersion:  5.5
> >> +Contact:        linux-accelerators@lists.ozlabs.org
> >> +Description:    Distance of device node to cpu node  
> > I wonder if we should be doing this in here. There are other standard
> > ways of obtaining this for the device.  Follow parent and check node_id
> > there then use the /sys/bus/node path to find out the distances.  
> Could you clarify more about this method.
> The purpose here is cpu searching the nearest device(zip) doing work.
> Does user application know which node it is running and compare distance?

Exactly.  The parent device will typically be a pci device. The parent
link will point somewhere like

/sys/bus/pci/devices/000:00:10.0/

Under that directory is a numa_node file which will give you which node
the device is assigned to.  

Using that number (N) read

/sys/bus/node/devices/nodeN/distance

Which should be the same as what you have from this interface.
It also provides access to info on latency and bandwidth etc
if HMAT is provided - so more info to make a decision than your
new interface here provides.


> >> +
> >> +What:           /sys/class/uacce/<dev_name>/node_id
> >> +Date:           Oct 2019
> >> +KernelVersion:  5.5
> >> +Contact:        linux-accelerators@lists.ozlabs.org
> >> +Description:    Id of the numa node
> >> diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
> >> index c55b637..929feb0 100644
> >> --- a/drivers/misc/Kconfig
> >> +++ b/drivers/misc/Kconfig
> >> @@ -481,4 +481,5 @@ source "drivers/misc/cxl/Kconfig"
> >>   source "drivers/misc/ocxl/Kconfig"
> >>   source "drivers/misc/cardreader/Kconfig"
> >>   source "drivers/misc/habanalabs/Kconfig"
> >> +source "drivers/misc/uacce/Kconfig"
> >>   endmenu
> >> diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
> >> index c1860d3..9abf292 100644
> >> --- a/drivers/misc/Makefile
> >> +++ b/drivers/misc/Makefile
> >> @@ -56,4 +56,5 @@ obj-$(CONFIG_OCXL)		+= ocxl/
> >>   obj-y				+= cardreader/
> >>   obj-$(CONFIG_PVPANIC)   	+= pvpanic.o
> >>   obj-$(CONFIG_HABANA_AI)		+= habanalabs/
> >> +obj-$(CONFIG_UACCE)		+= uacce/
> >>   obj-$(CONFIG_XILINX_SDFEC)	+= xilinx_sdfec.o
> >> diff --git a/drivers/misc/uacce/Kconfig b/drivers/misc/uacce/Kconfig
> >> new file mode 100644
> >> index 0000000..5e39b60
> >> --- /dev/null
> >> +++ b/drivers/misc/uacce/Kconfig
> >> @@ -0,0 +1,13 @@
> >> +config UACCE
> >> +	tristate "Accelerator Framework for User Land"
> >> +	depends on IOMMU_API
> >> +	help
> >> +	  UACCE provides interface for the user process to access the hardware
> >> +	  without interaction with the kernel space in data path.
> >> +
> >> +	  The user-space interface is described in
> >> +	  include/uapi/misc/uacce/uacce.h
> >> +
> >> +	  See Documentation/misc-devices/uacce.rst for more details.
> >> +
> >> +	  If you don't know what to do here, say N.  
> > Pessimist :) Everyone should want uacce so don't put them off.  Having said
> > that perhaps for now it should be hidden and enabled on a driver by driver
> > basis?
> >  
> >> diff --git a/drivers/misc/uacce/Makefile b/drivers/misc/uacce/Makefile
> >> new file mode 100644
> >> index 0000000..5b4374e
> >> --- /dev/null
> >> +++ b/drivers/misc/uacce/Makefile
> >> @@ -0,0 +1,2 @@
> >> +# SPDX-License-Identifier: GPL-2.0-or-later
> >> +obj-$(CONFIG_UACCE) += uacce.o
> >> diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
> >> new file mode 100644
> >> index 0000000..2b6b038
> >> --- /dev/null
> >> +++ b/drivers/misc/uacce/uacce.c
> >> @@ -0,0 +1,574 @@
> >> +// SPDX-License-Identifier: GPL-2.0-or-later
> >> +#include <linux/compat.h>
> >> +#include <linux/dma-iommu.h>
> >> +#include <linux/module.h>
> >> +#include <linux/poll.h>
> >> +#include <linux/uacce.h>
> >> +
> >> +static struct class *uacce_class;
> >> +static dev_t uacce_devt;
> >> +static DEFINE_MUTEX(uacce_mutex);
> >> +static DEFINE_XARRAY_ALLOC(uacce_xa);
> >> +
> >> +static int uacce_start_queue(struct uacce_queue *q)
> >> +{
> >> +	int ret = -EINVAL;
> >> +
> >> +	mutex_lock(&uacce_mutex);
> >> +
> >> +	if (q->state != UACCE_Q_INIT)
> >> +		goto out_with_lock;
> >> +
> >> +	if (q->uacce->ops->start_queue) {
> >> +		ret = q->uacce->ops->start_queue(q);
> >> +		if (ret < 0)
> >> +			goto out_with_lock;
> >> +	}
> >> +
> >> +	q->state = UACCE_Q_STARTED;  
> > out_with_lock:  
> >> +	mutex_unlock(&uacce_mutex);
> >> +  
> > return ret;
> > Though need to handle ret a bit differently above...  
> OK
> >
> > +static int uacce_fops_mmap(struct file *filep, struct vm_area_struct *vma)
> > +{
> > +	struct uacce_queue *q = filep->private_data;
> > +	struct uacce_device *uacce = q->uacce;
> > +	struct uacce_qfile_region *qfr;
> > +	enum uacce_qfrt type = 0;
> > +	unsigned int flags = 0;
> > +	int ret;
> > +
> > +	if (vma->vm_pgoff < UACCE_QFRT_MAX)
> > +		type = vma->vm_pgoff;
> > +
> > +	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_WIPEONFORK;
> > +	vma->vm_ops = &uacce_vm_ops;
> > +	vma->vm_private_data = q;
> > +
> > +	mutex_lock(&uacce_mutex);
> > +
> > +	if (q->qfrs[type]) {
> > +		ret = -EEXIST;
> > +		goto out_with_lock;
> > +	}
> > +
> > +	switch (type) {
> > +	case UACCE_QFRT_MMIO:
> > +		flags = UACCE_QFRF_SELFMT;
> > +		break;
> > +
> > +	case UACCE_QFRT_DUS:
> > +		if (uacce->flags & UACCE_DEV_SVA) {
> > +			flags = UACCE_QFRF_SELFMT;
> > +			break;
> > +		}
> > +		break;
> > +
> > +	default:
> > +		WARN_ON(&uacce->dev);
> > +		break;
> > +	}
> > +
> > +	qfr = uacce_create_region(q, vma, type, flags);
> > +	if (IS_ERR(qfr)) {
> > +		ret = PTR_ERR(qfr);
> > +		goto out_with_lock;
> > +	}
> > +	q->qfrs[type] = qfr;
> > +
> > Could put
> > out_with_lock:
> > here and return ret instead of 0.
> > You'll need to set ret to default to 0 in that
> > case though.  
> OK
> >
> > +static ssize_t algorithms_show(struct device *dev,
> > +			       struct device_attribute *attr, char *buf)
> > +{
> > +	struct uacce_device *uacce = to_uacce_device(dev);
> > +
> > +	return sprintf(buf, "%s", uacce->algs);
> > Any risk algs won't have the \n?
> > I'd kind of expect it to be a null termated arrays to allow the core
> > to format it however it wants to.  
> Yes, adding \n is better.

This may then add a bonus new line if you have multiple lines already in
the string.  Probably doesn't do much harm, but it's not ideal.

> >  
> >> +}
> >> +
> >> +static ssize_t qfrt_mmio_size_show(struct device *dev,
> >> +				   struct device_attribute *attr, char *buf)
> >> +{
> >> +	struct uacce_device *uacce = to_uacce_device(dev);
> >> +
> >> +	return sprintf(buf, "%lu\n",
> >> +		       uacce->qf_pg_size[UACCE_QFRT_MMIO] << PAGE_SHIFT);
> >> +}
> >> +
> >> +static ssize_t qfrt_dus_size_show(struct device *dev,
> >> +				  struct device_attribute *attr, char *buf)
> >> +{
> >> +	struct uacce_device *uacce = to_uacce_device(dev);
> >> +
> >> +	return sprintf(buf, "%lu\n",
> >> +		       uacce->qf_pg_size[UACCE_QFRT_DUS] << PAGE_SHIFT);
> >> +}
> >> +
> >> +static DEVICE_ATTR_RO(id);
> >> +static DEVICE_ATTR_RO(api);
> >> +static DEVICE_ATTR_RO(numa_distance);
> >> +static DEVICE_ATTR_RO(node_id);
> >> +static DEVICE_ATTR_RO(flags);
> >> +static DEVICE_ATTR_RO(available_instances);
> >> +static DEVICE_ATTR_RO(algorithms);
> >> +static DEVICE_ATTR_RO(qfrt_mmio_size);
> >> +static DEVICE_ATTR_RO(qfrt_dus_size);
> >> +
> >> +static struct attribute *uacce_dev_attrs[] = {
> >> +	&dev_attr_id.attr,
> >> +	&dev_attr_api.attr,
> >> +	&dev_attr_node_id.attr,
> >> +	&dev_attr_numa_distance.attr,
> >> +	&dev_attr_flags.attr,
> >> +	&dev_attr_available_instances.attr,
> >> +	&dev_attr_algorithms.attr,
> >> +	&dev_attr_qfrt_mmio_size.attr,
> >> +	&dev_attr_qfrt_dus_size.attr,
> >> +	NULL,
> >> +};
> >> +ATTRIBUTE_GROUPS(uacce_dev);
> >> +
> >> +static void uacce_release(struct device *dev)
> >> +{
> >> +	struct uacce_device *uacce = to_uacce_device(dev);
> >> +
> >> +	kfree(uacce);
> >> +}
> >> +
> >> +/**
> >> + * uacce_register - register an accelerator  
> > This isn't quite correct kernel-doc.  Please run the
> > generation script over it and fix any warnings.
> >
> > 	uacce_register() - register an accelerator  
> Sure, will add (), though no warning reported from ./scripts/kernel-doc

I checked that one for another review yesterday.  Seems the kernel
suggested kernel-doc style isn't actually enforced and the brackets
are optional for functions.   It assumes anything it hasn't identified
as something else must be a function hence this is the one case where
careful matching doesn't apply (unlike struct, enum etc).


> >  
> >> + * @parent: pointer of uacce parent device
> >> + * @interface: pointer of uacce_interface for register
> >> + */
> >> +struct uacce_device *uacce_register(struct device *parent,
> >> +				    struct uacce_interface *interface)
> >> +{
> >> +	unsigned int flags = interface->flags;
> >> +	struct uacce_device *uacce;
> >> +	int ret;
> >> +
> >> +	uacce = kzalloc(sizeof(struct uacce_device), GFP_KERNEL);
> >> +	if (!uacce)
> >> +		return ERR_PTR(-ENOMEM);
> >> +
> >> +	if (flags & UACCE_DEV_SVA) {
> >> +		ret = iommu_dev_enable_feature(parent, IOMMU_DEV_FEAT_SVA);
> >> +		if (ret)
> >> +			flags &= ~UACCE_DEV_SVA;
> >> +	}
> >> +
> >> +	uacce->pdev = parent;
> >> +	uacce->flags = flags;
> >> +	uacce->ops = interface->ops;
> >> +
> >> +	ret = xa_alloc(&uacce_xa, &uacce->dev_id, uacce, xa_limit_32b,
> >> +		       GFP_KERNEL);
> >> +	if (ret < 0)
> >> +		goto err_with_uacce;
> >> +
> >> +	uacce->cdev = cdev_alloc();  
> > If we can embed this (see below) then use cdev_init instead.
> >  
> >> +	if (!uacce->cdev) {
> >> +		ret = -ENOMEM;
> >> +		goto err_with_xa;
> >> +	}
> >> +
> >> +	INIT_LIST_HEAD(&uacce->qs);
> >> +	mutex_init(&uacce->q_lock);
> >> +	uacce->cdev->ops = &uacce_fops;
> >> +	uacce->cdev->owner = THIS_MODULE;
> >> +	device_initialize(&uacce->dev);
> >> +	uacce->dev.devt = MKDEV(MAJOR(uacce_devt), uacce->dev_id);
> >> +	uacce->dev.class = uacce_class;
> >> +	uacce->dev.groups = uacce_dev_groups;
> >> +	uacce->dev.parent = uacce->pdev;
> >> +	uacce->dev.release = uacce_release;
> >> +	dev_set_name(&uacce->dev, "%s-%d", interface->name, uacce->dev_id);
> >> +	ret = cdev_device_add(uacce->cdev, &uacce->dev);
> >> +	if (ret)
> >> +		goto err_with_xa;
> >> +
> >> +	return uacce;
> >> +
> >> +err_with_xa:
> >> +	if (uacce->cdev)
> >> +		cdev_del(uacce->cdev);  
> > Why not use a separate label to handle the above rather than checking if
> > it's set?  
> ok,
> >  
> >> +	xa_erase(&uacce_xa, uacce->dev_id);
> >> +err_with_uacce:
> >> +	if (flags & UACCE_DEV_SVA)
> >> +		iommu_dev_disable_feature(uacce->pdev, IOMMU_DEV_FEAT_SVA);
> >> +	kfree(uacce);
> >> +	return ERR_PTR(ret);
> >> +}
> >> +EXPORT_SYMBOL_GPL(uacce_register);
> >> +
> >> +/**
> >> + * uacce_unregister - unregisters an accelerator
> >> + * @uacce: the accelerator to unregister
> >> + */
> >> +void uacce_unregister(struct uacce_device *uacce)
> >> +{
> >> +	if (!uacce)
> >> +		return;
> >> +  
> > I'd like to see a comment here on why we are doing things not unwinding
> > actions from uacce_register.  
> OK will add comments.
> Here is "ensure no open queue remains"
> >> +	mutex_lock(&uacce->q_lock);
> >> +	if (!list_empty(&uacce->qs)) {
> >> +		struct uacce_queue *q;
> >> +
> >> +		list_for_each_entry(q, &uacce->qs, list) {
> >> +			uacce_put_queue(q);
> >> +			if (uacce->flags & UACCE_DEV_SVA)
> >> +				iommu_sva_unbind_device(q->handle);
> >> +		}
> >> +	}
> >> +	mutex_unlock(&uacce->q_lock);
> >> +  
> > For these next parts which are the unwind of uacce_register, why are they not
> > in the reverse order of what is happening in there (where possible given
> > device lifespan). That is why do we not disable the iommu feature much later?  
> First close all queues, then disable sva feature.
> >  
> >> +	if (uacce->flags & UACCE_DEV_SVA)
> >> +		iommu_dev_disable_feature(uacce->pdev, IOMMU_DEV_FEAT_SVA);
> >> +
> >> +	cdev_device_del(uacce->cdev, &uacce->dev);
> >> +	xa_erase(&uacce_xa, uacce->dev_id);
> >> +	put_device(&uacce->dev);
> >> +}
> >> +EXPORT_SYMBOL_GPL(uacce_unregister);
> >> +
> >> +static int __init uacce_init(void)
> >> +{
> >> +	int ret;
> >> +
> >> +	uacce_class = class_create(THIS_MODULE, UACCE_NAME);
> >> +	if (IS_ERR(uacce_class))
> >> +		return PTR_ERR(uacce_class);
> >> +
> >> +	ret = alloc_chrdev_region(&uacce_devt, 0, MINORMASK, UACCE_NAME);
> >> +	if (ret) {
> >> +		class_destroy(uacce_class);
> >> +		return ret;  
> > drop the return ret out of these brackets. i.e.
> >
> > if (ret)
> > 	class_destroy(uacce_class)
> >
> > return ret;  
> sure, thanks
> >  
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static __exit void uacce_exit(void)
> >> +{
> >> +	unregister_chrdev_region(uacce_devt, MINORMASK);
> >> +	class_destroy(uacce_class);
> >> +}
> >> +
> >> +subsys_initcall(uacce_init);
> >> +module_exit(uacce_exit);
> >> +
> >> +MODULE_LICENSE("GPL");
> >> +MODULE_AUTHOR("Hisilicon Tech. Co., Ltd.");
> >> +MODULE_DESCRIPTION("Accelerator interface for Userland applications");
> >> diff --git a/include/linux/uacce.h b/include/linux/uacce.h
> >> new file mode 100644
> >> index 0000000..04c8643
> >> --- /dev/null
> >> +++ b/include/linux/uacce.h
> >> @@ -0,0 +1,163 @@
> >> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> >> +#ifndef _LINUX_UACCE_H
> >> +#define _LINUX_UACCE_H
> >> +
> >> +#include <linux/cdev.h>
> >> +#include <uapi/misc/uacce/uacce.h>
> >> +
> >> +#define UACCE_NAME		"uacce"
> >> +#define UACCE_QFRT_MAX		16  
> > What does QFRT stand for?  
> change to UACCE_MAX_REGION

Much better

> >> +#define UACCE_MAX_NAME_SIZE	64
> >> +
> >> +struct uacce_queue;
> >> +struct uacce_device;
> >> +
> >> +/**
> >> + * enum uacce_qfr_flag: queue file flag:
> >> + * @UACCE_QFRF_SELFMT: self maintained qfr
> >> + */
> >> +enum uacce_qfr_flag {
> >> +	UACCE_QFRF_SELFMT = BIT(0),
> >> +};  
> > Same issue with enums for flags.  Doesn't make much sense to me.
> > Only one value can be taken which doesn't make it a flag.
> >  
> >> +
> >> +/**
> >> + * struct uacce_qfile_region - structure of queue file region
> >> + * @type: type of the qfr
> >> + * @flags: flags of qfr
> >> + * @prot: qfr protection flag
> >> + */
> >> +struct uacce_qfile_region {
> >> +	enum uacce_qfrt type;
> >> +	enum uacce_qfr_flag flags;
> >> +	u32 prot;
> >> +};
> >> +
> >> +/**
> >> + * struct uacce_ops - uacce device operations
> >> + * @get_available_instances:  get available instances left of the device
> >> + * @get_queue: get a queue from the device
> >> + * @put_queue: free a queue to the device
> >> + * @start_queue: make the queue start work after get_queue
> >> + * @stop_queue: make the queue stop work before put_queue
> >> + * @is_q_updated: check whether the task is finished
> >> + * @mask_notify: mask the task irq of queue
> >> + * @mmap: mmap addresses of queue to user space
> >> + * @reset: reset the uacce device
> >> + * @reset_queue: reset the queue
> >> + * @ioctl: ioctl for user space users of the queue
> >> + */
> >> +struct uacce_ops {
> >> +	int (*get_available_instances)(struct uacce_device *uacce);
> >> +	int (*get_queue)(struct uacce_device *uacce, unsigned long arg,
> >> +			 struct uacce_queue *q);
> >> +	void (*put_queue)(struct uacce_queue *q);
> >> +	int (*start_queue)(struct uacce_queue *q);
> >> +	void (*stop_queue)(struct uacce_queue *q);
> >> +	int (*is_q_updated)(struct uacce_queue *q);
> >> +	void (*mask_notify)(struct uacce_queue *q, int event_mask);
> >> +	int (*mmap)(struct uacce_queue *q, struct vm_area_struct *vma,
> >> +		    struct uacce_qfile_region *qfr);
> >> +	int (*reset)(struct uacce_device *uacce);
> >> +	int (*reset_queue)(struct uacce_queue *q);  
> > Some of these aren't used on only existing driver.  Introduce them only
> > in the series that uses them.  
> OK
> >  
> >> +	long (*ioctl)(struct uacce_queue *q, unsigned int cmd,
> >> +		      unsigned long arg);
> >> +};
> >> +
> >> +/**
> >> + * struct uacce_interface  
> > I think this needs a description for kernel doc (even if it's obvious!)
> > Could be wrong though.  
> OK
> >  
> >> + * @name: the uacce device name.  Will show up in sysfs
> >> + * @flags: uacce device attributes
> >> + * @ops: pointer to the struct uacce_ops
> >> + *
> >> + * This structure is used for the uacce_register()
> >> + */
> >> +struct uacce_interface {
> >> +	char name[UACCE_MAX_NAME_SIZE];
> >> +	enum uacce_dev_flag flags;
> >> +	struct uacce_ops *ops;
> >> +};
> >> +
> >> +enum uacce_q_state {
> >> +	UACCE_Q_INIT,
> >> +	UACCE_Q_STARTED,
> >> +	UACCE_Q_ZOMBIE,
> >> +};
> >> +
> >> +/**
> >> + * struct uacce_queue
> >> + * @uacce: pointer to uacce
> >> + * @priv: private pointer
> >> + * @wait: wait queue head
> >> + * @pasid: pasid of the queue
> >> + * @pid: pid of the process using the queue
> >> + * @handle: iommu_sva handle return from iommu_sva_bind_device
> >> + * @list: queue list
> >> + * @qfrs: pointer of qfr regions
> >> + * @state: queue state machine
> >> + */
> >> +struct uacce_queue {
> >> +	struct uacce_device *uacce;
> >> +	void *priv;
> >> +	wait_queue_head_t wait;
> >> +	int pasid;
> >> +	pid_t pid;
> >> +	struct iommu_sva *handle;
> >> +	struct list_head list;
> >> +	struct uacce_qfile_region *qfrs[UACCE_QFRT_MAX];
> >> +	enum uacce_q_state state;
> >> +};
> >> +
> >> +/**
> >> + * struct uacce_device
> >> + * @algs: supported algorithms
> >> + * @api_ver: api version
> >> + * @qf_pg_size: page size of the queue file regions
> >> + * @ops: pointer to the struct uacce_ops
> >> + * @pdev: pointer to the parent device
> >> + * @is_vf: whether virtual function
> >> + * @flags: uacce attributes
> >> + * @dev_id: id of the uacce device
> >> + * @prot: uacce protection flag
> >> + * @cdev: cdev of the uacce
> >> + * @dev: dev of the uacce
> >> + * @priv: private pointer of the uacce
> >> + * @qs: list head of queue->list
> >> + * @q_lock: lock for qs
> >> + */
> >> +struct uacce_device {
> >> +	const char *algs;
> >> +	const char *api_ver;
> >> +	unsigned long qf_pg_size[UACCE_QFRT_MAX];
> >> +	struct uacce_ops *ops;  
> > Can we make this ops structure a point to a constant struct?
> > I'm guessing it'll be fixed for a given driver.  
> OK
> >  
> >> +	struct device *pdev;  
> > Perhaps just call it parent. pdev will be confusing with
> > pci devices.  
> OK
> >  
> >> +	bool is_vf;
> >> +	u32 flags;
> >> +	u32 dev_id;
> >> +	u32 prot;
> >> +	struct cdev *cdev;  
> > Can we embed the cdev structure rather than use a pointer
> > and separate allocation?  
> NO, we can't.
> We originally embed the cdev structure, and Greg reminded us these two 
> structure have different lifetime.
> https://lkml.org/lkml/2019/8/28/771

Ok. Fair enough.


> >> +	struct device dev;
> >> +	void *priv;
> >> +	struct list_head qs;
> >> +	struct mutex q_lock;
> >> +};
> >> +
> >> +#if IS_ENABLED(CONFIG_UACCE)
> >> +
> >> +struct uacce_device *uacce_register(struct device *parent,
> >> +				    struct uacce_interface *interface);
> >> +void uacce_unregister(struct uacce_device *uacce);
> >> +
> >> +#else /* CONFIG_UACCE */
> >> +
> >> +static inline
> >> +struct uacce_device *uacce_register(struct device *parent,
> >> +				    struct uacce_interface *interface)
> >> +{
> >> +	return ERR_PTR(-ENODEV);
> >> +}
> >> +
> >> +static inline void uacce_unregister(struct uacce_device *uacce) {}
> >> +
> >> +#endif /* CONFIG_UACCE */
> >> +
> >> +#endif /* _LINUX_UACCE_H */
> >> diff --git a/include/uapi/misc/uacce/uacce.h b/include/uapi/misc/uacce/uacce.h
> >> new file mode 100644
> >> index 0000000..a4f9378
> >> --- /dev/null
> >> +++ b/include/uapi/misc/uacce/uacce.h
> >> @@ -0,0 +1,38 @@
> >> +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> >> +#ifndef _UAPIUUACCE_H
> >> +#define _UAPIUUACCE_H
> >> +
> >> +#include <linux/types.h>
> >> +#include <linux/ioctl.h>
> >> +
> >> +/* UACCE_CMD_START_Q: Start the queue */
> >> +#define UACCE_CMD_START_Q	_IO('W', 0)
> >> +
> >> +/**
> >> + * UACCE_CMD_PUT_Q:
> >> + * User actively stop queue and free queue resource immediately
> >> + * Optimization method since close fd may delay
> >> + */
> >> +#define UACCE_CMD_PUT_Q		_IO('W', 1)
> >> +
> >> +/**
> >> + * enum uacce_dev_flag: Device flags:
> >> + * @UACCE_DEV_SVA: Shared Virtual Addresses
> >> + *		   Support PASID
> >> + *		   Support device page faults (PCI PRI or SMMU Stall)
> >> + */
> >> +enum uacce_dev_flag {
> >> +	UACCE_DEV_SVA = BIT(0),  
> > As mentioned in docs review, this doesn't look like an enum to me.
> > Just use #define for the bit and a suitable sized integer for any
> > calls using it.  
> OK, but there are still more features in the future patch.
That's not the issue.  An enum should (more or less) use concurrent values.

A = 0,
B = 1,
C = 2, etc
and an instance of it should only take one of them.

Once you are using it as values for a bitmap, the typing becomes irrelevant
as you can't really use it to enforce anything, so you should just use.

#define UACCE_DEV_SVA BIT(0)
#define UACCE_DEV_SOMETHING BIT(1) etc

> 
> Thanks
>
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
new file mode 100644
index 0000000..35699dc
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-uacce
@@ -0,0 +1,53 @@ 
+What:           /sys/class/uacce/<dev_name>/id
+Date:           Oct 2019
+KernelVersion:  5.5
+Contact:        linux-accelerators@lists.ozlabs.org
+Description:    Id of the device.
+
+What:           /sys/class/uacce/<dev_name>/api
+Date:           Oct 2019
+KernelVersion:  5.5
+Contact:        linux-accelerators@lists.ozlabs.org
+Description:    Api of the device, used by application to match the correct driver
+
+What:           /sys/class/uacce/<dev_name>/flags
+Date:           Oct 2019
+KernelVersion:  5.5
+Contact:        linux-accelerators@lists.ozlabs.org
+Description:    Attributes of the device, see UACCE_DEV_xxx flag defined in uacce.h
+
+What:           /sys/class/uacce/<dev_name>/available_instances
+Date:           Oct 2019
+KernelVersion:  5.5
+Contact:        linux-accelerators@lists.ozlabs.org
+Description:    Available instances left of the device
+
+What:           /sys/class/uacce/<dev_name>/algorithms
+Date:           Oct 2019
+KernelVersion:  5.5
+Contact:        linux-accelerators@lists.ozlabs.org
+Description:    Algorithms supported by this accelerator
+
+What:           /sys/class/uacce/<dev_name>/qfrt_mmio_size
+Date:           Oct 2019
+KernelVersion:  5.5
+Contact:        linux-accelerators@lists.ozlabs.org
+Description:    Page size of mmio region queue file
+
+What:           /sys/class/uacce/<dev_name>/qfrt_dus_size
+Date:           Oct 2019
+KernelVersion:  5.5
+Contact:        linux-accelerators@lists.ozlabs.org
+Description:    Page size of dus region queue file
+
+What:           /sys/class/uacce/<dev_name>/numa_distance
+Date:           Oct 2019
+KernelVersion:  5.5
+Contact:        linux-accelerators@lists.ozlabs.org
+Description:    Distance of device node to cpu node
+
+What:           /sys/class/uacce/<dev_name>/node_id
+Date:           Oct 2019
+KernelVersion:  5.5
+Contact:        linux-accelerators@lists.ozlabs.org
+Description:    Id of the numa node
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index c55b637..929feb0 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -481,4 +481,5 @@  source "drivers/misc/cxl/Kconfig"
 source "drivers/misc/ocxl/Kconfig"
 source "drivers/misc/cardreader/Kconfig"
 source "drivers/misc/habanalabs/Kconfig"
+source "drivers/misc/uacce/Kconfig"
 endmenu
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index c1860d3..9abf292 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -56,4 +56,5 @@  obj-$(CONFIG_OCXL)		+= ocxl/
 obj-y				+= cardreader/
 obj-$(CONFIG_PVPANIC)   	+= pvpanic.o
 obj-$(CONFIG_HABANA_AI)		+= habanalabs/
+obj-$(CONFIG_UACCE)		+= uacce/
 obj-$(CONFIG_XILINX_SDFEC)	+= xilinx_sdfec.o
diff --git a/drivers/misc/uacce/Kconfig b/drivers/misc/uacce/Kconfig
new file mode 100644
index 0000000..5e39b60
--- /dev/null
+++ b/drivers/misc/uacce/Kconfig
@@ -0,0 +1,13 @@ 
+config UACCE
+	tristate "Accelerator Framework for User Land"
+	depends on IOMMU_API
+	help
+	  UACCE provides interface for the user process to access the hardware
+	  without interaction with the kernel space in data path.
+
+	  The user-space interface is described in
+	  include/uapi/misc/uacce/uacce.h
+
+	  See Documentation/misc-devices/uacce.rst for more details.
+
+	  If you don't know what to do here, say N.
diff --git a/drivers/misc/uacce/Makefile b/drivers/misc/uacce/Makefile
new file mode 100644
index 0000000..5b4374e
--- /dev/null
+++ b/drivers/misc/uacce/Makefile
@@ -0,0 +1,2 @@ 
+# SPDX-License-Identifier: GPL-2.0-or-later
+obj-$(CONFIG_UACCE) += uacce.o
diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c
new file mode 100644
index 0000000..2b6b038
--- /dev/null
+++ b/drivers/misc/uacce/uacce.c
@@ -0,0 +1,574 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/compat.h>
+#include <linux/dma-iommu.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/uacce.h>
+
+static struct class *uacce_class;
+static dev_t uacce_devt;
+static DEFINE_MUTEX(uacce_mutex);
+static DEFINE_XARRAY_ALLOC(uacce_xa);
+
+static int uacce_start_queue(struct uacce_queue *q)
+{
+	int ret = -EINVAL;
+
+	mutex_lock(&uacce_mutex);
+
+	if (q->state != UACCE_Q_INIT)
+		goto out_with_lock;
+
+	if (q->uacce->ops->start_queue) {
+		ret = q->uacce->ops->start_queue(q);
+		if (ret < 0)
+			goto out_with_lock;
+	}
+
+	q->state = UACCE_Q_STARTED;
+	mutex_unlock(&uacce_mutex);
+
+	return 0;
+
+out_with_lock:
+	mutex_unlock(&uacce_mutex);
+	return ret;
+}
+
+static int uacce_put_queue(struct uacce_queue *q)
+{
+	struct uacce_device *uacce = q->uacce;
+
+	mutex_lock(&uacce_mutex);
+
+	if (q->state == UACCE_Q_ZOMBIE)
+		goto out;
+
+	if ((q->state == UACCE_Q_STARTED) && uacce->ops->stop_queue)
+		uacce->ops->stop_queue(q);
+
+	if ((q->state == UACCE_Q_INIT || q->state == UACCE_Q_STARTED) &&
+	     uacce->ops->put_queue)
+		uacce->ops->put_queue(q);
+
+	q->state = UACCE_Q_ZOMBIE;
+out:
+	mutex_unlock(&uacce_mutex);
+
+	return 0;
+}
+
+static long uacce_fops_unl_ioctl(struct file *filep,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct uacce_queue *q = filep->private_data;
+	struct uacce_device *uacce = q->uacce;
+
+	switch (cmd) {
+	case UACCE_CMD_START_Q:
+		return uacce_start_queue(q);
+
+	case UACCE_CMD_PUT_Q:
+		return uacce_put_queue(q);
+
+	default:
+		if (!uacce->ops->ioctl)
+			return -EINVAL;
+
+		return uacce->ops->ioctl(q, cmd, arg);
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long uacce_fops_compat_ioctl(struct file *filep,
+				   unsigned int cmd, unsigned long arg)
+{
+	arg = (unsigned long)compat_ptr(arg);
+
+	return uacce_fops_unl_ioctl(filep, cmd, arg);
+}
+#endif
+
+static int uacce_sva_exit(struct device *dev, struct iommu_sva *handle,
+			  void *data)
+{
+	struct uacce_device *uacce = data;
+	struct uacce_queue *q;
+
+	mutex_lock(&uacce->q_lock);
+	list_for_each_entry(q, &uacce->qs, list) {
+		if (q->pid == task_pid_nr(current))
+			uacce_put_queue(q);
+	}
+	mutex_unlock(&uacce->q_lock);
+
+	return 0;
+}
+
+static struct iommu_sva_ops uacce_sva_ops = {
+	.mm_exit = uacce_sva_exit,
+};
+
+static int uacce_fops_open(struct inode *inode, struct file *filep)
+{
+	struct iommu_sva *handle = NULL;
+	struct uacce_device *uacce;
+	struct uacce_queue *q;
+	int ret = 0;
+	int pasid = 0;
+
+	uacce = xa_load(&uacce_xa, iminor(inode));
+	if (!uacce)
+		return -ENODEV;
+
+	if (!try_module_get(uacce->pdev->driver->owner))
+		return -ENODEV;
+
+	q = kzalloc(sizeof(struct uacce_queue), GFP_KERNEL);
+	if (!q) {
+		ret = -ENOMEM;
+		goto out_with_module;
+	}
+
+	if (uacce->flags & UACCE_DEV_SVA) {
+		handle = iommu_sva_bind_device(uacce->pdev, current->mm, uacce);
+		if (IS_ERR(handle))
+			goto out_with_mem;
+
+		ret = iommu_sva_set_ops(handle, &uacce_sva_ops);
+		if (ret)
+			goto out_unbind;
+
+		pasid = iommu_sva_get_pasid(handle);
+		if (pasid == IOMMU_PASID_INVALID)
+			goto out_unbind;
+	}
+
+	if (uacce->ops->get_queue) {
+		ret = uacce->ops->get_queue(uacce, pasid, q);
+		if (ret < 0)
+			goto out_unbind;
+	}
+
+	q->pid = task_pid_nr(current);
+	q->pasid = pasid;
+	q->handle = handle;
+	q->uacce = uacce;
+	memset(q->qfrs, 0, sizeof(q->qfrs));
+	init_waitqueue_head(&q->wait);
+	filep->private_data = q;
+	q->state = UACCE_Q_INIT;
+
+	mutex_lock(&uacce->q_lock);
+	list_add(&q->list, &uacce->qs);
+	mutex_unlock(&uacce->q_lock);
+
+	return 0;
+
+out_unbind:
+	if (uacce->flags & UACCE_DEV_SVA)
+		iommu_sva_unbind_device(handle);
+out_with_mem:
+	kfree(q);
+out_with_module:
+	module_put(uacce->pdev->driver->owner);
+	return ret;
+}
+
+static int uacce_fops_release(struct inode *inode, struct file *filep)
+{
+	struct uacce_queue *q = filep->private_data;
+	struct uacce_device *uacce = q->uacce;
+
+	uacce_put_queue(q);
+
+	if (uacce->flags & UACCE_DEV_SVA)
+		iommu_sva_unbind_device(q->handle);
+
+	mutex_lock(&uacce->q_lock);
+	list_del(&q->list);
+	mutex_unlock(&uacce->q_lock);
+	kfree(q);
+	module_put(uacce->pdev->driver->owner);
+
+	return 0;
+}
+
+static void uacce_vma_close(struct vm_area_struct *vma)
+{
+	struct uacce_queue *q = vma->vm_private_data;
+	enum uacce_qfrt type = 0;
+
+	if (vma->vm_pgoff < UACCE_QFRT_MAX)
+		type = vma->vm_pgoff;
+
+	kfree(q->qfrs[type]);
+}
+
+static const struct vm_operations_struct uacce_vm_ops = {
+	.close = uacce_vma_close,
+};
+
+static struct uacce_qfile_region *
+uacce_create_region(struct uacce_queue *q, struct vm_area_struct *vma,
+		    enum uacce_qfrt type, unsigned int flags)
+{
+	struct uacce_device *uacce = q->uacce;
+	struct uacce_qfile_region *qfr;
+	int ret = -ENOMEM;
+
+	qfr = kzalloc(sizeof(*qfr), GFP_KERNEL);
+	if (!qfr)
+		return ERR_PTR(-ENOMEM);
+
+	qfr->type = type;
+	qfr->flags = flags;
+
+	if (vma->vm_flags & VM_READ)
+		qfr->prot |= IOMMU_READ;
+
+	if (vma->vm_flags & VM_WRITE)
+		qfr->prot |= IOMMU_WRITE;
+
+	if (flags & UACCE_QFRF_SELFMT) {
+		if (!uacce->ops->mmap) {
+			ret = -EINVAL;
+			goto err_with_qfr;
+		}
+
+		ret = uacce->ops->mmap(q, vma, qfr);
+		if (ret)
+			goto err_with_qfr;
+		return qfr;
+	}
+
+	return qfr;
+
+err_with_qfr:
+	kfree(qfr);
+	return ERR_PTR(ret);
+}
+
+static int uacce_fops_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+	struct uacce_queue *q = filep->private_data;
+	struct uacce_device *uacce = q->uacce;
+	struct uacce_qfile_region *qfr;
+	enum uacce_qfrt type = 0;
+	unsigned int flags = 0;
+	int ret;
+
+	if (vma->vm_pgoff < UACCE_QFRT_MAX)
+		type = vma->vm_pgoff;
+
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_WIPEONFORK;
+	vma->vm_ops = &uacce_vm_ops;
+	vma->vm_private_data = q;
+
+	mutex_lock(&uacce_mutex);
+
+	if (q->qfrs[type]) {
+		ret = -EEXIST;
+		goto out_with_lock;
+	}
+
+	switch (type) {
+	case UACCE_QFRT_MMIO:
+		flags = UACCE_QFRF_SELFMT;
+		break;
+
+	case UACCE_QFRT_DUS:
+		if (uacce->flags & UACCE_DEV_SVA) {
+			flags = UACCE_QFRF_SELFMT;
+			break;
+		}
+		break;
+
+	default:
+		WARN_ON(&uacce->dev);
+		break;
+	}
+
+	qfr = uacce_create_region(q, vma, type, flags);
+	if (IS_ERR(qfr)) {
+		ret = PTR_ERR(qfr);
+		goto out_with_lock;
+	}
+	q->qfrs[type] = qfr;
+
+	mutex_unlock(&uacce_mutex);
+
+	return 0;
+
+out_with_lock:
+	mutex_unlock(&uacce_mutex);
+	return ret;
+}
+
+static __poll_t uacce_fops_poll(struct file *file, poll_table *wait)
+{
+	struct uacce_queue *q = file->private_data;
+	struct uacce_device *uacce = q->uacce;
+
+	poll_wait(file, &q->wait, wait);
+	if (uacce->ops->is_q_updated && uacce->ops->is_q_updated(q))
+		return EPOLLIN | EPOLLRDNORM;
+
+	return 0;
+}
+
+static const struct file_operations uacce_fops = {
+	.owner		= THIS_MODULE,
+	.open		= uacce_fops_open,
+	.release	= uacce_fops_release,
+	.unlocked_ioctl	= uacce_fops_unl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= uacce_fops_compat_ioctl,
+#endif
+	.mmap		= uacce_fops_mmap,
+	.poll		= uacce_fops_poll,
+};
+
+#define to_uacce_device(dev) container_of(dev, struct uacce_device, dev)
+
+static ssize_t id_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+
+	return sprintf(buf, "%d\n", uacce->dev_id);
+}
+
+static ssize_t api_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+
+	return sprintf(buf, "%s\n", uacce->api_ver);
+}
+
+static ssize_t numa_distance_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+	int distance;
+
+	distance = node_distance(smp_processor_id(), uacce->pdev->numa_node);
+
+	return sprintf(buf, "%d\n", abs(distance));
+}
+
+static ssize_t node_id_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+	int node_id;
+
+	node_id = dev_to_node(uacce->pdev);
+
+	return sprintf(buf, "%d\n", node_id);
+}
+
+static ssize_t flags_show(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+
+	return sprintf(buf, "%u\n", uacce->flags);
+}
+
+static ssize_t available_instances_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+	int val = 0;
+
+	if (uacce->ops->get_available_instances)
+		val = uacce->ops->get_available_instances(uacce);
+
+	return sprintf(buf, "%d\n", val);
+}
+
+static ssize_t algorithms_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+
+	return sprintf(buf, "%s", uacce->algs);
+}
+
+static ssize_t qfrt_mmio_size_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+
+	return sprintf(buf, "%lu\n",
+		       uacce->qf_pg_size[UACCE_QFRT_MMIO] << PAGE_SHIFT);
+}
+
+static ssize_t qfrt_dus_size_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+
+	return sprintf(buf, "%lu\n",
+		       uacce->qf_pg_size[UACCE_QFRT_DUS] << PAGE_SHIFT);
+}
+
+static DEVICE_ATTR_RO(id);
+static DEVICE_ATTR_RO(api);
+static DEVICE_ATTR_RO(numa_distance);
+static DEVICE_ATTR_RO(node_id);
+static DEVICE_ATTR_RO(flags);
+static DEVICE_ATTR_RO(available_instances);
+static DEVICE_ATTR_RO(algorithms);
+static DEVICE_ATTR_RO(qfrt_mmio_size);
+static DEVICE_ATTR_RO(qfrt_dus_size);
+
+static struct attribute *uacce_dev_attrs[] = {
+	&dev_attr_id.attr,
+	&dev_attr_api.attr,
+	&dev_attr_node_id.attr,
+	&dev_attr_numa_distance.attr,
+	&dev_attr_flags.attr,
+	&dev_attr_available_instances.attr,
+	&dev_attr_algorithms.attr,
+	&dev_attr_qfrt_mmio_size.attr,
+	&dev_attr_qfrt_dus_size.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(uacce_dev);
+
+static void uacce_release(struct device *dev)
+{
+	struct uacce_device *uacce = to_uacce_device(dev);
+
+	kfree(uacce);
+}
+
+/**
+ * uacce_register - register an accelerator
+ * @parent: pointer of uacce parent device
+ * @interface: pointer of uacce_interface for register
+ */
+struct uacce_device *uacce_register(struct device *parent,
+				    struct uacce_interface *interface)
+{
+	unsigned int flags = interface->flags;
+	struct uacce_device *uacce;
+	int ret;
+
+	uacce = kzalloc(sizeof(struct uacce_device), GFP_KERNEL);
+	if (!uacce)
+		return ERR_PTR(-ENOMEM);
+
+	if (flags & UACCE_DEV_SVA) {
+		ret = iommu_dev_enable_feature(parent, IOMMU_DEV_FEAT_SVA);
+		if (ret)
+			flags &= ~UACCE_DEV_SVA;
+	}
+
+	uacce->pdev = parent;
+	uacce->flags = flags;
+	uacce->ops = interface->ops;
+
+	ret = xa_alloc(&uacce_xa, &uacce->dev_id, uacce, xa_limit_32b,
+		       GFP_KERNEL);
+	if (ret < 0)
+		goto err_with_uacce;
+
+	uacce->cdev = cdev_alloc();
+	if (!uacce->cdev) {
+		ret = -ENOMEM;
+		goto err_with_xa;
+	}
+
+	INIT_LIST_HEAD(&uacce->qs);
+	mutex_init(&uacce->q_lock);
+	uacce->cdev->ops = &uacce_fops;
+	uacce->cdev->owner = THIS_MODULE;
+	device_initialize(&uacce->dev);
+	uacce->dev.devt = MKDEV(MAJOR(uacce_devt), uacce->dev_id);
+	uacce->dev.class = uacce_class;
+	uacce->dev.groups = uacce_dev_groups;
+	uacce->dev.parent = uacce->pdev;
+	uacce->dev.release = uacce_release;
+	dev_set_name(&uacce->dev, "%s-%d", interface->name, uacce->dev_id);
+	ret = cdev_device_add(uacce->cdev, &uacce->dev);
+	if (ret)
+		goto err_with_xa;
+
+	return uacce;
+
+err_with_xa:
+	if (uacce->cdev)
+		cdev_del(uacce->cdev);
+	xa_erase(&uacce_xa, uacce->dev_id);
+err_with_uacce:
+	if (flags & UACCE_DEV_SVA)
+		iommu_dev_disable_feature(uacce->pdev, IOMMU_DEV_FEAT_SVA);
+	kfree(uacce);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(uacce_register);
+
+/**
+ * uacce_unregister - unregisters an accelerator
+ * @uacce: the accelerator to unregister
+ */
+void uacce_unregister(struct uacce_device *uacce)
+{
+	if (!uacce)
+		return;
+
+	mutex_lock(&uacce->q_lock);
+	if (!list_empty(&uacce->qs)) {
+		struct uacce_queue *q;
+
+		list_for_each_entry(q, &uacce->qs, list) {
+			uacce_put_queue(q);
+			if (uacce->flags & UACCE_DEV_SVA)
+				iommu_sva_unbind_device(q->handle);
+		}
+	}
+	mutex_unlock(&uacce->q_lock);
+
+	if (uacce->flags & UACCE_DEV_SVA)
+		iommu_dev_disable_feature(uacce->pdev, IOMMU_DEV_FEAT_SVA);
+
+	cdev_device_del(uacce->cdev, &uacce->dev);
+	xa_erase(&uacce_xa, uacce->dev_id);
+	put_device(&uacce->dev);
+}
+EXPORT_SYMBOL_GPL(uacce_unregister);
+
+static int __init uacce_init(void)
+{
+	int ret;
+
+	uacce_class = class_create(THIS_MODULE, UACCE_NAME);
+	if (IS_ERR(uacce_class))
+		return PTR_ERR(uacce_class);
+
+	ret = alloc_chrdev_region(&uacce_devt, 0, MINORMASK, UACCE_NAME);
+	if (ret) {
+		class_destroy(uacce_class);
+		return ret;
+	}
+
+	return 0;
+}
+
+static __exit void uacce_exit(void)
+{
+	unregister_chrdev_region(uacce_devt, MINORMASK);
+	class_destroy(uacce_class);
+}
+
+subsys_initcall(uacce_init);
+module_exit(uacce_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Hisilicon Tech. Co., Ltd.");
+MODULE_DESCRIPTION("Accelerator interface for Userland applications");
diff --git a/include/linux/uacce.h b/include/linux/uacce.h
new file mode 100644
index 0000000..04c8643
--- /dev/null
+++ b/include/linux/uacce.h
@@ -0,0 +1,163 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_UACCE_H
+#define _LINUX_UACCE_H
+
+#include <linux/cdev.h>
+#include <uapi/misc/uacce/uacce.h>
+
+#define UACCE_NAME		"uacce"
+#define UACCE_QFRT_MAX		16
+#define UACCE_MAX_NAME_SIZE	64
+
+struct uacce_queue;
+struct uacce_device;
+
+/**
+ * enum uacce_qfr_flag: queue file flag:
+ * @UACCE_QFRF_SELFMT: self maintained qfr
+ */
+enum uacce_qfr_flag {
+	UACCE_QFRF_SELFMT = BIT(0),
+};
+
+/**
+ * struct uacce_qfile_region - structure of queue file region
+ * @type: type of the qfr
+ * @flags: flags of qfr
+ * @prot: qfr protection flag
+ */
+struct uacce_qfile_region {
+	enum uacce_qfrt type;
+	enum uacce_qfr_flag flags;
+	u32 prot;
+};
+
+/**
+ * struct uacce_ops - uacce device operations
+ * @get_available_instances:  get available instances left of the device
+ * @get_queue: get a queue from the device
+ * @put_queue: free a queue to the device
+ * @start_queue: make the queue start work after get_queue
+ * @stop_queue: make the queue stop work before put_queue
+ * @is_q_updated: check whether the task is finished
+ * @mask_notify: mask the task irq of queue
+ * @mmap: mmap addresses of queue to user space
+ * @reset: reset the uacce device
+ * @reset_queue: reset the queue
+ * @ioctl: ioctl for user space users of the queue
+ */
+struct uacce_ops {
+	int (*get_available_instances)(struct uacce_device *uacce);
+	int (*get_queue)(struct uacce_device *uacce, unsigned long arg,
+			 struct uacce_queue *q);
+	void (*put_queue)(struct uacce_queue *q);
+	int (*start_queue)(struct uacce_queue *q);
+	void (*stop_queue)(struct uacce_queue *q);
+	int (*is_q_updated)(struct uacce_queue *q);
+	void (*mask_notify)(struct uacce_queue *q, int event_mask);
+	int (*mmap)(struct uacce_queue *q, struct vm_area_struct *vma,
+		    struct uacce_qfile_region *qfr);
+	int (*reset)(struct uacce_device *uacce);
+	int (*reset_queue)(struct uacce_queue *q);
+	long (*ioctl)(struct uacce_queue *q, unsigned int cmd,
+		      unsigned long arg);
+};
+
+/**
+ * struct uacce_interface
+ * @name: the uacce device name.  Will show up in sysfs
+ * @flags: uacce device attributes
+ * @ops: pointer to the struct uacce_ops
+ *
+ * This structure is used for the uacce_register()
+ */
+struct uacce_interface {
+	char name[UACCE_MAX_NAME_SIZE];
+	enum uacce_dev_flag flags;
+	struct uacce_ops *ops;
+};
+
+enum uacce_q_state {
+	UACCE_Q_INIT,
+	UACCE_Q_STARTED,
+	UACCE_Q_ZOMBIE,
+};
+
+/**
+ * struct uacce_queue
+ * @uacce: pointer to uacce
+ * @priv: private pointer
+ * @wait: wait queue head
+ * @pasid: pasid of the queue
+ * @pid: pid of the process using the queue
+ * @handle: iommu_sva handle return from iommu_sva_bind_device
+ * @list: queue list
+ * @qfrs: pointer of qfr regions
+ * @state: queue state machine
+ */
+struct uacce_queue {
+	struct uacce_device *uacce;
+	void *priv;
+	wait_queue_head_t wait;
+	int pasid;
+	pid_t pid;
+	struct iommu_sva *handle;
+	struct list_head list;
+	struct uacce_qfile_region *qfrs[UACCE_QFRT_MAX];
+	enum uacce_q_state state;
+};
+
+/**
+ * struct uacce_device
+ * @algs: supported algorithms
+ * @api_ver: api version
+ * @qf_pg_size: page size of the queue file regions
+ * @ops: pointer to the struct uacce_ops
+ * @pdev: pointer to the parent device
+ * @is_vf: whether virtual function
+ * @flags: uacce attributes
+ * @dev_id: id of the uacce device
+ * @prot: uacce protection flag
+ * @cdev: cdev of the uacce
+ * @dev: dev of the uacce
+ * @priv: private pointer of the uacce
+ * @qs: list head of queue->list
+ * @q_lock: lock for qs
+ */
+struct uacce_device {
+	const char *algs;
+	const char *api_ver;
+	unsigned long qf_pg_size[UACCE_QFRT_MAX];
+	struct uacce_ops *ops;
+	struct device *pdev;
+	bool is_vf;
+	u32 flags;
+	u32 dev_id;
+	u32 prot;
+	struct cdev *cdev;
+	struct device dev;
+	void *priv;
+	struct list_head qs;
+	struct mutex q_lock;
+};
+
+#if IS_ENABLED(CONFIG_UACCE)
+
+struct uacce_device *uacce_register(struct device *parent,
+				    struct uacce_interface *interface);
+void uacce_unregister(struct uacce_device *uacce);
+
+#else /* CONFIG_UACCE */
+
+static inline
+struct uacce_device *uacce_register(struct device *parent,
+				    struct uacce_interface *interface)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline void uacce_unregister(struct uacce_device *uacce) {}
+
+#endif /* CONFIG_UACCE */
+
+#endif /* _LINUX_UACCE_H */
diff --git a/include/uapi/misc/uacce/uacce.h b/include/uapi/misc/uacce/uacce.h
new file mode 100644
index 0000000..a4f9378
--- /dev/null
+++ b/include/uapi/misc/uacce/uacce.h
@@ -0,0 +1,38 @@ 
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _UAPIUUACCE_H
+#define _UAPIUUACCE_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/* UACCE_CMD_START_Q: Start the queue */
+#define UACCE_CMD_START_Q	_IO('W', 0)
+
+/**
+ * UACCE_CMD_PUT_Q:
+ * User actively stop queue and free queue resource immediately
+ * Optimization method since close fd may delay
+ */
+#define UACCE_CMD_PUT_Q		_IO('W', 1)
+
+/**
+ * enum uacce_dev_flag: Device flags:
+ * @UACCE_DEV_SVA: Shared Virtual Addresses
+ *		   Support PASID
+ *		   Support device page faults (PCI PRI or SMMU Stall)
+ */
+enum uacce_dev_flag {
+	UACCE_DEV_SVA = BIT(0),
+};
+
+/**
+ * enum uacce_qfrt: qfrt type
+ * @UACCE_QFRT_MMIO: device mmio region
+ * @UACCE_QFRT_DUS: device user share region
+ */
+enum uacce_qfrt {
+	UACCE_QFRT_MMIO = 0,
+	UACCE_QFRT_DUS = 1,
+};
+
+#endif