diff mbox series

[2/2] xen: privcmd: Add support for ioeventfd

Message ID 52b07a3861a34fb0deb6251bae8bd885815c6944.1693311370.git.viresh.kumar@linaro.org (mailing list archive)
State Superseded
Headers show
Series xen: privcmd: Add ioeventfd support | expand

Commit Message

Viresh Kumar Aug. 29, 2023, 12:29 p.m. UTC
Virtio guests send VIRTIO_MMIO_QUEUE_NOTIFY notification when they need
to notify the backend of an update to the status of the virtqueue. The
backend or another entity, polls the MMIO address for updates to know
when the notification is sent.

It works well if the backend does this polling by itself. But as we move
towards generic backend implementations, we end up implementing this in
a separate user-space program.

Generally, the Virtio backends are implemented to work with the Eventfd
based mechanism. In order to make such backends work with Xen, another
software layer needs to do the polling and send an event via eventfd to
the backend once the notification from guest is received. This results
in an extra context switch.

This is not a new problem in Linux though. It is present with other
hypervisors like KVM, etc. as well. The generic solution implemented in
the kernel for them is to provide an IOCTL call to pass the address to
poll and eventfd, which lets the kernel take care of polling and raise
an event on the eventfd, instead of handling this in user space (which
involves an extra context switch).

This patch adds similar support for xen.

Inspired by existing implementations for KVM, etc..

This also copies ioreq.h header file (only struct ioreq and related
macros) from Xen's source tree (Top commit 5d84f07fe6bf ("xen/pci: drop
remaining uses of bool_t")).

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/xen/Kconfig               |   8 +-
 drivers/xen/privcmd.c             | 385 +++++++++++++++++++++++++++++-
 include/uapi/xen/privcmd.h        |  18 ++
 include/xen/interface/hvm/ioreq.h |  51 ++++
 4 files changed, 456 insertions(+), 6 deletions(-)
 create mode 100644 include/xen/interface/hvm/ioreq.h

Comments

Jürgen Groß Sept. 29, 2023, 5:46 a.m. UTC | #1
On 29.08.23 14:29, Viresh Kumar wrote:
> Virtio guests send VIRTIO_MMIO_QUEUE_NOTIFY notification when they need
> to notify the backend of an update to the status of the virtqueue. The
> backend or another entity, polls the MMIO address for updates to know
> when the notification is sent.
> 
> It works well if the backend does this polling by itself. But as we move
> towards generic backend implementations, we end up implementing this in
> a separate user-space program.
> 
> Generally, the Virtio backends are implemented to work with the Eventfd
> based mechanism. In order to make such backends work with Xen, another
> software layer needs to do the polling and send an event via eventfd to
> the backend once the notification from guest is received. This results
> in an extra context switch.
> 
> This is not a new problem in Linux though. It is present with other
> hypervisors like KVM, etc. as well. The generic solution implemented in
> the kernel for them is to provide an IOCTL call to pass the address to
> poll and eventfd, which lets the kernel take care of polling and raise
> an event on the eventfd, instead of handling this in user space (which
> involves an extra context switch).
> 
> This patch adds similar support for xen.
> 
> Inspired by existing implementations for KVM, etc..
> 
> This also copies ioreq.h header file (only struct ioreq and related
> macros) from Xen's source tree (Top commit 5d84f07fe6bf ("xen/pci: drop
> remaining uses of bool_t")).
> 
> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> ---
>   drivers/xen/Kconfig               |   8 +-
>   drivers/xen/privcmd.c             | 385 +++++++++++++++++++++++++++++-
>   include/uapi/xen/privcmd.h        |  18 ++
>   include/xen/interface/hvm/ioreq.h |  51 ++++
>   4 files changed, 456 insertions(+), 6 deletions(-)
>   create mode 100644 include/xen/interface/hvm/ioreq.h
> 
> diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
> index d43153fec18e..d5989871dd5d 100644
> --- a/drivers/xen/Kconfig
> +++ b/drivers/xen/Kconfig
> @@ -269,12 +269,12 @@ config XEN_PRIVCMD
>   	  disaggregated Xen setups this driver might be needed for other
>   	  domains, too.
>   
> -config XEN_PRIVCMD_IRQFD
> -	bool "Xen irqfd support"
> +config XEN_PRIVCMD_EVENTFD
> +	bool "Xen Ioeventfd and irqfd support"
>   	depends on XEN_PRIVCMD && XEN_VIRTIO && EVENTFD
>   	help
> -	  Using the irqfd mechanism a virtio backend running in a daemon can
> -	  speed up interrupt injection into a guest.
> +	  Using the ioeventfd / irqfd mechanism a virtio backend running in a
> +	  daemon can speed up interrupt delivery from / to a guest.
>   
>   config XEN_ACPI_PROCESSOR
>   	tristate "Xen ACPI processor"
> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
> index 120af57999fc..4b18c51b6b2e 100644
> --- a/drivers/xen/privcmd.c
> +++ b/drivers/xen/privcmd.c
> @@ -29,15 +29,18 @@
>   #include <linux/seq_file.h>
>   #include <linux/miscdevice.h>
>   #include <linux/moduleparam.h>
> +#include <linux/virtio_mmio.h>
>   
>   #include <asm/xen/hypervisor.h>
>   #include <asm/xen/hypercall.h>
>   
>   #include <xen/xen.h>
> +#include <xen/events.h>
>   #include <xen/privcmd.h>
>   #include <xen/interface/xen.h>
>   #include <xen/interface/memory.h>
>   #include <xen/interface/hvm/dm_op.h>
> +#include <xen/interface/hvm/ioreq.h>
>   #include <xen/features.h>
>   #include <xen/page.h>
>   #include <xen/xen-ops.h>
> @@ -782,6 +785,7 @@ static long privcmd_ioctl_mmap_resource(struct file *file,
>   			goto out;
>   
>   		pages = vma->vm_private_data;
> +
>   		for (i = 0; i < kdata.num; i++) {
>   			xen_pfn_t pfn =
>   				page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]);
> @@ -838,7 +842,7 @@ static long privcmd_ioctl_mmap_resource(struct file *file,
>   	return rc;
>   }
>   
> -#ifdef CONFIG_XEN_PRIVCMD_IRQFD
> +#ifdef CONFIG_XEN_PRIVCMD_EVENTFD
>   /* Irqfd support */
>   static struct workqueue_struct *irqfd_cleanup_wq;
>   static DEFINE_MUTEX(irqfds_lock);
> @@ -1079,6 +1083,369 @@ static void privcmd_irqfd_exit(void)
>   
>   	destroy_workqueue(irqfd_cleanup_wq);
>   }
> +
> +/* Ioeventfd Support */
> +#define QUEUE_NOTIFY_VQ_MASK 0xFFFF
> +
> +static DEFINE_MUTEX(ioreq_lock);
> +static LIST_HEAD(ioreq_list);
> +
> +/* per-eventfd structure */
> +struct privcmd_kernel_ioeventfd {
> +	struct eventfd_ctx *eventfd;
> +	struct list_head list;
> +	unsigned long long addr;

This is populated from a __u64 field. Maybe make it uint64_t?

> +	unsigned int addr_len;
> +	unsigned int vq;
> +};
> +
> +/* per-guest CPU / port structure */
> +struct ioreq_port {
> +	int vcpu;
> +	unsigned int port;
> +	struct privcmd_kernel_ioreq *kioreq;
> +};
> +
> +/* per-guest structure */
> +struct privcmd_kernel_ioreq {
> +	domid_t dom;
> +	unsigned int vcpus;
> +	void __user *uioreq;
> +	struct ioreq *ioreq;
> +	struct mutex lock; /* Protects ioeventfds list */
> +	struct list_head ioeventfds;
> +	struct list_head list;
> +	struct ioreq_port *ports;
> +};
> +
> +static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id)
> +{
> +	struct ioreq_port *port = dev_id;
> +	struct privcmd_kernel_ioreq *kioreq = port->kioreq;
> +	struct ioreq *ioreq = &kioreq->ioreq[port->vcpu];
> +	struct privcmd_kernel_ioeventfd *kioeventfd;
> +	unsigned int state = STATE_IOREQ_READY;
> +
> +	if (ioreq->state != STATE_IOREQ_READY ||
> +	    ioreq->type != IOREQ_TYPE_COPY || ioreq->dir != IOREQ_WRITE)
> +		return IRQ_NONE;
> +
> +	smp_mb();

Against what is this barrier protecting? Please add a comment.

> +	ioreq->state = STATE_IOREQ_INPROCESS;
> +
> +	mutex_lock(&kioreq->lock);
> +	list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) {
> +		if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY &&
> +		    ioreq->size == kioeventfd->addr_len &&
> +		    (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) {
> +			eventfd_signal(kioeventfd->eventfd, 1);
> +			state = STATE_IORESP_READY;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&kioreq->lock);
> +
> +	smp_mb();

Is this really needed after calling mutex_unlock()? I think you are trying to
avoid any accesses to go past ioreq->state modification. If so, add a comment
(either why you need the barrier, or that you don't need it due to the unlock).

In general, shouldn't the state be checked and modified in the locked area?

And a mutex_lock() inside an IRQ-handler seems dubious: This is allowed only in
case the handler has been defined to run in a threaded IRQ. Otherwise you'd
end up trying to schedule() in an interrupt-off section. I'd suggest you use
a spinlock instead.

> +	ioreq->state = state;
> +
> +	if (state == STATE_IORESP_READY) {
> +		notify_remote_via_evtchn(port->port);
> +		return IRQ_HANDLED;
> +	}
> +
> +	return IRQ_NONE;
> +}
> +
> +static void ioreq_free(struct privcmd_kernel_ioreq *kioreq)
> +{
> +	struct ioreq_port *ports = kioreq->ports;
> +	int i;
> +
> +	lockdep_assert_held(&ioreq_lock);
> +
> +	if (!list_empty(&kioreq->ioeventfds))
> +		return;

Is this a normal situation? If not, maybe issue a warning (probably rate
limited)?

> +
> +	list_del(&kioreq->list);
> +
> +	for (i = kioreq->vcpus - 1; i >= 0; i--)
> +		unbind_from_irqhandler(irq_from_evtchn(ports[i].port), &ports[i]);
> +
> +	kfree(kioreq);
> +}
> +
> +static
> +struct privcmd_kernel_ioreq *alloc_ioreq(struct privcmd_ioeventfd *ioeventfd)
> +{
> +	struct privcmd_kernel_ioreq *kioreq;
> +	struct mm_struct *mm = current->mm;
> +	struct vm_area_struct *vma;
> +	struct page **pages;
> +	unsigned int *ports;
> +	int ret, size, i;
> +
> +	lockdep_assert_held(&ioreq_lock);
> +
> +	size = sizeof(*kioreq) + sizeof(*kioreq->ports) * ioeventfd->vcpus;

Use the struct_size() macro instead? You need to define kioreq->ports as an
empty array for that to work.

> +	kioreq = kzalloc(size, GFP_KERNEL);
> +	if (!kioreq)
> +		return ERR_PTR(-ENOMEM);
> +	kioreq->ports = (struct ioreq_port *)(kioreq + 1);
> +
> +	kioreq->dom = ioeventfd->dom;
> +	kioreq->vcpus = ioeventfd->vcpus;
> +	kioreq->uioreq = ioeventfd->ioreq;
> +	mutex_init(&kioreq->lock);
> +	INIT_LIST_HEAD(&kioreq->ioeventfds);
> +
> +	/* The memory for ioreq server must have been mapped earlier */
> +	mmap_write_lock(mm);
> +	vma = find_vma(mm, (unsigned long)ioeventfd->ioreq);
> +	if (!vma) {
> +		pr_err("Failed to find vma for ioreq page!\n");
> +		mmap_write_unlock(mm);
> +		ret = -EFAULT;
> +		goto error_kfree;
> +	}
> +
> +	pages = vma->vm_private_data;
> +	kioreq->ioreq = (struct ioreq *)(page_to_virt(pages[0]));
> +	mmap_write_unlock(mm);
> +
> +	size = sizeof(*ports) * kioreq->vcpus;
> +	ports = kzalloc(size, GFP_KERNEL);
> +	if (!ports) {
> +		ret = -ENOMEM;
> +		goto error_kfree;
> +	}
> +
> +	if (copy_from_user(ports, ioeventfd->ports, size)) {
> +		ret = -EFAULT;
> +		goto error_kfree_ports;
> +	}
> +
> +	for (i = 0; i < kioreq->vcpus; i++) {
> +		kioreq->ports[i].vcpu = i;
> +		kioreq->ports[i].port = ports[i];
> +		kioreq->ports[i].kioreq = kioreq;
> +
> +		ret = bind_evtchn_to_irqhandler_lateeoi(ports[i],
> +				ioeventfd_interrupt, IRQF_SHARED, "ioeventfd",
> +				&kioreq->ports[i]);
> +		if (ret < 0)
> +			goto error_unbind;
> +	}
> +
> +	kfree(ports);
> +
> +	list_add_tail(&kioreq->list, &ioreq_list);
> +
> +	return kioreq;
> +
> +error_unbind:
> +	while (--i >= 0)
> +		unbind_from_irqhandler(irq_from_evtchn(ports[i]), &kioreq->ports[i]);
> +error_kfree_ports:
> +	kfree(ports);
> +error_kfree:
> +	kfree(kioreq);
> +	return ERR_PTR(ret);
> +}
> +
> +static struct privcmd_kernel_ioreq *
> +get_ioreq(struct privcmd_ioeventfd *ioeventfd, struct eventfd_ctx *eventfd)
> +{
> +	struct privcmd_kernel_ioreq *kioreq;
> +
> +	list_for_each_entry(kioreq, &ioreq_list, list) {
> +		struct privcmd_kernel_ioeventfd *kioeventfd;
> +
> +		/*
> +		 * kioreq fields can be accessed here without a lock as they are
> +		 * never updated after being added to the ioreq_list.
> +		 */
> +		if (kioreq->uioreq != ioeventfd->ioreq) {
> +			continue;
> +		} else if (kioreq->dom != ioeventfd->dom ||
> +			   kioreq->vcpus != ioeventfd->vcpus) {
> +			pr_err("Invalid ioeventfd configuration mismatch, dom (%u vs %u), vcpus (%u vs %u)\n",
> +			       kioreq->dom, ioeventfd->dom, kioreq->vcpus,
> +			       ioeventfd->vcpus);
> +			return ERR_PTR(-EINVAL);
> +		}
> +
> +		/* Look for a duplicate eventfd for the same guest */
> +		mutex_lock(&kioreq->lock);
> +		list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) {
> +			if (eventfd == kioeventfd->eventfd) {
> +				mutex_unlock(&kioreq->lock);
> +				return ERR_PTR(-EBUSY);
> +			}
> +		}
> +		mutex_unlock(&kioreq->lock);
> +
> +		return kioreq;
> +	}
> +
> +	/* Matching kioreq isn't found, allocate a new one */
> +	return alloc_ioreq(ioeventfd);
> +}
> +
> +static void ioeventfd_free(struct privcmd_kernel_ioeventfd *kioeventfd)
> +{
> +	list_del(&kioeventfd->list);
> +	eventfd_ctx_put(kioeventfd->eventfd);
> +	kfree(kioeventfd);
> +}
> +
> +static int privcmd_ioeventfd_assign(struct privcmd_ioeventfd *ioeventfd)
> +{
> +	struct privcmd_kernel_ioeventfd *kioeventfd;
> +	struct privcmd_kernel_ioreq *kioreq;
> +	struct fd f;
> +	int ret;
> +
> +	/* Check for range overflow */
> +	if (ioeventfd->addr + ioeventfd->addr_len < ioeventfd->addr)
> +		return -EINVAL;
> +
> +	/* Vhost requires us to support length 1, 2, 4, and 8 */
> +	if (!(ioeventfd->addr_len == 1 || ioeventfd->addr_len == 2 ||
> +	      ioeventfd->addr_len == 4 || ioeventfd->addr_len == 8))
> +		return -EINVAL;

I think you should check the numbers of vcpus to be in a sane range. Neither
0 nor 4 billion are appropriate.

> +
> +	kioeventfd = kzalloc(sizeof(*kioeventfd), GFP_KERNEL);
> +	if (!kioeventfd)
> +		return -ENOMEM;
> +
> +	f = fdget(ioeventfd->event_fd);
> +	if (!f.file) {
> +		ret = -EBADF;
> +		goto error_kfree;
> +	}
> +
> +	kioeventfd->eventfd = eventfd_ctx_fileget(f.file);
> +	fdput(f);
> +
> +	if (IS_ERR(kioeventfd->eventfd)) {
> +		ret = PTR_ERR(kioeventfd->eventfd);
> +		goto error_kfree;
> +	}
> +
> +	kioeventfd->addr = ioeventfd->addr;
> +	kioeventfd->addr_len = ioeventfd->addr_len;
> +	kioeventfd->vq = ioeventfd->vq;
> +
> +	mutex_lock(&ioreq_lock);
> +	kioreq = get_ioreq(ioeventfd, kioeventfd->eventfd);
> +	if (IS_ERR(kioreq)) {
> +		mutex_unlock(&ioreq_lock);
> +		ret = PTR_ERR(kioreq);
> +		goto error_eventfd;
> +	}
> +
> +	mutex_lock(&kioreq->lock);
> +	list_add_tail(&kioeventfd->list, &kioreq->ioeventfds);
> +	mutex_unlock(&kioreq->lock);
> +
> +	mutex_unlock(&ioreq_lock);
> +
> +	return 0;
> +
> +error_eventfd:
> +	eventfd_ctx_put(kioeventfd->eventfd);
> +
> +error_kfree:
> +	kfree(kioeventfd);
> +	return ret;
> +}
> +
> +static int privcmd_ioeventfd_deassign(struct privcmd_ioeventfd *ioeventfd)
> +{
> +	struct privcmd_kernel_ioreq *kioreq, *tkioreq;
> +	struct eventfd_ctx *eventfd;
> +	int ret = 0;
> +
> +	eventfd = eventfd_ctx_fdget(ioeventfd->event_fd);
> +	if (IS_ERR(eventfd))
> +		return PTR_ERR(eventfd);
> +
> +	mutex_lock(&ioreq_lock);
> +	list_for_each_entry_safe(kioreq, tkioreq, &ioreq_list, list) {
> +		struct privcmd_kernel_ioeventfd *kioeventfd, *tmp;
> +		/*
> +		 * kioreq fields can be accessed here without a lock as they are
> +		 * never updated after being added to the ioreq_list.
> +		 */
> +		if (kioreq->dom != ioeventfd->dom ||
> +		    kioreq->uioreq != ioeventfd->ioreq ||
> +		    kioreq->vcpus != ioeventfd->vcpus)
> +			continue;
> +
> +		mutex_lock(&kioreq->lock);
> +		list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) {
> +			if (eventfd == kioeventfd->eventfd) {
> +				ioeventfd_free(kioeventfd);
> +				mutex_unlock(&kioreq->lock);
> +
> +				ioreq_free(kioreq);
> +				goto unlock;
> +			}
> +		}
> +		mutex_unlock(&kioreq->lock);
> +		break;
> +	}
> +
> +	pr_err("Ioeventfd isn't already assigned, dom: %u, addr: %llu\n",
> +			ioeventfd->dom, ioeventfd->addr);
> +	ret = -ENODEV;
> +
> +unlock:
> +	mutex_unlock(&ioreq_lock);
> +	eventfd_ctx_put(eventfd);
> +
> +	return 0;
> +}
> +
> +static long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata)
> +{
> +	struct privcmd_data *data = file->private_data;
> +	struct privcmd_ioeventfd ioeventfd;
> +
> +	if (copy_from_user(&ioeventfd, udata, sizeof(ioeventfd)))
> +		return -EFAULT;
> +
> +	/* No other flags should be set */
> +	if (ioeventfd.flags & ~PRIVCMD_IOEVENTFD_FLAG_DEASSIGN)
> +		return -EINVAL;
> +
> +	/* If restriction is in place, check the domid matches */
> +	if (data->domid != DOMID_INVALID && data->domid != ioeventfd.dom)
> +		return -EPERM;
> +
> +	if (ioeventfd.flags & PRIVCMD_IOEVENTFD_FLAG_DEASSIGN)
> +		return privcmd_ioeventfd_deassign(&ioeventfd);
> +
> +	return privcmd_ioeventfd_assign(&ioeventfd);
> +}
> +
> +static void privcmd_ioeventfd_exit(void)
> +{
> +	struct privcmd_kernel_ioreq *kioreq, *tmp;
> +
> +	mutex_lock(&ioreq_lock);
> +	list_for_each_entry_safe(kioreq, tmp, &ioreq_list, list) {
> +		struct privcmd_kernel_ioeventfd *kioeventfd, *tmp;
> +
> +		mutex_lock(&kioreq->lock);
> +		list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list)
> +			ioeventfd_free(kioeventfd);
> +		mutex_unlock(&kioreq->lock);
> +
> +		ioreq_free(kioreq);
> +	}
> +	mutex_unlock(&ioreq_lock);
> +}
>   #else
>   static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata)
>   {
> @@ -1093,7 +1460,16 @@ static inline int privcmd_irqfd_init(void)
>   static inline void privcmd_irqfd_exit(void)
>   {
>   }
> -#endif /* CONFIG_XEN_PRIVCMD_IRQFD */
> +
> +static inline long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline void privcmd_ioeventfd_exit(void)
> +{
> +}
> +#endif /* CONFIG_XEN_PRIVCMD_EVENTFD */
>   
>   static long privcmd_ioctl(struct file *file,
>   			  unsigned int cmd, unsigned long data)
> @@ -1134,6 +1510,10 @@ static long privcmd_ioctl(struct file *file,
>   		ret = privcmd_ioctl_irqfd(file, udata);
>   		break;
>   
> +	case IOCTL_PRIVCMD_IOEVENTFD:
> +		ret = privcmd_ioctl_ioeventfd(file, udata);
> +		break;
> +
>   	default:
>   		break;
>   	}
> @@ -1278,6 +1658,7 @@ static int __init privcmd_init(void)
>   
>   static void __exit privcmd_exit(void)
>   {
> +	privcmd_ioeventfd_exit();
>   	privcmd_irqfd_exit();
>   	misc_deregister(&privcmd_dev);
>   	misc_deregister(&xen_privcmdbuf_dev);
> diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
> index 375718ba4ab6..ce71390f0024 100644
> --- a/include/uapi/xen/privcmd.h
> +++ b/include/uapi/xen/privcmd.h
> @@ -110,6 +110,22 @@ struct privcmd_irqfd {
>   	__u8 pad[2];
>   };
>   
> +/* For privcmd_ioeventfd::flags */
> +#define PRIVCMD_IOEVENTFD_FLAG_DEASSIGN (1 << 0)
> +
> +struct privcmd_ioeventfd {
> +	void __user *ioreq;
> +	unsigned int __user *ports;
> +	__u64 addr;
> +	__u32 addr_len;
> +	__u32 event_fd;
> +	__u32 vcpus;
> +	__u32 vq;
> +	__u32 flags;
> +	domid_t dom;
> +	__u8 pad[2];
> +};
> +
>   /*
>    * @cmd: IOCTL_PRIVCMD_HYPERCALL
>    * @arg: &privcmd_hypercall_t
> @@ -139,5 +155,7 @@ struct privcmd_irqfd {
>   	_IOC(_IOC_NONE, 'P', 7, sizeof(struct privcmd_mmap_resource))
>   #define IOCTL_PRIVCMD_IRQFD					\
>   	_IOC(_IOC_NONE, 'P', 8, sizeof(struct privcmd_irqfd))
> +#define IOCTL_PRIVCMD_IOEVENTFD					\
> +	_IOC(_IOC_NONE, 'P', 9, sizeof(struct privcmd_ioeventfd))
>   
>   #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
> diff --git a/include/xen/interface/hvm/ioreq.h b/include/xen/interface/hvm/ioreq.h
> new file mode 100644
> index 000000000000..b02cfeae7eb5
> --- /dev/null
> +++ b/include/xen/interface/hvm/ioreq.h
> @@ -0,0 +1,51 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * ioreq.h: I/O request definitions for device models
> + * Copyright (c) 2004, Intel Corporation.
> + */
> +
> +#ifndef __XEN_PUBLIC_HVM_IOREQ_H__
> +#define __XEN_PUBLIC_HVM_IOREQ_H__
> +
> +#define IOREQ_READ      1
> +#define IOREQ_WRITE     0
> +
> +#define STATE_IOREQ_NONE        0
> +#define STATE_IOREQ_READY       1
> +#define STATE_IOREQ_INPROCESS   2
> +#define STATE_IORESP_READY      3
> +
> +#define IOREQ_TYPE_PIO          0 /* pio */
> +#define IOREQ_TYPE_COPY         1 /* mmio ops */
> +#define IOREQ_TYPE_PCI_CONFIG   2
> +#define IOREQ_TYPE_TIMEOFFSET   7
> +#define IOREQ_TYPE_INVALIDATE   8 /* mapcache */
> +
> +/*
> + * VMExit dispatcher should cooperate with instruction decoder to
> + * prepare this structure and notify service OS and DM by sending
> + * virq.
> + *
> + * For I/O type IOREQ_TYPE_PCI_CONFIG, the physical address is formatted
> + * as follows:
> + *
> + * 63....48|47..40|39..35|34..32|31........0
> + * SEGMENT |BUS   |DEV   |FN    |OFFSET
> + */
> +struct ioreq {
> +	uint64_t addr;          /* physical address */
> +	uint64_t data;          /* data (or paddr of data) */
> +	uint32_t count;         /* for rep prefixes */
> +	uint32_t size;          /* size in bytes */
> +	uint32_t vp_eport;      /* evtchn for notifications to/from device model */
> +	uint16_t _pad0;
> +	uint8_t state:4;
> +	uint8_t data_is_ptr:1;  /* if 1, data above is the guest paddr
> +				 * of the real data to use. */
> +	uint8_t dir:1;          /* 1=read, 0=write */
> +	uint8_t df:1;
> +	uint8_t _pad1:1;
> +	uint8_t type;           /* I/O type */
> +};
> +
> +#endif /* __XEN_PUBLIC_HVM_IOREQ_H__ */

Juergen
Viresh Kumar Oct. 5, 2023, 6:32 a.m. UTC | #2
On 29-09-23, 07:46, Juergen Gross wrote:
> On 29.08.23 14:29, Viresh Kumar wrote:
> > +static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id)
> > +{
> > +	struct ioreq_port *port = dev_id;
> > +	struct privcmd_kernel_ioreq *kioreq = port->kioreq;
> > +	struct ioreq *ioreq = &kioreq->ioreq[port->vcpu];
> > +	struct privcmd_kernel_ioeventfd *kioeventfd;
> > +	unsigned int state = STATE_IOREQ_READY;
> > +
> > +	if (ioreq->state != STATE_IOREQ_READY ||
> > +	    ioreq->type != IOREQ_TYPE_COPY || ioreq->dir != IOREQ_WRITE)
> > +		return IRQ_NONE;
> > +
> > +	smp_mb();
> > +	ioreq->state = STATE_IOREQ_INPROCESS;
> > +
> > +	mutex_lock(&kioreq->lock);
> > +	list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) {
> > +		if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY &&
> > +		    ioreq->size == kioeventfd->addr_len &&
> > +		    (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) {
> > +			eventfd_signal(kioeventfd->eventfd, 1);
> > +			state = STATE_IORESP_READY;
> > +			break;
> > +		}
> > +	}
> > +	mutex_unlock(&kioreq->lock);
> > +
> > +	smp_mb();
> 
> Is this really needed after calling mutex_unlock()? I think you are trying to
> avoid any accesses to go past ioreq->state modification. If so, add a comment
> (either why you need the barrier, or that you don't need it due to the unlock).

Right, want all writes to finish before updating state.

> In general, shouldn't the state be checked and modified in the locked area?

The handler runs separately for each vcpu and shouldn't run in parallel for the
same vcpu. And so only one thread should ever be accessing ioreq port structure.

The lock is there to protect the ioeventfds list (as mentioned in struct
declaration) against parallel access, as threads for different vcpus may end up
accessing it simultaneously.
Viresh Kumar Oct. 5, 2023, 10:07 a.m. UTC | #3
On 29-09-23, 07:46, Juergen Gross wrote:
> This is populated from a __u64 field. Maybe make it uint64_t?

Checkpatch warns about this, will use u64 instead.

CHECK: Prefer kernel type 'u64' over 'uint64_t'
#124: FILE: drivers/xen/privcmd.c:1097:
+	uint64_t addr;
Alex Bennée Oct. 9, 2023, 9:40 a.m. UTC | #4
Viresh Kumar <viresh.kumar@linaro.org> writes:

> On 29-09-23, 07:46, Juergen Gross wrote:
>> On 29.08.23 14:29, Viresh Kumar wrote:
>> > +static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id)
>> > +{
>> > +	struct ioreq_port *port = dev_id;
>> > +	struct privcmd_kernel_ioreq *kioreq = port->kioreq;
>> > +	struct ioreq *ioreq = &kioreq->ioreq[port->vcpu];
>> > +	struct privcmd_kernel_ioeventfd *kioeventfd;
>> > +	unsigned int state = STATE_IOREQ_READY;
>> > +
>> > +	if (ioreq->state != STATE_IOREQ_READY ||
>> > +	    ioreq->type != IOREQ_TYPE_COPY || ioreq->dir != IOREQ_WRITE)
>> > +		return IRQ_NONE;
>> > +
>> > +	smp_mb();
>> > +	ioreq->state = STATE_IOREQ_INPROCESS;
>> > +
>> > +	mutex_lock(&kioreq->lock);
>> > +	list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) {
>> > +		if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY &&
>> > +		    ioreq->size == kioeventfd->addr_len &&
>> > +		    (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) {
>> > +			eventfd_signal(kioeventfd->eventfd, 1);
>> > +			state = STATE_IORESP_READY;
>> > +			break;
>> > +		}
>> > +	}
>> > +	mutex_unlock(&kioreq->lock);
>> > +
>> > +	smp_mb();
>> 
>> Is this really needed after calling mutex_unlock()? I think you are trying to
>> avoid any accesses to go past ioreq->state modification. If so, add a comment
>> (either why you need the barrier, or that you don't need it due to the unlock).
>
> Right, want all writes to finish before updating state.

I thought generally sync points act as full barriers. Doing a bunch of
grepping I think ends at:

  static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
  {
          unsigned long curr = (unsigned long)current;

          return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL);
  }

so you should already have completed your writes by that point.

>
>> In general, shouldn't the state be checked and modified in the locked area?
>
> The handler runs separately for each vcpu and shouldn't run in parallel for the
> same vcpu. And so only one thread should ever be accessing ioreq port structure.
>
> The lock is there to protect the ioeventfds list (as mentioned in struct
> declaration) against parallel access, as threads for different vcpus may end up
> accessing it simultaneously.
Viresh Kumar Oct. 9, 2023, 10:53 a.m. UTC | #5
On 09-10-23, 10:40, Alex Bennée wrote:
> I thought generally sync points act as full barriers. Doing a bunch of
> grepping I think ends at:
> 
>   static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
>   {
>           unsigned long curr = (unsigned long)current;
> 
>           return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL);
>   }
> 
> so you should already have completed your writes by that point.

I am not sure if depending on such indirect mechanisms to implement
barriers for you is a good idea.

The situation here probably requires explicit barriers to make sure it
doesn't break in future ?
Jürgen Groß Oct. 9, 2023, 10:58 a.m. UTC | #6
On 09.10.23 12:53, Viresh Kumar wrote:
> On 09-10-23, 10:40, Alex Bennée wrote:
>> I thought generally sync points act as full barriers. Doing a bunch of
>> grepping I think ends at:
>>
>>    static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
>>    {
>>            unsigned long curr = (unsigned long)current;
>>
>>            return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL);
>>    }
>>
>> so you should already have completed your writes by that point.
> 
> I am not sure if depending on such indirect mechanisms to implement
> barriers for you is a good idea.
> 
> The situation here probably requires explicit barriers to make sure it
> doesn't break in future ?
> 

Depending on lock implementations to include the needed barriers is fine IMO.

That is one central objective locks must ensure: to make sure any updates in
a locked region are operating on consistent data and being observable by
others after leaving the locked region.


Juergen
Arnd Bergmann Oct. 14, 2023, 12:19 p.m. UTC | #7
On Tue, Aug 29, 2023, at 14:29, Viresh Kumar via Stratos-dev wrote:

> +/* For privcmd_ioeventfd::flags */
> +#define PRIVCMD_IOEVENTFD_FLAG_DEASSIGN (1 << 0)
> +
> +struct privcmd_ioeventfd {
> +	void __user *ioreq;
> +	unsigned int __user *ports;
> +	__u64 addr;
> +	__u32 addr_len;
> +	__u32 event_fd;
> +	__u32 vcpus;
> +	__u32 vq;
> +	__u32 flags;
> +	domid_t dom;
> +	__u8 pad[2];
> +};

Using indirect pointers in an ioctl command argument means that
the layout is architecture specific, in particular you can't
use the same one from 32-bit compat tasks. The general recommendation
is to have __u64 members and use u64_to_user_ptr() to access it
from the kernel if you are unable to avoid the pointers altogether.

>  /*
>   * @cmd: IOCTL_PRIVCMD_HYPERCALL
>   * @arg: &privcmd_hypercall_t
> @@ -139,5 +155,7 @@ struct privcmd_irqfd {
>  	_IOC(_IOC_NONE, 'P', 7, sizeof(struct privcmd_mmap_resource))
>  #define IOCTL_PRIVCMD_IRQFD					\
>  	_IOC(_IOC_NONE, 'P', 8, sizeof(struct privcmd_irqfd))
> +#define IOCTL_PRIVCMD_IOEVENTFD					\
> +	_IOC(_IOC_NONE, 'P', 9, sizeof(struct privcmd_ioeventfd))

_IOC() an internal helper that you should not use in driver code.
In particular, you go the data direction wrong here, which breaks
a number of tools, as having "_IOC_NONE" should never be paired with
a nonzero size.

Instead, you should use the existing _IOWR() or _IOR() macros without
the 'sizeof()' like

#define IOCTL_PRIVCMD_IOEVENTFD _IOWR('P', 9, struct privcmd_ioeventfd)

You clearly copied this from the other ioctl command definitions in
the same file that we can't easily fix without breaking the user
ABI, but I still think you should try to do it correctly for new
commands.

     Arnd
diff mbox series

Patch

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index d43153fec18e..d5989871dd5d 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -269,12 +269,12 @@  config XEN_PRIVCMD
 	  disaggregated Xen setups this driver might be needed for other
 	  domains, too.
 
-config XEN_PRIVCMD_IRQFD
-	bool "Xen irqfd support"
+config XEN_PRIVCMD_EVENTFD
+	bool "Xen Ioeventfd and irqfd support"
 	depends on XEN_PRIVCMD && XEN_VIRTIO && EVENTFD
 	help
-	  Using the irqfd mechanism a virtio backend running in a daemon can
-	  speed up interrupt injection into a guest.
+	  Using the ioeventfd / irqfd mechanism a virtio backend running in a
+	  daemon can speed up interrupt delivery from / to a guest.
 
 config XEN_ACPI_PROCESSOR
 	tristate "Xen ACPI processor"
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 120af57999fc..4b18c51b6b2e 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -29,15 +29,18 @@ 
 #include <linux/seq_file.h>
 #include <linux/miscdevice.h>
 #include <linux/moduleparam.h>
+#include <linux/virtio_mmio.h>
 
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 
 #include <xen/xen.h>
+#include <xen/events.h>
 #include <xen/privcmd.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/memory.h>
 #include <xen/interface/hvm/dm_op.h>
+#include <xen/interface/hvm/ioreq.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/xen-ops.h>
@@ -782,6 +785,7 @@  static long privcmd_ioctl_mmap_resource(struct file *file,
 			goto out;
 
 		pages = vma->vm_private_data;
+
 		for (i = 0; i < kdata.num; i++) {
 			xen_pfn_t pfn =
 				page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]);
@@ -838,7 +842,7 @@  static long privcmd_ioctl_mmap_resource(struct file *file,
 	return rc;
 }
 
-#ifdef CONFIG_XEN_PRIVCMD_IRQFD
+#ifdef CONFIG_XEN_PRIVCMD_EVENTFD
 /* Irqfd support */
 static struct workqueue_struct *irqfd_cleanup_wq;
 static DEFINE_MUTEX(irqfds_lock);
@@ -1079,6 +1083,369 @@  static void privcmd_irqfd_exit(void)
 
 	destroy_workqueue(irqfd_cleanup_wq);
 }
+
+/* Ioeventfd Support */
+#define QUEUE_NOTIFY_VQ_MASK 0xFFFF
+
+static DEFINE_MUTEX(ioreq_lock);
+static LIST_HEAD(ioreq_list);
+
+/* per-eventfd structure */
+struct privcmd_kernel_ioeventfd {
+	struct eventfd_ctx *eventfd;
+	struct list_head list;
+	unsigned long long addr;
+	unsigned int addr_len;
+	unsigned int vq;
+};
+
+/* per-guest CPU / port structure */
+struct ioreq_port {
+	int vcpu;
+	unsigned int port;
+	struct privcmd_kernel_ioreq *kioreq;
+};
+
+/* per-guest structure */
+struct privcmd_kernel_ioreq {
+	domid_t dom;
+	unsigned int vcpus;
+	void __user *uioreq;
+	struct ioreq *ioreq;
+	struct mutex lock; /* Protects ioeventfds list */
+	struct list_head ioeventfds;
+	struct list_head list;
+	struct ioreq_port *ports;
+};
+
+static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id)
+{
+	struct ioreq_port *port = dev_id;
+	struct privcmd_kernel_ioreq *kioreq = port->kioreq;
+	struct ioreq *ioreq = &kioreq->ioreq[port->vcpu];
+	struct privcmd_kernel_ioeventfd *kioeventfd;
+	unsigned int state = STATE_IOREQ_READY;
+
+	if (ioreq->state != STATE_IOREQ_READY ||
+	    ioreq->type != IOREQ_TYPE_COPY || ioreq->dir != IOREQ_WRITE)
+		return IRQ_NONE;
+
+	smp_mb();
+	ioreq->state = STATE_IOREQ_INPROCESS;
+
+	mutex_lock(&kioreq->lock);
+	list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) {
+		if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY &&
+		    ioreq->size == kioeventfd->addr_len &&
+		    (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) {
+			eventfd_signal(kioeventfd->eventfd, 1);
+			state = STATE_IORESP_READY;
+			break;
+		}
+	}
+	mutex_unlock(&kioreq->lock);
+
+	smp_mb();
+	ioreq->state = state;
+
+	if (state == STATE_IORESP_READY) {
+		notify_remote_via_evtchn(port->port);
+		return IRQ_HANDLED;
+	}
+
+	return IRQ_NONE;
+}
+
+static void ioreq_free(struct privcmd_kernel_ioreq *kioreq)
+{
+	struct ioreq_port *ports = kioreq->ports;
+	int i;
+
+	lockdep_assert_held(&ioreq_lock);
+
+	if (!list_empty(&kioreq->ioeventfds))
+		return;
+
+	list_del(&kioreq->list);
+
+	for (i = kioreq->vcpus - 1; i >= 0; i--)
+		unbind_from_irqhandler(irq_from_evtchn(ports[i].port), &ports[i]);
+
+	kfree(kioreq);
+}
+
+static
+struct privcmd_kernel_ioreq *alloc_ioreq(struct privcmd_ioeventfd *ioeventfd)
+{
+	struct privcmd_kernel_ioreq *kioreq;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	struct page **pages;
+	unsigned int *ports;
+	int ret, size, i;
+
+	lockdep_assert_held(&ioreq_lock);
+
+	size = sizeof(*kioreq) + sizeof(*kioreq->ports) * ioeventfd->vcpus;
+	kioreq = kzalloc(size, GFP_KERNEL);
+	if (!kioreq)
+		return ERR_PTR(-ENOMEM);
+	kioreq->ports = (struct ioreq_port *)(kioreq + 1);
+
+	kioreq->dom = ioeventfd->dom;
+	kioreq->vcpus = ioeventfd->vcpus;
+	kioreq->uioreq = ioeventfd->ioreq;
+	mutex_init(&kioreq->lock);
+	INIT_LIST_HEAD(&kioreq->ioeventfds);
+
+	/* The memory for ioreq server must have been mapped earlier */
+	mmap_write_lock(mm);
+	vma = find_vma(mm, (unsigned long)ioeventfd->ioreq);
+	if (!vma) {
+		pr_err("Failed to find vma for ioreq page!\n");
+		mmap_write_unlock(mm);
+		ret = -EFAULT;
+		goto error_kfree;
+	}
+
+	pages = vma->vm_private_data;
+	kioreq->ioreq = (struct ioreq *)(page_to_virt(pages[0]));
+	mmap_write_unlock(mm);
+
+	size = sizeof(*ports) * kioreq->vcpus;
+	ports = kzalloc(size, GFP_KERNEL);
+	if (!ports) {
+		ret = -ENOMEM;
+		goto error_kfree;
+	}
+
+	if (copy_from_user(ports, ioeventfd->ports, size)) {
+		ret = -EFAULT;
+		goto error_kfree_ports;
+	}
+
+	for (i = 0; i < kioreq->vcpus; i++) {
+		kioreq->ports[i].vcpu = i;
+		kioreq->ports[i].port = ports[i];
+		kioreq->ports[i].kioreq = kioreq;
+
+		ret = bind_evtchn_to_irqhandler_lateeoi(ports[i],
+				ioeventfd_interrupt, IRQF_SHARED, "ioeventfd",
+				&kioreq->ports[i]);
+		if (ret < 0)
+			goto error_unbind;
+	}
+
+	kfree(ports);
+
+	list_add_tail(&kioreq->list, &ioreq_list);
+
+	return kioreq;
+
+error_unbind:
+	while (--i >= 0)
+		unbind_from_irqhandler(irq_from_evtchn(ports[i]), &kioreq->ports[i]);
+error_kfree_ports:
+	kfree(ports);
+error_kfree:
+	kfree(kioreq);
+	return ERR_PTR(ret);
+}
+
+static struct privcmd_kernel_ioreq *
+get_ioreq(struct privcmd_ioeventfd *ioeventfd, struct eventfd_ctx *eventfd)
+{
+	struct privcmd_kernel_ioreq *kioreq;
+
+	list_for_each_entry(kioreq, &ioreq_list, list) {
+		struct privcmd_kernel_ioeventfd *kioeventfd;
+
+		/*
+		 * kioreq fields can be accessed here without a lock as they are
+		 * never updated after being added to the ioreq_list.
+		 */
+		if (kioreq->uioreq != ioeventfd->ioreq) {
+			continue;
+		} else if (kioreq->dom != ioeventfd->dom ||
+			   kioreq->vcpus != ioeventfd->vcpus) {
+			pr_err("Invalid ioeventfd configuration mismatch, dom (%u vs %u), vcpus (%u vs %u)\n",
+			       kioreq->dom, ioeventfd->dom, kioreq->vcpus,
+			       ioeventfd->vcpus);
+			return ERR_PTR(-EINVAL);
+		}
+
+		/* Look for a duplicate eventfd for the same guest */
+		mutex_lock(&kioreq->lock);
+		list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) {
+			if (eventfd == kioeventfd->eventfd) {
+				mutex_unlock(&kioreq->lock);
+				return ERR_PTR(-EBUSY);
+			}
+		}
+		mutex_unlock(&kioreq->lock);
+
+		return kioreq;
+	}
+
+	/* Matching kioreq isn't found, allocate a new one */
+	return alloc_ioreq(ioeventfd);
+}
+
+static void ioeventfd_free(struct privcmd_kernel_ioeventfd *kioeventfd)
+{
+	list_del(&kioeventfd->list);
+	eventfd_ctx_put(kioeventfd->eventfd);
+	kfree(kioeventfd);
+}
+
+static int privcmd_ioeventfd_assign(struct privcmd_ioeventfd *ioeventfd)
+{
+	struct privcmd_kernel_ioeventfd *kioeventfd;
+	struct privcmd_kernel_ioreq *kioreq;
+	struct fd f;
+	int ret;
+
+	/* Check for range overflow */
+	if (ioeventfd->addr + ioeventfd->addr_len < ioeventfd->addr)
+		return -EINVAL;
+
+	/* Vhost requires us to support length 1, 2, 4, and 8 */
+	if (!(ioeventfd->addr_len == 1 || ioeventfd->addr_len == 2 ||
+	      ioeventfd->addr_len == 4 || ioeventfd->addr_len == 8))
+		return -EINVAL;
+
+	kioeventfd = kzalloc(sizeof(*kioeventfd), GFP_KERNEL);
+	if (!kioeventfd)
+		return -ENOMEM;
+
+	f = fdget(ioeventfd->event_fd);
+	if (!f.file) {
+		ret = -EBADF;
+		goto error_kfree;
+	}
+
+	kioeventfd->eventfd = eventfd_ctx_fileget(f.file);
+	fdput(f);
+
+	if (IS_ERR(kioeventfd->eventfd)) {
+		ret = PTR_ERR(kioeventfd->eventfd);
+		goto error_kfree;
+	}
+
+	kioeventfd->addr = ioeventfd->addr;
+	kioeventfd->addr_len = ioeventfd->addr_len;
+	kioeventfd->vq = ioeventfd->vq;
+
+	mutex_lock(&ioreq_lock);
+	kioreq = get_ioreq(ioeventfd, kioeventfd->eventfd);
+	if (IS_ERR(kioreq)) {
+		mutex_unlock(&ioreq_lock);
+		ret = PTR_ERR(kioreq);
+		goto error_eventfd;
+	}
+
+	mutex_lock(&kioreq->lock);
+	list_add_tail(&kioeventfd->list, &kioreq->ioeventfds);
+	mutex_unlock(&kioreq->lock);
+
+	mutex_unlock(&ioreq_lock);
+
+	return 0;
+
+error_eventfd:
+	eventfd_ctx_put(kioeventfd->eventfd);
+
+error_kfree:
+	kfree(kioeventfd);
+	return ret;
+}
+
+static int privcmd_ioeventfd_deassign(struct privcmd_ioeventfd *ioeventfd)
+{
+	struct privcmd_kernel_ioreq *kioreq, *tkioreq;
+	struct eventfd_ctx *eventfd;
+	int ret = 0;
+
+	eventfd = eventfd_ctx_fdget(ioeventfd->event_fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	mutex_lock(&ioreq_lock);
+	list_for_each_entry_safe(kioreq, tkioreq, &ioreq_list, list) {
+		struct privcmd_kernel_ioeventfd *kioeventfd, *tmp;
+		/*
+		 * kioreq fields can be accessed here without a lock as they are
+		 * never updated after being added to the ioreq_list.
+		 */
+		if (kioreq->dom != ioeventfd->dom ||
+		    kioreq->uioreq != ioeventfd->ioreq ||
+		    kioreq->vcpus != ioeventfd->vcpus)
+			continue;
+
+		mutex_lock(&kioreq->lock);
+		list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) {
+			if (eventfd == kioeventfd->eventfd) {
+				ioeventfd_free(kioeventfd);
+				mutex_unlock(&kioreq->lock);
+
+				ioreq_free(kioreq);
+				goto unlock;
+			}
+		}
+		mutex_unlock(&kioreq->lock);
+		break;
+	}
+
+	pr_err("Ioeventfd isn't already assigned, dom: %u, addr: %llu\n",
+			ioeventfd->dom, ioeventfd->addr);
+	ret = -ENODEV;
+
+unlock:
+	mutex_unlock(&ioreq_lock);
+	eventfd_ctx_put(eventfd);
+
+	return 0;
+}
+
+static long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata)
+{
+	struct privcmd_data *data = file->private_data;
+	struct privcmd_ioeventfd ioeventfd;
+
+	if (copy_from_user(&ioeventfd, udata, sizeof(ioeventfd)))
+		return -EFAULT;
+
+	/* No other flags should be set */
+	if (ioeventfd.flags & ~PRIVCMD_IOEVENTFD_FLAG_DEASSIGN)
+		return -EINVAL;
+
+	/* If restriction is in place, check the domid matches */
+	if (data->domid != DOMID_INVALID && data->domid != ioeventfd.dom)
+		return -EPERM;
+
+	if (ioeventfd.flags & PRIVCMD_IOEVENTFD_FLAG_DEASSIGN)
+		return privcmd_ioeventfd_deassign(&ioeventfd);
+
+	return privcmd_ioeventfd_assign(&ioeventfd);
+}
+
+static void privcmd_ioeventfd_exit(void)
+{
+	struct privcmd_kernel_ioreq *kioreq, *tmp;
+
+	mutex_lock(&ioreq_lock);
+	list_for_each_entry_safe(kioreq, tmp, &ioreq_list, list) {
+		struct privcmd_kernel_ioeventfd *kioeventfd, *tmp;
+
+		mutex_lock(&kioreq->lock);
+		list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list)
+			ioeventfd_free(kioeventfd);
+		mutex_unlock(&kioreq->lock);
+
+		ioreq_free(kioreq);
+	}
+	mutex_unlock(&ioreq_lock);
+}
 #else
 static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata)
 {
@@ -1093,7 +1460,16 @@  static inline int privcmd_irqfd_init(void)
 static inline void privcmd_irqfd_exit(void)
 {
 }
-#endif /* CONFIG_XEN_PRIVCMD_IRQFD */
+
+static inline long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void privcmd_ioeventfd_exit(void)
+{
+}
+#endif /* CONFIG_XEN_PRIVCMD_EVENTFD */
 
 static long privcmd_ioctl(struct file *file,
 			  unsigned int cmd, unsigned long data)
@@ -1134,6 +1510,10 @@  static long privcmd_ioctl(struct file *file,
 		ret = privcmd_ioctl_irqfd(file, udata);
 		break;
 
+	case IOCTL_PRIVCMD_IOEVENTFD:
+		ret = privcmd_ioctl_ioeventfd(file, udata);
+		break;
+
 	default:
 		break;
 	}
@@ -1278,6 +1658,7 @@  static int __init privcmd_init(void)
 
 static void __exit privcmd_exit(void)
 {
+	privcmd_ioeventfd_exit();
 	privcmd_irqfd_exit();
 	misc_deregister(&privcmd_dev);
 	misc_deregister(&xen_privcmdbuf_dev);
diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
index 375718ba4ab6..ce71390f0024 100644
--- a/include/uapi/xen/privcmd.h
+++ b/include/uapi/xen/privcmd.h
@@ -110,6 +110,22 @@  struct privcmd_irqfd {
 	__u8 pad[2];
 };
 
+/* For privcmd_ioeventfd::flags */
+#define PRIVCMD_IOEVENTFD_FLAG_DEASSIGN (1 << 0)
+
+struct privcmd_ioeventfd {
+	void __user *ioreq;
+	unsigned int __user *ports;
+	__u64 addr;
+	__u32 addr_len;
+	__u32 event_fd;
+	__u32 vcpus;
+	__u32 vq;
+	__u32 flags;
+	domid_t dom;
+	__u8 pad[2];
+};
+
 /*
  * @cmd: IOCTL_PRIVCMD_HYPERCALL
  * @arg: &privcmd_hypercall_t
@@ -139,5 +155,7 @@  struct privcmd_irqfd {
 	_IOC(_IOC_NONE, 'P', 7, sizeof(struct privcmd_mmap_resource))
 #define IOCTL_PRIVCMD_IRQFD					\
 	_IOC(_IOC_NONE, 'P', 8, sizeof(struct privcmd_irqfd))
+#define IOCTL_PRIVCMD_IOEVENTFD					\
+	_IOC(_IOC_NONE, 'P', 9, sizeof(struct privcmd_ioeventfd))
 
 #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
diff --git a/include/xen/interface/hvm/ioreq.h b/include/xen/interface/hvm/ioreq.h
new file mode 100644
index 000000000000..b02cfeae7eb5
--- /dev/null
+++ b/include/xen/interface/hvm/ioreq.h
@@ -0,0 +1,51 @@ 
+/* SPDX-License-Identifier: MIT */
+/*
+ * ioreq.h: I/O request definitions for device models
+ * Copyright (c) 2004, Intel Corporation.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_IOREQ_H__
+#define __XEN_PUBLIC_HVM_IOREQ_H__
+
+#define IOREQ_READ      1
+#define IOREQ_WRITE     0
+
+#define STATE_IOREQ_NONE        0
+#define STATE_IOREQ_READY       1
+#define STATE_IOREQ_INPROCESS   2
+#define STATE_IORESP_READY      3
+
+#define IOREQ_TYPE_PIO          0 /* pio */
+#define IOREQ_TYPE_COPY         1 /* mmio ops */
+#define IOREQ_TYPE_PCI_CONFIG   2
+#define IOREQ_TYPE_TIMEOFFSET   7
+#define IOREQ_TYPE_INVALIDATE   8 /* mapcache */
+
+/*
+ * VMExit dispatcher should cooperate with instruction decoder to
+ * prepare this structure and notify service OS and DM by sending
+ * virq.
+ *
+ * For I/O type IOREQ_TYPE_PCI_CONFIG, the physical address is formatted
+ * as follows:
+ *
+ * 63....48|47..40|39..35|34..32|31........0
+ * SEGMENT |BUS   |DEV   |FN    |OFFSET
+ */
+struct ioreq {
+	uint64_t addr;          /* physical address */
+	uint64_t data;          /* data (or paddr of data) */
+	uint32_t count;         /* for rep prefixes */
+	uint32_t size;          /* size in bytes */
+	uint32_t vp_eport;      /* evtchn for notifications to/from device model */
+	uint16_t _pad0;
+	uint8_t state:4;
+	uint8_t data_is_ptr:1;  /* if 1, data above is the guest paddr
+				 * of the real data to use. */
+	uint8_t dir:1;          /* 1=read, 0=write */
+	uint8_t df:1;
+	uint8_t _pad1:1;
+	uint8_t type;           /* I/O type */
+};
+
+#endif /* __XEN_PUBLIC_HVM_IOREQ_H__ */