diff mbox series

[2/8] fwctl: Basic ioctl dispatch for the character device

Message ID 2-v1-9912f1a11620+2a-fwctl_jgg@nvidia.com (mailing list archive)
State Superseded
Headers show
Series Introduce fwctl subystem | expand

Commit Message

Jason Gunthorpe June 3, 2024, 3:53 p.m. UTC
Each file descriptor gets a chunk of per-FD driver specific context that
allows the driver to attach a device specific struct to. The core code
takes care of the memory lifetime for this structure.

The ioctl dispatch and design is based on what was built for iommufd. The
ioctls have a struct which has a combined in/out behavior with a typical
'zero pad' scheme for future extension and backwards compatibility.

Like iommufd some shared logic does most of the ioctl marshalling and
compatibility work and tables diatches to some function pointers for
each unique iotcl.

This approach has proven to work quite well in the iommufd and rdma
subsystems.

Allocate an ioctl number space for the subsystem.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 .../userspace-api/ioctl/ioctl-number.rst      |   1 +
 MAINTAINERS                                   |   1 +
 drivers/fwctl/main.c                          | 124 +++++++++++++++++-
 include/linux/fwctl.h                         |  31 +++++
 include/uapi/fwctl/fwctl.h                    |  41 ++++++
 5 files changed, 196 insertions(+), 2 deletions(-)
 create mode 100644 include/uapi/fwctl/fwctl.h

Comments

Zhu Yanjun June 4, 2024, 12:16 p.m. UTC | #1
On 03.06.24 17:53, Jason Gunthorpe wrote:
> Each file descriptor gets a chunk of per-FD driver specific context that
> allows the driver to attach a device specific struct to. The core code
> takes care of the memory lifetime for this structure.
> 
> The ioctl dispatch and design is based on what was built for iommufd. The
> ioctls have a struct which has a combined in/out behavior with a typical
> 'zero pad' scheme for future extension and backwards compatibility.
> 
> Like iommufd some shared logic does most of the ioctl marshalling and
> compatibility work and tables diatches to some function pointers for
> each unique iotcl.
> 
> This approach has proven to work quite well in the iommufd and rdma
> subsystems.
> 
> Allocate an ioctl number space for the subsystem.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>   .../userspace-api/ioctl/ioctl-number.rst      |   1 +
>   MAINTAINERS                                   |   1 +
>   drivers/fwctl/main.c                          | 124 +++++++++++++++++-
>   include/linux/fwctl.h                         |  31 +++++
>   include/uapi/fwctl/fwctl.h                    |  41 ++++++
>   5 files changed, 196 insertions(+), 2 deletions(-)
>   create mode 100644 include/uapi/fwctl/fwctl.h
> 
> diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
> index a141e8e65c5d3a..4d91c5a20b98c8 100644
> --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> @@ -324,6 +324,7 @@ Code  Seq#    Include File                                           Comments
>   0x97  00-7F  fs/ceph/ioctl.h                                         Ceph file system
>   0x99  00-0F                                                          537-Addinboard driver
>                                                                        <mailto:buk@buks.ipn.de>
> +0x9A  00-0F  include/uapi/fwctl/fwctl.h
>   0xA0  all    linux/sdp/sdp.h                                         Industrial Device Project
>                                                                        <mailto:kenji@bitgate.com>
>   0xA1  0      linux/vtpm_proxy.h                                      TPM Emulator Proxy Driver
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 833b853808421e..94062161e9c4d7 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -9084,6 +9084,7 @@ S:	Maintained
>   F:	Documentation/userspace-api/fwctl.rst
>   F:	drivers/fwctl/
>   F:	include/linux/fwctl.h
> +F:	include/uapi/fwctl/
>   
>   GALAXYCORE GC0308 CAMERA SENSOR DRIVER
>   M:	Sebastian Reichel <sre@kernel.org>
> diff --git a/drivers/fwctl/main.c b/drivers/fwctl/main.c
> index ff9b7bad5a2b0d..7ecdabdd9dcb1e 100644
> --- a/drivers/fwctl/main.c
> +++ b/drivers/fwctl/main.c
> @@ -9,26 +9,131 @@
>   #include <linux/container_of.h>
>   #include <linux/fs.h>
>   
> +#include <uapi/fwctl/fwctl.h>
> +
>   enum {
>   	FWCTL_MAX_DEVICES = 256,
>   };
>   static dev_t fwctl_dev;
>   static DEFINE_IDA(fwctl_ida);
>   
> +struct fwctl_ucmd {
> +	struct fwctl_uctx *uctx;
> +	void __user *ubuffer;
> +	void *cmd;
> +	u32 user_size;
> +};
> +
> +/* On stack memory for the ioctl structs */
> +union ucmd_buffer {
> +};
> +
> +struct fwctl_ioctl_op {
> +	unsigned int size;
> +	unsigned int min_size;
> +	unsigned int ioctl_num;
> +	int (*execute)(struct fwctl_ucmd *ucmd);
> +};
> +
> +#define IOCTL_OP(_ioctl, _fn, _struct, _last)                         \
> +	[_IOC_NR(_ioctl) - FWCTL_CMD_BASE] = {                        \
> +		.size = sizeof(_struct) +                             \
> +			BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \
> +					  sizeof(_struct)),           \
> +		.min_size = offsetofend(_struct, _last),              \
> +		.ioctl_num = _ioctl,                                  \
> +		.execute = _fn,                                       \
> +	}
> +static const struct fwctl_ioctl_op fwctl_ioctl_ops[] = {
> +};
> +
> +static long fwctl_fops_ioctl(struct file *filp, unsigned int cmd,
> +			       unsigned long arg)
> +{
> +	struct fwctl_uctx *uctx = filp->private_data;
> +	const struct fwctl_ioctl_op *op;
> +	struct fwctl_ucmd ucmd = {};
> +	union ucmd_buffer buf;
> +	unsigned int nr;
> +	int ret;
> +
> +	nr = _IOC_NR(cmd);
> +	if ((nr - FWCTL_CMD_BASE) >= ARRAY_SIZE(fwctl_ioctl_ops))
> +		return -ENOIOCTLCMD;
> +	op = &fwctl_ioctl_ops[nr - FWCTL_CMD_BASE];
> +	if (op->ioctl_num != cmd)
> +		return -ENOIOCTLCMD;
> +
> +	ucmd.uctx = uctx;
> +	ucmd.cmd = &buf;
> +	ucmd.ubuffer = (void __user *)arg;
> +	ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
> +	if (ret)
> +		return ret;
> +
> +	if (ucmd.user_size < op->min_size)
> +		return -EINVAL;
> +
> +	ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
> +				    ucmd.user_size);
> +	if (ret)
> +		return ret;
> +
> +	guard(rwsem_read)(&uctx->fwctl->registration_lock);
> +	if (!uctx->fwctl->ops)
> +		return -ENODEV;
> +	return op->execute(&ucmd);
> +}
> +
>   static int fwctl_fops_open(struct inode *inode, struct file *filp)
>   {
>   	struct fwctl_device *fwctl =
>   		container_of(inode->i_cdev, struct fwctl_device, cdev);
> +	struct fwctl_uctx *uctx __free(kfree) = NULL;
> +	int ret;
> +
> +	guard(rwsem_read)(&fwctl->registration_lock);
> +	if (!fwctl->ops)
> +		return -ENODEV;
> +
> +	uctx = kzalloc(fwctl->ops->uctx_size, GFP_KERNEL |  GFP_KERNEL_ACCOUNT);
> +	if (!uctx)
> +		return -ENOMEM;
> +
> +	uctx->fwctl = fwctl;
> +	ret = fwctl->ops->open_uctx(uctx);
> +	if (ret)
> +		return ret;

When something is wrong, uctx is freed in "fwctl->ops->open_uctx(uctx);"?

If not, the allocated memory uctx leaks here.

Zhu Yanjun

> +
> +	scoped_guard(mutex, &fwctl->uctx_list_lock) {
> +		list_add_tail(&uctx->uctx_list_entry, &fwctl->uctx_list);
> +	}
>   
>   	get_device(&fwctl->dev);
> -	filp->private_data = fwctl;
> +	filp->private_data = no_free_ptr(uctx);
>   	return 0;
>   }
>   
> +static void fwctl_destroy_uctx(struct fwctl_uctx *uctx)
> +{
> +	lockdep_assert_held(&uctx->fwctl->uctx_list_lock);
> +	list_del(&uctx->uctx_list_entry);
> +	uctx->fwctl->ops->close_uctx(uctx);
> +}
> +
>   static int fwctl_fops_release(struct inode *inode, struct file *filp)
>   {
> -	struct fwctl_device *fwctl = filp->private_data;
> +	struct fwctl_uctx *uctx = filp->private_data;
> +	struct fwctl_device *fwctl = uctx->fwctl;
>   
> +	scoped_guard(rwsem_read, &fwctl->registration_lock) {
> +		if (fwctl->ops) {
> +			guard(mutex)(&fwctl->uctx_list_lock);
> +			fwctl_destroy_uctx(uctx);
> +		}
> +	}
> +
> +	kfree(uctx);
>   	fwctl_put(fwctl);
>   	return 0;
>   }
> @@ -37,6 +142,7 @@ static const struct file_operations fwctl_fops = {
>   	.owner = THIS_MODULE,
>   	.open = fwctl_fops_open,
>   	.release = fwctl_fops_release,
> +	.unlocked_ioctl = fwctl_fops_ioctl,
>   };
>   
>   static void fwctl_device_release(struct device *device)
> @@ -46,6 +152,7 @@ static void fwctl_device_release(struct device *device)
>   
>   	if (fwctl->dev.devt)
>   		ida_free(&fwctl_ida, fwctl->dev.devt - fwctl_dev);
> +	mutex_destroy(&fwctl->uctx_list_lock);
>   	kfree(fwctl);
>   }
>   
> @@ -69,6 +176,9 @@ _alloc_device(struct device *parent, const struct fwctl_ops *ops, size_t size)
>   		return NULL;
>   	fwctl->dev.class = &fwctl_class;
>   	fwctl->dev.parent = parent;
> +	init_rwsem(&fwctl->registration_lock);
> +	mutex_init(&fwctl->uctx_list_lock);
> +	INIT_LIST_HEAD(&fwctl->uctx_list);
>   	device_initialize(&fwctl->dev);
>   	return_ptr(fwctl);
>   }
> @@ -134,8 +244,18 @@ EXPORT_SYMBOL_NS_GPL(fwctl_register, FWCTL);
>    */
>   void fwctl_unregister(struct fwctl_device *fwctl)
>   {
> +	struct fwctl_uctx *uctx;
> +
>   	cdev_device_del(&fwctl->cdev, &fwctl->dev);
>   
> +	/* Disable and free the driver's resources for any still open FDs. */
> +	guard(rwsem_write)(&fwctl->registration_lock);
> +	guard(mutex)(&fwctl->uctx_list_lock);
> +	while ((uctx = list_first_entry_or_null(&fwctl->uctx_list,
> +						struct fwctl_uctx,
> +						uctx_list_entry)))
> +		fwctl_destroy_uctx(uctx);
> +
>   	/*
>   	 * The driver module may unload after this returns, the op pointer will
>   	 * not be valid.
> diff --git a/include/linux/fwctl.h b/include/linux/fwctl.h
> index ef4eaa87c945e4..1d9651de92fc19 100644
> --- a/include/linux/fwctl.h
> +++ b/include/linux/fwctl.h
> @@ -11,7 +11,20 @@
>   struct fwctl_device;
>   struct fwctl_uctx;
>   
> +/**
> + * struct fwctl_ops - Driver provided operations
> + * @uctx_size: The size of the fwctl_uctx struct to allocate. The first
> + *	bytes of this memory will be a fwctl_uctx. The driver can use the
> + *	remaining bytes as its private memory.
> + * @open_uctx: Called when a file descriptor is opened before the uctx is ever
> + *	used.
> + * @close_uctx: Called when the uctx is destroyed, usually when the FD is
> + *	closed.
> + */
>   struct fwctl_ops {
> +	size_t uctx_size;
> +	int (*open_uctx)(struct fwctl_uctx *uctx);
> +	void (*close_uctx)(struct fwctl_uctx *uctx);
>   };
>   
>   /**
> @@ -26,6 +39,10 @@ struct fwctl_device {
>   	struct device dev;
>   	/* private: */
>   	struct cdev cdev;
> +
> +	struct rw_semaphore registration_lock;
> +	struct mutex uctx_list_lock;
> +	struct list_head uctx_list;
>   	const struct fwctl_ops *ops;
>   };
>   
> @@ -65,4 +82,18 @@ DEFINE_FREE(fwctl, struct fwctl_device *, if (_T) fwctl_put(_T));
>   int fwctl_register(struct fwctl_device *fwctl);
>   void fwctl_unregister(struct fwctl_device *fwctl);
>   
> +/**
> + * struct fwctl_uctx - Per user FD context
> + * @fwctl: fwctl instance that owns the context
> + *
> + * Every FD opened by userspace will get a unique context allocation. Any driver
> + * private data will follow immediately after.
> + */
> +struct fwctl_uctx {
> +	struct fwctl_device *fwctl;
> +	/* private: */
> +	/* Head at fwctl_device::uctx_list */
> +	struct list_head uctx_list_entry;
> +};
> +
>   #endif
> diff --git a/include/uapi/fwctl/fwctl.h b/include/uapi/fwctl/fwctl.h
> new file mode 100644
> index 00000000000000..0bdce95b6d69d9
> --- /dev/null
> +++ b/include/uapi/fwctl/fwctl.h
> @@ -0,0 +1,41 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
> + */
> +#ifndef _UAPI_FWCTL_H
> +#define _UAPI_FWCTL_H
> +
> +#include <linux/types.h>
> +#include <linux/ioctl.h>
> +
> +#define FWCTL_TYPE 0x9A
> +
> +/**
> + * DOC: General ioctl format
> + *
> + * The ioctl interface follows a general format to allow for extensibility. Each
> + * ioctl is passed in a structure pointer as the argument providing the size of
> + * the structure in the first u32. The kernel checks that any structure space
> + * beyond what it understands is 0. This allows userspace to use the backward
> + * compatible portion while consistently using the newer, larger, structures.
> + *
> + * ioctls use a standard meaning for common errnos:
> + *
> + *  - ENOTTY: The IOCTL number itself is not supported at all
> + *  - E2BIG: The IOCTL number is supported, but the provided structure has
> + *    non-zero in a part the kernel does not understand.
> + *  - EOPNOTSUPP: The IOCTL number is supported, and the structure is
> + *    understood, however a known field has a value the kernel does not
> + *    understand or support.
> + *  - EINVAL: Everything about the IOCTL was understood, but a field is not
> + *    correct.
> + *  - ENOMEM: Out of memory.
> + *  - ENODEV: The underlying device has been hot-unplugged and the FD is
> + *            orphaned.
> + *
> + * As well as additional errnos, within specific ioctls.
> + */
> +enum {
> +	FWCTL_CMD_BASE = 0,
> +};
> +
> +#endif
Leon Romanovsky June 4, 2024, 12:22 p.m. UTC | #2
On Tue, Jun 04, 2024 at 02:16:12PM +0200, Zhu Yanjun wrote:
> On 03.06.24 17:53, Jason Gunthorpe wrote:
> > Each file descriptor gets a chunk of per-FD driver specific context that
> > allows the driver to attach a device specific struct to. The core code
> > takes care of the memory lifetime for this structure.
> > 
> > The ioctl dispatch and design is based on what was built for iommufd. The
> > ioctls have a struct which has a combined in/out behavior with a typical
> > 'zero pad' scheme for future extension and backwards compatibility.
> > 
> > Like iommufd some shared logic does most of the ioctl marshalling and
> > compatibility work and tables diatches to some function pointers for
> > each unique iotcl.
> > 
> > This approach has proven to work quite well in the iommufd and rdma
> > subsystems.
> > 
> > Allocate an ioctl number space for the subsystem.
> > 
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > ---
> >   .../userspace-api/ioctl/ioctl-number.rst      |   1 +
> >   MAINTAINERS                                   |   1 +
> >   drivers/fwctl/main.c                          | 124 +++++++++++++++++-
> >   include/linux/fwctl.h                         |  31 +++++
> >   include/uapi/fwctl/fwctl.h                    |  41 ++++++
> >   5 files changed, 196 insertions(+), 2 deletions(-)
> >   create mode 100644 include/uapi/fwctl/fwctl.h

<...>

> >   static int fwctl_fops_open(struct inode *inode, struct file *filp)
> >   {
> >   	struct fwctl_device *fwctl =
> >   		container_of(inode->i_cdev, struct fwctl_device, cdev);
> > +	struct fwctl_uctx *uctx __free(kfree) = NULL;
> > +	int ret;
> > +
> > +	guard(rwsem_read)(&fwctl->registration_lock);
> > +	if (!fwctl->ops)
> > +		return -ENODEV;
> > +
> > +	uctx = kzalloc(fwctl->ops->uctx_size, GFP_KERNEL |  GFP_KERNEL_ACCOUNT);
> > +	if (!uctx)
> > +		return -ENOMEM;
> > +
> > +	uctx->fwctl = fwctl;
> > +	ret = fwctl->ops->open_uctx(uctx);
> > +	if (ret)
> > +		return ret;
> 
> When something is wrong, uctx is freed in "fwctl->ops->open_uctx(uctx);"?
> 
> If not, the allocated memory uctx leaks here.

See how uctx is declared:
struct fwctl_uctx *uctx __free(kfree) = NULL;

It will be released automatically.
See include/linux/cleanup.h for more details.

Thanks
Jonathan Cameron June 4, 2024, 4:50 p.m. UTC | #3
On Tue, 4 Jun 2024 15:22:21 +0300
Leon Romanovsky <leon@kernel.org> wrote:

> On Tue, Jun 04, 2024 at 02:16:12PM +0200, Zhu Yanjun wrote:
> > On 03.06.24 17:53, Jason Gunthorpe wrote:  
> > > Each file descriptor gets a chunk of per-FD driver specific context that
> > > allows the driver to attach a device specific struct to. The core code
> > > takes care of the memory lifetime for this structure.
> > > 
> > > The ioctl dispatch and design is based on what was built for iommufd. The
> > > ioctls have a struct which has a combined in/out behavior with a typical
> > > 'zero pad' scheme for future extension and backwards compatibility.
> > > 
> > > Like iommufd some shared logic does most of the ioctl marshalling and
> > > compatibility work and tables diatches to some function pointers for
> > > each unique iotcl.
> > > 
> > > This approach has proven to work quite well in the iommufd and rdma
> > > subsystems.
> > > 
> > > Allocate an ioctl number space for the subsystem.
> > > 
> > > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > > ---
> > >   .../userspace-api/ioctl/ioctl-number.rst      |   1 +
> > >   MAINTAINERS                                   |   1 +
> > >   drivers/fwctl/main.c                          | 124 +++++++++++++++++-
> > >   include/linux/fwctl.h                         |  31 +++++
> > >   include/uapi/fwctl/fwctl.h                    |  41 ++++++
> > >   5 files changed, 196 insertions(+), 2 deletions(-)
> > >   create mode 100644 include/uapi/fwctl/fwctl.h  
> 
> <...>
> 
> > >   static int fwctl_fops_open(struct inode *inode, struct file *filp)
> > >   {
> > >   	struct fwctl_device *fwctl =
> > >   		container_of(inode->i_cdev, struct fwctl_device, cdev);
> > > +	struct fwctl_uctx *uctx __free(kfree) = NULL;
> > > +	int ret;
> > > +
> > > +	guard(rwsem_read)(&fwctl->registration_lock);
> > > +	if (!fwctl->ops)
> > > +		return -ENODEV;
> > > +
> > > +	uctx = kzalloc(fwctl->ops->uctx_size, GFP_KERNEL |  GFP_KERNEL_ACCOUNT);
> > > +	if (!uctx)
> > > +		return -ENOMEM;
> > > +
> > > +	uctx->fwctl = fwctl;
> > > +	ret = fwctl->ops->open_uctx(uctx);
> > > +	if (ret)
> > > +		return ret;  
> > 
> > When something is wrong, uctx is freed in "fwctl->ops->open_uctx(uctx);"?
> > 
> > If not, the allocated memory uctx leaks here.  
> 
> See how uctx is declared:
> struct fwctl_uctx *uctx __free(kfree) = NULL;
> 
> It will be released automatically.
> See include/linux/cleanup.h for more details.

I'm lazy so not finding the discussion now, but Linus has been pretty clear
that he doesn't like this pattern because of possibility of additional cleanup
magic getting introduced and then the cleanup happening in an order that
causes problems. 

Preferred option is drag the declaration to where is initialized so break
with our tradition of declarations all at the top

struct fwctl_uctx *uctx __free(kfree) =
	kzalloc(...);
etc


> 
> Thanks
>
Jason Gunthorpe June 4, 2024, 4:58 p.m. UTC | #4
On Tue, Jun 04, 2024 at 05:50:23PM +0100, Jonathan Cameron wrote:

> > > >   static int fwctl_fops_open(struct inode *inode, struct file *filp)
> > > >   {
> > > >   	struct fwctl_device *fwctl =
> > > >   		container_of(inode->i_cdev, struct fwctl_device, cdev);
> > > > +	struct fwctl_uctx *uctx __free(kfree) = NULL;
> > > > +	int ret;
> > > > +
> > > > +	guard(rwsem_read)(&fwctl->registration_lock);
> > > > +	if (!fwctl->ops)
> > > > +		return -ENODEV;
> > > > +
> > > > +	uctx = kzalloc(fwctl->ops->uctx_size, GFP_KERNEL |  GFP_KERNEL_ACCOUNT);
> > > > +	if (!uctx)
> > > > +		return -ENOMEM;
> > > > +
> > > > +	uctx->fwctl = fwctl;
> > > > +	ret = fwctl->ops->open_uctx(uctx);
> > > > +	if (ret)
> > > > +		return ret;  
> > > 
> > > When something is wrong, uctx is freed in "fwctl->ops->open_uctx(uctx);"?
> > > 
> > > If not, the allocated memory uctx leaks here.  
> > 
> > See how uctx is declared:
> > struct fwctl_uctx *uctx __free(kfree) = NULL;
> > 
> > It will be released automatically.
> > See include/linux/cleanup.h for more details.
> 
> I'm lazy so not finding the discussion now, but Linus has been pretty clear
> that he doesn't like this pattern because of possibility of additional cleanup
> magic getting introduced and then the cleanup happening in an order that
> causes problems. 

I saw that discussion, but I thought it was talking about the macro
behavior - ie guard() creates a variable hidden in the macro.

The point about order is interesting though - notice the above will
free the uctx after unlocking (which is the slightly more preferred
order here), but it is easy to imagine cases where that order would be
wrong.

> Preferred option is drag the declaration to where is initialized so break
> with our tradition of declarations all at the top
> 
> struct fwctl_uctx *uctx __free(kfree) =
> 	kzalloc(...);

I don't recall that dramatic conclusion in the discussion, but it does
make alot of sense to me.

Thanks,
Jason
Jonathan Cameron June 5, 2024, 11:07 a.m. UTC | #5
On Tue, 4 Jun 2024 13:58:44 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Jun 04, 2024 at 05:50:23PM +0100, Jonathan Cameron wrote:
> 
> > > > >   static int fwctl_fops_open(struct inode *inode, struct file *filp)
> > > > >   {
> > > > >   	struct fwctl_device *fwctl =
> > > > >   		container_of(inode->i_cdev, struct fwctl_device, cdev);
> > > > > +	struct fwctl_uctx *uctx __free(kfree) = NULL;
> > > > > +	int ret;
> > > > > +
> > > > > +	guard(rwsem_read)(&fwctl->registration_lock);
> > > > > +	if (!fwctl->ops)
> > > > > +		return -ENODEV;
> > > > > +
> > > > > +	uctx = kzalloc(fwctl->ops->uctx_size, GFP_KERNEL |  GFP_KERNEL_ACCOUNT);
> > > > > +	if (!uctx)
> > > > > +		return -ENOMEM;
> > > > > +
> > > > > +	uctx->fwctl = fwctl;
> > > > > +	ret = fwctl->ops->open_uctx(uctx);
> > > > > +	if (ret)
> > > > > +		return ret;    
> > > > 
> > > > When something is wrong, uctx is freed in "fwctl->ops->open_uctx(uctx);"?
> > > > 
> > > > If not, the allocated memory uctx leaks here.    
> > > 
> > > See how uctx is declared:
> > > struct fwctl_uctx *uctx __free(kfree) = NULL;
> > > 
> > > It will be released automatically.
> > > See include/linux/cleanup.h for more details.  
> > 
> > I'm lazy so not finding the discussion now, but Linus has been pretty clear
> > that he doesn't like this pattern because of possibility of additional cleanup
> > magic getting introduced and then the cleanup happening in an order that
> > causes problems.   
> 
> I saw that discussion, but I thought it was talking about the macro
> behavior - ie guard() creates a variable hidden in the macro.
> 
> The point about order is interesting though - notice the above will
> free the uctx after unlocking (which is the slightly more preferred
> order here), but it is easy to imagine cases where that order would be
> wrong.
> 
> > Preferred option is drag the declaration to where is initialized so break
> > with our tradition of declarations all at the top
> > 
> > struct fwctl_uctx *uctx __free(kfree) =
> > 	kzalloc(...);  
> 
> I don't recall that dramatic conclusion in the discussion, but it does
> make alot of sense to me.

I'll be less lazy (and today found the search foo to track it down).

https://lore.kernel.org/all/CAHk-=wicfvWPuRVDG5R1mZSxD8Xg=-0nLOiHay2T_UJ0yDX42g@mail.gmail.com/
Linus:
> IOW, my current thinking is "let's always have the constructor and
> destructor together", and see how it ends up going.

Not set in stone but I've not yet seen a suggestion of the opposite.

The example from Bartosz that got that response was
Bartosz:
> void foo(void)
> {
>     char *s __free(kfree) = NULL;
> 
>     do_stuff();
>     s = kmalloc(42, GFP_KERNEL);
> }
> 
> Or does it always have to be:
> 
> void foo(void)
> {
>     do_stuff();
>     char *s __free(kfree) = kmalloc(42, GFP_KERNEL);
> }
So option 2.

Jonathan

> 
> Thanks,
> Jason
Przemek Kitszel June 5, 2024, 3:42 p.m. UTC | #6
On 6/3/24 17:53, Jason Gunthorpe wrote:
> Each file descriptor gets a chunk of per-FD driver specific context that
> allows the driver to attach a device specific struct to. The core code
> takes care of the memory lifetime for this structure.
> 
> The ioctl dispatch and design is based on what was built for iommufd. The
> ioctls have a struct which has a combined in/out behavior with a typical
> 'zero pad' scheme for future extension and backwards compatibility.

I would go one step further and introduce a new syscall, that would
smooth out typical problems of ioctl, and base it on some TLV scheme
(similar to netlink, in some kind a way smaller brother of protobuf).
Perhaps with the name more broad than fw-knob-tuning.

Then I would go two steps back and a driver layer to interpert those
syscalls to have at least some sort of openness.

> 
> Like iommufd some shared logic does most of the ioctl marshalling and
> compatibility work and tables diatches to some function pointers for
> each unique iotcl.
> 
> This approach has proven to work quite well in the iommufd and rdma
> subsystems.
> 
> Allocate an ioctl number space for the subsystem.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>   .../userspace-api/ioctl/ioctl-number.rst      |   1 +
>   MAINTAINERS                                   |   1 +
>   drivers/fwctl/main.c                          | 124 +++++++++++++++++-
>   include/linux/fwctl.h                         |  31 +++++
>   include/uapi/fwctl/fwctl.h                    |  41 ++++++
>   5 files changed, 196 insertions(+), 2 deletions(-)
>   create mode 100644 include/uapi/fwctl/fwctl.h
> 
> diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
> index a141e8e65c5d3a..4d91c5a20b98c8 100644
> --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> @@ -324,6 +324,7 @@ Code  Seq#    Include File                                           Comments
>   0x97  00-7F  fs/ceph/ioctl.h                                         Ceph file system
>   0x99  00-0F                                                          537-Addinboard driver
>                                                                        <mailto:buk@buks.ipn.de>
> +0x9A  00-0F  include/uapi/fwctl/fwctl.h
>   0xA0  all    linux/sdp/sdp.h                                         Industrial Device Project
>                                                                        <mailto:kenji@bitgate.com>
>   0xA1  0      linux/vtpm_proxy.h                                      TPM Emulator Proxy Driver
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 833b853808421e..94062161e9c4d7 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -9084,6 +9084,7 @@ S:	Maintained
>   F:	Documentation/userspace-api/fwctl.rst
>   F:	drivers/fwctl/
>   F:	include/linux/fwctl.h
> +F:	include/uapi/fwctl/
>   
>   GALAXYCORE GC0308 CAMERA SENSOR DRIVER
>   M:	Sebastian Reichel <sre@kernel.org>
> diff --git a/drivers/fwctl/main.c b/drivers/fwctl/main.c
> index ff9b7bad5a2b0d..7ecdabdd9dcb1e 100644
> --- a/drivers/fwctl/main.c
> +++ b/drivers/fwctl/main.c
> @@ -9,26 +9,131 @@
>   #include <linux/container_of.h>
>   #include <linux/fs.h>
>   
> +#include <uapi/fwctl/fwctl.h>
> +
>   enum {
>   	FWCTL_MAX_DEVICES = 256,
>   };
>   static dev_t fwctl_dev;
>   static DEFINE_IDA(fwctl_ida);
>   
> +struct fwctl_ucmd {
> +	struct fwctl_uctx *uctx;
> +	void __user *ubuffer;
> +	void *cmd;
> +	u32 user_size;
> +};
> +
> +/* On stack memory for the ioctl structs */
> +union ucmd_buffer {
> +};
> +
> +struct fwctl_ioctl_op {
> +	unsigned int size;
> +	unsigned int min_size;
> +	unsigned int ioctl_num;
> +	int (*execute)(struct fwctl_ucmd *ucmd);
> +};
> +
> +#define IOCTL_OP(_ioctl, _fn, _struct, _last)                         \
> +	[_IOC_NR(_ioctl) - FWCTL_CMD_BASE] = {                        \
> +		.size = sizeof(_struct) +                             \
> +			BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \
> +					  sizeof(_struct)),           \
> +		.min_size = offsetofend(_struct, _last),              \
> +		.ioctl_num = _ioctl,                                  \
> +		.execute = _fn,                                       \
> +	}
> +static const struct fwctl_ioctl_op fwctl_ioctl_ops[] = {
> +};
> +
> +static long fwctl_fops_ioctl(struct file *filp, unsigned int cmd,
> +			       unsigned long arg)
> +{
> +	struct fwctl_uctx *uctx = filp->private_data;
> +	const struct fwctl_ioctl_op *op;
> +	struct fwctl_ucmd ucmd = {};
> +	union ucmd_buffer buf;
> +	unsigned int nr;
> +	int ret;
> +
> +	nr = _IOC_NR(cmd);
> +	if ((nr - FWCTL_CMD_BASE) >= ARRAY_SIZE(fwctl_ioctl_ops))
> +		return -ENOIOCTLCMD;
> +	op = &fwctl_ioctl_ops[nr - FWCTL_CMD_BASE];
> +	if (op->ioctl_num != cmd)
> +		return -ENOIOCTLCMD;
> +
> +	ucmd.uctx = uctx;
> +	ucmd.cmd = &buf;
> +	ucmd.ubuffer = (void __user *)arg;
> +	ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
> +	if (ret)
> +		return ret;
> +
> +	if (ucmd.user_size < op->min_size)
> +		return -EINVAL;
> +
> +	ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
> +				    ucmd.user_size);
> +	if (ret)
> +		return ret;
> +
> +	guard(rwsem_read)(&uctx->fwctl->registration_lock);
> +	if (!uctx->fwctl->ops)
> +		return -ENODEV;
> +	return op->execute(&ucmd);
> +}
> +
>   static int fwctl_fops_open(struct inode *inode, struct file *filp)
>   {
>   	struct fwctl_device *fwctl =
>   		container_of(inode->i_cdev, struct fwctl_device, cdev);
> +	struct fwctl_uctx *uctx __free(kfree) = NULL;
> +	int ret;
> +
> +	guard(rwsem_read)(&fwctl->registration_lock);
> +	if (!fwctl->ops)
> +		return -ENODEV;
> +
> +	uctx = kzalloc(fwctl->ops->uctx_size, GFP_KERNEL |  GFP_KERNEL_ACCOUNT);
> +	if (!uctx)
> +		return -ENOMEM;
> +
> +	uctx->fwctl = fwctl;
> +	ret = fwctl->ops->open_uctx(uctx);
> +	if (ret)
> +		return ret;
> +
> +	scoped_guard(mutex, &fwctl->uctx_list_lock) {
> +		list_add_tail(&uctx->uctx_list_entry, &fwctl->uctx_list);
> +	}
>   
>   	get_device(&fwctl->dev);
> -	filp->private_data = fwctl;
> +	filp->private_data = no_free_ptr(uctx);
>   	return 0;
>   }
>   
> +static void fwctl_destroy_uctx(struct fwctl_uctx *uctx)
> +{
> +	lockdep_assert_held(&uctx->fwctl->uctx_list_lock);
> +	list_del(&uctx->uctx_list_entry);
> +	uctx->fwctl->ops->close_uctx(uctx);
> +}
> +
>   static int fwctl_fops_release(struct inode *inode, struct file *filp)
>   {
> -	struct fwctl_device *fwctl = filp->private_data;
> +	struct fwctl_uctx *uctx = filp->private_data;
> +	struct fwctl_device *fwctl = uctx->fwctl;
>   
> +	scoped_guard(rwsem_read, &fwctl->registration_lock) {
> +		if (fwctl->ops) {
> +			guard(mutex)(&fwctl->uctx_list_lock);
> +			fwctl_destroy_uctx(uctx);
> +		}
> +	}
> +
> +	kfree(uctx);
>   	fwctl_put(fwctl);
>   	return 0;
>   }
> @@ -37,6 +142,7 @@ static const struct file_operations fwctl_fops = {
>   	.owner = THIS_MODULE,
>   	.open = fwctl_fops_open,
>   	.release = fwctl_fops_release,
> +	.unlocked_ioctl = fwctl_fops_ioctl,
>   };
>   
>   static void fwctl_device_release(struct device *device)
> @@ -46,6 +152,7 @@ static void fwctl_device_release(struct device *device)
>   
>   	if (fwctl->dev.devt)
>   		ida_free(&fwctl_ida, fwctl->dev.devt - fwctl_dev);
> +	mutex_destroy(&fwctl->uctx_list_lock);
>   	kfree(fwctl);
>   }
>   
> @@ -69,6 +176,9 @@ _alloc_device(struct device *parent, const struct fwctl_ops *ops, size_t size)
>   		return NULL;
>   	fwctl->dev.class = &fwctl_class;
>   	fwctl->dev.parent = parent;
> +	init_rwsem(&fwctl->registration_lock);
> +	mutex_init(&fwctl->uctx_list_lock);
> +	INIT_LIST_HEAD(&fwctl->uctx_list);
>   	device_initialize(&fwctl->dev);
>   	return_ptr(fwctl);
>   }
> @@ -134,8 +244,18 @@ EXPORT_SYMBOL_NS_GPL(fwctl_register, FWCTL);
>    */
>   void fwctl_unregister(struct fwctl_device *fwctl)
>   {
> +	struct fwctl_uctx *uctx;
> +
>   	cdev_device_del(&fwctl->cdev, &fwctl->dev);
>   
> +	/* Disable and free the driver's resources for any still open FDs. */
> +	guard(rwsem_write)(&fwctl->registration_lock);
> +	guard(mutex)(&fwctl->uctx_list_lock);
> +	while ((uctx = list_first_entry_or_null(&fwctl->uctx_list,
> +						struct fwctl_uctx,
> +						uctx_list_entry)))
> +		fwctl_destroy_uctx(uctx);
> +
>   	/*
>   	 * The driver module may unload after this returns, the op pointer will
>   	 * not be valid.
> diff --git a/include/linux/fwctl.h b/include/linux/fwctl.h
> index ef4eaa87c945e4..1d9651de92fc19 100644
> --- a/include/linux/fwctl.h
> +++ b/include/linux/fwctl.h
> @@ -11,7 +11,20 @@
>   struct fwctl_device;
>   struct fwctl_uctx;
>   
> +/**
> + * struct fwctl_ops - Driver provided operations
> + * @uctx_size: The size of the fwctl_uctx struct to allocate. The first
> + *	bytes of this memory will be a fwctl_uctx. The driver can use the
> + *	remaining bytes as its private memory.
> + * @open_uctx: Called when a file descriptor is opened before the uctx is ever
> + *	used.
> + * @close_uctx: Called when the uctx is destroyed, usually when the FD is
> + *	closed.
> + */
>   struct fwctl_ops {
> +	size_t uctx_size;
> +	int (*open_uctx)(struct fwctl_uctx *uctx);
> +	void (*close_uctx)(struct fwctl_uctx *uctx);
>   };
>   
>   /**
> @@ -26,6 +39,10 @@ struct fwctl_device {
>   	struct device dev;
>   	/* private: */
>   	struct cdev cdev;
> +
> +	struct rw_semaphore registration_lock;
> +	struct mutex uctx_list_lock;
> +	struct list_head uctx_list;
>   	const struct fwctl_ops *ops;
>   };
>   
> @@ -65,4 +82,18 @@ DEFINE_FREE(fwctl, struct fwctl_device *, if (_T) fwctl_put(_T));
>   int fwctl_register(struct fwctl_device *fwctl);
>   void fwctl_unregister(struct fwctl_device *fwctl);
>   
> +/**
> + * struct fwctl_uctx - Per user FD context
> + * @fwctl: fwctl instance that owns the context
> + *
> + * Every FD opened by userspace will get a unique context allocation. Any driver
> + * private data will follow immediately after.
> + */
> +struct fwctl_uctx {
> +	struct fwctl_device *fwctl;
> +	/* private: */
> +	/* Head at fwctl_device::uctx_list */
> +	struct list_head uctx_list_entry;
> +};
> +
>   #endif
> diff --git a/include/uapi/fwctl/fwctl.h b/include/uapi/fwctl/fwctl.h
> new file mode 100644
> index 00000000000000..0bdce95b6d69d9
> --- /dev/null
> +++ b/include/uapi/fwctl/fwctl.h
> @@ -0,0 +1,41 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
> + */
> +#ifndef _UAPI_FWCTL_H
> +#define _UAPI_FWCTL_H
> +
> +#include <linux/types.h>
> +#include <linux/ioctl.h>
> +
> +#define FWCTL_TYPE 0x9A
> +
> +/**
> + * DOC: General ioctl format
> + *
> + * The ioctl interface follows a general format to allow for extensibility. Each
> + * ioctl is passed in a structure pointer as the argument providing the size of
> + * the structure in the first u32. The kernel checks that any structure space
> + * beyond what it understands is 0. This allows userspace to use the backward
> + * compatible portion while consistently using the newer, larger, structures.
> + *
> + * ioctls use a standard meaning for common errnos:
> + *
> + *  - ENOTTY: The IOCTL number itself is not supported at all
> + *  - E2BIG: The IOCTL number is supported, but the provided structure has
> + *    non-zero in a part the kernel does not understand.
> + *  - EOPNOTSUPP: The IOCTL number is supported, and the structure is
> + *    understood, however a known field has a value the kernel does not
> + *    understand or support.
> + *  - EINVAL: Everything about the IOCTL was understood, but a field is not
> + *    correct.
> + *  - ENOMEM: Out of memory.
> + *  - ENODEV: The underlying device has been hot-unplugged and the FD is
> + *            orphaned.
> + *
> + * As well as additional errnos, within specific ioctls.
> + */
> +enum {
> +	FWCTL_CMD_BASE = 0,
> +};
> +
> +#endif
Jason Gunthorpe June 5, 2024, 3:49 p.m. UTC | #7
On Wed, Jun 05, 2024 at 05:42:51PM +0200, Przemek Kitszel wrote:
> On 6/3/24 17:53, Jason Gunthorpe wrote:
> > Each file descriptor gets a chunk of per-FD driver specific context that
> > allows the driver to attach a device specific struct to. The core code
> > takes care of the memory lifetime for this structure.
> > 
> > The ioctl dispatch and design is based on what was built for iommufd. The
> > ioctls have a struct which has a combined in/out behavior with a typical
> > 'zero pad' scheme for future extension and backwards compatibility.
> 
> I would go one step further and introduce a new syscall, that would
> smooth out typical problems of ioctl, and base it on some TLV scheme
> (similar to netlink, in some kind a way smaller brother of protobuf).
> Perhaps with the name more broad than fw-knob-tuning.

We did a TLV scheme like netlink for RDMA. It is very complex and
frankly I think it is overkill for what this wants to do. It suited
RDMA because the system call interface is so vast there.

If the kernel had a general TLV path as an alternative to ioctl it
could be very interesting. I thought about generalizing the RDMA stuff
once, and even gave a small talk at LPC on some of the ideas, but
didn't have the bravery or justification to actually try to do it.

> Then I would go two steps back and a driver layer to interpert those
> syscalls to have at least some sort of openness.

I don't envision having thick drivers marshaling and unmarshaling FW
messages to obfuscate the data flow. The purpose here is what it says
on the label, to be a thin and simple path to sends native commands
with a security apparatus.

Jason
Jason Gunthorpe June 5, 2024, 6:27 p.m. UTC | #8
On Wed, Jun 05, 2024 at 12:07:37PM +0100, Jonathan Cameron wrote:

> > I don't recall that dramatic conclusion in the discussion, but it does
> > make alot of sense to me.
> 
> I'll be less lazy (and today found the search foo to track it down).
> 
> https://lore.kernel.org/all/CAHk-=wicfvWPuRVDG5R1mZSxD8Xg=-0nLOiHay2T_UJ0yDX42g@mail.gmail.com/

Oh that is a bit different discussion than I was thinking of.. I fixed
all the cases to follow this advise and checked that all the free
functions are proper pairs of whatever is being allocated.

Thanks,
Jason
Jonathan Cameron June 6, 2024, 1:34 p.m. UTC | #9
On Wed, 5 Jun 2024 15:27:26 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Wed, Jun 05, 2024 at 12:07:37PM +0100, Jonathan Cameron wrote:
> 
> > > I don't recall that dramatic conclusion in the discussion, but it does
> > > make alot of sense to me.  
> > 
> > I'll be less lazy (and today found the search foo to track it down).
> > 
> > https://lore.kernel.org/all/CAHk-=wicfvWPuRVDG5R1mZSxD8Xg=-0nLOiHay2T_UJ0yDX42g@mail.gmail.com/  
> 
> Oh that is a bit different discussion than I was thinking of.. I fixed
> all the cases to follow this advise and checked that all the free
> functions are proper pairs of whatever is being allocated.

Yes. I think we are approaching the point where maybe we need
a 'best practice guide' somewhere. It is sort of coding style, but
it is perhaps rather complex perhaps to put in that doc.

I'm happy to help review such changes, but it would be too far down
my todo list if I took on writing one.

Maybe there is one I've missed?

Jonathan
> 
> Thanks,
> Jason
Randy Dunlap June 6, 2024, 3:37 p.m. UTC | #10
On 6/6/24 6:34 AM, Jonathan Cameron wrote:
> On Wed, 5 Jun 2024 15:27:26 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
>> On Wed, Jun 05, 2024 at 12:07:37PM +0100, Jonathan Cameron wrote:
>>
>>>> I don't recall that dramatic conclusion in the discussion, but it does
>>>> make alot of sense to me.  
>>>
>>> I'll be less lazy (and today found the search foo to track it down).
>>>
>>> https://lore.kernel.org/all/CAHk-=wicfvWPuRVDG5R1mZSxD8Xg=-0nLOiHay2T_UJ0yDX42g@mail.gmail.com/  
>>
>> Oh that is a bit different discussion than I was thinking of.. I fixed
>> all the cases to follow this advise and checked that all the free
>> functions are proper pairs of whatever is being allocated.
> 
> Yes. I think we are approaching the point where maybe we need
> a 'best practice guide' somewhere. It is sort of coding style, but
> it is perhaps rather complex perhaps to put in that doc.
> 
> I'm happy to help review such changes, but it would be too far down
> my todo list if I took on writing one.
> 
> Maybe there is one I've missed?


There is not a published one that I know of, other than one that I pasted into
an email in Dec-2023, in this post:

https://lore.kernel.org/lkml/34e27c57-fc18-4918-bf44-4f8a53825361@infradead.org/
diff mbox series

Patch

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index a141e8e65c5d3a..4d91c5a20b98c8 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -324,6 +324,7 @@  Code  Seq#    Include File                                           Comments
 0x97  00-7F  fs/ceph/ioctl.h                                         Ceph file system
 0x99  00-0F                                                          537-Addinboard driver
                                                                      <mailto:buk@buks.ipn.de>
+0x9A  00-0F  include/uapi/fwctl/fwctl.h
 0xA0  all    linux/sdp/sdp.h                                         Industrial Device Project
                                                                      <mailto:kenji@bitgate.com>
 0xA1  0      linux/vtpm_proxy.h                                      TPM Emulator Proxy Driver
diff --git a/MAINTAINERS b/MAINTAINERS
index 833b853808421e..94062161e9c4d7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9084,6 +9084,7 @@  S:	Maintained
 F:	Documentation/userspace-api/fwctl.rst
 F:	drivers/fwctl/
 F:	include/linux/fwctl.h
+F:	include/uapi/fwctl/
 
 GALAXYCORE GC0308 CAMERA SENSOR DRIVER
 M:	Sebastian Reichel <sre@kernel.org>
diff --git a/drivers/fwctl/main.c b/drivers/fwctl/main.c
index ff9b7bad5a2b0d..7ecdabdd9dcb1e 100644
--- a/drivers/fwctl/main.c
+++ b/drivers/fwctl/main.c
@@ -9,26 +9,131 @@ 
 #include <linux/container_of.h>
 #include <linux/fs.h>
 
+#include <uapi/fwctl/fwctl.h>
+
 enum {
 	FWCTL_MAX_DEVICES = 256,
 };
 static dev_t fwctl_dev;
 static DEFINE_IDA(fwctl_ida);
 
+struct fwctl_ucmd {
+	struct fwctl_uctx *uctx;
+	void __user *ubuffer;
+	void *cmd;
+	u32 user_size;
+};
+
+/* On stack memory for the ioctl structs */
+union ucmd_buffer {
+};
+
+struct fwctl_ioctl_op {
+	unsigned int size;
+	unsigned int min_size;
+	unsigned int ioctl_num;
+	int (*execute)(struct fwctl_ucmd *ucmd);
+};
+
+#define IOCTL_OP(_ioctl, _fn, _struct, _last)                         \
+	[_IOC_NR(_ioctl) - FWCTL_CMD_BASE] = {                        \
+		.size = sizeof(_struct) +                             \
+			BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \
+					  sizeof(_struct)),           \
+		.min_size = offsetofend(_struct, _last),              \
+		.ioctl_num = _ioctl,                                  \
+		.execute = _fn,                                       \
+	}
+static const struct fwctl_ioctl_op fwctl_ioctl_ops[] = {
+};
+
+static long fwctl_fops_ioctl(struct file *filp, unsigned int cmd,
+			       unsigned long arg)
+{
+	struct fwctl_uctx *uctx = filp->private_data;
+	const struct fwctl_ioctl_op *op;
+	struct fwctl_ucmd ucmd = {};
+	union ucmd_buffer buf;
+	unsigned int nr;
+	int ret;
+
+	nr = _IOC_NR(cmd);
+	if ((nr - FWCTL_CMD_BASE) >= ARRAY_SIZE(fwctl_ioctl_ops))
+		return -ENOIOCTLCMD;
+	op = &fwctl_ioctl_ops[nr - FWCTL_CMD_BASE];
+	if (op->ioctl_num != cmd)
+		return -ENOIOCTLCMD;
+
+	ucmd.uctx = uctx;
+	ucmd.cmd = &buf;
+	ucmd.ubuffer = (void __user *)arg;
+	ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
+	if (ret)
+		return ret;
+
+	if (ucmd.user_size < op->min_size)
+		return -EINVAL;
+
+	ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
+				    ucmd.user_size);
+	if (ret)
+		return ret;
+
+	guard(rwsem_read)(&uctx->fwctl->registration_lock);
+	if (!uctx->fwctl->ops)
+		return -ENODEV;
+	return op->execute(&ucmd);
+}
+
 static int fwctl_fops_open(struct inode *inode, struct file *filp)
 {
 	struct fwctl_device *fwctl =
 		container_of(inode->i_cdev, struct fwctl_device, cdev);
+	struct fwctl_uctx *uctx __free(kfree) = NULL;
+	int ret;
+
+	guard(rwsem_read)(&fwctl->registration_lock);
+	if (!fwctl->ops)
+		return -ENODEV;
+
+	uctx = kzalloc(fwctl->ops->uctx_size, GFP_KERNEL |  GFP_KERNEL_ACCOUNT);
+	if (!uctx)
+		return -ENOMEM;
+
+	uctx->fwctl = fwctl;
+	ret = fwctl->ops->open_uctx(uctx);
+	if (ret)
+		return ret;
+
+	scoped_guard(mutex, &fwctl->uctx_list_lock) {
+		list_add_tail(&uctx->uctx_list_entry, &fwctl->uctx_list);
+	}
 
 	get_device(&fwctl->dev);
-	filp->private_data = fwctl;
+	filp->private_data = no_free_ptr(uctx);
 	return 0;
 }
 
+static void fwctl_destroy_uctx(struct fwctl_uctx *uctx)
+{
+	lockdep_assert_held(&uctx->fwctl->uctx_list_lock);
+	list_del(&uctx->uctx_list_entry);
+	uctx->fwctl->ops->close_uctx(uctx);
+}
+
 static int fwctl_fops_release(struct inode *inode, struct file *filp)
 {
-	struct fwctl_device *fwctl = filp->private_data;
+	struct fwctl_uctx *uctx = filp->private_data;
+	struct fwctl_device *fwctl = uctx->fwctl;
 
+	scoped_guard(rwsem_read, &fwctl->registration_lock) {
+		if (fwctl->ops) {
+			guard(mutex)(&fwctl->uctx_list_lock);
+			fwctl_destroy_uctx(uctx);
+		}
+	}
+
+	kfree(uctx);
 	fwctl_put(fwctl);
 	return 0;
 }
@@ -37,6 +142,7 @@  static const struct file_operations fwctl_fops = {
 	.owner = THIS_MODULE,
 	.open = fwctl_fops_open,
 	.release = fwctl_fops_release,
+	.unlocked_ioctl = fwctl_fops_ioctl,
 };
 
 static void fwctl_device_release(struct device *device)
@@ -46,6 +152,7 @@  static void fwctl_device_release(struct device *device)
 
 	if (fwctl->dev.devt)
 		ida_free(&fwctl_ida, fwctl->dev.devt - fwctl_dev);
+	mutex_destroy(&fwctl->uctx_list_lock);
 	kfree(fwctl);
 }
 
@@ -69,6 +176,9 @@  _alloc_device(struct device *parent, const struct fwctl_ops *ops, size_t size)
 		return NULL;
 	fwctl->dev.class = &fwctl_class;
 	fwctl->dev.parent = parent;
+	init_rwsem(&fwctl->registration_lock);
+	mutex_init(&fwctl->uctx_list_lock);
+	INIT_LIST_HEAD(&fwctl->uctx_list);
 	device_initialize(&fwctl->dev);
 	return_ptr(fwctl);
 }
@@ -134,8 +244,18 @@  EXPORT_SYMBOL_NS_GPL(fwctl_register, FWCTL);
  */
 void fwctl_unregister(struct fwctl_device *fwctl)
 {
+	struct fwctl_uctx *uctx;
+
 	cdev_device_del(&fwctl->cdev, &fwctl->dev);
 
+	/* Disable and free the driver's resources for any still open FDs. */
+	guard(rwsem_write)(&fwctl->registration_lock);
+	guard(mutex)(&fwctl->uctx_list_lock);
+	while ((uctx = list_first_entry_or_null(&fwctl->uctx_list,
+						struct fwctl_uctx,
+						uctx_list_entry)))
+		fwctl_destroy_uctx(uctx);
+
 	/*
 	 * The driver module may unload after this returns, the op pointer will
 	 * not be valid.
diff --git a/include/linux/fwctl.h b/include/linux/fwctl.h
index ef4eaa87c945e4..1d9651de92fc19 100644
--- a/include/linux/fwctl.h
+++ b/include/linux/fwctl.h
@@ -11,7 +11,20 @@ 
 struct fwctl_device;
 struct fwctl_uctx;
 
+/**
+ * struct fwctl_ops - Driver provided operations
+ * @uctx_size: The size of the fwctl_uctx struct to allocate. The first
+ *	bytes of this memory will be a fwctl_uctx. The driver can use the
+ *	remaining bytes as its private memory.
+ * @open_uctx: Called when a file descriptor is opened before the uctx is ever
+ *	used.
+ * @close_uctx: Called when the uctx is destroyed, usually when the FD is
+ *	closed.
+ */
 struct fwctl_ops {
+	size_t uctx_size;
+	int (*open_uctx)(struct fwctl_uctx *uctx);
+	void (*close_uctx)(struct fwctl_uctx *uctx);
 };
 
 /**
@@ -26,6 +39,10 @@  struct fwctl_device {
 	struct device dev;
 	/* private: */
 	struct cdev cdev;
+
+	struct rw_semaphore registration_lock;
+	struct mutex uctx_list_lock;
+	struct list_head uctx_list;
 	const struct fwctl_ops *ops;
 };
 
@@ -65,4 +82,18 @@  DEFINE_FREE(fwctl, struct fwctl_device *, if (_T) fwctl_put(_T));
 int fwctl_register(struct fwctl_device *fwctl);
 void fwctl_unregister(struct fwctl_device *fwctl);
 
+/**
+ * struct fwctl_uctx - Per user FD context
+ * @fwctl: fwctl instance that owns the context
+ *
+ * Every FD opened by userspace will get a unique context allocation. Any driver
+ * private data will follow immediately after.
+ */
+struct fwctl_uctx {
+	struct fwctl_device *fwctl;
+	/* private: */
+	/* Head at fwctl_device::uctx_list */
+	struct list_head uctx_list_entry;
+};
+
 #endif
diff --git a/include/uapi/fwctl/fwctl.h b/include/uapi/fwctl/fwctl.h
new file mode 100644
index 00000000000000..0bdce95b6d69d9
--- /dev/null
+++ b/include/uapi/fwctl/fwctl.h
@@ -0,0 +1,41 @@ 
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+ */
+#ifndef _UAPI_FWCTL_H
+#define _UAPI_FWCTL_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define FWCTL_TYPE 0x9A
+
+/**
+ * DOC: General ioctl format
+ *
+ * The ioctl interface follows a general format to allow for extensibility. Each
+ * ioctl is passed in a structure pointer as the argument providing the size of
+ * the structure in the first u32. The kernel checks that any structure space
+ * beyond what it understands is 0. This allows userspace to use the backward
+ * compatible portion while consistently using the newer, larger, structures.
+ *
+ * ioctls use a standard meaning for common errnos:
+ *
+ *  - ENOTTY: The IOCTL number itself is not supported at all
+ *  - E2BIG: The IOCTL number is supported, but the provided structure has
+ *    non-zero in a part the kernel does not understand.
+ *  - EOPNOTSUPP: The IOCTL number is supported, and the structure is
+ *    understood, however a known field has a value the kernel does not
+ *    understand or support.
+ *  - EINVAL: Everything about the IOCTL was understood, but a field is not
+ *    correct.
+ *  - ENOMEM: Out of memory.
+ *  - ENODEV: The underlying device has been hot-unplugged and the FD is
+ *            orphaned.
+ *
+ * As well as additional errnos, within specific ioctls.
+ */
+enum {
+	FWCTL_CMD_BASE = 0,
+};
+
+#endif