Message ID | 20240201153337.4033490-11-xin.zeng@intel.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Herbert Xu |
Headers | show |
Series | crypto: qat - enable SRIOV VF live migration for | expand |
On Thu, Feb 01, 2024 at 11:33:37PM +0800, Xin Zeng wrote: > +static int qat_vf_pci_init_dev(struct vfio_device *core_vdev) > +{ > + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, > + struct qat_vf_core_device, core_device.vdev); > + struct qat_migdev_ops *ops; > + struct qat_mig_dev *mdev; > + struct pci_dev *parent; > + int ret, vf_id; > + > + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; > + core_vdev->mig_ops = &qat_vf_pci_mig_ops; > + > + ret = vfio_pci_core_init_dev(core_vdev); > + if (ret) > + return ret; > + > + mutex_init(&qat_vdev->state_mutex); > + spin_lock_init(&qat_vdev->reset_lock); > + > + parent = qat_vdev->core_device.pdev->physfn; > + vf_id = pci_iov_vf_id(qat_vdev->core_device.pdev); > + if (!parent || vf_id < 0) { > + ret = -ENODEV; > + goto err_rel; > + } > + > + mdev = qat_vfmig_create(parent, vf_id); > + if (IS_ERR(mdev)) { > + ret = PTR_ERR(mdev); > + goto err_rel; > + } > + > + ops = mdev->ops; > + if (!ops || !ops->init || !ops->cleanup || > + !ops->open || !ops->close || > + !ops->save_state || !ops->load_state || > + !ops->suspend || !ops->resume) { > + ret = -EIO; > + dev_err(&parent->dev, "Incomplete device migration ops structure!"); > + goto err_destroy; > + } Why are there ops pointers here? I would expect this should just be direct function calls to the PF QAT driver. > +static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev) > +{ > + struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev); > + > + if (!qat_vdev->core_device.vdev.mig_ops) > + return; > + > + /* > + * As the higher VFIO layers are holding locks across reset and using > + * those same locks with the mm_lock we need to prevent ABBA deadlock > + * with the state_mutex and mm_lock. > + * In case the state_mutex was taken already we defer the cleanup work > + * to the unlock flow of the other running context. > + */ > + spin_lock(&qat_vdev->reset_lock); > + qat_vdev->deferred_reset = true; > + if (!mutex_trylock(&qat_vdev->state_mutex)) { > + spin_unlock(&qat_vdev->reset_lock); > + return; > + } > + spin_unlock(&qat_vdev->reset_lock); > + qat_vf_state_mutex_unlock(qat_vdev); > +} Do you really need this? I thought this ugly thing was going to be a uniquely mlx5 thing.. Jason
On Thu, 1 Feb 2024 23:33:37 +0800 Xin Zeng <xin.zeng@intel.com> wrote: > Add vfio pci driver for Intel QAT VF devices. > > This driver uses vfio_pci_core to register to the VFIO subsystem. It > acts as a vfio agent and interacts with the QAT PF driver to implement > VF live migration. > > Co-developed-by: Yahui Cao <yahui.cao@intel.com> > Signed-off-by: Yahui Cao <yahui.cao@intel.com> > Signed-off-by: Xin Zeng <xin.zeng@intel.com> > Reviewed-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com> > --- > MAINTAINERS | 8 + > drivers/vfio/pci/Kconfig | 2 + > drivers/vfio/pci/Makefile | 2 + > drivers/vfio/pci/intel/qat/Kconfig | 13 + > drivers/vfio/pci/intel/qat/Makefile | 4 + > drivers/vfio/pci/intel/qat/main.c | 572 ++++++++++++++++++++++++++++ > 6 files changed, 601 insertions(+) > create mode 100644 drivers/vfio/pci/intel/qat/Kconfig > create mode 100644 drivers/vfio/pci/intel/qat/Makefile > create mode 100644 drivers/vfio/pci/intel/qat/main.c > > diff --git a/MAINTAINERS b/MAINTAINERS > index 8d1052fa6a69..c1d3e4cb3892 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -23095,6 +23095,14 @@ S: Maintained > F: Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst > F: drivers/vfio/pci/pds/ > > +VFIO QAT PCI DRIVER > +M: Xin Zeng <xin.zeng@intel.com> > +M: Giovanni Cabiddu <giovanni.cabiddu@intel.com> > +L: kvm@vger.kernel.org > +L: qat-linux@intel.com > +S: Supported > +F: drivers/vfio/pci/intel/qat/ > + > VFIO PLATFORM DRIVER > M: Eric Auger <eric.auger@redhat.com> > L: kvm@vger.kernel.org > diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig > index 18c397df566d..329d25c53274 100644 > --- a/drivers/vfio/pci/Kconfig > +++ b/drivers/vfio/pci/Kconfig > @@ -67,4 +67,6 @@ source "drivers/vfio/pci/pds/Kconfig" > > source "drivers/vfio/pci/virtio/Kconfig" > > +source "drivers/vfio/pci/intel/qat/Kconfig" > + > endmenu > diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile > index 046139a4eca5..a87b6b43ce1c 100644 > --- a/drivers/vfio/pci/Makefile > +++ b/drivers/vfio/pci/Makefile > @@ -15,3 +15,5 @@ obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/ > obj-$(CONFIG_PDS_VFIO_PCI) += pds/ > > obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/ > + > +obj-$(CONFIG_QAT_VFIO_PCI) += intel/qat/ > diff --git a/drivers/vfio/pci/intel/qat/Kconfig b/drivers/vfio/pci/intel/qat/Kconfig > new file mode 100644 > index 000000000000..71b28ac0bf6a > --- /dev/null > +++ b/drivers/vfio/pci/intel/qat/Kconfig > @@ -0,0 +1,13 @@ > +# SPDX-License-Identifier: GPL-2.0-only > +config QAT_VFIO_PCI > + tristate "VFIO support for QAT VF PCI devices" > + select VFIO_PCI_CORE > + depends on CRYPTO_DEV_QAT > + depends on CRYPTO_DEV_QAT_4XXX CRYPTO_DEV_QAT_4XXX selects CRYPTO_DEV_QAT, is it necessary to depend on CRYPTO_DEV_QAT here? > + help > + This provides migration support for Intel(R) QAT Virtual Function > + using the VFIO framework. > + > + To compile this as a module, choose M here: the module > + will be called qat_vfio_pci. If you don't know what to do here, > + say N. > diff --git a/drivers/vfio/pci/intel/qat/Makefile b/drivers/vfio/pci/intel/qat/Makefile > new file mode 100644 > index 000000000000..9289ae4c51bf > --- /dev/null > +++ b/drivers/vfio/pci/intel/qat/Makefile > @@ -0,0 +1,4 @@ > +# SPDX-License-Identifier: GPL-2.0-only > +obj-$(CONFIG_QAT_VFIO_PCI) += qat_vfio_pci.o > +qat_vfio_pci-y := main.o > + Blank line at end of file. > diff --git a/drivers/vfio/pci/intel/qat/main.c b/drivers/vfio/pci/intel/qat/main.c > new file mode 100644 > index 000000000000..85d0ed701397 > --- /dev/null > +++ b/drivers/vfio/pci/intel/qat/main.c > @@ -0,0 +1,572 @@ ... > +static int qat_vf_pci_init_dev(struct vfio_device *core_vdev) > +{ > + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, > + struct qat_vf_core_device, core_device.vdev); > + struct qat_migdev_ops *ops; > + struct qat_mig_dev *mdev; > + struct pci_dev *parent; > + int ret, vf_id; > + > + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; AFAICT it's always good practice to support VFIO_MIGRATION_PRE_COPY, even if only to leave yourself an option to mark an incompatible ABI change in the data stream that can be checked before the stop-copy phase of migration. See for example the mtty sample driver. Thanks, Alex > + core_vdev->mig_ops = &qat_vf_pci_mig_ops; > + > + ret = vfio_pci_core_init_dev(core_vdev); > + if (ret) > + return ret; > + > + mutex_init(&qat_vdev->state_mutex); > + spin_lock_init(&qat_vdev->reset_lock); > + > + parent = qat_vdev->core_device.pdev->physfn; > + vf_id = pci_iov_vf_id(qat_vdev->core_device.pdev); > + if (!parent || vf_id < 0) { > + ret = -ENODEV; > + goto err_rel; > + } > + > + mdev = qat_vfmig_create(parent, vf_id); > + if (IS_ERR(mdev)) { > + ret = PTR_ERR(mdev); > + goto err_rel; > + } > + > + ops = mdev->ops; > + if (!ops || !ops->init || !ops->cleanup || > + !ops->open || !ops->close || > + !ops->save_state || !ops->load_state || > + !ops->suspend || !ops->resume) { > + ret = -EIO; > + dev_err(&parent->dev, "Incomplete device migration ops structure!"); > + goto err_destroy; > + } > + ret = ops->init(mdev); > + if (ret) > + goto err_destroy; > + > + qat_vdev->mdev = mdev; > + > + return 0; > + > +err_destroy: > + qat_vfmig_destroy(mdev); > +err_rel: > + vfio_pci_core_release_dev(core_vdev); > + return ret; > +}
> -----Original Message----- > From: Xin Zeng <xin.zeng@intel.com> > Sent: Thursday, February 1, 2024 3:34 PM > To: herbert@gondor.apana.org.au; alex.williamson@redhat.com; > jgg@nvidia.com; yishaih@nvidia.com; Shameerali Kolothum Thodi > <shameerali.kolothum.thodi@huawei.com>; kevin.tian@intel.com > Cc: linux-crypto@vger.kernel.org; kvm@vger.kernel.org; qat- > linux@intel.com; Xin Zeng <xin.zeng@intel.com>; Yahui Cao > <yahui.cao@intel.com> > Subject: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel QAT VF devices > > Add vfio pci driver for Intel QAT VF devices. > > This driver uses vfio_pci_core to register to the VFIO subsystem. It > acts as a vfio agent and interacts with the QAT PF driver to implement > VF live migration. > > Co-developed-by: Yahui Cao <yahui.cao@intel.com> > Signed-off-by: Yahui Cao <yahui.cao@intel.com> > Signed-off-by: Xin Zeng <xin.zeng@intel.com> > Reviewed-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com> > --- > MAINTAINERS | 8 + > drivers/vfio/pci/Kconfig | 2 + > drivers/vfio/pci/Makefile | 2 + > drivers/vfio/pci/intel/qat/Kconfig | 13 + > drivers/vfio/pci/intel/qat/Makefile | 4 + > drivers/vfio/pci/intel/qat/main.c | 572 ++++++++++++++++++++++++++++ > 6 files changed, 601 insertions(+) > create mode 100644 drivers/vfio/pci/intel/qat/Kconfig > create mode 100644 drivers/vfio/pci/intel/qat/Makefile > create mode 100644 drivers/vfio/pci/intel/qat/main.c > > diff --git a/MAINTAINERS b/MAINTAINERS > index 8d1052fa6a69..c1d3e4cb3892 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -23095,6 +23095,14 @@ S: Maintained > F: > Documentation/networking/device_drivers/ethernet/amd/pds_vfio_ > pci.rst > F: drivers/vfio/pci/pds/ > > +VFIO QAT PCI DRIVER > +M: Xin Zeng <xin.zeng@intel.com> > +M: Giovanni Cabiddu <giovanni.cabiddu@intel.com> > +L: kvm@vger.kernel.org > +L: qat-linux@intel.com > +S: Supported > +F: drivers/vfio/pci/intel/qat/ > + > VFIO PLATFORM DRIVER > M: Eric Auger <eric.auger@redhat.com> > L: kvm@vger.kernel.org > diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig > index 18c397df566d..329d25c53274 100644 > --- a/drivers/vfio/pci/Kconfig > +++ b/drivers/vfio/pci/Kconfig > @@ -67,4 +67,6 @@ source "drivers/vfio/pci/pds/Kconfig" > > source "drivers/vfio/pci/virtio/Kconfig" > > +source "drivers/vfio/pci/intel/qat/Kconfig" > + > endmenu > diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile > index 046139a4eca5..a87b6b43ce1c 100644 > --- a/drivers/vfio/pci/Makefile > +++ b/drivers/vfio/pci/Makefile > @@ -15,3 +15,5 @@ obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/ > obj-$(CONFIG_PDS_VFIO_PCI) += pds/ > > obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/ > + > +obj-$(CONFIG_QAT_VFIO_PCI) += intel/qat/ > diff --git a/drivers/vfio/pci/intel/qat/Kconfig > b/drivers/vfio/pci/intel/qat/Kconfig > new file mode 100644 > index 000000000000..71b28ac0bf6a > --- /dev/null > +++ b/drivers/vfio/pci/intel/qat/Kconfig > @@ -0,0 +1,13 @@ > +# SPDX-License-Identifier: GPL-2.0-only > +config QAT_VFIO_PCI > + tristate "VFIO support for QAT VF PCI devices" > + select VFIO_PCI_CORE > + depends on CRYPTO_DEV_QAT > + depends on CRYPTO_DEV_QAT_4XXX > + help > + This provides migration support for Intel(R) QAT Virtual Function > + using the VFIO framework. > + > + To compile this as a module, choose M here: the module > + will be called qat_vfio_pci. If you don't know what to do here, > + say N. > diff --git a/drivers/vfio/pci/intel/qat/Makefile > b/drivers/vfio/pci/intel/qat/Makefile > new file mode 100644 > index 000000000000..9289ae4c51bf > --- /dev/null > +++ b/drivers/vfio/pci/intel/qat/Makefile > @@ -0,0 +1,4 @@ > +# SPDX-License-Identifier: GPL-2.0-only > +obj-$(CONFIG_QAT_VFIO_PCI) += qat_vfio_pci.o > +qat_vfio_pci-y := main.o > + > diff --git a/drivers/vfio/pci/intel/qat/main.c > b/drivers/vfio/pci/intel/qat/main.c > new file mode 100644 > index 000000000000..85d0ed701397 > --- /dev/null > +++ b/drivers/vfio/pci/intel/qat/main.c > @@ -0,0 +1,572 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* Copyright(c) 2024 Intel Corporation */ > + > +#include <linux/anon_inodes.h> > +#include <linux/container_of.h> > +#include <linux/device.h> > +#include <linux/file.h> > +#include <linux/init.h> > +#include <linux/kernel.h> > +#include <linux/module.h> > +#include <linux/mutex.h> > +#include <linux/pci.h> > +#include <linux/sizes.h> > +#include <linux/types.h> > +#include <linux/uaccess.h> > +#include <linux/vfio_pci_core.h> > +#include <linux/qat/qat_mig_dev.h> > + > +struct qat_vf_migration_file { > + struct file *filp; > + /* protects migration region context */ > + struct mutex lock; > + bool disabled; > + struct qat_mig_dev *mdev; > +}; > + > +struct qat_vf_core_device { > + struct vfio_pci_core_device core_device; > + struct qat_mig_dev *mdev; > + /* protects migration state */ > + struct mutex state_mutex; > + enum vfio_device_mig_state mig_state; > + /* protects reset down flow */ > + spinlock_t reset_lock; > + bool deferred_reset; > + struct qat_vf_migration_file *resuming_migf; > + struct qat_vf_migration_file *saving_migf; > +}; > + > +static int qat_vf_pci_open_device(struct vfio_device *core_vdev) > +{ > + struct qat_vf_core_device *qat_vdev = > + container_of(core_vdev, struct qat_vf_core_device, > + core_device.vdev); > + struct vfio_pci_core_device *vdev = &qat_vdev->core_device; > + int ret; > + > + ret = vfio_pci_core_enable(vdev); > + if (ret) > + return ret; > + > + ret = qat_vdev->mdev->ops->open(qat_vdev->mdev); > + if (ret) { > + vfio_pci_core_disable(vdev); > + return ret; > + } > + qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; > + > + vfio_pci_core_finish_enable(vdev); > + > + return 0; > +} > + > +static void qat_vf_disable_fd(struct qat_vf_migration_file *migf) > +{ > + mutex_lock(&migf->lock); > + migf->disabled = true; > + migf->filp->f_pos = 0; > + mutex_unlock(&migf->lock); > +} > + > +static void qat_vf_disable_fds(struct qat_vf_core_device *qat_vdev) > +{ > + if (qat_vdev->resuming_migf) { > + qat_vf_disable_fd(qat_vdev->resuming_migf); > + fput(qat_vdev->resuming_migf->filp); > + qat_vdev->resuming_migf = NULL; > + } > + > + if (qat_vdev->saving_migf) { > + qat_vf_disable_fd(qat_vdev->saving_migf); > + fput(qat_vdev->saving_migf->filp); > + qat_vdev->saving_migf = NULL; > + } > +} > + > +static void qat_vf_pci_close_device(struct vfio_device *core_vdev) > +{ > + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, > + struct qat_vf_core_device, core_device.vdev); > + > + qat_vdev->mdev->ops->close(qat_vdev->mdev); > + qat_vf_disable_fds(qat_vdev); > + vfio_pci_core_close_device(core_vdev); > +} > + > +static ssize_t qat_vf_save_read(struct file *filp, char __user *buf, > + size_t len, loff_t *pos) > +{ > + struct qat_vf_migration_file *migf = filp->private_data; > + ssize_t done = 0; > + loff_t *offs; > + int ret; > + > + if (pos) > + return -ESPIPE; > + offs = &filp->f_pos; > + > + mutex_lock(&migf->lock); > + if (*offs > migf->mdev->state_size || *offs < 0) { > + done = -EINVAL; > + goto out_unlock; > + } > + > + if (migf->disabled) { > + done = -ENODEV; > + goto out_unlock; > + } > + > + len = min_t(size_t, migf->mdev->state_size - *offs, len); > + if (len) { > + ret = copy_to_user(buf, migf->mdev->state + *offs, len); > + if (ret) { > + done = -EFAULT; > + goto out_unlock; > + } > + *offs += len; > + done = len; > + } > + > +out_unlock: > + mutex_unlock(&migf->lock); > + return done; > +} > + > +static int qat_vf_release_file(struct inode *inode, struct file *filp) > +{ > + struct qat_vf_migration_file *migf = filp->private_data; > + > + qat_vf_disable_fd(migf); > + mutex_destroy(&migf->lock); > + kfree(migf); > + > + return 0; > +} > + > +static const struct file_operations qat_vf_save_fops = { > + .owner = THIS_MODULE, > + .read = qat_vf_save_read, > + .release = qat_vf_release_file, > + .llseek = no_llseek, > +}; > + > +static struct qat_vf_migration_file * > +qat_vf_save_device_data(struct qat_vf_core_device *qat_vdev) > +{ > + struct qat_vf_migration_file *migf; > + int ret; > + > + migf = kzalloc(sizeof(*migf), GFP_KERNEL); > + if (!migf) > + return ERR_PTR(-ENOMEM); > + > + migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_save_fops, > migf, O_RDONLY); > + ret = PTR_ERR_OR_ZERO(migf->filp); > + if (ret) { > + kfree(migf); > + return ERR_PTR(ret); > + } > + > + stream_open(migf->filp->f_inode, migf->filp); > + mutex_init(&migf->lock); > + > + ret = qat_vdev->mdev->ops->save_state(qat_vdev->mdev); > + if (ret) { > + fput(migf->filp); > + kfree(migf); Probably don't need that kfree(migf) here as fput() --> qat_vf_release_file () will do that. > + return ERR_PTR(ret); > + } > + > + migf->mdev = qat_vdev->mdev; > + > + return migf; > +} > + > +static ssize_t qat_vf_resume_write(struct file *filp, const char __user *buf, > + size_t len, loff_t *pos) > +{ > + struct qat_vf_migration_file *migf = filp->private_data; > + loff_t end, *offs; > + ssize_t done = 0; > + int ret; > + > + if (pos) > + return -ESPIPE; > + offs = &filp->f_pos; > + > + if (*offs < 0 || > + check_add_overflow((loff_t)len, *offs, &end)) > + return -EOVERFLOW; > + > + if (end > migf->mdev->state_size) > + return -ENOMEM; > + > + mutex_lock(&migf->lock); > + if (migf->disabled) { > + done = -ENODEV; > + goto out_unlock; > + } > + > + ret = copy_from_user(migf->mdev->state + *offs, buf, len); > + if (ret) { > + done = -EFAULT; > + goto out_unlock; > + } > + *offs += len; > + done = len; > + > +out_unlock: > + mutex_unlock(&migf->lock); > + return done; > +} > + > +static const struct file_operations qat_vf_resume_fops = { > + .owner = THIS_MODULE, > + .write = qat_vf_resume_write, > + .release = qat_vf_release_file, > + .llseek = no_llseek, > +}; > + > +static struct qat_vf_migration_file * > +qat_vf_resume_device_data(struct qat_vf_core_device *qat_vdev) > +{ > + struct qat_vf_migration_file *migf; > + int ret; > + > + migf = kzalloc(sizeof(*migf), GFP_KERNEL); > + if (!migf) > + return ERR_PTR(-ENOMEM); > + > + migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_resume_fops, > migf, O_WRONLY); > + ret = PTR_ERR_OR_ZERO(migf->filp); > + if (ret) { > + kfree(migf); > + return ERR_PTR(ret); > + } > + > + migf->mdev = qat_vdev->mdev; > + stream_open(migf->filp->f_inode, migf->filp); > + mutex_init(&migf->lock); > + > + return migf; > +} > + > +static int qat_vf_load_device_data(struct qat_vf_core_device *qat_vdev) > +{ > + return qat_vdev->mdev->ops->load_state(qat_vdev->mdev); > +} > + > +static struct file *qat_vf_pci_step_device_state(struct qat_vf_core_device > *qat_vdev, u32 new) > +{ > + u32 cur = qat_vdev->mig_state; > + int ret; > + > + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == > VFIO_DEVICE_STATE_RUNNING_P2P)) { > + ret = qat_vdev->mdev->ops->suspend(qat_vdev->mdev); > + if (ret) > + return ERR_PTR(ret); > + return NULL; > + } > + > + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == > VFIO_DEVICE_STATE_STOP) || > + (cur == VFIO_DEVICE_STATE_STOP && new == > VFIO_DEVICE_STATE_RUNNING_P2P)) > + return NULL; > + > + if (cur == VFIO_DEVICE_STATE_STOP && new == > VFIO_DEVICE_STATE_STOP_COPY) { > + struct qat_vf_migration_file *migf; > + > + migf = qat_vf_save_device_data(qat_vdev); > + if (IS_ERR(migf)) > + return ERR_CAST(migf); > + get_file(migf->filp); > + qat_vdev->saving_migf = migf; > + return migf->filp; > + } > + > + if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == > VFIO_DEVICE_STATE_STOP) { > + qat_vf_disable_fds(qat_vdev); > + return NULL; > + } > + > + if (cur == VFIO_DEVICE_STATE_STOP && new == > VFIO_DEVICE_STATE_RESUMING) { > + struct qat_vf_migration_file *migf; > + > + migf = qat_vf_resume_device_data(qat_vdev); > + if (IS_ERR(migf)) > + return ERR_CAST(migf); > + get_file(migf->filp); > + qat_vdev->resuming_migf = migf; > + return migf->filp; > + } > + > + if (cur == VFIO_DEVICE_STATE_RESUMING && new == > VFIO_DEVICE_STATE_STOP) { > + ret = qat_vf_load_device_data(qat_vdev); > + if (ret) > + return ERR_PTR(ret); > + > + qat_vf_disable_fds(qat_vdev); > + return NULL; > + } > + > + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == > VFIO_DEVICE_STATE_RUNNING) { > + qat_vdev->mdev->ops->resume(qat_vdev->mdev); > + return NULL; > + } > + > + /* vfio_mig_get_next_state() does not use arcs other than the above > */ > + WARN_ON(true); > + return ERR_PTR(-EINVAL); > +} > + > +static void qat_vf_state_mutex_unlock(struct qat_vf_core_device > *qat_vdev) > +{ > +again: > + spin_lock(&qat_vdev->reset_lock); > + if (qat_vdev->deferred_reset) { > + qat_vdev->deferred_reset = false; > + spin_unlock(&qat_vdev->reset_lock); > + qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; > + qat_vf_disable_fds(qat_vdev); > + goto again; > + } > + mutex_unlock(&qat_vdev->state_mutex); > + spin_unlock(&qat_vdev->reset_lock); > +} > + > +static struct file *qat_vf_pci_set_device_state(struct vfio_device *vdev, > + enum vfio_device_mig_state > new_state) > +{ > + struct qat_vf_core_device *qat_vdev = container_of(vdev, > + struct qat_vf_core_device, core_device.vdev); > + enum vfio_device_mig_state next_state; > + struct file *res = NULL; > + int ret; > + > + mutex_lock(&qat_vdev->state_mutex); > + while (new_state != qat_vdev->mig_state) { > + ret = vfio_mig_get_next_state(vdev, qat_vdev->mig_state, > + new_state, &next_state); > + if (ret) { > + res = ERR_PTR(ret); > + break; > + } > + res = qat_vf_pci_step_device_state(qat_vdev, next_state); > + if (IS_ERR(res)) > + break; > + qat_vdev->mig_state = next_state; > + if (WARN_ON(res && new_state != qat_vdev->mig_state)) { > + fput(res); > + res = ERR_PTR(-EINVAL); > + break; > + } > + } > + qat_vf_state_mutex_unlock(qat_vdev); > + > + return res; > +} > + > +static int qat_vf_pci_get_device_state(struct vfio_device *vdev, > + enum vfio_device_mig_state *curr_state) > +{ > + struct qat_vf_core_device *qat_vdev = container_of(vdev, > + struct qat_vf_core_device, core_device.vdev); > + > + mutex_lock(&qat_vdev->state_mutex); > + *curr_state = qat_vdev->mig_state; > + qat_vf_state_mutex_unlock(qat_vdev); > + > + return 0; > +} > + > +static int qat_vf_pci_get_data_size(struct vfio_device *vdev, > + unsigned long *stop_copy_length) > +{ > + struct qat_vf_core_device *qat_vdev = container_of(vdev, > + struct qat_vf_core_device, core_device.vdev); > + > + *stop_copy_length = qat_vdev->mdev->state_size; Do we need a lock here or this is not changing? > + return 0; > +} > + > +static const struct vfio_migration_ops qat_vf_pci_mig_ops = { > + .migration_set_state = qat_vf_pci_set_device_state, > + .migration_get_state = qat_vf_pci_get_device_state, > + .migration_get_data_size = qat_vf_pci_get_data_size, > +}; > + > +static void qat_vf_pci_release_dev(struct vfio_device *core_vdev) > +{ > + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, > + struct qat_vf_core_device, core_device.vdev); > + > + qat_vdev->mdev->ops->cleanup(qat_vdev->mdev); > + qat_vfmig_destroy(qat_vdev->mdev); > + mutex_destroy(&qat_vdev->state_mutex); > + vfio_pci_core_release_dev(core_vdev); > +} > + > +static int qat_vf_pci_init_dev(struct vfio_device *core_vdev) > +{ > + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, > + struct qat_vf_core_device, core_device.vdev); > + struct qat_migdev_ops *ops; > + struct qat_mig_dev *mdev; > + struct pci_dev *parent; > + int ret, vf_id; > + > + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | > VFIO_MIGRATION_P2P; > + core_vdev->mig_ops = &qat_vf_pci_mig_ops; > + > + ret = vfio_pci_core_init_dev(core_vdev); > + if (ret) > + return ret; > + > + mutex_init(&qat_vdev->state_mutex); > + spin_lock_init(&qat_vdev->reset_lock); > + > + parent = qat_vdev->core_device.pdev->physfn; Can we use pci_physfn() here? > + vf_id = pci_iov_vf_id(qat_vdev->core_device.pdev); > + if (!parent || vf_id < 0) { Also if the pci_iov_vf_id() return success I don't think you need to check for parent and can use directly below. > + ret = -ENODEV; > + goto err_rel; > + } > + > + mdev = qat_vfmig_create(parent, vf_id); > + if (IS_ERR(mdev)) { > + ret = PTR_ERR(mdev); > + goto err_rel; > + } > + > + ops = mdev->ops; > + if (!ops || !ops->init || !ops->cleanup || > + !ops->open || !ops->close || > + !ops->save_state || !ops->load_state || > + !ops->suspend || !ops->resume) { > + ret = -EIO; > + dev_err(&parent->dev, "Incomplete device migration ops > structure!"); > + goto err_destroy; > + } If all these ops are a must why cant we move the check inside the qat_vfmig_create()? Or rather call them explicitly as suggested by Jason. Thanks, Shameer > + ret = ops->init(mdev); > + if (ret) > + goto err_destroy; > + > + qat_vdev->mdev = mdev; > + > + return 0; > + > +err_destroy: > + qat_vfmig_destroy(mdev); > +err_rel: > + vfio_pci_core_release_dev(core_vdev); > + return ret; > +} > + > +static const struct vfio_device_ops qat_vf_pci_ops = { > + .name = "qat-vf-vfio-pci", > + .init = qat_vf_pci_init_dev, > + .release = qat_vf_pci_release_dev, > + .open_device = qat_vf_pci_open_device, > + .close_device = qat_vf_pci_close_device, > + .ioctl = vfio_pci_core_ioctl, > + .read = vfio_pci_core_read, > + .write = vfio_pci_core_write, > + .mmap = vfio_pci_core_mmap, > + .request = vfio_pci_core_request, > + .match = vfio_pci_core_match, > + .bind_iommufd = vfio_iommufd_physical_bind, > + .unbind_iommufd = vfio_iommufd_physical_unbind, > + .attach_ioas = vfio_iommufd_physical_attach_ioas, > + .detach_ioas = vfio_iommufd_physical_detach_ioas, > +}; > + > +static struct qat_vf_core_device *qat_vf_drvdata(struct pci_dev *pdev) > +{ > + struct vfio_pci_core_device *core_device = pci_get_drvdata(pdev); > + > + return container_of(core_device, struct qat_vf_core_device, > core_device); > +} > + > +static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev) > +{ > + struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev); > + > + if (!qat_vdev->core_device.vdev.mig_ops) > + return; > + > + /* > + * As the higher VFIO layers are holding locks across reset and using > + * those same locks with the mm_lock we need to prevent ABBA > deadlock > + * with the state_mutex and mm_lock. > + * In case the state_mutex was taken already we defer the cleanup > work > + * to the unlock flow of the other running context. > + */ > + spin_lock(&qat_vdev->reset_lock); > + qat_vdev->deferred_reset = true; > + if (!mutex_trylock(&qat_vdev->state_mutex)) { > + spin_unlock(&qat_vdev->reset_lock); > + return; > + } > + spin_unlock(&qat_vdev->reset_lock); > + qat_vf_state_mutex_unlock(qat_vdev); > +} > + > +static int > +qat_vf_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > +{ > + struct device *dev = &pdev->dev; > + struct qat_vf_core_device *qat_vdev; > + int ret; > + > + qat_vdev = vfio_alloc_device(qat_vf_core_device, core_device.vdev, > dev, &qat_vf_pci_ops); > + if (IS_ERR(qat_vdev)) > + return PTR_ERR(qat_vdev); > + > + pci_set_drvdata(pdev, &qat_vdev->core_device); > + ret = vfio_pci_core_register_device(&qat_vdev->core_device); > + if (ret) > + goto out_put_device; > + > + return 0; > + > +out_put_device: > + vfio_put_device(&qat_vdev->core_device.vdev); > + return ret; > +} > + > +static void qat_vf_vfio_pci_remove(struct pci_dev *pdev) > +{ > + struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev); > + > + vfio_pci_core_unregister_device(&qat_vdev->core_device); > + vfio_put_device(&qat_vdev->core_device.vdev); > +} > + > +static const struct pci_device_id qat_vf_vfio_pci_table[] = { > + /* Intel QAT GEN4 4xxx VF device */ > + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, > 0x4941) }, > + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, > 0x4943) }, > + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, > 0x4945) }, > + {} > +}; > +MODULE_DEVICE_TABLE(pci, qat_vf_vfio_pci_table); > + > +static const struct pci_error_handlers qat_vf_err_handlers = { > + .reset_done = qat_vf_pci_aer_reset_done, > + .error_detected = vfio_pci_core_aer_err_detected, > +}; > + > +static struct pci_driver qat_vf_vfio_pci_driver = { > + .name = "qat_vfio_pci", > + .id_table = qat_vf_vfio_pci_table, > + .probe = qat_vf_vfio_pci_probe, > + .remove = qat_vf_vfio_pci_remove, > + .err_handler = &qat_vf_err_handlers, > + .driver_managed_dma = true, > +}; > +module_pci_driver(qat_vf_vfio_pci_driver) > + > +MODULE_LICENSE("GPL"); > +MODULE_AUTHOR("Intel Corporation"); > +MODULE_DESCRIPTION("QAT VFIO PCI - VFIO PCI driver with live migration > support for Intel(R) QAT GEN4 device family"); > +MODULE_IMPORT_NS(CRYPTO_QAT); > -- > 2.18.2
Thanks for your comments, Jason. On Tuesday, February 6, 2024 8:55 PM, Jason Gunthorpe <jgg@nvidia.com> wrote: > > + > > + ops = mdev->ops; > > + if (!ops || !ops->init || !ops->cleanup || > > + !ops->open || !ops->close || > > + !ops->save_state || !ops->load_state || > > + !ops->suspend || !ops->resume) { > > + ret = -EIO; > > + dev_err(&parent->dev, "Incomplete device migration ops > structure!"); > > + goto err_destroy; > > + } > > Why are there ops pointers here? I would expect this should just be > direct function calls to the PF QAT driver. I indeed had a version where the direct function calls are Implemented in QAT driver, while when I look at the functions, most of them only translate the interface to the ops pointer. That's why I put ops pointers directly into vfio variant driver. > > > +static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev) > > +{ > > + struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev); > > + > > + if (!qat_vdev->core_device.vdev.mig_ops) > > + return; > > + > > + /* > > + * As the higher VFIO layers are holding locks across reset and using > > + * those same locks with the mm_lock we need to prevent ABBA > deadlock > > + * with the state_mutex and mm_lock. > > + * In case the state_mutex was taken already we defer the cleanup work > > + * to the unlock flow of the other running context. > > + */ > > + spin_lock(&qat_vdev->reset_lock); > > + qat_vdev->deferred_reset = true; > > + if (!mutex_trylock(&qat_vdev->state_mutex)) { > > + spin_unlock(&qat_vdev->reset_lock); > > + return; > > + } > > + spin_unlock(&qat_vdev->reset_lock); > > + qat_vf_state_mutex_unlock(qat_vdev); > > +} > > Do you really need this? I thought this ugly thing was going to be a > uniquely mlx5 thing.. I think that's still required to make the migration state synchronized if the VF is reset by other VFIO emulation paths. Is it the case? BTW, this implementation is not only in mlx5 driver, but also in other Vfio pci variant drivers such as hisilicon acc driver and pds driver. Thanks, Xin
On Fri, Feb 09, 2024 at 08:23:32AM +0000, Zeng, Xin wrote: > Thanks for your comments, Jason. > On Tuesday, February 6, 2024 8:55 PM, Jason Gunthorpe <jgg@nvidia.com> wrote: > > > + > > > + ops = mdev->ops; > > > + if (!ops || !ops->init || !ops->cleanup || > > > + !ops->open || !ops->close || > > > + !ops->save_state || !ops->load_state || > > > + !ops->suspend || !ops->resume) { > > > + ret = -EIO; > > > + dev_err(&parent->dev, "Incomplete device migration ops > > structure!"); > > > + goto err_destroy; > > > + } > > > > Why are there ops pointers here? I would expect this should just be > > direct function calls to the PF QAT driver. > > I indeed had a version where the direct function calls are Implemented in > QAT driver, while when I look at the functions, most of them > only translate the interface to the ops pointer. That's why I put > ops pointers directly into vfio variant driver. But why is there an ops indirection at all? Are there more than one ops? > > > +static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev) > > > +{ > > > + struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev); > > > + > > > + if (!qat_vdev->core_device.vdev.mig_ops) > > > + return; > > > + > > > + /* > > > + * As the higher VFIO layers are holding locks across reset and using > > > + * those same locks with the mm_lock we need to prevent ABBA > > deadlock > > > + * with the state_mutex and mm_lock. > > > + * In case the state_mutex was taken already we defer the cleanup work > > > + * to the unlock flow of the other running context. > > > + */ > > > + spin_lock(&qat_vdev->reset_lock); > > > + qat_vdev->deferred_reset = true; > > > + if (!mutex_trylock(&qat_vdev->state_mutex)) { > > > + spin_unlock(&qat_vdev->reset_lock); > > > + return; > > > + } > > > + spin_unlock(&qat_vdev->reset_lock); > > > + qat_vf_state_mutex_unlock(qat_vdev); > > > +} > > > > Do you really need this? I thought this ugly thing was going to be a > > uniquely mlx5 thing.. > > I think that's still required to make the migration state synchronized > if the VF is reset by other VFIO emulation paths. Is it the case? > BTW, this implementation is not only in mlx5 driver, but also in other > Vfio pci variant drivers such as hisilicon acc driver and pds > driver. It had to specifically do with the mm lock interaction that, I thought, was going to be unique to the mlx driver. Otherwise you could just directly hold the state_mutex here. Yishai do you remember the exact trace for the mmlock entanglement? Jason
Thanks for your comments, Alex. On Wednesday, February 7, 2024 5:15 AM, Alex Williamson <alex.williamson@redhat.com> wrote: > > diff --git a/drivers/vfio/pci/intel/qat/Kconfig > b/drivers/vfio/pci/intel/qat/Kconfig > > new file mode 100644 > > index 000000000000..71b28ac0bf6a > > --- /dev/null > > +++ b/drivers/vfio/pci/intel/qat/Kconfig > > @@ -0,0 +1,13 @@ > > +# SPDX-License-Identifier: GPL-2.0-only > > +config QAT_VFIO_PCI > > + tristate "VFIO support for QAT VF PCI devices" > > + select VFIO_PCI_CORE > > + depends on CRYPTO_DEV_QAT > > + depends on CRYPTO_DEV_QAT_4XXX > > CRYPTO_DEV_QAT_4XXX selects CRYPTO_DEV_QAT, is it necessary to depend > on CRYPTO_DEV_QAT here? You are right, we don't need it, will update it in next version. > > > + help > > + This provides migration support for Intel(R) QAT Virtual Function > > + using the VFIO framework. > > + > > + To compile this as a module, choose M here: the module > > + will be called qat_vfio_pci. If you don't know what to do here, > > + say N. > > diff --git a/drivers/vfio/pci/intel/qat/Makefile > b/drivers/vfio/pci/intel/qat/Makefile > > new file mode 100644 > > index 000000000000..9289ae4c51bf > > --- /dev/null > > +++ b/drivers/vfio/pci/intel/qat/Makefile > > @@ -0,0 +1,4 @@ > > +# SPDX-License-Identifier: GPL-2.0-only > > +obj-$(CONFIG_QAT_VFIO_PCI) += qat_vfio_pci.o > > +qat_vfio_pci-y := main.o > > + > > Blank line at end of file. Good catch, will update it in next version. > > > diff --git a/drivers/vfio/pci/intel/qat/main.c > b/drivers/vfio/pci/intel/qat/main.c > > new file mode 100644 > > index 000000000000..85d0ed701397 > > --- /dev/null > > +++ b/drivers/vfio/pci/intel/qat/main.c > > @@ -0,0 +1,572 @@ > ... > > +static int qat_vf_pci_init_dev(struct vfio_device *core_vdev) > > +{ > > + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, > > + struct qat_vf_core_device, core_device.vdev); > > + struct qat_migdev_ops *ops; > > + struct qat_mig_dev *mdev; > > + struct pci_dev *parent; > > + int ret, vf_id; > > + > > + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | > VFIO_MIGRATION_P2P; > > AFAICT it's always good practice to support VFIO_MIGRATION_PRE_COPY, > even if only to leave yourself an option to mark an incompatible ABI > change in the data stream that can be checked before the stop-copy > phase of migration. See for example the mtty sample driver. Thanks, > Alex If the good practice is to check migration stream compatibility, then we do incorporate additional information as part of device state for ABI compatibility testing such as magic, version and device capability in stop copy phase. Can we ignore the optional VFIO_MIGRATION_PRE_COPY support? Thanks, Xin
On 09/02/2024 14:10, Jason Gunthorpe wrote: > On Fri, Feb 09, 2024 at 08:23:32AM +0000, Zeng, Xin wrote: >> Thanks for your comments, Jason. >> On Tuesday, February 6, 2024 8:55 PM, Jason Gunthorpe <jgg@nvidia.com> wrote: >>>> + >>>> + ops = mdev->ops; >>>> + if (!ops || !ops->init || !ops->cleanup || >>>> + !ops->open || !ops->close || >>>> + !ops->save_state || !ops->load_state || >>>> + !ops->suspend || !ops->resume) { >>>> + ret = -EIO; >>>> + dev_err(&parent->dev, "Incomplete device migration ops >>> structure!"); >>>> + goto err_destroy; >>>> + } >>> >>> Why are there ops pointers here? I would expect this should just be >>> direct function calls to the PF QAT driver. >> >> I indeed had a version where the direct function calls are Implemented in >> QAT driver, while when I look at the functions, most of them >> only translate the interface to the ops pointer. That's why I put >> ops pointers directly into vfio variant driver. > > But why is there an ops indirection at all? Are there more than one > ops? > >>>> +static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev) >>>> +{ >>>> + struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev); >>>> + >>>> + if (!qat_vdev->core_device.vdev.mig_ops) >>>> + return; >>>> + >>>> + /* >>>> + * As the higher VFIO layers are holding locks across reset and using >>>> + * those same locks with the mm_lock we need to prevent ABBA >>> deadlock >>>> + * with the state_mutex and mm_lock. >>>> + * In case the state_mutex was taken already we defer the cleanup work >>>> + * to the unlock flow of the other running context. >>>> + */ >>>> + spin_lock(&qat_vdev->reset_lock); >>>> + qat_vdev->deferred_reset = true; >>>> + if (!mutex_trylock(&qat_vdev->state_mutex)) { >>>> + spin_unlock(&qat_vdev->reset_lock); >>>> + return; >>>> + } >>>> + spin_unlock(&qat_vdev->reset_lock); >>>> + qat_vf_state_mutex_unlock(qat_vdev); >>>> +} >>> >>> Do you really need this? I thought this ugly thing was going to be a >>> uniquely mlx5 thing.. >> >> I think that's still required to make the migration state synchronized >> if the VF is reset by other VFIO emulation paths. Is it the case? >> BTW, this implementation is not only in mlx5 driver, but also in other >> Vfio pci variant drivers such as hisilicon acc driver and pds >> driver. > > It had to specifically do with the mm lock interaction that, I > thought, was going to be unique to the mlx driver. Otherwise you could > just directly hold the state_mutex here. > > Yishai do you remember the exact trace for the mmlock entanglement? > I found in my in-box (from more than 2.5 years ago) the below [1] lockdep WARNING when the state_mutex was used directly. Once we moved to the 'deferred_reset' mode it solved that. [1] [ +1.063822] ====================================================== [ +0.000732] WARNING: possible circular locking dependency detected [ +0.000747] 5.15.0-rc3 #236 Not tainted [ +0.000556] ------------------------------------------------------ [ +0.000714] qemu-system-x86/7731 is trying to acquire lock: [ +0.000659] ffff888126c64b78 (&vdev->vma_lock){+.+.}-{3:3}, at: vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core] [ +0.001127] but task is already holding lock: [ +0.000803] ffff888105f4c5d8 (&mm->mmap_lock#2){++++}-{3:3}, at: vaddr_get_pfns+0x64/0x240 [vfio_iommu_type1] [ +0.001119] which lock already depends on the new lock. [ +0.001086] the existing dependency chain (in reverse order) is: [ +0.000910] -> #3 (&mm->mmap_lock#2){++++}-{3:3}: [ +0.000844] __might_fault+0x56/0x80 [ +0.000572] _copy_to_user+0x1e/0x80 [ +0.000556] mlx5vf_pci_mig_rw.cold+0xa1/0x24f [mlx5_vfio_pci] [ +0.000732] vfs_read+0xa8/0x1c0 [ +0.000547] __x64_sys_pread64+0x8c/0xc0 [ +0.000580] do_syscall_64+0x3d/0x90 [ +0.000566] entry_SYSCALL_64_after_hwframe+0x44/0xae [ +0.000682] -> #2 (&mvdev->state_mutex){+.+.}-{3:3}: [ +0.000899] __mutex_lock+0x80/0x9d0 [ +0.000566] mlx5vf_reset_done+0x2c/0x40 [mlx5_vfio_pci] [ +0.000697] vfio_pci_core_ioctl+0x585/0x1020 [vfio_pci_core] [ +0.000721] __x64_sys_ioctl+0x436/0x9a0 [ +0.000588] do_syscall_64+0x3d/0x90 [ +0.000584] entry_SYSCALL_64_after_hwframe+0x44/0xae [ +0.000674] -> #1 (&vdev->memory_lock){+.+.}-{3:3}: [ +0.000843] down_write+0x38/0x70 [ +0.000544] vfio_pci_zap_and_down_write_memory_lock+0x1c/0x30 [vfio_pci_core] [ +0.003705] vfio_basic_config_write+0x1e7/0x280 [vfio_pci_core] [ +0.000744] vfio_pci_config_rw+0x1b7/0x3af [vfio_pci_core] [ +0.000716] vfs_write+0xe6/0x390 [ +0.000539] __x64_sys_pwrite64+0x8c/0xc0 [ +0.000603] do_syscall_64+0x3d/0x90 [ +0.000572] entry_SYSCALL_64_after_hwframe+0x44/0xae [ +0.000661] -> #0 (&vdev->vma_lock){+.+.}-{3:3}: [ +0.000828] __lock_acquire+0x1244/0x2280 [ +0.000596] lock_acquire+0xc2/0x2c0 [ +0.000556] __mutex_lock+0x80/0x9d0 [ +0.000580] vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core] [ +0.000709] __do_fault+0x32/0xa0 [ +0.000556] __handle_mm_fault+0xbe8/0x1450 [ +0.000606] handle_mm_fault+0x6c/0x140 [ +0.000624] fixup_user_fault+0x6b/0x100 [ +0.000600] vaddr_get_pfns+0x108/0x240 [vfio_iommu_type1] [ +0.000726] vfio_pin_pages_remote+0x326/0x460 [vfio_iommu_type1] [ +0.000736] vfio_iommu_type1_ioctl+0x43b/0x15a0 [vfio_iommu_type1] [ +0.000752] __x64_sys_ioctl+0x436/0x9a0 [ +0.000588] do_syscall_64+0x3d/0x90 [ +0.000571] entry_SYSCALL_64_after_hwframe+0x44/0xae [ +0.000677] other info that might help us debug this: [ +0.001073] Chain exists of: &vdev->vma_lock --> &mvdev->state_mutex --> &mm->mmap_lock#2 [ +0.001285] Possible unsafe locking scenario: [ +0.000808] CPU0 CPU1 [ +0.000599] ---- ---- [ +0.000593] lock(&mm->mmap_lock#2); [ +0.000530] lock(&mvdev->state_mutex); [ +0.000725] lock(&mm->mmap_lock#2); [ +0.000712] lock(&vdev->vma_lock); [ +0.000532] *** DEADLOCK *** [ +0.000922] 2 locks held by qemu-system-x86/7731: [ +0.000597] #0: ffff88810c9bec88 (&iommu->lock#2){+.+.}-{3:3}, at: vfio_iommu_type1_ioctl+0x189/0x15a0 [vfio_iommu_type1] [ +0.001177] #1: ffff888105f4c5d8 (&mm->mmap_lock#2){++++}-{3:3}, at: vaddr_get_pfns+0x64/0x240 [vfio_iommu_type1] [ +0.001153] stack backtrace: [ +0.000689] CPU: 1 PID: 7731 Comm: qemu-system-x86 Not tainted 5.15.0-rc3 #236 [ +0.000932] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ +0.001192] Call Trace: [ +0.000454] dump_stack_lvl+0x45/0x59 [ +0.000527] check_noncircular+0xf2/0x110 [ +0.000557] __lock_acquire+0x1244/0x2280 [ +0.002462] lock_acquire+0xc2/0x2c0 [ +0.000534] ? vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core] [ +0.000684] ? lock_is_held_type+0x98/0x110 [ +0.000565] __mutex_lock+0x80/0x9d0 [ +0.000542] ? vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core] [ +0.000676] ? vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core] [ +0.000681] ? mark_held_locks+0x49/0x70 [ +0.000551] ? lock_is_held_type+0x98/0x110 [ +0.000575] vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core] [ +0.000679] __do_fault+0x32/0xa0 [ +0.000700] __handle_mm_fault+0xbe8/0x1450 [ +0.000571] handle_mm_fault+0x6c/0x140 [ +0.000564] fixup_user_fault+0x6b/0x100 [ +0.000556] vaddr_get_pfns+0x108/0x240 [vfio_iommu_type1] [ +0.000666] ? lock_is_held_type+0x98/0x110 [ +0.000572] vfio_pin_pages_remote+0x326/0x460 [vfio_iommu_type1] [ +0.000710] vfio_iommu_type1_ioctl+0x43b/0x15a0 [vfio_iommu_type1] [ +0.001050] ? find_held_lock+0x2b/0x80 [ +0.000561] ? lock_release+0xc2/0x2a0 [ +0.000537] ? __fget_files+0xdc/0x1d0 [ +0.000541] __x64_sys_ioctl+0x436/0x9a0 [ +0.000556] ? lockdep_hardirqs_on_prepare+0xd4/0x170 [ +0.000641] do_syscall_64+0x3d/0x90 [ +0.000529] entry_SYSCALL_64_after_hwframe+0x44/0xae [ +0.000670] RIP: 0033:0x7f54255bb45b [ +0.000541] Code: 0f 1e fa 48 8b 05 2d aa 2c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d fd a9 2c 00 f7 d8 64 89 01 48 [ +0.001843] RSP: 002b:00007f5415ca3d38 EFLAGS: 00000206 ORIG_RAX: 0000000000000010 [ +0.000929] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f54255bb45b [ +0.000781] RDX: 00007f5415ca3d70 RSI: 0000000000003b71 RDI: 0000000000000012 [ +0.000775] RBP: 00007f5415ca3da0 R08: 0000000000000000 R09: 0000000000000000 [ +0.000777] R10: 00000000fe002000 R11: 0000000000000206 R12: 0000000000000004 [ +0.000775] R13: 0000000000000000 R14: 00000000fe000000 R15: 00007f5415ca47c0 Yishai
Thanks for the comments, Shameer. > -----Original Message----- > From: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com> > Sent: Thursday, February 8, 2024 8:17 PM > To: Zeng, Xin <xin.zeng@intel.com>; herbert@gondor.apana.org.au; > alex.williamson@redhat.com; jgg@nvidia.com; yishaih@nvidia.com; Tian, Kevin > <kevin.tian@intel.com> > Cc: linux-crypto@vger.kernel.org; kvm@vger.kernel.org; qat-linux <qat- > linux@intel.com>; Cao, Yahui <yahui.cao@intel.com> > Subject: RE: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel QAT VF devices > > > > +static struct qat_vf_migration_file * > > +qat_vf_save_device_data(struct qat_vf_core_device *qat_vdev) > > +{ > > + struct qat_vf_migration_file *migf; > > + int ret; > > + > > + migf = kzalloc(sizeof(*migf), GFP_KERNEL); > > + if (!migf) > > + return ERR_PTR(-ENOMEM); > > + > > + migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_save_fops, > > migf, O_RDONLY); > > + ret = PTR_ERR_OR_ZERO(migf->filp); > > + if (ret) { > > + kfree(migf); > > + return ERR_PTR(ret); > > + } > > + > > + stream_open(migf->filp->f_inode, migf->filp); > > + mutex_init(&migf->lock); > > + > > + ret = qat_vdev->mdev->ops->save_state(qat_vdev->mdev); > > + if (ret) { > > + fput(migf->filp); > > + kfree(migf); > > Probably don't need that kfree(migf) here as fput() --> qat_vf_release_file () will > do that. Thanks, it's redundant, will update it in next version. > > > +static int qat_vf_pci_get_data_size(struct vfio_device *vdev, > > + unsigned long *stop_copy_length) > > +{ > > + struct qat_vf_core_device *qat_vdev = container_of(vdev, > > + struct qat_vf_core_device, core_device.vdev); > > + > > + *stop_copy_length = qat_vdev->mdev->state_size; > > Do we need a lock here or this is not changing? Yes, will update it in next version. > > > + return 0; > > +} > > + > > +static const struct vfio_migration_ops qat_vf_pci_mig_ops = { > > + .migration_set_state = qat_vf_pci_set_device_state, > > + .migration_get_state = qat_vf_pci_get_device_state, > > + .migration_get_data_size = qat_vf_pci_get_data_size, > > +}; > > + > > +static void qat_vf_pci_release_dev(struct vfio_device *core_vdev) > > +{ > > + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, > > + struct qat_vf_core_device, core_device.vdev); > > + > > + qat_vdev->mdev->ops->cleanup(qat_vdev->mdev); > > + qat_vfmig_destroy(qat_vdev->mdev); > > + mutex_destroy(&qat_vdev->state_mutex); > > + vfio_pci_core_release_dev(core_vdev); > > +} > > + > > +static int qat_vf_pci_init_dev(struct vfio_device *core_vdev) > > +{ > > + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, > > + struct qat_vf_core_device, core_device.vdev); > > + struct qat_migdev_ops *ops; > > + struct qat_mig_dev *mdev; > > + struct pci_dev *parent; > > + int ret, vf_id; > > + > > + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | > > VFIO_MIGRATION_P2P; > > + core_vdev->mig_ops = &qat_vf_pci_mig_ops; > > + > > + ret = vfio_pci_core_init_dev(core_vdev); > > + if (ret) > > + return ret; > > + > > + mutex_init(&qat_vdev->state_mutex); > > + spin_lock_init(&qat_vdev->reset_lock); > > + > > + parent = qat_vdev->core_device.pdev->physfn; > > Can we use pci_physfn() here? Sure, will update it in next version. > > > + vf_id = pci_iov_vf_id(qat_vdev->core_device.pdev); > > + if (!parent || vf_id < 0) { > > Also if the pci_iov_vf_id() return success I don't think you need to > check for parent and can use directly below. OK, will update it in next version. > > > + ret = -ENODEV; > > + goto err_rel; > > + } > > + > > + mdev = qat_vfmig_create(parent, vf_id); > > + if (IS_ERR(mdev)) { > > + ret = PTR_ERR(mdev); > > + goto err_rel; > > + } > > + > > + ops = mdev->ops; > > + if (!ops || !ops->init || !ops->cleanup || > > + !ops->open || !ops->close || > > + !ops->save_state || !ops->load_state || > > + !ops->suspend || !ops->resume) { > > + ret = -EIO; > > + dev_err(&parent->dev, "Incomplete device migration ops > > structure!"); > > + goto err_destroy; > > + } > > If all these ops are a must why cant we move the check inside the > qat_vfmig_create()? > Or rather call them explicitly as suggested by Jason. We can do it, but it might make sense to leave the check to the APIs' user as some of these ops interfaces might be optional for other QAT variant driver in future. Thanks, Xin
On Tue, Feb 13, 2024 at 01:08:47PM +0000, Zeng, Xin wrote: > > > + ret = -ENODEV; > > > + goto err_rel; > > > + } > > > + > > > + mdev = qat_vfmig_create(parent, vf_id); > > > + if (IS_ERR(mdev)) { > > > + ret = PTR_ERR(mdev); > > > + goto err_rel; > > > + } > > > + > > > + ops = mdev->ops; > > > + if (!ops || !ops->init || !ops->cleanup || > > > + !ops->open || !ops->close || > > > + !ops->save_state || !ops->load_state || > > > + !ops->suspend || !ops->resume) { > > > + ret = -EIO; > > > + dev_err(&parent->dev, "Incomplete device migration ops > > > structure!"); > > > + goto err_destroy; > > > + } > > > > If all these ops are a must why cant we move the check inside the > > qat_vfmig_create()? > > Or rather call them explicitly as suggested by Jason. > > We can do it, but it might make sense to leave the check to the APIs' user > as some of these ops interfaces might be optional for other QAT variant driver > in future. If you have patches already that need ops then you can leave them, but otherwise they should be removed and added back if you come with future patches that require another implementation. Jason
On Sunday, February 11, 2024 4:17 PM, Yishai Hadas <yishaih@nvidia.com> wrote: > >>>> +static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev) > >>>> +{ > >>>> + struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev); > >>>> + > >>>> + if (!qat_vdev->core_device.vdev.mig_ops) > >>>> + return; > >>>> + > >>>> + /* > >>>> + * As the higher VFIO layers are holding locks across reset and using > >>>> + * those same locks with the mm_lock we need to prevent ABBA > >>> deadlock > >>>> + * with the state_mutex and mm_lock. > >>>> + * In case the state_mutex was taken already we defer the cleanup work > >>>> + * to the unlock flow of the other running context. > >>>> + */ > >>>> + spin_lock(&qat_vdev->reset_lock); > >>>> + qat_vdev->deferred_reset = true; > >>>> + if (!mutex_trylock(&qat_vdev->state_mutex)) { > >>>> + spin_unlock(&qat_vdev->reset_lock); > >>>> + return; > >>>> + } > >>>> + spin_unlock(&qat_vdev->reset_lock); > >>>> + qat_vf_state_mutex_unlock(qat_vdev); > >>>> +} > >>> > >>> Do you really need this? I thought this ugly thing was going to be a > >>> uniquely mlx5 thing.. > >> > >> I think that's still required to make the migration state synchronized > >> if the VF is reset by other VFIO emulation paths. Is it the case? > >> BTW, this implementation is not only in mlx5 driver, but also in other > >> Vfio pci variant drivers such as hisilicon acc driver and pds > >> driver. > > > > It had to specifically do with the mm lock interaction that, I > > thought, was going to be unique to the mlx driver. Otherwise you could > > just directly hold the state_mutex here. > > > > Yishai do you remember the exact trace for the mmlock entanglement? > > > > I found in my in-box (from more than 2.5 years ago) the below [1] > lockdep WARNING when the state_mutex was used directly. > > Once we moved to the 'deferred_reset' mode it solved that. > > [1] > [ +1.063822] ====================================================== > [ +0.000732] WARNING: possible circular locking dependency detected > [ +0.000747] 5.15.0-rc3 #236 Not tainted > [ +0.000556] ------------------------------------------------------ > [ +0.000714] qemu-system-x86/7731 is trying to acquire lock: > [ +0.000659] ffff888126c64b78 (&vdev->vma_lock){+.+.}-{3:3}, at: > vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core] > [ +0.001127] > but task is already holding lock: > [ +0.000803] ffff888105f4c5d8 (&mm->mmap_lock#2){++++}-{3:3}, at: > vaddr_get_pfns+0x64/0x240 [vfio_iommu_type1] > [ +0.001119] > which lock already depends on the new lock. > > [ +0.001086] > the existing dependency chain (in reverse order) is: > [ +0.000910] > -> #3 (&mm->mmap_lock#2){++++}-{3:3}: > [ +0.000844] __might_fault+0x56/0x80 > [ +0.000572] _copy_to_user+0x1e/0x80 > [ +0.000556] mlx5vf_pci_mig_rw.cold+0xa1/0x24f [mlx5_vfio_pci] > [ +0.000732] vfs_read+0xa8/0x1c0 > [ +0.000547] __x64_sys_pread64+0x8c/0xc0 > [ +0.000580] do_syscall_64+0x3d/0x90 > [ +0.000566] entry_SYSCALL_64_after_hwframe+0x44/0xae > [ +0.000682] > -> #2 (&mvdev->state_mutex){+.+.}-{3:3}: > [ +0.000899] __mutex_lock+0x80/0x9d0 > [ +0.000566] mlx5vf_reset_done+0x2c/0x40 [mlx5_vfio_pci] > [ +0.000697] vfio_pci_core_ioctl+0x585/0x1020 [vfio_pci_core] > [ +0.000721] __x64_sys_ioctl+0x436/0x9a0 > [ +0.000588] do_syscall_64+0x3d/0x90 > [ +0.000584] entry_SYSCALL_64_after_hwframe+0x44/0xae > [ +0.000674] > -> #1 (&vdev->memory_lock){+.+.}-{3:3}: > [ +0.000843] down_write+0x38/0x70 > [ +0.000544] vfio_pci_zap_and_down_write_memory_lock+0x1c/0x30 > [vfio_pci_core] > [ +0.003705] vfio_basic_config_write+0x1e7/0x280 [vfio_pci_core] > [ +0.000744] vfio_pci_config_rw+0x1b7/0x3af [vfio_pci_core] > [ +0.000716] vfs_write+0xe6/0x390 > [ +0.000539] __x64_sys_pwrite64+0x8c/0xc0 > [ +0.000603] do_syscall_64+0x3d/0x90 > [ +0.000572] entry_SYSCALL_64_after_hwframe+0x44/0xae > [ +0.000661] > -> #0 (&vdev->vma_lock){+.+.}-{3:3}: > [ +0.000828] __lock_acquire+0x1244/0x2280 > [ +0.000596] lock_acquire+0xc2/0x2c0 > [ +0.000556] __mutex_lock+0x80/0x9d0 > [ +0.000580] vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core] > [ +0.000709] __do_fault+0x32/0xa0 > [ +0.000556] __handle_mm_fault+0xbe8/0x1450 > [ +0.000606] handle_mm_fault+0x6c/0x140 > [ +0.000624] fixup_user_fault+0x6b/0x100 > [ +0.000600] vaddr_get_pfns+0x108/0x240 [vfio_iommu_type1] > [ +0.000726] vfio_pin_pages_remote+0x326/0x460 [vfio_iommu_type1] > [ +0.000736] vfio_iommu_type1_ioctl+0x43b/0x15a0 [vfio_iommu_type1] > [ +0.000752] __x64_sys_ioctl+0x436/0x9a0 > [ +0.000588] do_syscall_64+0x3d/0x90 > [ +0.000571] entry_SYSCALL_64_after_hwframe+0x44/0xae > [ +0.000677] > other info that might help us debug this: > > [ +0.001073] Chain exists of: > &vdev->vma_lock --> &mvdev->state_mutex --> > &mm->mmap_lock#2 > > [ +0.001285] Possible unsafe locking scenario: > > [ +0.000808] CPU0 CPU1 > [ +0.000599] ---- ---- > [ +0.000593] lock(&mm->mmap_lock#2); > [ +0.000530] lock(&mvdev->state_mutex); > [ +0.000725] lock(&mm->mmap_lock#2); > [ +0.000712] lock(&vdev->vma_lock); > [ +0.000532] > *** DEADLOCK *** Thanks for this information, but this flow is not clear to me why it cause deadlock. From this flow, CPU0 is not waiting for any resource held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue to run. Am I missing something? Meanwhile, it seems the trace above is triggered with migration protocol v1, the context of CPU1 listed also seems protocol v1 specific. Is it the case? Thanks, Xin > > [ +0.000922] 2 locks held by qemu-system-x86/7731: > [ +0.000597] #0: ffff88810c9bec88 (&iommu->lock#2){+.+.}-{3:3}, at: > vfio_iommu_type1_ioctl+0x189/0x15a0 [vfio_iommu_type1] > [ +0.001177] #1: ffff888105f4c5d8 (&mm->mmap_lock#2){++++}-{3:3}, at: > vaddr_get_pfns+0x64/0x240 [vfio_iommu_type1] > [ +0.001153] > stack backtrace: > [ +0.000689] CPU: 1 PID: 7731 Comm: qemu-system-x86 Not tainted > 5.15.0-rc3 #236 > [ +0.000932] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS > rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 > [ +0.001192] Call Trace: > [ +0.000454] dump_stack_lvl+0x45/0x59 > [ +0.000527] check_noncircular+0xf2/0x110 > [ +0.000557] __lock_acquire+0x1244/0x2280 > [ +0.002462] lock_acquire+0xc2/0x2c0 > [ +0.000534] ? vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core] > [ +0.000684] ? lock_is_held_type+0x98/0x110 > [ +0.000565] __mutex_lock+0x80/0x9d0 > [ +0.000542] ? vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core] > [ +0.000676] ? vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core] > [ +0.000681] ? mark_held_locks+0x49/0x70 > [ +0.000551] ? lock_is_held_type+0x98/0x110 > [ +0.000575] vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core] > [ +0.000679] __do_fault+0x32/0xa0 > [ +0.000700] __handle_mm_fault+0xbe8/0x1450 > [ +0.000571] handle_mm_fault+0x6c/0x140 > [ +0.000564] fixup_user_fault+0x6b/0x100 > [ +0.000556] vaddr_get_pfns+0x108/0x240 [vfio_iommu_type1] > [ +0.000666] ? lock_is_held_type+0x98/0x110 > [ +0.000572] vfio_pin_pages_remote+0x326/0x460 [vfio_iommu_type1] > [ +0.000710] vfio_iommu_type1_ioctl+0x43b/0x15a0 [vfio_iommu_type1] > [ +0.001050] ? find_held_lock+0x2b/0x80 > [ +0.000561] ? lock_release+0xc2/0x2a0 > [ +0.000537] ? __fget_files+0xdc/0x1d0 > [ +0.000541] __x64_sys_ioctl+0x436/0x9a0 > [ +0.000556] ? lockdep_hardirqs_on_prepare+0xd4/0x170 > [ +0.000641] do_syscall_64+0x3d/0x90 > [ +0.000529] entry_SYSCALL_64_after_hwframe+0x44/0xae > [ +0.000670] RIP: 0033:0x7f54255bb45b > [ +0.000541] Code: 0f 1e fa 48 8b 05 2d aa 2c 00 64 c7 00 26 00 00 00 > 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f > 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d fd a9 2c 00 f7 d8 64 89 01 48 > [ +0.001843] RSP: 002b:00007f5415ca3d38 EFLAGS: 00000206 ORIG_RAX: > 0000000000000010 > [ +0.000929] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: > 00007f54255bb45b > [ +0.000781] RDX: 00007f5415ca3d70 RSI: 0000000000003b71 RDI: > 0000000000000012 > [ +0.000775] RBP: 00007f5415ca3da0 R08: 0000000000000000 R09: > 0000000000000000 > [ +0.000777] R10: 00000000fe002000 R11: 0000000000000206 R12: > 0000000000000004 > [ +0.000775] R13: 0000000000000000 R14: 00000000fe000000 R15: > 00007f5415ca47c0 > > Yishai
On Sat, Feb 17, 2024 at 04:20:20PM +0000, Zeng, Xin wrote: > Thanks for this information, but this flow is not clear to me why it > cause deadlock. From this flow, CPU0 is not waiting for any resource > held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue > to run. Am I missing something? At some point it was calling copy_to_user() under the state mutex. These days it doesn't. copy_to_user() would nest the mm_lock under the state mutex which is a locking inversion. So I wonder if we still have this problem now that the copy_to_user() is not under the mutex? Jason
On Tuesday, February 20, 2024 9:25 PM, Jason Gunthorpe wrote: > To: Zeng, Xin <xin.zeng@intel.com> > Cc: Yishai Hadas <yishaih@nvidia.com>; herbert@gondor.apana.org.au; > alex.williamson@redhat.com; shameerali.kolothum.thodi@huawei.com; Tian, > Kevin <kevin.tian@intel.com>; linux-crypto@vger.kernel.org; > kvm@vger.kernel.org; qat-linux <qat-linux@intel.com>; Cao, Yahui > <yahui.cao@intel.com> > Subject: Re: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel QAT VF devices > > On Sat, Feb 17, 2024 at 04:20:20PM +0000, Zeng, Xin wrote: > > > Thanks for this information, but this flow is not clear to me why it > > cause deadlock. From this flow, CPU0 is not waiting for any resource > > held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue > > to run. Am I missing something? > > At some point it was calling copy_to_user() under the state > mutex. These days it doesn't. > > copy_to_user() would nest the mm_lock under the state mutex which is a > locking inversion. > > So I wonder if we still have this problem now that the copy_to_user() > is not under the mutex? In protocol v2, we still have the scenario in precopy_ioctl where copy_to_user is called under state_mutex. Thanks, Xin
On Tue, Feb 20, 2024 at 03:53:08PM +0000, Zeng, Xin wrote: > On Tuesday, February 20, 2024 9:25 PM, Jason Gunthorpe wrote: > > To: Zeng, Xin <xin.zeng@intel.com> > > Cc: Yishai Hadas <yishaih@nvidia.com>; herbert@gondor.apana.org.au; > > alex.williamson@redhat.com; shameerali.kolothum.thodi@huawei.com; Tian, > > Kevin <kevin.tian@intel.com>; linux-crypto@vger.kernel.org; > > kvm@vger.kernel.org; qat-linux <qat-linux@intel.com>; Cao, Yahui > > <yahui.cao@intel.com> > > Subject: Re: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel QAT VF devices > > > > On Sat, Feb 17, 2024 at 04:20:20PM +0000, Zeng, Xin wrote: > > > > > Thanks for this information, but this flow is not clear to me why it > > > cause deadlock. From this flow, CPU0 is not waiting for any resource > > > held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue > > > to run. Am I missing something? > > > > At some point it was calling copy_to_user() under the state > > mutex. These days it doesn't. > > > > copy_to_user() would nest the mm_lock under the state mutex which is a > > locking inversion. > > > > So I wonder if we still have this problem now that the copy_to_user() > > is not under the mutex? > > In protocol v2, we still have the scenario in precopy_ioctl where copy_to_user is > called under state_mutex. Why? Does mlx5 do that? It looked Ok to me: mlx5vf_state_mutex_unlock(mvdev); if (copy_to_user((void __user *)arg, &info, minsz)) return -EFAULT; Jason
On Wednesday, February 21, 2024 1:03 AM, Jason Gunthorpe wrote: > On Tue, Feb 20, 2024 at 03:53:08PM +0000, Zeng, Xin wrote: > > On Tuesday, February 20, 2024 9:25 PM, Jason Gunthorpe wrote: > > > To: Zeng, Xin <xin.zeng@intel.com> > > > Cc: Yishai Hadas <yishaih@nvidia.com>; herbert@gondor.apana.org.au; > > > alex.williamson@redhat.com; shameerali.kolothum.thodi@huawei.com; > Tian, > > > Kevin <kevin.tian@intel.com>; linux-crypto@vger.kernel.org; > > > kvm@vger.kernel.org; qat-linux <qat-linux@intel.com>; Cao, Yahui > > > <yahui.cao@intel.com> > > > Subject: Re: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel QAT VF > devices > > > > > > On Sat, Feb 17, 2024 at 04:20:20PM +0000, Zeng, Xin wrote: > > > > > > > Thanks for this information, but this flow is not clear to me why it > > > > cause deadlock. From this flow, CPU0 is not waiting for any resource > > > > held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue > > > > to run. Am I missing something? > > > > > > At some point it was calling copy_to_user() under the state > > > mutex. These days it doesn't. > > > > > > copy_to_user() would nest the mm_lock under the state mutex which is > a > > > locking inversion. > > > > > > So I wonder if we still have this problem now that the copy_to_user() > > > is not under the mutex? > > > > In protocol v2, we still have the scenario in precopy_ioctl where > copy_to_user is > > called under state_mutex. > > Why? Does mlx5 do that? It looked Ok to me: > > mlx5vf_state_mutex_unlock(mvdev); > if (copy_to_user((void __user *)arg, &info, minsz)) > return -EFAULT; Indeed, thanks, Jason. BTW, is there any reason why was "deferred_reset" mode still implemented in mlx5 driver given this deadlock condition has been avoided with migration protocol v2 implementation. Anyway, I'll use state_mutex directly instead of the "deferred_reset" mode in qat variant driver and update it in next version soon, please help to review. Thanks, Xin
On Wed, Feb 21, 2024 at 08:44:31AM +0000, Zeng, Xin wrote: > On Wednesday, February 21, 2024 1:03 AM, Jason Gunthorpe wrote: > > On Tue, Feb 20, 2024 at 03:53:08PM +0000, Zeng, Xin wrote: > > > On Tuesday, February 20, 2024 9:25 PM, Jason Gunthorpe wrote: > > > > To: Zeng, Xin <xin.zeng@intel.com> > > > > Cc: Yishai Hadas <yishaih@nvidia.com>; herbert@gondor.apana.org.au; > > > > alex.williamson@redhat.com; shameerali.kolothum.thodi@huawei.com; > > Tian, > > > > Kevin <kevin.tian@intel.com>; linux-crypto@vger.kernel.org; > > > > kvm@vger.kernel.org; qat-linux <qat-linux@intel.com>; Cao, Yahui > > > > <yahui.cao@intel.com> > > > > Subject: Re: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel QAT VF > > devices > > > > > > > > On Sat, Feb 17, 2024 at 04:20:20PM +0000, Zeng, Xin wrote: > > > > > > > > > Thanks for this information, but this flow is not clear to me why it > > > > > cause deadlock. From this flow, CPU0 is not waiting for any resource > > > > > held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue > > > > > to run. Am I missing something? > > > > > > > > At some point it was calling copy_to_user() under the state > > > > mutex. These days it doesn't. > > > > > > > > copy_to_user() would nest the mm_lock under the state mutex which is > > a > > > > locking inversion. > > > > > > > > So I wonder if we still have this problem now that the copy_to_user() > > > > is not under the mutex? > > > > > > In protocol v2, we still have the scenario in precopy_ioctl where > > copy_to_user is > > > called under state_mutex. > > > > Why? Does mlx5 do that? It looked Ok to me: > > > > mlx5vf_state_mutex_unlock(mvdev); > > if (copy_to_user((void __user *)arg, &info, minsz)) > > return -EFAULT; > > Indeed, thanks, Jason. BTW, is there any reason why was "deferred_reset" mode > still implemented in mlx5 driver given this deadlock condition has been avoided > with migration protocol v2 implementation. I do not remember. Yishai? Jason
On 21/02/2024 15:18, Jason Gunthorpe wrote: > On Wed, Feb 21, 2024 at 08:44:31AM +0000, Zeng, Xin wrote: >> On Wednesday, February 21, 2024 1:03 AM, Jason Gunthorpe wrote: >>> On Tue, Feb 20, 2024 at 03:53:08PM +0000, Zeng, Xin wrote: >>>> On Tuesday, February 20, 2024 9:25 PM, Jason Gunthorpe wrote: >>>>> To: Zeng, Xin <xin.zeng@intel.com> >>>>> Cc: Yishai Hadas <yishaih@nvidia.com>; herbert@gondor.apana.org.au; >>>>> alex.williamson@redhat.com; shameerali.kolothum.thodi@huawei.com; >>> Tian, >>>>> Kevin <kevin.tian@intel.com>; linux-crypto@vger.kernel.org; >>>>> kvm@vger.kernel.org; qat-linux <qat-linux@intel.com>; Cao, Yahui >>>>> <yahui.cao@intel.com> >>>>> Subject: Re: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel QAT VF >>> devices >>>>> >>>>> On Sat, Feb 17, 2024 at 04:20:20PM +0000, Zeng, Xin wrote: >>>>> >>>>>> Thanks for this information, but this flow is not clear to me why it >>>>>> cause deadlock. From this flow, CPU0 is not waiting for any resource >>>>>> held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue >>>>>> to run. Am I missing something? >>>>> >>>>> At some point it was calling copy_to_user() under the state >>>>> mutex. These days it doesn't. >>>>> >>>>> copy_to_user() would nest the mm_lock under the state mutex which is >>> a >>>>> locking inversion. >>>>> >>>>> So I wonder if we still have this problem now that the copy_to_user() >>>>> is not under the mutex? >>>> >>>> In protocol v2, we still have the scenario in precopy_ioctl where >>> copy_to_user is >>>> called under state_mutex. >>> >>> Why? Does mlx5 do that? It looked Ok to me: >>> >>> mlx5vf_state_mutex_unlock(mvdev); >>> if (copy_to_user((void __user *)arg, &info, minsz)) >>> return -EFAULT; >> >> Indeed, thanks, Jason. BTW, is there any reason why was "deferred_reset" mode >> still implemented in mlx5 driver given this deadlock condition has been avoided >> with migration protocol v2 implementation. > > I do not remember. Yishai? > Long time passed.., I also don't fully remember whether this was the only potential problem here, maybe Yes. My plan is to prepare a cleanup patch for mlx5 and put it into our regression for a while, if all will be good, I may send it for the next kernel cycle. By the way, there are other drivers around (i.e. hisi and mtty) that still run copy_to_user() under the state mutex and might hit the problem without the 'deferred_rest', see here[1]. If we'll reach to the conclusion that the only reason for that mechanism was the copy_to_user() under the state_mutex, those drivers can change their code easily and also send a patch to cleanup the 'deferred_reset'. [1] https://elixir.bootlin.com/linux/v6.8-rc5/source/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c#L808 [2] https://elixir.bootlin.com/linux/v6.8-rc5/source/samples/vfio-mdev/mtty.c#L878 Yishai
On 21/02/2024 17:11, Yishai Hadas wrote: > On 21/02/2024 15:18, Jason Gunthorpe wrote: >> On Wed, Feb 21, 2024 at 08:44:31AM +0000, Zeng, Xin wrote: >>> On Wednesday, February 21, 2024 1:03 AM, Jason Gunthorpe wrote: >>>> On Tue, Feb 20, 2024 at 03:53:08PM +0000, Zeng, Xin wrote: >>>>> On Tuesday, February 20, 2024 9:25 PM, Jason Gunthorpe wrote: >>>>>> To: Zeng, Xin <xin.zeng@intel.com> >>>>>> Cc: Yishai Hadas <yishaih@nvidia.com>; herbert@gondor.apana.org.au; >>>>>> alex.williamson@redhat.com; shameerali.kolothum.thodi@huawei.com; >>>> Tian, >>>>>> Kevin <kevin.tian@intel.com>; linux-crypto@vger.kernel.org; >>>>>> kvm@vger.kernel.org; qat-linux <qat-linux@intel.com>; Cao, Yahui >>>>>> <yahui.cao@intel.com> >>>>>> Subject: Re: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel >>>>>> QAT VF >>>> devices >>>>>> >>>>>> On Sat, Feb 17, 2024 at 04:20:20PM +0000, Zeng, Xin wrote: >>>>>> >>>>>>> Thanks for this information, but this flow is not clear to me why it >>>>>>> cause deadlock. From this flow, CPU0 is not waiting for any resource >>>>>>> held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue >>>>>>> to run. Am I missing something? >>>>>> >>>>>> At some point it was calling copy_to_user() under the state >>>>>> mutex. These days it doesn't. >>>>>> >>>>>> copy_to_user() would nest the mm_lock under the state mutex which is >>>> a >>>>>> locking inversion. >>>>>> >>>>>> So I wonder if we still have this problem now that the copy_to_user() >>>>>> is not under the mutex? >>>>> >>>>> In protocol v2, we still have the scenario in precopy_ioctl where >>>> copy_to_user is >>>>> called under state_mutex. >>>> >>>> Why? Does mlx5 do that? It looked Ok to me: >>>> >>>> mlx5vf_state_mutex_unlock(mvdev); >>>> if (copy_to_user((void __user *)arg, &info, minsz)) >>>> return -EFAULT; >>> >>> Indeed, thanks, Jason. BTW, is there any reason why was >>> "deferred_reset" mode >>> still implemented in mlx5 driver given this deadlock condition has >>> been avoided >>> with migration protocol v2 implementation. >> >> I do not remember. Yishai? >> > > Long time passed.., I also don't fully remember whether this was the > only potential problem here, maybe Yes. > > My plan is to prepare a cleanup patch for mlx5 and put it into our > regression for a while, if all will be good, I may send it for the next > kernel cycle. > > By the way, there are other drivers around (i.e. hisi and mtty) that > still run copy_to_user() under the state mutex and might hit the problem > without the 'deferred_rest', see here[1]. > > If we'll reach to the conclusion that the only reason for that mechanism > was the copy_to_user() under the state_mutex, those drivers can change > their code easily and also send a patch to cleanup the 'deferred_reset'. > > [1] > https://elixir.bootlin.com/linux/v6.8-rc5/source/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c#L808 > [2] > https://elixir.bootlin.com/linux/v6.8-rc5/source/samples/vfio-mdev/mtty.c#L878 > > Yishai > From an extra code review around, it looks as we still need the 'deferred_reset' flow in mlx5. For example, we call copy_from_user() as part of mlx5vf_resume_read_header() which is called by mlx5vf_resume_write() under the state mutex. So, no change is expected in mlx5. Yishai
diff --git a/MAINTAINERS b/MAINTAINERS index 8d1052fa6a69..c1d3e4cb3892 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23095,6 +23095,14 @@ S: Maintained F: Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst F: drivers/vfio/pci/pds/ +VFIO QAT PCI DRIVER +M: Xin Zeng <xin.zeng@intel.com> +M: Giovanni Cabiddu <giovanni.cabiddu@intel.com> +L: kvm@vger.kernel.org +L: qat-linux@intel.com +S: Supported +F: drivers/vfio/pci/intel/qat/ + VFIO PLATFORM DRIVER M: Eric Auger <eric.auger@redhat.com> L: kvm@vger.kernel.org diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 18c397df566d..329d25c53274 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -67,4 +67,6 @@ source "drivers/vfio/pci/pds/Kconfig" source "drivers/vfio/pci/virtio/Kconfig" +source "drivers/vfio/pci/intel/qat/Kconfig" + endmenu diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 046139a4eca5..a87b6b43ce1c 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -15,3 +15,5 @@ obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/ obj-$(CONFIG_PDS_VFIO_PCI) += pds/ obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/ + +obj-$(CONFIG_QAT_VFIO_PCI) += intel/qat/ diff --git a/drivers/vfio/pci/intel/qat/Kconfig b/drivers/vfio/pci/intel/qat/Kconfig new file mode 100644 index 000000000000..71b28ac0bf6a --- /dev/null +++ b/drivers/vfio/pci/intel/qat/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +config QAT_VFIO_PCI + tristate "VFIO support for QAT VF PCI devices" + select VFIO_PCI_CORE + depends on CRYPTO_DEV_QAT + depends on CRYPTO_DEV_QAT_4XXX + help + This provides migration support for Intel(R) QAT Virtual Function + using the VFIO framework. + + To compile this as a module, choose M here: the module + will be called qat_vfio_pci. If you don't know what to do here, + say N. diff --git a/drivers/vfio/pci/intel/qat/Makefile b/drivers/vfio/pci/intel/qat/Makefile new file mode 100644 index 000000000000..9289ae4c51bf --- /dev/null +++ b/drivers/vfio/pci/intel/qat/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_QAT_VFIO_PCI) += qat_vfio_pci.o +qat_vfio_pci-y := main.o + diff --git a/drivers/vfio/pci/intel/qat/main.c b/drivers/vfio/pci/intel/qat/main.c new file mode 100644 index 000000000000..85d0ed701397 --- /dev/null +++ b/drivers/vfio/pci/intel/qat/main.c @@ -0,0 +1,572 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2024 Intel Corporation */ + +#include <linux/anon_inodes.h> +#include <linux/container_of.h> +#include <linux/device.h> +#include <linux/file.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <linux/sizes.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/vfio_pci_core.h> +#include <linux/qat/qat_mig_dev.h> + +struct qat_vf_migration_file { + struct file *filp; + /* protects migration region context */ + struct mutex lock; + bool disabled; + struct qat_mig_dev *mdev; +}; + +struct qat_vf_core_device { + struct vfio_pci_core_device core_device; + struct qat_mig_dev *mdev; + /* protects migration state */ + struct mutex state_mutex; + enum vfio_device_mig_state mig_state; + /* protects reset down flow */ + spinlock_t reset_lock; + bool deferred_reset; + struct qat_vf_migration_file *resuming_migf; + struct qat_vf_migration_file *saving_migf; +}; + +static int qat_vf_pci_open_device(struct vfio_device *core_vdev) +{ + struct qat_vf_core_device *qat_vdev = + container_of(core_vdev, struct qat_vf_core_device, + core_device.vdev); + struct vfio_pci_core_device *vdev = &qat_vdev->core_device; + int ret; + + ret = vfio_pci_core_enable(vdev); + if (ret) + return ret; + + ret = qat_vdev->mdev->ops->open(qat_vdev->mdev); + if (ret) { + vfio_pci_core_disable(vdev); + return ret; + } + qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + + vfio_pci_core_finish_enable(vdev); + + return 0; +} + +static void qat_vf_disable_fd(struct qat_vf_migration_file *migf) +{ + mutex_lock(&migf->lock); + migf->disabled = true; + migf->filp->f_pos = 0; + mutex_unlock(&migf->lock); +} + +static void qat_vf_disable_fds(struct qat_vf_core_device *qat_vdev) +{ + if (qat_vdev->resuming_migf) { + qat_vf_disable_fd(qat_vdev->resuming_migf); + fput(qat_vdev->resuming_migf->filp); + qat_vdev->resuming_migf = NULL; + } + + if (qat_vdev->saving_migf) { + qat_vf_disable_fd(qat_vdev->saving_migf); + fput(qat_vdev->saving_migf->filp); + qat_vdev->saving_migf = NULL; + } +} + +static void qat_vf_pci_close_device(struct vfio_device *core_vdev) +{ + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, + struct qat_vf_core_device, core_device.vdev); + + qat_vdev->mdev->ops->close(qat_vdev->mdev); + qat_vf_disable_fds(qat_vdev); + vfio_pci_core_close_device(core_vdev); +} + +static ssize_t qat_vf_save_read(struct file *filp, char __user *buf, + size_t len, loff_t *pos) +{ + struct qat_vf_migration_file *migf = filp->private_data; + ssize_t done = 0; + loff_t *offs; + int ret; + + if (pos) + return -ESPIPE; + offs = &filp->f_pos; + + mutex_lock(&migf->lock); + if (*offs > migf->mdev->state_size || *offs < 0) { + done = -EINVAL; + goto out_unlock; + } + + if (migf->disabled) { + done = -ENODEV; + goto out_unlock; + } + + len = min_t(size_t, migf->mdev->state_size - *offs, len); + if (len) { + ret = copy_to_user(buf, migf->mdev->state + *offs, len); + if (ret) { + done = -EFAULT; + goto out_unlock; + } + *offs += len; + done = len; + } + +out_unlock: + mutex_unlock(&migf->lock); + return done; +} + +static int qat_vf_release_file(struct inode *inode, struct file *filp) +{ + struct qat_vf_migration_file *migf = filp->private_data; + + qat_vf_disable_fd(migf); + mutex_destroy(&migf->lock); + kfree(migf); + + return 0; +} + +static const struct file_operations qat_vf_save_fops = { + .owner = THIS_MODULE, + .read = qat_vf_save_read, + .release = qat_vf_release_file, + .llseek = no_llseek, +}; + +static struct qat_vf_migration_file * +qat_vf_save_device_data(struct qat_vf_core_device *qat_vdev) +{ + struct qat_vf_migration_file *migf; + int ret; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_save_fops, migf, O_RDONLY); + ret = PTR_ERR_OR_ZERO(migf->filp); + if (ret) { + kfree(migf); + return ERR_PTR(ret); + } + + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + + ret = qat_vdev->mdev->ops->save_state(qat_vdev->mdev); + if (ret) { + fput(migf->filp); + kfree(migf); + return ERR_PTR(ret); + } + + migf->mdev = qat_vdev->mdev; + + return migf; +} + +static ssize_t qat_vf_resume_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct qat_vf_migration_file *migf = filp->private_data; + loff_t end, *offs; + ssize_t done = 0; + int ret; + + if (pos) + return -ESPIPE; + offs = &filp->f_pos; + + if (*offs < 0 || + check_add_overflow((loff_t)len, *offs, &end)) + return -EOVERFLOW; + + if (end > migf->mdev->state_size) + return -ENOMEM; + + mutex_lock(&migf->lock); + if (migf->disabled) { + done = -ENODEV; + goto out_unlock; + } + + ret = copy_from_user(migf->mdev->state + *offs, buf, len); + if (ret) { + done = -EFAULT; + goto out_unlock; + } + *offs += len; + done = len; + +out_unlock: + mutex_unlock(&migf->lock); + return done; +} + +static const struct file_operations qat_vf_resume_fops = { + .owner = THIS_MODULE, + .write = qat_vf_resume_write, + .release = qat_vf_release_file, + .llseek = no_llseek, +}; + +static struct qat_vf_migration_file * +qat_vf_resume_device_data(struct qat_vf_core_device *qat_vdev) +{ + struct qat_vf_migration_file *migf; + int ret; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_resume_fops, migf, O_WRONLY); + ret = PTR_ERR_OR_ZERO(migf->filp); + if (ret) { + kfree(migf); + return ERR_PTR(ret); + } + + migf->mdev = qat_vdev->mdev; + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + + return migf; +} + +static int qat_vf_load_device_data(struct qat_vf_core_device *qat_vdev) +{ + return qat_vdev->mdev->ops->load_state(qat_vdev->mdev); +} + +static struct file *qat_vf_pci_step_device_state(struct qat_vf_core_device *qat_vdev, u32 new) +{ + u32 cur = qat_vdev->mig_state; + int ret; + + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P)) { + ret = qat_vdev->mdev->ops->suspend(qat_vdev->mdev); + if (ret) + return ERR_PTR(ret); + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P)) + return NULL; + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { + struct qat_vf_migration_file *migf; + + migf = qat_vf_save_device_data(qat_vdev); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + qat_vdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { + qat_vf_disable_fds(qat_vdev); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { + struct qat_vf_migration_file *migf; + + migf = qat_vf_resume_device_data(qat_vdev); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + qat_vdev->resuming_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { + ret = qat_vf_load_device_data(qat_vdev); + if (ret) + return ERR_PTR(ret); + + qat_vf_disable_fds(qat_vdev); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { + qat_vdev->mdev->ops->resume(qat_vdev->mdev); + return NULL; + } + + /* vfio_mig_get_next_state() does not use arcs other than the above */ + WARN_ON(true); + return ERR_PTR(-EINVAL); +} + +static void qat_vf_state_mutex_unlock(struct qat_vf_core_device *qat_vdev) +{ +again: + spin_lock(&qat_vdev->reset_lock); + if (qat_vdev->deferred_reset) { + qat_vdev->deferred_reset = false; + spin_unlock(&qat_vdev->reset_lock); + qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + qat_vf_disable_fds(qat_vdev); + goto again; + } + mutex_unlock(&qat_vdev->state_mutex); + spin_unlock(&qat_vdev->reset_lock); +} + +static struct file *qat_vf_pci_set_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state new_state) +{ + struct qat_vf_core_device *qat_vdev = container_of(vdev, + struct qat_vf_core_device, core_device.vdev); + enum vfio_device_mig_state next_state; + struct file *res = NULL; + int ret; + + mutex_lock(&qat_vdev->state_mutex); + while (new_state != qat_vdev->mig_state) { + ret = vfio_mig_get_next_state(vdev, qat_vdev->mig_state, + new_state, &next_state); + if (ret) { + res = ERR_PTR(ret); + break; + } + res = qat_vf_pci_step_device_state(qat_vdev, next_state); + if (IS_ERR(res)) + break; + qat_vdev->mig_state = next_state; + if (WARN_ON(res && new_state != qat_vdev->mig_state)) { + fput(res); + res = ERR_PTR(-EINVAL); + break; + } + } + qat_vf_state_mutex_unlock(qat_vdev); + + return res; +} + +static int qat_vf_pci_get_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state *curr_state) +{ + struct qat_vf_core_device *qat_vdev = container_of(vdev, + struct qat_vf_core_device, core_device.vdev); + + mutex_lock(&qat_vdev->state_mutex); + *curr_state = qat_vdev->mig_state; + qat_vf_state_mutex_unlock(qat_vdev); + + return 0; +} + +static int qat_vf_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct qat_vf_core_device *qat_vdev = container_of(vdev, + struct qat_vf_core_device, core_device.vdev); + + *stop_copy_length = qat_vdev->mdev->state_size; + return 0; +} + +static const struct vfio_migration_ops qat_vf_pci_mig_ops = { + .migration_set_state = qat_vf_pci_set_device_state, + .migration_get_state = qat_vf_pci_get_device_state, + .migration_get_data_size = qat_vf_pci_get_data_size, +}; + +static void qat_vf_pci_release_dev(struct vfio_device *core_vdev) +{ + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, + struct qat_vf_core_device, core_device.vdev); + + qat_vdev->mdev->ops->cleanup(qat_vdev->mdev); + qat_vfmig_destroy(qat_vdev->mdev); + mutex_destroy(&qat_vdev->state_mutex); + vfio_pci_core_release_dev(core_vdev); +} + +static int qat_vf_pci_init_dev(struct vfio_device *core_vdev) +{ + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, + struct qat_vf_core_device, core_device.vdev); + struct qat_migdev_ops *ops; + struct qat_mig_dev *mdev; + struct pci_dev *parent; + int ret, vf_id; + + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; + core_vdev->mig_ops = &qat_vf_pci_mig_ops; + + ret = vfio_pci_core_init_dev(core_vdev); + if (ret) + return ret; + + mutex_init(&qat_vdev->state_mutex); + spin_lock_init(&qat_vdev->reset_lock); + + parent = qat_vdev->core_device.pdev->physfn; + vf_id = pci_iov_vf_id(qat_vdev->core_device.pdev); + if (!parent || vf_id < 0) { + ret = -ENODEV; + goto err_rel; + } + + mdev = qat_vfmig_create(parent, vf_id); + if (IS_ERR(mdev)) { + ret = PTR_ERR(mdev); + goto err_rel; + } + + ops = mdev->ops; + if (!ops || !ops->init || !ops->cleanup || + !ops->open || !ops->close || + !ops->save_state || !ops->load_state || + !ops->suspend || !ops->resume) { + ret = -EIO; + dev_err(&parent->dev, "Incomplete device migration ops structure!"); + goto err_destroy; + } + ret = ops->init(mdev); + if (ret) + goto err_destroy; + + qat_vdev->mdev = mdev; + + return 0; + +err_destroy: + qat_vfmig_destroy(mdev); +err_rel: + vfio_pci_core_release_dev(core_vdev); + return ret; +} + +static const struct vfio_device_ops qat_vf_pci_ops = { + .name = "qat-vf-vfio-pci", + .init = qat_vf_pci_init_dev, + .release = qat_vf_pci_release_dev, + .open_device = qat_vf_pci_open_device, + .close_device = qat_vf_pci_close_device, + .ioctl = vfio_pci_core_ioctl, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, +}; + +static struct qat_vf_core_device *qat_vf_drvdata(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = pci_get_drvdata(pdev); + + return container_of(core_device, struct qat_vf_core_device, core_device); +} + +static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev) +{ + struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev); + + if (!qat_vdev->core_device.vdev.mig_ops) + return; + + /* + * As the higher VFIO layers are holding locks across reset and using + * those same locks with the mm_lock we need to prevent ABBA deadlock + * with the state_mutex and mm_lock. + * In case the state_mutex was taken already we defer the cleanup work + * to the unlock flow of the other running context. + */ + spin_lock(&qat_vdev->reset_lock); + qat_vdev->deferred_reset = true; + if (!mutex_trylock(&qat_vdev->state_mutex)) { + spin_unlock(&qat_vdev->reset_lock); + return; + } + spin_unlock(&qat_vdev->reset_lock); + qat_vf_state_mutex_unlock(qat_vdev); +} + +static int +qat_vf_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct device *dev = &pdev->dev; + struct qat_vf_core_device *qat_vdev; + int ret; + + qat_vdev = vfio_alloc_device(qat_vf_core_device, core_device.vdev, dev, &qat_vf_pci_ops); + if (IS_ERR(qat_vdev)) + return PTR_ERR(qat_vdev); + + pci_set_drvdata(pdev, &qat_vdev->core_device); + ret = vfio_pci_core_register_device(&qat_vdev->core_device); + if (ret) + goto out_put_device; + + return 0; + +out_put_device: + vfio_put_device(&qat_vdev->core_device.vdev); + return ret; +} + +static void qat_vf_vfio_pci_remove(struct pci_dev *pdev) +{ + struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev); + + vfio_pci_core_unregister_device(&qat_vdev->core_device); + vfio_put_device(&qat_vdev->core_device.vdev); +} + +static const struct pci_device_id qat_vf_vfio_pci_table[] = { + /* Intel QAT GEN4 4xxx VF device */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4941) }, + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4943) }, + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4945) }, + {} +}; +MODULE_DEVICE_TABLE(pci, qat_vf_vfio_pci_table); + +static const struct pci_error_handlers qat_vf_err_handlers = { + .reset_done = qat_vf_pci_aer_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + +static struct pci_driver qat_vf_vfio_pci_driver = { + .name = "qat_vfio_pci", + .id_table = qat_vf_vfio_pci_table, + .probe = qat_vf_vfio_pci_probe, + .remove = qat_vf_vfio_pci_remove, + .err_handler = &qat_vf_err_handlers, + .driver_managed_dma = true, +}; +module_pci_driver(qat_vf_vfio_pci_driver) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("QAT VFIO PCI - VFIO PCI driver with live migration support for Intel(R) QAT GEN4 device family"); +MODULE_IMPORT_NS(CRYPTO_QAT);