diff mbox series

[V2,mlx5-next,12/14] vfio/mlx5: Implement vfio_pci driver for mlx5 devices

Message ID 20211019105838.227569-13-yishaih@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Add mlx5 live migration driver | expand

Commit Message

Yishai Hadas Oct. 19, 2021, 10:58 a.m. UTC
This patch adds support for vfio_pci driver for mlx5 devices.

It uses vfio_pci_core to register to the VFIO subsystem and then
implements the mlx5 specific logic in the migration area.

The migration implementation follows the definition from uapi/vfio.h and
uses the mlx5 VF->PF command channel to achieve it.

This patch implements the suspend/resume flows.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 MAINTAINERS                    |   6 +
 drivers/vfio/pci/Kconfig       |   3 +
 drivers/vfio/pci/Makefile      |   2 +
 drivers/vfio/pci/mlx5/Kconfig  |  10 +
 drivers/vfio/pci/mlx5/Makefile |   4 +
 drivers/vfio/pci/mlx5/main.c   | 696 +++++++++++++++++++++++++++++++++
 6 files changed, 721 insertions(+)
 create mode 100644 drivers/vfio/pci/mlx5/Kconfig
 create mode 100644 drivers/vfio/pci/mlx5/Makefile
 create mode 100644 drivers/vfio/pci/mlx5/main.c

Comments

Alex Williamson Oct. 19, 2021, 6:43 p.m. UTC | #1
On Tue, 19 Oct 2021 13:58:36 +0300
Yishai Hadas <yishaih@nvidia.com> wrote:

> This patch adds support for vfio_pci driver for mlx5 devices.
> 
> It uses vfio_pci_core to register to the VFIO subsystem and then
> implements the mlx5 specific logic in the migration area.
> 
> The migration implementation follows the definition from uapi/vfio.h and
> uses the mlx5 VF->PF command channel to achieve it.
> 
> This patch implements the suspend/resume flows.
> 
> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
>  MAINTAINERS                    |   6 +
>  drivers/vfio/pci/Kconfig       |   3 +
>  drivers/vfio/pci/Makefile      |   2 +
>  drivers/vfio/pci/mlx5/Kconfig  |  10 +
>  drivers/vfio/pci/mlx5/Makefile |   4 +
>  drivers/vfio/pci/mlx5/main.c   | 696 +++++++++++++++++++++++++++++++++
>  6 files changed, 721 insertions(+)
>  create mode 100644 drivers/vfio/pci/mlx5/Kconfig
>  create mode 100644 drivers/vfio/pci/mlx5/Makefile
>  create mode 100644 drivers/vfio/pci/mlx5/main.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index abdcbcfef73d..e824bfab4a01 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -19699,6 +19699,12 @@ L:	kvm@vger.kernel.org
>  S:	Maintained
>  F:	drivers/vfio/platform/
>  
> +VFIO MLX5 PCI DRIVER
> +M:	Yishai Hadas <yishaih@nvidia.com>
> +L:	kvm@vger.kernel.org
> +S:	Maintained
> +F:	drivers/vfio/pci/mlx5/
> +
>  VGA_SWITCHEROO
>  R:	Lukas Wunner <lukas@wunner.de>
>  S:	Maintained
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 860424ccda1b..187b9c259944 100644
> --- a/drivers/vfio/pci/Kconfig
> +++ b/drivers/vfio/pci/Kconfig
> @@ -43,4 +43,7 @@ config VFIO_PCI_IGD
>  
>  	  To enable Intel IGD assignment through vfio-pci, say Y.
>  endif
> +
> +source "drivers/vfio/pci/mlx5/Kconfig"
> +
>  endif
> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> index 349d68d242b4..ed9d6f2e0555 100644
> --- a/drivers/vfio/pci/Makefile
> +++ b/drivers/vfio/pci/Makefile
> @@ -7,3 +7,5 @@ obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
>  vfio-pci-y := vfio_pci.o
>  vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
>  obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
> +
> +obj-$(CONFIG_MLX5_VFIO_PCI)           += mlx5/
> diff --git a/drivers/vfio/pci/mlx5/Kconfig b/drivers/vfio/pci/mlx5/Kconfig
> new file mode 100644
> index 000000000000..119712656400
> --- /dev/null
> +++ b/drivers/vfio/pci/mlx5/Kconfig
> @@ -0,0 +1,10 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +config MLX5_VFIO_PCI
> +	tristate "VFIO support for MLX5 PCI devices"
> +	depends on MLX5_CORE
> +	select VFIO_PCI_CORE
> +	help
> +	  This provides a migration support for MLX5 devices using the VFIO

s/ a//


> +	  framework.
> +
> +	  If you don't know what to do here, say N.
> diff --git a/drivers/vfio/pci/mlx5/Makefile b/drivers/vfio/pci/mlx5/Makefile
> new file mode 100644
> index 000000000000..689627da7ff5
> --- /dev/null
> +++ b/drivers/vfio/pci/mlx5/Makefile
> @@ -0,0 +1,4 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5-vfio-pci.o
> +mlx5-vfio-pci-y := main.o cmd.o
> +
> diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
> new file mode 100644
> index 000000000000..621b7fc60544
> --- /dev/null
> +++ b/drivers/vfio/pci/mlx5/main.c
> @@ -0,0 +1,696 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved
> + */
> +
> +#include <linux/device.h>
> +#include <linux/eventfd.h>
> +#include <linux/file.h>
> +#include <linux/interrupt.h>
> +#include <linux/iommu.h>
> +#include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/notifier.h>
> +#include <linux/pci.h>
> +#include <linux/pm_runtime.h>
> +#include <linux/types.h>
> +#include <linux/uaccess.h>
> +#include <linux/vfio.h>
> +#include <linux/sched/mm.h>
> +#include <linux/vfio_pci_core.h>
> +
> +#include "cmd.h"
> +
> +enum {
> +	MLX5VF_PCI_FREEZED = 1 << 0,
> +};
> +
> +enum {
> +	MLX5VF_REGION_PENDING_BYTES = 1 << 0,
> +	MLX5VF_REGION_DATA_SIZE = 1 << 1,
> +};
> +
> +enum {
> +	MLX5VF_SUPPORTED_DEVICE_STATES = VFIO_DEVICE_STATE_RUNNING |
> +					 VFIO_DEVICE_STATE_SAVING |
> +					 VFIO_DEVICE_STATE_RESUMING,
> +};
> +
> +#define MLX5VF_MIG_REGION_DATA_SIZE SZ_128K
> +/* Data section offset from migration region */
> +#define MLX5VF_MIG_REGION_DATA_OFFSET                                          \
> +	(sizeof(struct vfio_device_migration_info))
> +
> +#define VFIO_DEVICE_MIGRATION_OFFSET(x)                                        \
> +	(offsetof(struct vfio_device_migration_info, x))
> +
> +struct mlx5vf_pci_migration_info {
> +	u32 vfio_dev_state; /* VFIO_DEVICE_STATE_XXX */
> +	u32 dev_state; /* device migration state */
> +	u32 region_state; /* Use MLX5VF_REGION_XXX */
> +	u16 vhca_id;
> +	struct mlx5_vhca_state_data vhca_state_data;
> +};
> +
> +struct mlx5vf_pci_core_device {
> +	struct vfio_pci_core_device core_device;
> +	u8 migrate_cap:1;
> +	/* protect migartion state */

s/migartion/migration/

> +	struct mutex state_mutex;
> +	struct mlx5vf_pci_migration_info vmig;
> +};
> +
> +static int mlx5vf_pci_unquiesce_device(struct mlx5vf_pci_core_device *mvdev)
> +{
> +	return mlx5vf_cmd_resume_vhca(mvdev->core_device.pdev,
> +				      mvdev->vmig.vhca_id,
> +				      MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_MASTER);
> +}
> +
> +static int mlx5vf_pci_quiesce_device(struct mlx5vf_pci_core_device *mvdev)
> +{
> +	return mlx5vf_cmd_suspend_vhca(
> +		mvdev->core_device.pdev, mvdev->vmig.vhca_id,
> +		MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_MASTER);
> +}
> +
> +static int mlx5vf_pci_unfreeze_device(struct mlx5vf_pci_core_device *mvdev)
> +{
> +	int ret;
> +
> +	ret = mlx5vf_cmd_resume_vhca(mvdev->core_device.pdev,
> +				     mvdev->vmig.vhca_id,
> +				     MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_SLAVE);
> +	if (ret)
> +		return ret;
> +
> +	mvdev->vmig.dev_state &= ~MLX5VF_PCI_FREEZED;
> +	return 0;
> +}
> +
> +static int mlx5vf_pci_freeze_device(struct mlx5vf_pci_core_device *mvdev)
> +{
> +	int ret;
> +
> +	ret = mlx5vf_cmd_suspend_vhca(
> +		mvdev->core_device.pdev, mvdev->vmig.vhca_id,
> +		MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_SLAVE);
> +	if (ret)
> +		return ret;
> +
> +	mvdev->vmig.dev_state |= MLX5VF_PCI_FREEZED;
> +	return 0;
> +}
> +
> +static int mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev)
> +{
> +	u32 state_size = 0;
> +	int ret;
> +
> +	if (!(mvdev->vmig.dev_state & MLX5VF_PCI_FREEZED))
> +		return -EFAULT;
> +
> +	/* If we already read state no reason to re-read */
> +	if (mvdev->vmig.vhca_state_data.state_size)
> +		return 0;
> +
> +	ret = mlx5vf_cmd_query_vhca_migration_state(
> +		mvdev->core_device.pdev, mvdev->vmig.vhca_id, &state_size);
> +	if (ret)
> +		return ret;
> +
> +	return mlx5vf_cmd_save_vhca_state(mvdev->core_device.pdev,
> +					  mvdev->vmig.vhca_id, state_size,
> +					  &mvdev->vmig.vhca_state_data);
> +}
> +
> +static int mlx5vf_pci_new_write_window(struct mlx5vf_pci_core_device *mvdev)
> +{
> +	struct mlx5_vhca_state_data *state_data = &mvdev->vmig.vhca_state_data;
> +	u32 num_pages_needed;
> +	u64 allocated_ready;
> +	u32 bytes_needed;
> +
> +	/* Check how many bytes are available from previous flows */
> +	WARN_ON(state_data->num_pages * PAGE_SIZE <
> +		state_data->win_start_offset);
> +	allocated_ready = (state_data->num_pages * PAGE_SIZE) -
> +			  state_data->win_start_offset;
> +	WARN_ON(allocated_ready > MLX5VF_MIG_REGION_DATA_SIZE);
> +
> +	bytes_needed = MLX5VF_MIG_REGION_DATA_SIZE - allocated_ready;
> +	if (!bytes_needed)
> +		return 0;
> +
> +	num_pages_needed = DIV_ROUND_UP_ULL(bytes_needed, PAGE_SIZE);
> +	return mlx5vf_add_migration_pages(state_data, num_pages_needed);
> +}
> +
> +static ssize_t
> +mlx5vf_pci_handle_migration_data_size(struct mlx5vf_pci_core_device *mvdev,
> +				      char __user *buf, bool iswrite)
> +{
> +	struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig;
> +	u64 data_size;
> +	int ret;
> +
> +	if (iswrite) {
> +		/* data_size is writable only during resuming state */
> +		if (vmig->vfio_dev_state != VFIO_DEVICE_STATE_RESUMING)
> +			return -EINVAL;
> +
> +		ret = copy_from_user(&data_size, buf, sizeof(data_size));
> +		if (ret)
> +			return -EFAULT;
> +
> +		vmig->vhca_state_data.state_size += data_size;
> +		vmig->vhca_state_data.win_start_offset += data_size;
> +		ret = mlx5vf_pci_new_write_window(mvdev);
> +		if (ret)
> +			return ret;
> +
> +	} else {
> +		if (vmig->vfio_dev_state != VFIO_DEVICE_STATE_SAVING)
> +			return -EINVAL;
> +
> +		data_size = min_t(u64, MLX5VF_MIG_REGION_DATA_SIZE,
> +				  vmig->vhca_state_data.state_size -
> +				  vmig->vhca_state_data.win_start_offset);
> +		ret = copy_to_user(buf, &data_size, sizeof(data_size));
> +		if (ret)
> +			return -EFAULT;
> +	}
> +
> +	vmig->region_state |= MLX5VF_REGION_DATA_SIZE;
> +	return sizeof(data_size);
> +}
> +
> +static ssize_t
> +mlx5vf_pci_handle_migration_data_offset(struct mlx5vf_pci_core_device *mvdev,
> +					char __user *buf, bool iswrite)
> +{
> +	static const u64 data_offset = MLX5VF_MIG_REGION_DATA_OFFSET;
> +	int ret;
> +
> +	/* RO field */
> +	if (iswrite)
> +		return -EFAULT;
> +
> +	ret = copy_to_user(buf, &data_offset, sizeof(data_offset));
> +	if (ret)
> +		return -EFAULT;
> +
> +	return sizeof(data_offset);
> +}
> +
> +static ssize_t
> +mlx5vf_pci_handle_migration_pending_bytes(struct mlx5vf_pci_core_device *mvdev,
> +					  char __user *buf, bool iswrite)
> +{
> +	struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig;
> +	u64 pending_bytes;
> +	int ret;
> +
> +	/* RO field */
> +	if (iswrite)
> +		return -EFAULT;
> +
> +	if (vmig->vfio_dev_state == (VFIO_DEVICE_STATE_SAVING |
> +				     VFIO_DEVICE_STATE_RUNNING)) {
> +		/* In pre-copy state we have no data to return for now,
> +		 * return 0 pending bytes
> +		 */
> +		pending_bytes = 0;
> +	} else {
> +		if (!vmig->vhca_state_data.state_size)
> +			return 0;
> +		pending_bytes = vmig->vhca_state_data.state_size -
> +				vmig->vhca_state_data.win_start_offset;
> +	}
> +
> +	ret = copy_to_user(buf, &pending_bytes, sizeof(pending_bytes));
> +	if (ret)
> +		return -EFAULT;
> +
> +	/* Window moves forward once data from previous iteration was read */
> +	if (vmig->region_state & MLX5VF_REGION_DATA_SIZE)
> +		vmig->vhca_state_data.win_start_offset +=
> +			min_t(u64, MLX5VF_MIG_REGION_DATA_SIZE, pending_bytes);
> +
> +	WARN_ON(vmig->vhca_state_data.win_start_offset >
> +		vmig->vhca_state_data.state_size);
> +
> +	/* New iteration started */
> +	vmig->region_state = MLX5VF_REGION_PENDING_BYTES;
> +	return sizeof(pending_bytes);
> +}
> +
> +static int mlx5vf_load_state(struct mlx5vf_pci_core_device *mvdev)
> +{
> +	if (!mvdev->vmig.vhca_state_data.state_size)
> +		return 0;
> +
> +	return mlx5vf_cmd_load_vhca_state(mvdev->core_device.pdev,
> +					  mvdev->vmig.vhca_id,
> +					  &mvdev->vmig.vhca_state_data);
> +}
> +
> +static void mlx5vf_reset_mig_state(struct mlx5vf_pci_core_device *mvdev)
> +{
> +	struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig;
> +
> +	vmig->region_state = 0;
> +	mlx5vf_reset_vhca_state(&vmig->vhca_state_data);
> +}
> +
> +static int mlx5vf_pci_set_device_state(struct mlx5vf_pci_core_device *mvdev,
> +				       u32 state)
> +{
> +	struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig;
> +	u32 old_state = vmig->vfio_dev_state;
> +	int ret = 0;
> +
> +	if (old_state == VFIO_DEVICE_STATE_ERROR ||
> +	    !VFIO_DEVICE_STATE_VALID(state) ||
> +	    (state & ~MLX5VF_SUPPORTED_DEVICE_STATES))
> +		return -EINVAL;
> +
> +	/* Running switches off */
> +	if (((old_state ^ state) & VFIO_DEVICE_STATE_RUNNING) &&
> +	    (old_state & VFIO_DEVICE_STATE_RUNNING)) {
> +		ret = mlx5vf_pci_quiesce_device(mvdev);
> +		if (ret)
> +			return ret;
> +		ret = mlx5vf_pci_freeze_device(mvdev);
> +		if (ret) {
> +			vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR;
> +			return ret;
> +		}
> +	}
> +
> +	/* Resuming switches off */
> +	if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) &&
> +	    (old_state & VFIO_DEVICE_STATE_RESUMING)) {
> +		/* deserialize state into the device */
> +		ret = mlx5vf_load_state(mvdev);
> +		if (ret) {
> +			vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR;
> +			return ret;
> +		}
> +	}
> +
> +	/* Resuming switches on */
> +	if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) &&
> +	    (state & VFIO_DEVICE_STATE_RESUMING)) {
> +		mlx5vf_reset_mig_state(mvdev);
> +		ret = mlx5vf_pci_new_write_window(mvdev);
> +		if (ret)
> +			return ret;
> +	}

A couple nits here...

Perhaps:

	if ((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING)) {
		/* Resuming bit cleared */
		if (old_state & VFIO_DEVICE_STATE_RESUMING) {
			...
		} else { /* Resuming bit set */
			...
		}
	}

Also

	u32 flipped_bits = old_state ^ state;

or similar would simplify all these cases slightly.


> +
> +	/* Saving switches on */
> +	if (((old_state ^ state) & VFIO_DEVICE_STATE_SAVING) &&
> +	    (state & VFIO_DEVICE_STATE_SAVING)) {
> +		if (!(state & VFIO_DEVICE_STATE_RUNNING)) {
> +			/* serialize post copy */
> +			ret = mlx5vf_pci_save_device_data(mvdev);
> +			if (ret)
> +				return ret;
> +		}
> +	}

This doesn't catch all the cases, and in fact misses the most expected
case where userspace clears the _RUNNING bit while _SAVING is already
enabled.  Does that mean this hasn't actually been tested with QEMU?

It seems like there also needs to be a clause in the case where
_RUNNING switches off to test if _SAVING is already set and has not
toggled.


> +
> +	/* Running switches on */
> +	if (((old_state ^ state) & VFIO_DEVICE_STATE_RUNNING) &&
> +	    (state & VFIO_DEVICE_STATE_RUNNING)) {
> +		ret = mlx5vf_pci_unfreeze_device(mvdev);
> +		if (ret)
> +			return ret;
> +		ret = mlx5vf_pci_unquiesce_device(mvdev);
> +		if (ret) {
> +			vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR;
> +			return ret;
> +		}
> +	}

Per previous discussion, I understand that freeze and quiesce are
loosely stop-responding-to-dma and stop-sending-dma, respectively.
Once we're quiesced and frozen, device state doesn't change.  What are
the implications to userspace that we don't expose a quiesce state
(yet)?  I'm wondering if this needs to be resolved before we introduce
our first in-tree user of the uAPI (and before QEMU support becomes
non-experimental).  Thanks,

Alex
Jason Gunthorpe Oct. 19, 2021, 7:23 p.m. UTC | #2
On Tue, Oct 19, 2021 at 12:43:52PM -0600, Alex Williamson wrote:
> > +	/* Running switches on */
> > +	if (((old_state ^ state) & VFIO_DEVICE_STATE_RUNNING) &&
> > +	    (state & VFIO_DEVICE_STATE_RUNNING)) {
> > +		ret = mlx5vf_pci_unfreeze_device(mvdev);
> > +		if (ret)
> > +			return ret;
> > +		ret = mlx5vf_pci_unquiesce_device(mvdev);
> > +		if (ret) {
> > +			vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR;
> > +			return ret;
> > +		}
> > +	}
> 
> Per previous discussion, I understand that freeze and quiesce are
> loosely stop-responding-to-dma and stop-sending-dma, respectively.
> Once we're quiesced and frozen, device state doesn't change.  What are
> the implications to userspace that we don't expose a quiesce state
> (yet)?  I'm wondering if this needs to be resolved before we introduce
> our first in-tree user of the uAPI (and before QEMU support becomes
> non-experimental).  Thanks,

The prototype patch I saw added a 4th bit to the state which was
   1 == 'not dma initiating'
As you suggested I think a cap bit someplace should be defined if the
driver supports the 4th bit.

Otherwise, I think it is backwards compatible, the new logic would be
two ifs

 if ((flipped & STATE_NDMA) &&
      (flipped & (STATE_NDMA | STATE_RUNNING)) == STATE_NDMA | STATE_RUNNING)
      mlx5vf_pci _quiesce_device()

 [..]

 if ((flipped == (STATE_NDMA)) &&
      (flipped & (STATE_NDMA | STATE_RUNNING)) == STATE_RUNNING)
      mlx5vf_pci_unquiesce_device()

Sequenced before/after the other calls to quiesce_device

So if userspace doesn't use it then the same driver behavior is kept,
as it never sees STATE_NDMA flip

Asking for STATE_NDMA !STATE_RUNNING is just ignored because !RUNNING
already implies NDMA

.. and some optimization of the logic to avoid duplicated work

Jason
Alex Williamson Oct. 19, 2021, 8:58 p.m. UTC | #3
On Tue, 19 Oct 2021 16:23:28 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Oct 19, 2021 at 12:43:52PM -0600, Alex Williamson wrote:
> > > +	/* Running switches on */
> > > +	if (((old_state ^ state) & VFIO_DEVICE_STATE_RUNNING) &&
> > > +	    (state & VFIO_DEVICE_STATE_RUNNING)) {
> > > +		ret = mlx5vf_pci_unfreeze_device(mvdev);
> > > +		if (ret)
> > > +			return ret;
> > > +		ret = mlx5vf_pci_unquiesce_device(mvdev);
> > > +		if (ret) {
> > > +			vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR;
> > > +			return ret;
> > > +		}
> > > +	}  
> > 
> > Per previous discussion, I understand that freeze and quiesce are
> > loosely stop-responding-to-dma and stop-sending-dma, respectively.
> > Once we're quiesced and frozen, device state doesn't change.  What are
> > the implications to userspace that we don't expose a quiesce state
> > (yet)?  I'm wondering if this needs to be resolved before we introduce
> > our first in-tree user of the uAPI (and before QEMU support becomes
> > non-experimental).  Thanks,  
> 
> The prototype patch I saw added a 4th bit to the state which was
>    1 == 'not dma initiating'
> As you suggested I think a cap bit someplace should be defined if the
> driver supports the 4th bit.
> 
> Otherwise, I think it is backwards compatible, the new logic would be
> two ifs
> 
>  if ((flipped & STATE_NDMA) &&
>       (flipped & (STATE_NDMA | STATE_RUNNING)) == STATE_NDMA | STATE_RUNNING)
>       mlx5vf_pci _quiesce_device()
> 
>  [..]
> 
>  if ((flipped == (STATE_NDMA)) &&
>       (flipped & (STATE_NDMA | STATE_RUNNING)) == STATE_RUNNING)
>       mlx5vf_pci_unquiesce_device()
> 
> Sequenced before/after the other calls to quiesce_device
> 
> So if userspace doesn't use it then the same driver behavior is kept,
> as it never sees STATE_NDMA flip
> 
> Asking for STATE_NDMA !STATE_RUNNING is just ignored because !RUNNING
> already implies NDMA
> 
> .. and some optimization of the logic to avoid duplicated work

Ok, so this new bit just augments how the device interprets _RUNNING,
it's essentially a don't-care relative to _SAVING or _RESTORING.

I think that gives us this table:

|   NDMA   | RESUMING |  SAVING  |  RUNNING |
+----------+----------+----------+----------+ ---
|     X    |     0    |     0    |     0    |  ^
+----------+----------+----------+----------+  |
|     0    |     0    |     0    |     1    |  |
+----------+----------+----------+----------+  |
|     X    |     0    |     1    |     0    |
+----------+----------+----------+----------+  NDMA value is either compatible
|     0    |     0    |     1    |     1    |  to existing behavior or don't
+----------+----------+----------+----------+  care due to redundancy vs
|     X    |     1    |     0    |     0    |  !_RUNNING/INVALID/ERROR
+----------+----------+----------+----------+
|     X    |     1    |     0    |     1    |  |
+----------+----------+----------+----------+  |
|     X    |     1    |     1    |     0    |  |
+----------+----------+----------+----------+  |
|     X    |     1    |     1    |     1    |  v
+----------+----------+----------+----------+ ---
|     1    |     0    |     0    |     1    |  ^
+----------+----------+----------+----------+  Desired new useful cases
|     1    |     0    |     1    |     1    |  v
+----------+----------+----------+----------+ ---

Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can
set which are simply redundant to the NDMA = 0 cases.  Row 6 remains
invalid due to lack of support for pre-copy (_RESUMING | _RUNNING) and
therefore cannot be set by userspace.  Rows 7 & 8 are error states and
cannot be set by userspace.

Like other bits, setting the bit should be effective at the completion
of writing device state.  Therefore the device would need to flush any
outbound DMA queues before returning.

The question I was really trying to get to though is whether we have a
supportable interface without such an extension.  There's currently
only an experimental version of vfio migration support for PCI devices
in QEMU (afaik), so it seems like we could make use of the bus-master
bit to fill this gap in QEMU currently, before we claim
non-experimental support, but this new device agnostic extension would
be required for non-PCI device support (and PCI support should adopt it
as available).  Does that sound right?  Thanks,

Alex
Jason Gunthorpe Oct. 19, 2021, 11:04 p.m. UTC | #4
On Tue, Oct 19, 2021 at 02:58:56PM -0600, Alex Williamson wrote:
> I think that gives us this table:
> 
> |   NDMA   | RESUMING |  SAVING  |  RUNNING |
> +----------+----------+----------+----------+ ---
> |     X    |     0    |     0    |     0    |  ^
> +----------+----------+----------+----------+  |
> |     0    |     0    |     0    |     1    |  |
> +----------+----------+----------+----------+  |
> |     X    |     0    |     1    |     0    |
> +----------+----------+----------+----------+  NDMA value is either compatible
> |     0    |     0    |     1    |     1    |  to existing behavior or don't
> +----------+----------+----------+----------+  care due to redundancy vs
> |     X    |     1    |     0    |     0    |  !_RUNNING/INVALID/ERROR
> +----------+----------+----------+----------+
> |     X    |     1    |     0    |     1    |  |
> +----------+----------+----------+----------+  |
> |     X    |     1    |     1    |     0    |  |
> +----------+----------+----------+----------+  |
> |     X    |     1    |     1    |     1    |  v
> +----------+----------+----------+----------+ ---
> |     1    |     0    |     0    |     1    |  ^
> +----------+----------+----------+----------+  Desired new useful cases
> |     1    |     0    |     1    |     1    |  v
> +----------+----------+----------+----------+ ---
> 
> Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can
> set which are simply redundant to the NDMA = 0 cases.  

It seems right

> Row 6 remains invalid due to lack of support for pre-copy (_RESUMING
> | _RUNNING) and therefore cannot be set by userspace.  Rows 7 & 8
> are error states and cannot be set by userspace.

I wonder, did Yishai's series capture this row 6 restriction? Yishai?

> Like other bits, setting the bit should be effective at the completion
> of writing device state.  Therefore the device would need to flush any
> outbound DMA queues before returning.

Yes, the device commands are expected to achieve this.

> The question I was really trying to get to though is whether we have a
> supportable interface without such an extension.  There's currently
> only an experimental version of vfio migration support for PCI devices
> in QEMU (afaik), 

If I recall this only matters if you have a VM that is causing
migratable devices to interact with each other. So long as the devices
are only interacting with the CPU this extra step is not strictly
needed.

So, single device cases can be fine as-is

IMHO the multi-device case the VMM should probably demand this support
from the migration drivers, otherwise it cannot know if it is safe for
sure.

A config option to override the block if the admin knows there is no
use case to cause devices to interact - eg two NVMe devices without
CMB do not have a useful interaction.

> so it seems like we could make use of the bus-master bit to fill
> this gap in QEMU currently, before we claim non-experimental
> support, but this new device agnostic extension would be required
> for non-PCI device support (and PCI support should adopt it as
> available).  Does that sound right?  Thanks,

I don't think the bus master support is really a substitute, tripping
bus master will stop DMA but it will not do so in a clean way and is
likely to be non-transparent to the VM's driver.

The single-device-assigned case is a cleaner restriction, IMHO.

Alternatively we can add the 4th bit and insist that migration drivers
support all the states. I'm just unsure what other HW can do, I get
the feeling people have been designing to the migration description in
the header file for a while and this is a new idea.

Jason
Yishai Hadas Oct. 20, 2021, 8:01 a.m. UTC | #5
On 10/19/2021 9:43 PM, Alex Williamson wrote:
>
>> +
>> +	/* Resuming switches off */
>> +	if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) &&
>> +	    (old_state & VFIO_DEVICE_STATE_RESUMING)) {
>> +		/* deserialize state into the device */
>> +		ret = mlx5vf_load_state(mvdev);
>> +		if (ret) {
>> +			vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR;
>> +			return ret;
>> +		}
>> +	}
>> +
>> +	/* Resuming switches on */
>> +	if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) &&
>> +	    (state & VFIO_DEVICE_STATE_RESUMING)) {
>> +		mlx5vf_reset_mig_state(mvdev);
>> +		ret = mlx5vf_pci_new_write_window(mvdev);
>> +		if (ret)
>> +			return ret;
>> +	}
> A couple nits here...
>
> Perhaps:
>
> 	if ((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING)) {
> 		/* Resuming bit cleared */
> 		if (old_state & VFIO_DEVICE_STATE_RESUMING) {
> 			...
> 		} else { /* Resuming bit set */
> 			...
> 		}
> 	}

  I tried to avoid nested 'if's as of some previous notes.

The 'resuming' two cases are handled already above so functional wise 
the code covers this.

Jason/Alex,

Please recommend what is the preferred way, both options seems to be 
fine for me.

>
> Also
>
> 	u32 flipped_bits = old_state ^ state;
>
> or similar would simplify all these cases slightly.
>

Sure, will use it in V3.

>> +
>> +	/* Saving switches on */
>> +	if (((old_state ^ state) & VFIO_DEVICE_STATE_SAVING) &&
>> +	    (state & VFIO_DEVICE_STATE_SAVING)) {
>> +		if (!(state & VFIO_DEVICE_STATE_RUNNING)) {
>> +			/* serialize post copy */
>> +			ret = mlx5vf_pci_save_device_data(mvdev);
>> +			if (ret)
>> +				return ret;
>> +		}
>> +	}
> This doesn't catch all the cases, and in fact misses the most expected
> case where userspace clears the _RUNNING bit while _SAVING is already
> enabled.  Does that mean this hasn't actually been tested with QEMU?


I run QEMU with 'x-pre-copy-dirty-page-tracking=off' as current driver 
doesn't support dirty-pages.

As so, it seems that this flow wasn't triggered by QEMU in my save/load 
test.

> It seems like there also needs to be a clause in the case where
> _RUNNING switches off to test if _SAVING is already set and has not
> toggled.
>

This can be achieved by adding the below to current code, this assumes 
that we are fine with nested 'if's coding.

Seems OK ?

@@ -269,6 +269,7 @@ static int mlx5vf_pci_set_device_state(struct 
mlx5vf_pci_core_device *mvdev,
  {
         struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig;
         u32 old_state = vmig->vfio_dev_state;
+       u32 flipped_bits = old_state ^ state;
         int ret = 0;

         if (old_state == VFIO_DEVICE_STATE_ERROR ||
@@ -277,7 +278,7 @@ static int mlx5vf_pci_set_device_state(struct 
mlx5vf_pci_core_device *mvdev,
                 return -EINVAL;

         /* Running switches off */
-       if (((old_state ^ state) & VFIO_DEVICE_STATE_RUNNING) &&
+       if ((flipped_bits & VFIO_DEVICE_STATE_RUNNING) &&
             (old_state & VFIO_DEVICE_STATE_RUNNING)) {
                 ret = mlx5vf_pci_quiesce_device(mvdev);
                 if (ret)
@@ -287,10 +288,18 @@ static int mlx5vf_pci_set_device_state(struct 
mlx5vf_pci_core_device *mvdev,
                         vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR;
                         return ret;
                 }
+               if (state & VFIO_DEVICE_STATE_SAVING) {
+                       /* serialize post copy */
+                       ret = mlx5vf_pci_save_device_data(mvdev);
+                       if (ret) {
+                               vmig->vfio_dev_state = 
VFIO_DEVICE_STATE_ERROR;
+                               return ret;
+                       }
+               }
         }


Yishai
Yishai Hadas Oct. 20, 2021, 8:28 a.m. UTC | #6
On 10/20/2021 2:04 AM, Jason Gunthorpe wrote:
> On Tue, Oct 19, 2021 at 02:58:56PM -0600, Alex Williamson wrote:
>> I think that gives us this table:
>>
>> |   NDMA   | RESUMING |  SAVING  |  RUNNING |
>> +----------+----------+----------+----------+ ---
>> |     X    |     0    |     0    |     0    |  ^
>> +----------+----------+----------+----------+  |
>> |     0    |     0    |     0    |     1    |  |
>> +----------+----------+----------+----------+  |
>> |     X    |     0    |     1    |     0    |
>> +----------+----------+----------+----------+  NDMA value is either compatible
>> |     0    |     0    |     1    |     1    |  to existing behavior or don't
>> +----------+----------+----------+----------+  care due to redundancy vs
>> |     X    |     1    |     0    |     0    |  !_RUNNING/INVALID/ERROR
>> +----------+----------+----------+----------+
>> |     X    |     1    |     0    |     1    |  |
>> +----------+----------+----------+----------+  |
>> |     X    |     1    |     1    |     0    |  |
>> +----------+----------+----------+----------+  |
>> |     X    |     1    |     1    |     1    |  v
>> +----------+----------+----------+----------+ ---
>> |     1    |     0    |     0    |     1    |  ^
>> +----------+----------+----------+----------+  Desired new useful cases
>> |     1    |     0    |     1    |     1    |  v
>> +----------+----------+----------+----------+ ---
>>
>> Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can
>> set which are simply redundant to the NDMA = 0 cases.
> It seems right
>
>> Row 6 remains invalid due to lack of support for pre-copy (_RESUMING
>> | _RUNNING) and therefore cannot be set by userspace.  Rows 7 & 8
>> are error states and cannot be set by userspace.
> I wonder, did Yishai's series capture this row 6 restriction? Yishai?


It seems so,  by using the below check which includes the 
!VFIO_DEVICE_STATE_VALID clause.

if (old_state == VFIO_DEVICE_STATE_ERROR ||
         !VFIO_DEVICE_STATE_VALID(state) ||
         (state & ~MLX5VF_SUPPORTED_DEVICE_STATES))
         return -EINVAL;

Which is:

#define VFIO_DEVICE_STATE_VALID(state) \
     (state & VFIO_DEVICE_STATE_RESUMING ? \
     (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)

>
>> Like other bits, setting the bit should be effective at the completion
>> of writing device state.  Therefore the device would need to flush any
>> outbound DMA queues before returning.
> Yes, the device commands are expected to achieve this.
>
>> The question I was really trying to get to though is whether we have a
>> supportable interface without such an extension.  There's currently
>> only an experimental version of vfio migration support for PCI devices
>> in QEMU (afaik),
> If I recall this only matters if you have a VM that is causing
> migratable devices to interact with each other. So long as the devices
> are only interacting with the CPU this extra step is not strictly
> needed.
>
> So, single device cases can be fine as-is
>
> IMHO the multi-device case the VMM should probably demand this support
> from the migration drivers, otherwise it cannot know if it is safe for
> sure.
>
> A config option to override the block if the admin knows there is no
> use case to cause devices to interact - eg two NVMe devices without
> CMB do not have a useful interaction.
>
>> so it seems like we could make use of the bus-master bit to fill
>> this gap in QEMU currently, before we claim non-experimental
>> support, but this new device agnostic extension would be required
>> for non-PCI device support (and PCI support should adopt it as
>> available).  Does that sound right?  Thanks,
> I don't think the bus master support is really a substitute, tripping
> bus master will stop DMA but it will not do so in a clean way and is
> likely to be non-transparent to the VM's driver.
>
> The single-device-assigned case is a cleaner restriction, IMHO.
>
> Alternatively we can add the 4th bit and insist that migration drivers
> support all the states. I'm just unsure what other HW can do, I get
> the feeling people have been designing to the migration description in
> the header file for a while and this is a new idea.
>
> Jason

Just to be sure,

We refer here to some future functionality support with this extra 4th 
bit but it doesn't enforce any change in the submitted code, right ?

The below code uses the (state & ~MLX5VF_SUPPORTED_DEVICE_STATES) clause 
which fails any usage of a non-supported bit as of this one.

if (old_state == VFIO_DEVICE_STATE_ERROR ||
         !VFIO_DEVICE_STATE_VALID(state) ||
         (state & ~MLX5VF_SUPPORTED_DEVICE_STATES))
         return -EINVAL;

Yishai
Jason Gunthorpe Oct. 20, 2021, 4:25 p.m. UTC | #7
On Wed, Oct 20, 2021 at 11:01:01AM +0300, Yishai Hadas wrote:
> On 10/19/2021 9:43 PM, Alex Williamson wrote:
> > 
> > > +
> > > +	/* Resuming switches off */
> > > +	if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) &&
> > > +	    (old_state & VFIO_DEVICE_STATE_RESUMING)) {
> > > +		/* deserialize state into the device */
> > > +		ret = mlx5vf_load_state(mvdev);
> > > +		if (ret) {
> > > +			vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR;
> > > +			return ret;
> > > +		}
> > > +	}
> > > +
> > > +	/* Resuming switches on */
> > > +	if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) &&
> > > +	    (state & VFIO_DEVICE_STATE_RESUMING)) {
> > > +		mlx5vf_reset_mig_state(mvdev);
> > > +		ret = mlx5vf_pci_new_write_window(mvdev);
> > > +		if (ret)
> > > +			return ret;
> > > +	}
> > A couple nits here...
> > 
> > Perhaps:
> > 
> > 	if ((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING)) {
> > 		/* Resuming bit cleared */
> > 		if (old_state & VFIO_DEVICE_STATE_RESUMING) {
> > 			...
> > 		} else { /* Resuming bit set */
> > 			...
> > 		}
> > 	}
> 
> I tried to avoid nested 'if's as of some previous notes.

The layout of the if blocks must follow a logical progression of
the actions toward the chip:

 start precopy tracking
 quiesce
 freeze
 stop precopy tracking
 save state to buffer
 clear state buffer
 load state from buffer
 unfreeze
 unquiesce

The order of the if blocks sets the precendence of the requested
actions, because the bit-field view means userspace can request
multiple actions concurrently.

When adding the precopy actions into the above list we can see that
the current patches ordering is backwards, save/load should be
swapped.

Idiomatically each action needs a single edge triggred predicate
listed in the right order.

The predicates we define here will become the true ABI for qemu to
follow to operate the HW

Also, the ordering of the predicates influences what usable state
transitions exist.

So, it is really important to get this right.

> I run QEMU with 'x-pre-copy-dirty-page-tracking=off' as current driver
> doesn't support dirty-pages.
> 
> As so, it seems that this flow wasn't triggered by QEMU in my save/load
> test.
> 
> > It seems like there also needs to be a clause in the case where
> > _RUNNING switches off to test if _SAVING is already set and has not
> > toggled.
> > 
> 
> This can be achieved by adding the below to current code, this assumes that
> we are fine with nested 'if's coding.

Like this:

if ((flipped_bits & (RUNNING | SAVING)) &&
     ((old_state & (RUNNING | SAVING)) == (RUNNING|SAVING))
    /* enter pre-copy state */

if ((flipped_bits & (RUNNING | SAVING)) &&
     ((old_state & (RUNNING | SAVING)) != (RUNNING|SAVING))
    /* exit pre-copy state */

if ((flipped_bits & (RUNNING | SAVING)) &&
     ((old_state & (RUNNING | SAVING)) == SAVING))
    mlx5vf_pci_save_device_data()

Jason
Alex Williamson Oct. 20, 2021, 4:52 p.m. UTC | #8
[Cc +dgilbert, +cohuck]

On Wed, 20 Oct 2021 11:28:04 +0300
Yishai Hadas <yishaih@nvidia.com> wrote:

> On 10/20/2021 2:04 AM, Jason Gunthorpe wrote:
> > On Tue, Oct 19, 2021 at 02:58:56PM -0600, Alex Williamson wrote:  
> >> I think that gives us this table:
> >>
> >> |   NDMA   | RESUMING |  SAVING  |  RUNNING |
> >> +----------+----------+----------+----------+ ---
> >> |     X    |     0    |     0    |     0    |  ^
> >> +----------+----------+----------+----------+  |
> >> |     0    |     0    |     0    |     1    |  |
> >> +----------+----------+----------+----------+  |
> >> |     X    |     0    |     1    |     0    |
> >> +----------+----------+----------+----------+  NDMA value is either compatible
> >> |     0    |     0    |     1    |     1    |  to existing behavior or don't
> >> +----------+----------+----------+----------+  care due to redundancy vs
> >> |     X    |     1    |     0    |     0    |  !_RUNNING/INVALID/ERROR
> >> +----------+----------+----------+----------+
> >> |     X    |     1    |     0    |     1    |  |
> >> +----------+----------+----------+----------+  |
> >> |     X    |     1    |     1    |     0    |  |
> >> +----------+----------+----------+----------+  |
> >> |     X    |     1    |     1    |     1    |  v
> >> +----------+----------+----------+----------+ ---
> >> |     1    |     0    |     0    |     1    |  ^
> >> +----------+----------+----------+----------+  Desired new useful cases
> >> |     1    |     0    |     1    |     1    |  v
> >> +----------+----------+----------+----------+ ---
> >>
> >> Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can
> >> set which are simply redundant to the NDMA = 0 cases.  
> > It seems right
> >  
> >> Row 6 remains invalid due to lack of support for pre-copy (_RESUMING
> >> | _RUNNING) and therefore cannot be set by userspace.  Rows 7 & 8
> >> are error states and cannot be set by userspace.  
> > I wonder, did Yishai's series capture this row 6 restriction? Yishai?  
> 
> 
> It seems so,  by using the below check which includes the 
> !VFIO_DEVICE_STATE_VALID clause.
> 
> if (old_state == VFIO_DEVICE_STATE_ERROR ||
>          !VFIO_DEVICE_STATE_VALID(state) ||
>          (state & ~MLX5VF_SUPPORTED_DEVICE_STATES))
>          return -EINVAL;
> 
> Which is:
> 
> #define VFIO_DEVICE_STATE_VALID(state) \
>      (state & VFIO_DEVICE_STATE_RESUMING ? \
>      (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
> 
> >  
> >> Like other bits, setting the bit should be effective at the completion
> >> of writing device state.  Therefore the device would need to flush any
> >> outbound DMA queues before returning.  
> > Yes, the device commands are expected to achieve this.
> >  
> >> The question I was really trying to get to though is whether we have a
> >> supportable interface without such an extension.  There's currently
> >> only an experimental version of vfio migration support for PCI devices
> >> in QEMU (afaik),  
> > If I recall this only matters if you have a VM that is causing
> > migratable devices to interact with each other. So long as the devices
> > are only interacting with the CPU this extra step is not strictly
> > needed.
> >
> > So, single device cases can be fine as-is
> >
> > IMHO the multi-device case the VMM should probably demand this support
> > from the migration drivers, otherwise it cannot know if it is safe for
> > sure.
> >
> > A config option to override the block if the admin knows there is no
> > use case to cause devices to interact - eg two NVMe devices without
> > CMB do not have a useful interaction.
> >  
> >> so it seems like we could make use of the bus-master bit to fill
> >> this gap in QEMU currently, before we claim non-experimental
> >> support, but this new device agnostic extension would be required
> >> for non-PCI device support (and PCI support should adopt it as
> >> available).  Does that sound right?  Thanks,  
> > I don't think the bus master support is really a substitute, tripping
> > bus master will stop DMA but it will not do so in a clean way and is
> > likely to be non-transparent to the VM's driver.
> >
> > The single-device-assigned case is a cleaner restriction, IMHO.
> >
> > Alternatively we can add the 4th bit and insist that migration drivers
> > support all the states. I'm just unsure what other HW can do, I get
> > the feeling people have been designing to the migration description in
> > the header file for a while and this is a new idea.

I'm wondering if we're imposing extra requirements on the !_RUNNING
state that don't need to be there.  For example, if we can assume that
all devices within a userspace context are !_RUNNING before any of the
devices begin to retrieve final state, then clearing of the _RUNNING
bit becomes the device quiesce point and the beginning of reading
device data is the point at which the device state is frozen and
serialized.  No new states required and essentially works with a slight
rearrangement of the callbacks in this series.  Why can't we do that?

Maybe a clarification of the uAPI spec is sufficient to achieve this,
ex. !_RUNNING devices may still update their internal state machine
based on external access.  Userspace is expected to quiesce all external
access prior to initiating the retrieval of the final device state from
the data section of the migration region.  Failure to do so may result
in inconsistent device state or optionally the device driver may induce
a fault if a quiescent state is not maintained.

> Just to be sure,
> 
> We refer here to some future functionality support with this extra 4th 
> bit but it doesn't enforce any change in the submitted code, right ?
> 
> The below code uses the (state & ~MLX5VF_SUPPORTED_DEVICE_STATES) clause 
> which fails any usage of a non-supported bit as of this one.
> 
> if (old_state == VFIO_DEVICE_STATE_ERROR ||
>          !VFIO_DEVICE_STATE_VALID(state) ||
>          (state & ~MLX5VF_SUPPORTED_DEVICE_STATES))
>          return -EINVAL;

Correct, userspace shouldn't be setting any extra bits unless we
advertise support, such as via a capability or flag.  Drivers need to
continue to sanitize user input to validate yet-to-be-defined bits are
not accepted from userspace or else we risk not being able to define
them later without breaking userspace.  Thanks,

Alex
Jason Gunthorpe Oct. 20, 2021, 6:59 p.m. UTC | #9
On Wed, Oct 20, 2021 at 10:52:30AM -0600, Alex Williamson wrote:

> I'm wondering if we're imposing extra requirements on the !_RUNNING
> state that don't need to be there.  For example, if we can assume that
> all devices within a userspace context are !_RUNNING before any of the
> devices begin to retrieve final state, then clearing of the _RUNNING
> bit becomes the device quiesce point and the beginning of reading
> device data is the point at which the device state is frozen and
> serialized.  No new states required and essentially works with a slight
> rearrangement of the callbacks in this series.  Why can't we do that?

It sounds worth checking carefully. I didn't come up with a major
counter scenario.

We would need to specifically define which user action triggers the
device to freeze and serialize. Reading pending_bytes I suppose?

Since freeze is a device command we need to be able to report failure
and to move the state to error, that feels bad hidden inside reading
pending_bytes.

> Maybe a clarification of the uAPI spec is sufficient to achieve this,
> ex. !_RUNNING devices may still update their internal state machine
> based on external access.  Userspace is expected to quiesce all external
> access prior to initiating the retrieval of the final device state from
> the data section of the migration region.  Failure to do so may result
> in inconsistent device state or optionally the device driver may induce
> a fault if a quiescent state is not maintained.

But on the other hand this seem so subtle and tricky way to design a
uAPI that devices and users are unlikely to implement it completely
correctly.

IMHO the point of the state is to control the current behavior of the
device - yes we can control behavior on other operations, but it feels
obfuscated.

Especially when the userspace that needs to know about this isn't even
deployed yet, let's just do it cleanly?

Jason
Alex Williamson Oct. 20, 2021, 9:07 p.m. UTC | #10
On Wed, 20 Oct 2021 15:59:19 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Wed, Oct 20, 2021 at 10:52:30AM -0600, Alex Williamson wrote:
> 
> > I'm wondering if we're imposing extra requirements on the !_RUNNING
> > state that don't need to be there.  For example, if we can assume that
> > all devices within a userspace context are !_RUNNING before any of the
> > devices begin to retrieve final state, then clearing of the _RUNNING
> > bit becomes the device quiesce point and the beginning of reading
> > device data is the point at which the device state is frozen and
> > serialized.  No new states required and essentially works with a slight
> > rearrangement of the callbacks in this series.  Why can't we do that?  
> 
> It sounds worth checking carefully. I didn't come up with a major
> counter scenario.
> 
> We would need to specifically define which user action triggers the
> device to freeze and serialize. Reading pending_bytes I suppose?

The first read of pending_bytes after clearing the _RUNNING bit would
be the logical place to do this since that's what we define as the start
of the cycle for reading the device state.

"Freezing" the device is a valid implementation, but I don't think it's
strictly required per the uAPI.  For instance there's no requirement
that pending_bytes is reduced by data_size on each iteratio; we
specifically only define that the state is complete when the user reads
a pending_bytes value of zero.  So a driver could restart the device
state if the device continues to change (though it's debatable whether
triggering an -errno on the next migration region access might be a
more supportable approach to enforce that userspace has quiesced
external access).

> Since freeze is a device command we need to be able to report failure
> and to move the state to error, that feels bad hidden inside reading
> pending_bytes.

That seems like the wrong model.  Reading pending_bytes can return
-errno should an error occur during freeze/serialize, but device_state
is never implicitly modified.  Upon encountering an error reading
pending_bytes, userspace would abort the migration and move the
device_state to a new value.  This is the point at which the driver
would return another -errno to indicate an unrecoverable internal
error, or honor the requested new state.

> > Maybe a clarification of the uAPI spec is sufficient to achieve this,
> > ex. !_RUNNING devices may still update their internal state machine
> > based on external access.  Userspace is expected to quiesce all external
> > access prior to initiating the retrieval of the final device state from
> > the data section of the migration region.  Failure to do so may result
> > in inconsistent device state or optionally the device driver may induce
> > a fault if a quiescent state is not maintained.  
> 
> But on the other hand this seem so subtle and tricky way to design a
> uAPI that devices and users are unlikely to implement it completely
> correctly.

I think it's only tricky if device_state is implicitly modified by
other actions, stopping all devices before collecting the state of any
of them seems like a reasonable requirement.  It's the same requirement
we'd add with an NDMA bit.

> IMHO the point of the state is to control the current behavior of the
> device - yes we can control behavior on other operations, but it feels
> obfuscated.

It is, don't do that.  Fail the migration region operation, fail the
next device state transition if the internal error is irrecoverable.
 
> Especially when the userspace that needs to know about this isn't even
> deployed yet, let's just do it cleanly?

That's an option, but again I'm wondering if we're imposing a
requirement on the !_RUNNING state that doesn't really exist and a
clean solution exists already.  Thanks,

Alex
Cornelia Huck Oct. 21, 2021, 9:34 a.m. UTC | #11
On Wed, Oct 20 2021, Alex Williamson <alex.williamson@redhat.com> wrote:

> On Wed, 20 Oct 2021 15:59:19 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
>
>> On Wed, Oct 20, 2021 at 10:52:30AM -0600, Alex Williamson wrote:
>> 
>> > I'm wondering if we're imposing extra requirements on the !_RUNNING
>> > state that don't need to be there.  For example, if we can assume that
>> > all devices within a userspace context are !_RUNNING before any of the
>> > devices begin to retrieve final state, then clearing of the _RUNNING
>> > bit becomes the device quiesce point and the beginning of reading
>> > device data is the point at which the device state is frozen and
>> > serialized.  No new states required and essentially works with a slight
>> > rearrangement of the callbacks in this series.  Why can't we do that?  
>> 
>> It sounds worth checking carefully. I didn't come up with a major
>> counter scenario.
>> 
>> We would need to specifically define which user action triggers the
>> device to freeze and serialize. Reading pending_bytes I suppose?
>
> The first read of pending_bytes after clearing the _RUNNING bit would
> be the logical place to do this since that's what we define as the start
> of the cycle for reading the device state.
>
> "Freezing" the device is a valid implementation, but I don't think it's
> strictly required per the uAPI.  For instance there's no requirement
> that pending_bytes is reduced by data_size on each iteratio; we
> specifically only define that the state is complete when the user reads
> a pending_bytes value of zero.  So a driver could restart the device
> state if the device continues to change (though it's debatable whether
> triggering an -errno on the next migration region access might be a
> more supportable approach to enforce that userspace has quiesced
> external access).

Hm, not so sure. From my reading of the uAPI, transitioning from
pre-copy to stop-and-copy (i.e. clearing _RUNNING) implies that we
freeze the device (at least, that's how I interpret "On state transition
from pre-copy to stop-and-copy, the driver must stop the device, save
the device state and send it to the user application through the
migration region.")

Maybe the uAPI is simply not yet clear enough.
Yishai Hadas Oct. 21, 2021, 10:46 a.m. UTC | #12
On 10/20/2021 7:25 PM, Jason Gunthorpe wrote:
> On Wed, Oct 20, 2021 at 11:01:01AM +0300, Yishai Hadas wrote:
>> On 10/19/2021 9:43 PM, Alex Williamson wrote:
>>>> +
>>>> +	/* Resuming switches off */
>>>> +	if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) &&
>>>> +	    (old_state & VFIO_DEVICE_STATE_RESUMING)) {
>>>> +		/* deserialize state into the device */
>>>> +		ret = mlx5vf_load_state(mvdev);
>>>> +		if (ret) {
>>>> +			vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR;
>>>> +			return ret;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	/* Resuming switches on */
>>>> +	if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) &&
>>>> +	    (state & VFIO_DEVICE_STATE_RESUMING)) {
>>>> +		mlx5vf_reset_mig_state(mvdev);
>>>> +		ret = mlx5vf_pci_new_write_window(mvdev);
>>>> +		if (ret)
>>>> +			return ret;
>>>> +	}
>>> A couple nits here...
>>>
>>> Perhaps:
>>>
>>> 	if ((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING)) {
>>> 		/* Resuming bit cleared */
>>> 		if (old_state & VFIO_DEVICE_STATE_RESUMING) {
>>> 			...
>>> 		} else { /* Resuming bit set */
>>> 			...
>>> 		}
>>> 	}
>> I tried to avoid nested 'if's as of some previous notes.
> The layout of the if blocks must follow a logical progression of
> the actions toward the chip:
>
>   start precopy tracking
>   quiesce
>   freeze
>   stop precopy tracking
>   save state to buffer
>   clear state buffer
>   load state from buffer
>   unfreeze
>   unquiesce
>
> The order of the if blocks sets the precendence of the requested
> actions, because the bit-field view means userspace can request
> multiple actions concurrently.
>
> When adding the precopy actions into the above list we can see that
> the current patches ordering is backwards, save/load should be
> swapped.
>
> Idiomatically each action needs a single edge triggred predicate
> listed in the right order.
>
> The predicates we define here will become the true ABI for qemu to
> follow to operate the HW
>
> Also, the ordering of the predicates influences what usable state
> transitions exist.
>
> So, it is really important to get this right.
>
>> I run QEMU with 'x-pre-copy-dirty-page-tracking=off' as current driver
>> doesn't support dirty-pages.
>>
>> As so, it seems that this flow wasn't triggered by QEMU in my save/load
>> test.
>>
>>> It seems like there also needs to be a clause in the case where
>>> _RUNNING switches off to test if _SAVING is already set and has not
>>> toggled.
>>>
>> This can be achieved by adding the below to current code, this assumes that
>> we are fine with nested 'if's coding.
> Like this:
>
> if ((flipped_bits & (RUNNING | SAVING)) &&
>       ((old_state & (RUNNING | SAVING)) == (RUNNING|SAVING))
>      /* enter pre-copy state */
>
> if ((flipped_bits & (RUNNING | SAVING)) &&
>       ((old_state & (RUNNING | SAVING)) != (RUNNING|SAVING))
>      /* exit pre-copy state */
>
> if ((flipped_bits & (RUNNING | SAVING)) &&
>       ((old_state & (RUNNING | SAVING)) == SAVING))
>      mlx5vf_pci_save_device_data()
>
> Jason

OK, V3 will follow that.

Note: In the above old_state needs to be the new state.

Thanks,
Yishai
Alex Williamson Oct. 21, 2021, 9:47 p.m. UTC | #13
On Thu, 21 Oct 2021 11:34:00 +0200
Cornelia Huck <cohuck@redhat.com> wrote:

> On Wed, Oct 20 2021, Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> > On Wed, 20 Oct 2021 15:59:19 -0300
> > Jason Gunthorpe <jgg@nvidia.com> wrote:
> >  
> >> On Wed, Oct 20, 2021 at 10:52:30AM -0600, Alex Williamson wrote:
> >>   
> >> > I'm wondering if we're imposing extra requirements on the !_RUNNING
> >> > state that don't need to be there.  For example, if we can assume that
> >> > all devices within a userspace context are !_RUNNING before any of the
> >> > devices begin to retrieve final state, then clearing of the _RUNNING
> >> > bit becomes the device quiesce point and the beginning of reading
> >> > device data is the point at which the device state is frozen and
> >> > serialized.  No new states required and essentially works with a slight
> >> > rearrangement of the callbacks in this series.  Why can't we do that?    
> >> 
> >> It sounds worth checking carefully. I didn't come up with a major
> >> counter scenario.
> >> 
> >> We would need to specifically define which user action triggers the
> >> device to freeze and serialize. Reading pending_bytes I suppose?  
> >
> > The first read of pending_bytes after clearing the _RUNNING bit would
> > be the logical place to do this since that's what we define as the start
> > of the cycle for reading the device state.
> >
> > "Freezing" the device is a valid implementation, but I don't think it's
> > strictly required per the uAPI.  For instance there's no requirement
> > that pending_bytes is reduced by data_size on each iteratio; we
> > specifically only define that the state is complete when the user reads
> > a pending_bytes value of zero.  So a driver could restart the device
> > state if the device continues to change (though it's debatable whether
> > triggering an -errno on the next migration region access might be a
> > more supportable approach to enforce that userspace has quiesced
> > external access).  
> 
> Hm, not so sure. From my reading of the uAPI, transitioning from
> pre-copy to stop-and-copy (i.e. clearing _RUNNING) implies that we
> freeze the device (at least, that's how I interpret "On state transition
> from pre-copy to stop-and-copy, the driver must stop the device, save
> the device state and send it to the user application through the
> migration region.")

"[S]end it to the user application through the migration region" is
certainly not something that's encompassed just by clearing the _RUNNING
bit.  There's a sequence of operations there.  If the device is
quiesced for outbound DMA and frozen from inbound DMA (or can
reasonably expect no further inbound DMA) before the user reads the
data, I think that meets the description.

We can certainly clarify the spec in the process if we agree that we
can do this without adding another state bit.

I recall that we previously suggested a very strict interpretation of
clearing the _RUNNING bit, but again I'm questioning if that's a real
requirement or simply a nice-to-have feature for some undefined
debugging capability.  In raising the p2p DMA issue, we can see that a
hard stop independent of other devices is not really practical but I
also don't see that introducing a new state bit solves this problem any
more elegantly than proposed here.  Thanks,

Alex
Jason Gunthorpe Oct. 25, 2021, 12:29 p.m. UTC | #14
On Thu, Oct 21, 2021 at 03:47:29PM -0600, Alex Williamson wrote:
> I recall that we previously suggested a very strict interpretation of
> clearing the _RUNNING bit, but again I'm questioning if that's a real
> requirement or simply a nice-to-have feature for some undefined
> debugging capability.  In raising the p2p DMA issue, we can see that a
> hard stop independent of other devices is not really practical but I
> also don't see that introducing a new state bit solves this problem any
> more elegantly than proposed here.  Thanks,

I still disagree with this - the level of 'frozenness' of a device is
something that belongs in the defined state exposed to userspace, not
as a hidden internal state that userspace can't see.

It makes the state transitions asymmetric between suspend/resume as
resume does have a defined uAPI state for each level of frozeness and
suspend does not.

With the extra bit resume does:
  
  0000, 0100, 1000, 0001

And suspend does:

  0001, 1001, 0010, 0000

However, without the extra bit suspend is only
  
  001,  010, 000

With hidden state inside the 010

Jason
Alex Williamson Oct. 25, 2021, 2:28 p.m. UTC | #15
On Mon, 25 Oct 2021 09:29:38 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Thu, Oct 21, 2021 at 03:47:29PM -0600, Alex Williamson wrote:
> > I recall that we previously suggested a very strict interpretation of
> > clearing the _RUNNING bit, but again I'm questioning if that's a real
> > requirement or simply a nice-to-have feature for some undefined
> > debugging capability.  In raising the p2p DMA issue, we can see that a
> > hard stop independent of other devices is not really practical but I
> > also don't see that introducing a new state bit solves this problem any
> > more elegantly than proposed here.  Thanks,  
> 
> I still disagree with this - the level of 'frozenness' of a device is
> something that belongs in the defined state exposed to userspace, not
> as a hidden internal state that userspace can't see.
> 
> It makes the state transitions asymmetric between suspend/resume as
> resume does have a defined uAPI state for each level of frozeness and
> suspend does not.
> 
> With the extra bit resume does:
>   
>   0000, 0100, 1000, 0001
> 
> And suspend does:
> 
>   0001, 1001, 0010, 0000
> 
> However, without the extra bit suspend is only
>   
>   001,  010, 000
> 
> With hidden state inside the 010

And what is the device supposed to do if it receives a DMA while in
this strictly defined stopped state?  If it generates an unsupported
request, that can trigger a fatal platform error.  If it silently drops
the DMA, then we have data loss.  We're defining a catch-22 scenario
for drivers versus placing the onus on the user to quiesce the set of
devices in order to consider the migration status as valid.  Thanks,

Alex
Jason Gunthorpe Oct. 25, 2021, 2:56 p.m. UTC | #16
On Mon, Oct 25, 2021 at 08:28:57AM -0600, Alex Williamson wrote:
> On Mon, 25 Oct 2021 09:29:38 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Thu, Oct 21, 2021 at 03:47:29PM -0600, Alex Williamson wrote:
> > > I recall that we previously suggested a very strict interpretation of
> > > clearing the _RUNNING bit, but again I'm questioning if that's a real
> > > requirement or simply a nice-to-have feature for some undefined
> > > debugging capability.  In raising the p2p DMA issue, we can see that a
> > > hard stop independent of other devices is not really practical but I
> > > also don't see that introducing a new state bit solves this problem any
> > > more elegantly than proposed here.  Thanks,  
> > 
> > I still disagree with this - the level of 'frozenness' of a device is
> > something that belongs in the defined state exposed to userspace, not
> > as a hidden internal state that userspace can't see.
> > 
> > It makes the state transitions asymmetric between suspend/resume as
> > resume does have a defined uAPI state for each level of frozeness and
> > suspend does not.
> > 
> > With the extra bit resume does:
> >   
> >   0000, 0100, 1000, 0001
> > 
> > And suspend does:
> > 
> >   0001, 1001, 0010, 0000
> > 
> > However, without the extra bit suspend is only
> >   
> >   001,  010, 000
> > 
> > With hidden state inside the 010
> 
> And what is the device supposed to do if it receives a DMA while in
> this strictly defined stopped state?  If it generates an unsupported
> request, that can trigger a fatal platform error.  

I don't see that this question changes anything, we always have a
state where the device is unable to respond to incoming DMA.

In all cases entry to this state is triggered only by user space
action, if userspace does the ioctls in the wrong order then it will
hit it.

> If it silently drops the DMA, then we have data loss.  We're
> defining a catch-22 scenario for drivers versus placing the onus on
> the user to quiesce the set of devices in order to consider the
> migration status as valid.  

The device should error the TLP.

Userspace must globally fence access to the device before it enters
the device into the state where it errors TLPs.

This is also why I don't like it being so transparent as it is
something userspace needs to care about - especially if the HW cannot
support such a thing, if we intend to allow that.

Jason
Dr. David Alan Gilbert Oct. 25, 2021, 4:34 p.m. UTC | #17
* Alex Williamson (alex.williamson@redhat.com) wrote:
> [Cc +dgilbert, +cohuck]
> 
> On Wed, 20 Oct 2021 11:28:04 +0300
> Yishai Hadas <yishaih@nvidia.com> wrote:
> 
> > On 10/20/2021 2:04 AM, Jason Gunthorpe wrote:
> > > On Tue, Oct 19, 2021 at 02:58:56PM -0600, Alex Williamson wrote:  
> > >> I think that gives us this table:
> > >>
> > >> |   NDMA   | RESUMING |  SAVING  |  RUNNING |
> > >> +----------+----------+----------+----------+ ---
> > >> |     X    |     0    |     0    |     0    |  ^
> > >> +----------+----------+----------+----------+  |
> > >> |     0    |     0    |     0    |     1    |  |
> > >> +----------+----------+----------+----------+  |
> > >> |     X    |     0    |     1    |     0    |
> > >> +----------+----------+----------+----------+  NDMA value is either compatible
> > >> |     0    |     0    |     1    |     1    |  to existing behavior or don't
> > >> +----------+----------+----------+----------+  care due to redundancy vs
> > >> |     X    |     1    |     0    |     0    |  !_RUNNING/INVALID/ERROR
> > >> +----------+----------+----------+----------+
> > >> |     X    |     1    |     0    |     1    |  |
> > >> +----------+----------+----------+----------+  |
> > >> |     X    |     1    |     1    |     0    |  |
> > >> +----------+----------+----------+----------+  |
> > >> |     X    |     1    |     1    |     1    |  v
> > >> +----------+----------+----------+----------+ ---
> > >> |     1    |     0    |     0    |     1    |  ^
> > >> +----------+----------+----------+----------+  Desired new useful cases
> > >> |     1    |     0    |     1    |     1    |  v
> > >> +----------+----------+----------+----------+ ---
> > >>
> > >> Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can
> > >> set which are simply redundant to the NDMA = 0 cases.  
> > > It seems right
> > >  
> > >> Row 6 remains invalid due to lack of support for pre-copy (_RESUMING
> > >> | _RUNNING) and therefore cannot be set by userspace.  Rows 7 & 8
> > >> are error states and cannot be set by userspace.  
> > > I wonder, did Yishai's series capture this row 6 restriction? Yishai?  
> > 
> > 
> > It seems so,  by using the below check which includes the 
> > !VFIO_DEVICE_STATE_VALID clause.
> > 
> > if (old_state == VFIO_DEVICE_STATE_ERROR ||
> >          !VFIO_DEVICE_STATE_VALID(state) ||
> >          (state & ~MLX5VF_SUPPORTED_DEVICE_STATES))
> >          return -EINVAL;
> > 
> > Which is:
> > 
> > #define VFIO_DEVICE_STATE_VALID(state) \
> >      (state & VFIO_DEVICE_STATE_RESUMING ? \
> >      (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
> > 
> > >  
> > >> Like other bits, setting the bit should be effective at the completion
> > >> of writing device state.  Therefore the device would need to flush any
> > >> outbound DMA queues before returning.  
> > > Yes, the device commands are expected to achieve this.
> > >  
> > >> The question I was really trying to get to though is whether we have a
> > >> supportable interface without such an extension.  There's currently
> > >> only an experimental version of vfio migration support for PCI devices
> > >> in QEMU (afaik),  
> > > If I recall this only matters if you have a VM that is causing
> > > migratable devices to interact with each other. So long as the devices
> > > are only interacting with the CPU this extra step is not strictly
> > > needed.
> > >
> > > So, single device cases can be fine as-is
> > >
> > > IMHO the multi-device case the VMM should probably demand this support
> > > from the migration drivers, otherwise it cannot know if it is safe for
> > > sure.
> > >
> > > A config option to override the block if the admin knows there is no
> > > use case to cause devices to interact - eg two NVMe devices without
> > > CMB do not have a useful interaction.
> > >  
> > >> so it seems like we could make use of the bus-master bit to fill
> > >> this gap in QEMU currently, before we claim non-experimental
> > >> support, but this new device agnostic extension would be required
> > >> for non-PCI device support (and PCI support should adopt it as
> > >> available).  Does that sound right?  Thanks,  
> > > I don't think the bus master support is really a substitute, tripping
> > > bus master will stop DMA but it will not do so in a clean way and is
> > > likely to be non-transparent to the VM's driver.
> > >
> > > The single-device-assigned case is a cleaner restriction, IMHO.
> > >
> > > Alternatively we can add the 4th bit and insist that migration drivers
> > > support all the states. I'm just unsure what other HW can do, I get
> > > the feeling people have been designing to the migration description in
> > > the header file for a while and this is a new idea.
> 
> I'm wondering if we're imposing extra requirements on the !_RUNNING
> state that don't need to be there.  For example, if we can assume that
> all devices within a userspace context are !_RUNNING before any of the
> devices begin to retrieve final state, then clearing of the _RUNNING
> bit becomes the device quiesce point and the beginning of reading
> device data is the point at which the device state is frozen and
> serialized.  No new states required and essentially works with a slight
> rearrangement of the callbacks in this series.  Why can't we do that?

So without me actually understanding your bit encodings that closely, I
think the problem is we have to asusme that any transition takes time.
From the QEMU point of view I think the requirement is when we stop the
machine (vm_stop_force_state(RUN_STATE_FINISH_MIGRATE) in
migration_completion) that at the point that call returns (with no
error) all devices are idle.  That means you need a way to command the
device to go into the stopped state, and probably another to make sure
it's got there.

Now, you could be a *little* more sloppy; you could allow a device carry
on doing stuff purely with it's own internal state up until the point
it needs to serialise; but that would have to be strictly internal state
only - if it can change any other devices state (or issue an interrupt,
change RAM etc) then you get into ordering issues on the serialisation
of multiple devices.

Dave

> Maybe a clarification of the uAPI spec is sufficient to achieve this,
> ex. !_RUNNING devices may still update their internal state machine
> based on external access.  Userspace is expected to quiesce all external
> access prior to initiating the retrieval of the final device state from
> the data section of the migration region.  Failure to do so may result
> in inconsistent device state or optionally the device driver may induce
> a fault if a quiescent state is not maintained.
> 
> > Just to be sure,
> > 
> > We refer here to some future functionality support with this extra 4th 
> > bit but it doesn't enforce any change in the submitted code, right ?
> > 
> > The below code uses the (state & ~MLX5VF_SUPPORTED_DEVICE_STATES) clause 
> > which fails any usage of a non-supported bit as of this one.
> > 
> > if (old_state == VFIO_DEVICE_STATE_ERROR ||
> >          !VFIO_DEVICE_STATE_VALID(state) ||
> >          (state & ~MLX5VF_SUPPORTED_DEVICE_STATES))
> >          return -EINVAL;
> 
> Correct, userspace shouldn't be setting any extra bits unless we
> advertise support, such as via a capability or flag.  Drivers need to
> continue to sanitize user input to validate yet-to-be-defined bits are
> not accepted from userspace or else we risk not being able to define
> them later without breaking userspace.  Thanks,
> 
> Alex
>
Alex Williamson Oct. 25, 2021, 5:55 p.m. UTC | #18
On Mon, 25 Oct 2021 17:34:01 +0100
"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:

> * Alex Williamson (alex.williamson@redhat.com) wrote:
> > [Cc +dgilbert, +cohuck]
> > 
> > On Wed, 20 Oct 2021 11:28:04 +0300
> > Yishai Hadas <yishaih@nvidia.com> wrote:
> >   
> > > On 10/20/2021 2:04 AM, Jason Gunthorpe wrote:  
> > > > On Tue, Oct 19, 2021 at 02:58:56PM -0600, Alex Williamson wrote:    
> > > >> I think that gives us this table:
> > > >>
> > > >> |   NDMA   | RESUMING |  SAVING  |  RUNNING |
> > > >> +----------+----------+----------+----------+ ---
> > > >> |     X    |     0    |     0    |     0    |  ^
> > > >> +----------+----------+----------+----------+  |
> > > >> |     0    |     0    |     0    |     1    |  |
> > > >> +----------+----------+----------+----------+  |
> > > >> |     X    |     0    |     1    |     0    |
> > > >> +----------+----------+----------+----------+  NDMA value is either compatible
> > > >> |     0    |     0    |     1    |     1    |  to existing behavior or don't
> > > >> +----------+----------+----------+----------+  care due to redundancy vs
> > > >> |     X    |     1    |     0    |     0    |  !_RUNNING/INVALID/ERROR
> > > >> +----------+----------+----------+----------+
> > > >> |     X    |     1    |     0    |     1    |  |
> > > >> +----------+----------+----------+----------+  |
> > > >> |     X    |     1    |     1    |     0    |  |
> > > >> +----------+----------+----------+----------+  |
> > > >> |     X    |     1    |     1    |     1    |  v
> > > >> +----------+----------+----------+----------+ ---
> > > >> |     1    |     0    |     0    |     1    |  ^
> > > >> +----------+----------+----------+----------+  Desired new useful cases
> > > >> |     1    |     0    |     1    |     1    |  v
> > > >> +----------+----------+----------+----------+ ---
> > > >>
> > > >> Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can
> > > >> set which are simply redundant to the NDMA = 0 cases.    
> > > > It seems right
> > > >    
> > > >> Row 6 remains invalid due to lack of support for pre-copy (_RESUMING
> > > >> | _RUNNING) and therefore cannot be set by userspace.  Rows 7 & 8
> > > >> are error states and cannot be set by userspace.    
> > > > I wonder, did Yishai's series capture this row 6 restriction? Yishai?    
> > > 
> > > 
> > > It seems so,  by using the below check which includes the 
> > > !VFIO_DEVICE_STATE_VALID clause.
> > > 
> > > if (old_state == VFIO_DEVICE_STATE_ERROR ||
> > >          !VFIO_DEVICE_STATE_VALID(state) ||
> > >          (state & ~MLX5VF_SUPPORTED_DEVICE_STATES))
> > >          return -EINVAL;
> > > 
> > > Which is:
> > > 
> > > #define VFIO_DEVICE_STATE_VALID(state) \
> > >      (state & VFIO_DEVICE_STATE_RESUMING ? \
> > >      (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
> > >   
> > > >    
> > > >> Like other bits, setting the bit should be effective at the completion
> > > >> of writing device state.  Therefore the device would need to flush any
> > > >> outbound DMA queues before returning.    
> > > > Yes, the device commands are expected to achieve this.
> > > >    
> > > >> The question I was really trying to get to though is whether we have a
> > > >> supportable interface without such an extension.  There's currently
> > > >> only an experimental version of vfio migration support for PCI devices
> > > >> in QEMU (afaik),    
> > > > If I recall this only matters if you have a VM that is causing
> > > > migratable devices to interact with each other. So long as the devices
> > > > are only interacting with the CPU this extra step is not strictly
> > > > needed.
> > > >
> > > > So, single device cases can be fine as-is
> > > >
> > > > IMHO the multi-device case the VMM should probably demand this support
> > > > from the migration drivers, otherwise it cannot know if it is safe for
> > > > sure.
> > > >
> > > > A config option to override the block if the admin knows there is no
> > > > use case to cause devices to interact - eg two NVMe devices without
> > > > CMB do not have a useful interaction.
> > > >    
> > > >> so it seems like we could make use of the bus-master bit to fill
> > > >> this gap in QEMU currently, before we claim non-experimental
> > > >> support, but this new device agnostic extension would be required
> > > >> for non-PCI device support (and PCI support should adopt it as
> > > >> available).  Does that sound right?  Thanks,    
> > > > I don't think the bus master support is really a substitute, tripping
> > > > bus master will stop DMA but it will not do so in a clean way and is
> > > > likely to be non-transparent to the VM's driver.
> > > >
> > > > The single-device-assigned case is a cleaner restriction, IMHO.
> > > >
> > > > Alternatively we can add the 4th bit and insist that migration drivers
> > > > support all the states. I'm just unsure what other HW can do, I get
> > > > the feeling people have been designing to the migration description in
> > > > the header file for a while and this is a new idea.  
> > 
> > I'm wondering if we're imposing extra requirements on the !_RUNNING
> > state that don't need to be there.  For example, if we can assume that
> > all devices within a userspace context are !_RUNNING before any of the
> > devices begin to retrieve final state, then clearing of the _RUNNING
> > bit becomes the device quiesce point and the beginning of reading
> > device data is the point at which the device state is frozen and
> > serialized.  No new states required and essentially works with a slight
> > rearrangement of the callbacks in this series.  Why can't we do that?  
> 
> So without me actually understanding your bit encodings that closely, I
> think the problem is we have to asusme that any transition takes time.
> From the QEMU point of view I think the requirement is when we stop the
> machine (vm_stop_force_state(RUN_STATE_FINISH_MIGRATE) in
> migration_completion) that at the point that call returns (with no
> error) all devices are idle.  That means you need a way to command the
> device to go into the stopped state, and probably another to make sure
> it's got there.

In a way.  We're essentially recognizing that we cannot stop a single
device in isolation of others that might participate in peer-to-peer
DMA with that device, so we need to make a pass to quiesce each device
before we can ask the device to fully stop.  This new device state bit
is meant to be that quiescent point, devices can accept incoming DMA
but should cease to generate any.  Once all device are quiesced then we
can safely stop them.

> Now, you could be a *little* more sloppy; you could allow a device carry
> on doing stuff purely with it's own internal state up until the point
> it needs to serialise; but that would have to be strictly internal state
> only - if it can change any other devices state (or issue an interrupt,
> change RAM etc) then you get into ordering issues on the serialisation
> of multiple devices.

Yep, that's the proposal that doesn't require a uAPI change, we loosen
the definition of stopped to mean the device can no longer generate DMA
or interrupts and all internal processing outside or responding to
incoming DMA should halt (essentially the same as the new quiescent
state above).  Once all devices are in this state, there should be no
incoming DMA and we can safely collect per device migration data.  If
state changes occur beyond the point in time where userspace has
initiated the collection of migration data, drivers have options for
generating errors when userspace consumes that data.

AFAICT, the two approaches are equally valid.  If we modify the uAPI to
include this new quiescent state then userspace needs to make some hard
choices about what configurations they support without such a feature.
The majority of configurations are likely not exercising p2p between
assigned devices, but the hypervisor can't know that.  If we work
within the existing uAPI, well there aren't any open source driver
implementations yet anyway and any non-upstream implementations would
need to be updated for this clarification.  Existing userspace works
better with no change, so long as they already follow the guideline
that all devices in the userspace context must be stopped before the
migration data of any device can be considered valid.  Thanks,

Alex
Dr. David Alan Gilbert Oct. 25, 2021, 6:47 p.m. UTC | #19
* Alex Williamson (alex.williamson@redhat.com) wrote:
> On Mon, 25 Oct 2021 17:34:01 +0100
> "Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:
> 
> > * Alex Williamson (alex.williamson@redhat.com) wrote:
> > > [Cc +dgilbert, +cohuck]
> > > 
> > > On Wed, 20 Oct 2021 11:28:04 +0300
> > > Yishai Hadas <yishaih@nvidia.com> wrote:
> > >   
> > > > On 10/20/2021 2:04 AM, Jason Gunthorpe wrote:  
> > > > > On Tue, Oct 19, 2021 at 02:58:56PM -0600, Alex Williamson wrote:    
> > > > >> I think that gives us this table:
> > > > >>
> > > > >> |   NDMA   | RESUMING |  SAVING  |  RUNNING |
> > > > >> +----------+----------+----------+----------+ ---
> > > > >> |     X    |     0    |     0    |     0    |  ^
> > > > >> +----------+----------+----------+----------+  |
> > > > >> |     0    |     0    |     0    |     1    |  |
> > > > >> +----------+----------+----------+----------+  |
> > > > >> |     X    |     0    |     1    |     0    |
> > > > >> +----------+----------+----------+----------+  NDMA value is either compatible
> > > > >> |     0    |     0    |     1    |     1    |  to existing behavior or don't
> > > > >> +----------+----------+----------+----------+  care due to redundancy vs
> > > > >> |     X    |     1    |     0    |     0    |  !_RUNNING/INVALID/ERROR
> > > > >> +----------+----------+----------+----------+
> > > > >> |     X    |     1    |     0    |     1    |  |
> > > > >> +----------+----------+----------+----------+  |
> > > > >> |     X    |     1    |     1    |     0    |  |
> > > > >> +----------+----------+----------+----------+  |
> > > > >> |     X    |     1    |     1    |     1    |  v
> > > > >> +----------+----------+----------+----------+ ---
> > > > >> |     1    |     0    |     0    |     1    |  ^
> > > > >> +----------+----------+----------+----------+  Desired new useful cases
> > > > >> |     1    |     0    |     1    |     1    |  v
> > > > >> +----------+----------+----------+----------+ ---
> > > > >>
> > > > >> Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can
> > > > >> set which are simply redundant to the NDMA = 0 cases.    
> > > > > It seems right
> > > > >    
> > > > >> Row 6 remains invalid due to lack of support for pre-copy (_RESUMING
> > > > >> | _RUNNING) and therefore cannot be set by userspace.  Rows 7 & 8
> > > > >> are error states and cannot be set by userspace.    
> > > > > I wonder, did Yishai's series capture this row 6 restriction? Yishai?    
> > > > 
> > > > 
> > > > It seems so,  by using the below check which includes the 
> > > > !VFIO_DEVICE_STATE_VALID clause.
> > > > 
> > > > if (old_state == VFIO_DEVICE_STATE_ERROR ||
> > > >          !VFIO_DEVICE_STATE_VALID(state) ||
> > > >          (state & ~MLX5VF_SUPPORTED_DEVICE_STATES))
> > > >          return -EINVAL;
> > > > 
> > > > Which is:
> > > > 
> > > > #define VFIO_DEVICE_STATE_VALID(state) \
> > > >      (state & VFIO_DEVICE_STATE_RESUMING ? \
> > > >      (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
> > > >   
> > > > >    
> > > > >> Like other bits, setting the bit should be effective at the completion
> > > > >> of writing device state.  Therefore the device would need to flush any
> > > > >> outbound DMA queues before returning.    
> > > > > Yes, the device commands are expected to achieve this.
> > > > >    
> > > > >> The question I was really trying to get to though is whether we have a
> > > > >> supportable interface without such an extension.  There's currently
> > > > >> only an experimental version of vfio migration support for PCI devices
> > > > >> in QEMU (afaik),    
> > > > > If I recall this only matters if you have a VM that is causing
> > > > > migratable devices to interact with each other. So long as the devices
> > > > > are only interacting with the CPU this extra step is not strictly
> > > > > needed.
> > > > >
> > > > > So, single device cases can be fine as-is
> > > > >
> > > > > IMHO the multi-device case the VMM should probably demand this support
> > > > > from the migration drivers, otherwise it cannot know if it is safe for
> > > > > sure.
> > > > >
> > > > > A config option to override the block if the admin knows there is no
> > > > > use case to cause devices to interact - eg two NVMe devices without
> > > > > CMB do not have a useful interaction.
> > > > >    
> > > > >> so it seems like we could make use of the bus-master bit to fill
> > > > >> this gap in QEMU currently, before we claim non-experimental
> > > > >> support, but this new device agnostic extension would be required
> > > > >> for non-PCI device support (and PCI support should adopt it as
> > > > >> available).  Does that sound right?  Thanks,    
> > > > > I don't think the bus master support is really a substitute, tripping
> > > > > bus master will stop DMA but it will not do so in a clean way and is
> > > > > likely to be non-transparent to the VM's driver.
> > > > >
> > > > > The single-device-assigned case is a cleaner restriction, IMHO.
> > > > >
> > > > > Alternatively we can add the 4th bit and insist that migration drivers
> > > > > support all the states. I'm just unsure what other HW can do, I get
> > > > > the feeling people have been designing to the migration description in
> > > > > the header file for a while and this is a new idea.  
> > > 
> > > I'm wondering if we're imposing extra requirements on the !_RUNNING
> > > state that don't need to be there.  For example, if we can assume that
> > > all devices within a userspace context are !_RUNNING before any of the
> > > devices begin to retrieve final state, then clearing of the _RUNNING
> > > bit becomes the device quiesce point and the beginning of reading
> > > device data is the point at which the device state is frozen and
> > > serialized.  No new states required and essentially works with a slight
> > > rearrangement of the callbacks in this series.  Why can't we do that?  
> > 
> > So without me actually understanding your bit encodings that closely, I
> > think the problem is we have to asusme that any transition takes time.
> > From the QEMU point of view I think the requirement is when we stop the
> > machine (vm_stop_force_state(RUN_STATE_FINISH_MIGRATE) in
> > migration_completion) that at the point that call returns (with no
> > error) all devices are idle.  That means you need a way to command the
> > device to go into the stopped state, and probably another to make sure
> > it's got there.
> 
> In a way.  We're essentially recognizing that we cannot stop a single
> device in isolation of others that might participate in peer-to-peer
> DMA with that device, so we need to make a pass to quiesce each device
> before we can ask the device to fully stop.  This new device state bit
> is meant to be that quiescent point, devices can accept incoming DMA
> but should cease to generate any.  Once all device are quiesced then we
> can safely stop them.

It may need some further refinement; for example in that quiesed state
do counters still tick? will a NIC still respond to packets that don't
get forwarded to the host?

Note I still think you need a way to know when you have actually reached
these states; setting a bit in a register is asking nicely for a device
to go into a state - has it got there?

> > Now, you could be a *little* more sloppy; you could allow a device carry
> > on doing stuff purely with it's own internal state up until the point
> > it needs to serialise; but that would have to be strictly internal state
> > only - if it can change any other devices state (or issue an interrupt,
> > change RAM etc) then you get into ordering issues on the serialisation
> > of multiple devices.
> 
> Yep, that's the proposal that doesn't require a uAPI change, we loosen
> the definition of stopped to mean the device can no longer generate DMA
> or interrupts and all internal processing outside or responding to
> incoming DMA should halt (essentially the same as the new quiescent
> state above).  Once all devices are in this state, there should be no
> incoming DMA and we can safely collect per device migration data.  If
> state changes occur beyond the point in time where userspace has
> initiated the collection of migration data, drivers have options for
> generating errors when userspace consumes that data.

How do you know that last device has actually gone into that state?
Also be careful; it feels much more delicate where something might
accidentally start a transaction.

> AFAICT, the two approaches are equally valid.  If we modify the uAPI to
> include this new quiescent state then userspace needs to make some hard
> choices about what configurations they support without such a feature.
> The majority of configurations are likely not exercising p2p between
> assigned devices, but the hypervisor can't know that.  If we work
> within the existing uAPI, well there aren't any open source driver
> implementations yet anyway and any non-upstream implementations would
> need to be updated for this clarification.  Existing userspace works
> better with no change, so long as they already follow the guideline
> that all devices in the userspace context must be stopped before the
> migration data of any device can be considered valid.  Thanks,

Dave

> Alex
>
Jason Gunthorpe Oct. 25, 2021, 7:15 p.m. UTC | #20
On Mon, Oct 25, 2021 at 07:47:29PM +0100, Dr. David Alan Gilbert wrote:

> It may need some further refinement; for example in that quiesed state
> do counters still tick? will a NIC still respond to packets that don't
> get forwarded to the host?

At least for the mlx5 NIC the two states are 'able to issue outbound
DMA' and 'all internal memories and state are frozen and unchanging'.

The later means it should not respond/count/etc to network packets
either.

Roughly it is
 'able to mutate external state' / 'able to mutate internal state'

The invariant should be that successive calls to serialize the
internal state while frozen should return the same serialization.

We may find that migratable PCI functions must have some limited
functionality. Ie anything with a realtime compoment - for instance a
sync-ethernet clock synchronization control loop, may become
challenging to migrate without creating a serious functional
distortion.

Jason
Dr. David Alan Gilbert Oct. 26, 2021, 8:40 a.m. UTC | #21
* Jason Gunthorpe (jgg@nvidia.com) wrote:
> On Mon, Oct 25, 2021 at 07:47:29PM +0100, Dr. David Alan Gilbert wrote:
> 
> > It may need some further refinement; for example in that quiesed state
> > do counters still tick? will a NIC still respond to packets that don't
> > get forwarded to the host?
> 
> At least for the mlx5 NIC the two states are 'able to issue outbound
> DMA' and 'all internal memories and state are frozen and unchanging'.

Yeh, so my point was just that if you're adding a new state to this
process, you need to define the details like that.

Dave

> The later means it should not respond/count/etc to network packets
> either.
> 
> Roughly it is
>  'able to mutate external state' / 'able to mutate internal state'
> 
> The invariant should be that successive calls to serialize the
> internal state while frozen should return the same serialization.
> 
> We may find that migratable PCI functions must have some limited
> functionality. Ie anything with a realtime compoment - for instance a
> sync-ethernet clock synchronization control loop, may become
> challenging to migrate without creating a serious functional
> distortion.
> 
> Jason
>
Jason Gunthorpe Oct. 26, 2021, 12:13 p.m. UTC | #22
On Tue, Oct 26, 2021 at 09:40:34AM +0100, Dr. David Alan Gilbert wrote:
> * Jason Gunthorpe (jgg@nvidia.com) wrote:
> > On Mon, Oct 25, 2021 at 07:47:29PM +0100, Dr. David Alan Gilbert wrote:
> > 
> > > It may need some further refinement; for example in that quiesed state
> > > do counters still tick? will a NIC still respond to packets that don't
> > > get forwarded to the host?
> > 
> > At least for the mlx5 NIC the two states are 'able to issue outbound
> > DMA' and 'all internal memories and state are frozen and unchanging'.
> 
> Yeh, so my point was just that if you're adding a new state to this
> process, you need to define the details like that.

We are not planning to propose any patches/uAPI specification for this
problem until after the mlx5 vfio driver is merged..

Jason
Alex Williamson Oct. 26, 2021, 2:29 p.m. UTC | #23
On Mon, 25 Oct 2021 19:47:29 +0100
"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:

> * Alex Williamson (alex.williamson@redhat.com) wrote:
> > On Mon, 25 Oct 2021 17:34:01 +0100
> > "Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:
> >   
> > > * Alex Williamson (alex.williamson@redhat.com) wrote:  
> > > > [Cc +dgilbert, +cohuck]
> > > > 
> > > > On Wed, 20 Oct 2021 11:28:04 +0300
> > > > Yishai Hadas <yishaih@nvidia.com> wrote:
> > > >     
> > > > > On 10/20/2021 2:04 AM, Jason Gunthorpe wrote:    
> > > > > > On Tue, Oct 19, 2021 at 02:58:56PM -0600, Alex Williamson wrote:      
> > > > > >> I think that gives us this table:
> > > > > >>
> > > > > >> |   NDMA   | RESUMING |  SAVING  |  RUNNING |
> > > > > >> +----------+----------+----------+----------+ ---
> > > > > >> |     X    |     0    |     0    |     0    |  ^
> > > > > >> +----------+----------+----------+----------+  |
> > > > > >> |     0    |     0    |     0    |     1    |  |
> > > > > >> +----------+----------+----------+----------+  |
> > > > > >> |     X    |     0    |     1    |     0    |
> > > > > >> +----------+----------+----------+----------+  NDMA value is either compatible
> > > > > >> |     0    |     0    |     1    |     1    |  to existing behavior or don't
> > > > > >> +----------+----------+----------+----------+  care due to redundancy vs
> > > > > >> |     X    |     1    |     0    |     0    |  !_RUNNING/INVALID/ERROR
> > > > > >> +----------+----------+----------+----------+
> > > > > >> |     X    |     1    |     0    |     1    |  |
> > > > > >> +----------+----------+----------+----------+  |
> > > > > >> |     X    |     1    |     1    |     0    |  |
> > > > > >> +----------+----------+----------+----------+  |
> > > > > >> |     X    |     1    |     1    |     1    |  v
> > > > > >> +----------+----------+----------+----------+ ---
> > > > > >> |     1    |     0    |     0    |     1    |  ^
> > > > > >> +----------+----------+----------+----------+  Desired new useful cases
> > > > > >> |     1    |     0    |     1    |     1    |  v
> > > > > >> +----------+----------+----------+----------+ ---
> > > > > >>
> > > > > >> Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can
> > > > > >> set which are simply redundant to the NDMA = 0 cases.      
> > > > > > It seems right
> > > > > >      
> > > > > >> Row 6 remains invalid due to lack of support for pre-copy (_RESUMING
> > > > > >> | _RUNNING) and therefore cannot be set by userspace.  Rows 7 & 8
> > > > > >> are error states and cannot be set by userspace.      
> > > > > > I wonder, did Yishai's series capture this row 6 restriction? Yishai?      
> > > > > 
> > > > > 
> > > > > It seems so,  by using the below check which includes the 
> > > > > !VFIO_DEVICE_STATE_VALID clause.
> > > > > 
> > > > > if (old_state == VFIO_DEVICE_STATE_ERROR ||
> > > > >          !VFIO_DEVICE_STATE_VALID(state) ||
> > > > >          (state & ~MLX5VF_SUPPORTED_DEVICE_STATES))
> > > > >          return -EINVAL;
> > > > > 
> > > > > Which is:
> > > > > 
> > > > > #define VFIO_DEVICE_STATE_VALID(state) \
> > > > >      (state & VFIO_DEVICE_STATE_RESUMING ? \
> > > > >      (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
> > > > >     
> > > > > >      
> > > > > >> Like other bits, setting the bit should be effective at the completion
> > > > > >> of writing device state.  Therefore the device would need to flush any
> > > > > >> outbound DMA queues before returning.      
> > > > > > Yes, the device commands are expected to achieve this.
> > > > > >      
> > > > > >> The question I was really trying to get to though is whether we have a
> > > > > >> supportable interface without such an extension.  There's currently
> > > > > >> only an experimental version of vfio migration support for PCI devices
> > > > > >> in QEMU (afaik),      
> > > > > > If I recall this only matters if you have a VM that is causing
> > > > > > migratable devices to interact with each other. So long as the devices
> > > > > > are only interacting with the CPU this extra step is not strictly
> > > > > > needed.
> > > > > >
> > > > > > So, single device cases can be fine as-is
> > > > > >
> > > > > > IMHO the multi-device case the VMM should probably demand this support
> > > > > > from the migration drivers, otherwise it cannot know if it is safe for
> > > > > > sure.
> > > > > >
> > > > > > A config option to override the block if the admin knows there is no
> > > > > > use case to cause devices to interact - eg two NVMe devices without
> > > > > > CMB do not have a useful interaction.
> > > > > >      
> > > > > >> so it seems like we could make use of the bus-master bit to fill
> > > > > >> this gap in QEMU currently, before we claim non-experimental
> > > > > >> support, but this new device agnostic extension would be required
> > > > > >> for non-PCI device support (and PCI support should adopt it as
> > > > > >> available).  Does that sound right?  Thanks,      
> > > > > > I don't think the bus master support is really a substitute, tripping
> > > > > > bus master will stop DMA but it will not do so in a clean way and is
> > > > > > likely to be non-transparent to the VM's driver.
> > > > > >
> > > > > > The single-device-assigned case is a cleaner restriction, IMHO.
> > > > > >
> > > > > > Alternatively we can add the 4th bit and insist that migration drivers
> > > > > > support all the states. I'm just unsure what other HW can do, I get
> > > > > > the feeling people have been designing to the migration description in
> > > > > > the header file for a while and this is a new idea.    
> > > > 
> > > > I'm wondering if we're imposing extra requirements on the !_RUNNING
> > > > state that don't need to be there.  For example, if we can assume that
> > > > all devices within a userspace context are !_RUNNING before any of the
> > > > devices begin to retrieve final state, then clearing of the _RUNNING
> > > > bit becomes the device quiesce point and the beginning of reading
> > > > device data is the point at which the device state is frozen and
> > > > serialized.  No new states required and essentially works with a slight
> > > > rearrangement of the callbacks in this series.  Why can't we do that?    
> > > 
> > > So without me actually understanding your bit encodings that closely, I
> > > think the problem is we have to asusme that any transition takes time.
> > > From the QEMU point of view I think the requirement is when we stop the
> > > machine (vm_stop_force_state(RUN_STATE_FINISH_MIGRATE) in
> > > migration_completion) that at the point that call returns (with no
> > > error) all devices are idle.  That means you need a way to command the
> > > device to go into the stopped state, and probably another to make sure
> > > it's got there.  
> > 
> > In a way.  We're essentially recognizing that we cannot stop a single
> > device in isolation of others that might participate in peer-to-peer
> > DMA with that device, so we need to make a pass to quiesce each device
> > before we can ask the device to fully stop.  This new device state bit
> > is meant to be that quiescent point, devices can accept incoming DMA
> > but should cease to generate any.  Once all device are quiesced then we
> > can safely stop them.  
> 
> It may need some further refinement; for example in that quiesed state
> do counters still tick? will a NIC still respond to packets that don't
> get forwarded to the host?

I'd think no, but I imagine it's largely device specific to what extent
a device can be fully halted yet minimally handle incoming DMA.
 
> Note I still think you need a way to know when you have actually reached
> these states; setting a bit in a register is asking nicely for a device
> to go into a state - has it got there?

It's more than asking nicely, we define the device_state bits as
synchronous, the device needs to enter the state before returning from
the write operation or return an errno.

> > > Now, you could be a *little* more sloppy; you could allow a device carry
> > > on doing stuff purely with it's own internal state up until the point
> > > it needs to serialise; but that would have to be strictly internal state
> > > only - if it can change any other devices state (or issue an interrupt,
> > > change RAM etc) then you get into ordering issues on the serialisation
> > > of multiple devices.  
> > 
> > Yep, that's the proposal that doesn't require a uAPI change, we loosen
> > the definition of stopped to mean the device can no longer generate DMA
> > or interrupts and all internal processing outside or responding to
> > incoming DMA should halt (essentially the same as the new quiescent
> > state above).  Once all devices are in this state, there should be no
> > incoming DMA and we can safely collect per device migration data.  If
> > state changes occur beyond the point in time where userspace has
> > initiated the collection of migration data, drivers have options for
> > generating errors when userspace consumes that data.  
> 
> How do you know that last device has actually gone into that state?

Each device cannot, the burden is on the user to make sure all devices
are stopped before proceeding to read migration data.

> Also be careful; it feels much more delicate where something might
> accidentally start a transaction.

This sounds like a discussion of theoretically broken drivers.  Like
the above device_state, drivers still have a synchronization point when
the user reads the pending_bytes field to initiate retrieving the
device state.  If the implementation requires the device to be fully
stopped to snapshot the device state to provide to the user, this is
where that would happen.  Thanks,

Alex
Alex Williamson Oct. 26, 2021, 2:42 p.m. UTC | #24
On Mon, 25 Oct 2021 11:56:46 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Mon, Oct 25, 2021 at 08:28:57AM -0600, Alex Williamson wrote:
> > On Mon, 25 Oct 2021 09:29:38 -0300
> > Jason Gunthorpe <jgg@nvidia.com> wrote:
> >   
> > > On Thu, Oct 21, 2021 at 03:47:29PM -0600, Alex Williamson wrote:  
> > > > I recall that we previously suggested a very strict interpretation of
> > > > clearing the _RUNNING bit, but again I'm questioning if that's a real
> > > > requirement or simply a nice-to-have feature for some undefined
> > > > debugging capability.  In raising the p2p DMA issue, we can see that a
> > > > hard stop independent of other devices is not really practical but I
> > > > also don't see that introducing a new state bit solves this problem any
> > > > more elegantly than proposed here.  Thanks,    
> > > 
> > > I still disagree with this - the level of 'frozenness' of a device is
> > > something that belongs in the defined state exposed to userspace, not
> > > as a hidden internal state that userspace can't see.
> > > 
> > > It makes the state transitions asymmetric between suspend/resume as
> > > resume does have a defined uAPI state for each level of frozeness and
> > > suspend does not.
> > > 
> > > With the extra bit resume does:
> > >   
> > >   0000, 0100, 1000, 0001
> > > 
> > > And suspend does:
> > > 
> > >   0001, 1001, 0010, 0000
> > > 
> > > However, without the extra bit suspend is only
> > >   
> > >   001,  010, 000
> > > 
> > > With hidden state inside the 010  
> > 
> > And what is the device supposed to do if it receives a DMA while in
> > this strictly defined stopped state?  If it generates an unsupported
> > request, that can trigger a fatal platform error.    
> 
> I don't see that this question changes anything, we always have a
> state where the device is unable to respond to incoming DMA.

I think that depends on the device implementation.  If all devices can
receive incoming DMA, but all devices are also quiesced not to send
DMA, there's not necessarily a need to put the device in a state where
it errors TLPs.  This ventures into conversations about why assigning
VFs can be considered safer than assigning PFs, users cannot disable
memory space of VFs and therefore cannot generate URs on writes to
MMIO, which may generate fatal faults on some platforms.  If we create
a uAPI that requires dropping TLPs, then we provide userspace with a
means to specifically generate those faults.

> In all cases entry to this state is triggered only by user space
> action, if userspace does the ioctls in the wrong order then it will
> hit it.

And if userspace does not quiesce DMA and gets an intermediate device
state, that's a failure to follow the protocol.

> > If it silently drops the DMA, then we have data loss.  We're
> > defining a catch-22 scenario for drivers versus placing the onus on
> > the user to quiesce the set of devices in order to consider the
> > migration status as valid.    
> 
> The device should error the TLP.

That's a bit of a landmine as outlined above.
 
> Userspace must globally fence access to the device before it enters
> the device into the state where it errors TLPs.
> 
> This is also why I don't like it being so transparent as it is
> something userspace needs to care about - especially if the HW cannot
> support such a thing, if we intend to allow that.

Userspace does need to care, but userspace's concern over this should
not be able to compromise the platform and therefore making VF
assignment more susceptible to fatal error conditions to comply with a
migration uAPI is troublesome for me.  Thanks,

Alex
Dr. David Alan Gilbert Oct. 26, 2021, 2:51 p.m. UTC | #25
* Alex Williamson (alex.williamson@redhat.com) wrote:
> On Mon, 25 Oct 2021 19:47:29 +0100
> "Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:
> 
> > * Alex Williamson (alex.williamson@redhat.com) wrote:
> > > On Mon, 25 Oct 2021 17:34:01 +0100
> > > "Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:
> > >   
> > > > * Alex Williamson (alex.williamson@redhat.com) wrote:  
> > > > > [Cc +dgilbert, +cohuck]
> > > > > 
> > > > > On Wed, 20 Oct 2021 11:28:04 +0300
> > > > > Yishai Hadas <yishaih@nvidia.com> wrote:
> > > > >     

<snip>

> > > In a way.  We're essentially recognizing that we cannot stop a single
> > > device in isolation of others that might participate in peer-to-peer
> > > DMA with that device, so we need to make a pass to quiesce each device
> > > before we can ask the device to fully stop.  This new device state bit
> > > is meant to be that quiescent point, devices can accept incoming DMA
> > > but should cease to generate any.  Once all device are quiesced then we
> > > can safely stop them.  
> > 
> > It may need some further refinement; for example in that quiesed state
> > do counters still tick? will a NIC still respond to packets that don't
> > get forwarded to the host?
> 
> I'd think no, but I imagine it's largely device specific to what extent
> a device can be fully halted yet minimally handle incoming DMA.

That's what worries me; we're adding a new state here as we understand
more about trying to implement a device; but it seems that we need to
nail something down as to what the state means.

> > Note I still think you need a way to know when you have actually reached
> > these states; setting a bit in a register is asking nicely for a device
> > to go into a state - has it got there?
> 
> It's more than asking nicely, we define the device_state bits as
> synchronous, the device needs to enter the state before returning from
> the write operation or return an errno.

I don't see how it can be synchronous in practice; can it really wait to
complete if it has to take many cycles to finish off an inflight DMA
before it transitions?

> > > > Now, you could be a *little* more sloppy; you could allow a device carry
> > > > on doing stuff purely with it's own internal state up until the point
> > > > it needs to serialise; but that would have to be strictly internal state
> > > > only - if it can change any other devices state (or issue an interrupt,
> > > > change RAM etc) then you get into ordering issues on the serialisation
> > > > of multiple devices.  
> > > 
> > > Yep, that's the proposal that doesn't require a uAPI change, we loosen
> > > the definition of stopped to mean the device can no longer generate DMA
> > > or interrupts and all internal processing outside or responding to
> > > incoming DMA should halt (essentially the same as the new quiescent
> > > state above).  Once all devices are in this state, there should be no
> > > incoming DMA and we can safely collect per device migration data.  If
> > > state changes occur beyond the point in time where userspace has
> > > initiated the collection of migration data, drivers have options for
> > > generating errors when userspace consumes that data.  
> > 
> > How do you know that last device has actually gone into that state?
> 
> Each device cannot, the burden is on the user to make sure all devices
> are stopped before proceeding to read migration data.

Yeh this really ties to the previous question; if it's synchronous
you're OK.

> > Also be careful; it feels much more delicate where something might
> > accidentally start a transaction.
> 
> This sounds like a discussion of theoretically broken drivers.  Like
> the above device_state, drivers still have a synchronization point when
> the user reads the pending_bytes field to initiate retrieving the
> device state.  If the implementation requires the device to be fully
> stopped to snapshot the device state to provide to the user, this is
> where that would happen.  Thanks,

Yes, but I worry that some ways of definining it are harder to get right
in drivers, so less likely to be theoretical.

Dave

> Alex
>
Alex Williamson Oct. 26, 2021, 2:52 p.m. UTC | #26
On Tue, 26 Oct 2021 09:13:53 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Oct 26, 2021 at 09:40:34AM +0100, Dr. David Alan Gilbert wrote:
> > * Jason Gunthorpe (jgg@nvidia.com) wrote:  
> > > On Mon, Oct 25, 2021 at 07:47:29PM +0100, Dr. David Alan Gilbert wrote:
> > >   
> > > > It may need some further refinement; for example in that quiesed state
> > > > do counters still tick? will a NIC still respond to packets that don't
> > > > get forwarded to the host?  
> > > 
> > > At least for the mlx5 NIC the two states are 'able to issue outbound
> > > DMA' and 'all internal memories and state are frozen and unchanging'.  
> > 
> > Yeh, so my point was just that if you're adding a new state to this
> > process, you need to define the details like that.  
> 
> We are not planning to propose any patches/uAPI specification for this
> problem until after the mlx5 vfio driver is merged..

I'm not super comfortable with that.  If we're expecting to add a new
bit to define a quiescent state prior to clearing the running flag and
this is an optional device feature that userspace migration needs to be
aware of and it's really not clear from a hypervisor when p2p DMA might
be in use, I think that leaves userspace in a pickle how and when
they'd impose restrictions on assignment with multiple assigned
devices.  It's likely that the majority of initial use cases wouldn't
need this feature, which would make it difficult to arbitrarily impose
later.

OTOH, if we define !_RUNNING as quiescent and userspace reading
pending_bytes as the point by which the user is responsible for
quiescing all devices and the device state becomes stable (or drivers
can generate errors during collection of device state if that proves
otherwise), then I think existing userspace doesn't care about this
issue.  Thanks,

Alex
Jason Gunthorpe Oct. 26, 2021, 3:18 p.m. UTC | #27
On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote:

> > This is also why I don't like it being so transparent as it is
> > something userspace needs to care about - especially if the HW cannot
> > support such a thing, if we intend to allow that.
> 
> Userspace does need to care, but userspace's concern over this should
> not be able to compromise the platform and therefore making VF
> assignment more susceptible to fatal error conditions to comply with a
> migration uAPI is troublesome for me.

It is an interesting scenario.

I think it points that we are not implementing this fully properly.

The !RUNNING state should be like your reset efforts.

All access to the MMIO memories from userspace should be revoked
during !RUNNING

All VMAs zap'd.

All IOMMU peer mappings invalidated.

The kernel should directly block userspace from causing a MMIO TLP
before the device driver goes to !RUNNING.

Then the question of what the device does at this edge is not
relevant as hostile userspace cannot trigger it.

The logical way to implement this is to key off running and
block/unblock MMIO access when !RUNNING.

To me this strongly suggests that the extra bit is the correct way
forward as the driver is much simpler to implement and understand if
RUNNING directly controls the availability of MMIO instead of having
an irregular case where !RUNNING still allows MMIO but only until a
pending_bytes read.

Given the complexity of this can we move ahead with the current
mlx5_vfio and Yishai&co can come with some followup proposal to split
the freeze/queice and block MMIO?

Jason
Jason Gunthorpe Oct. 26, 2021, 3:25 p.m. UTC | #28
On Tue, Oct 26, 2021 at 03:51:21PM +0100, Dr. David Alan Gilbert wrote:

> > It's more than asking nicely, we define the device_state bits as
> > synchronous, the device needs to enter the state before returning from
> > the write operation or return an errno.
> 
> I don't see how it can be synchronous in practice; can it really wait to
> complete if it has to take many cycles to finish off an inflight DMA
> before it transitions?

The fencing of outbound DMAs in the device must be synchronous, how
could anything work if it isn't?

Jason
Jason Gunthorpe Oct. 26, 2021, 3:56 p.m. UTC | #29
On Tue, Oct 26, 2021 at 08:52:10AM -0600, Alex Williamson wrote:
> On Tue, 26 Oct 2021 09:13:53 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Tue, Oct 26, 2021 at 09:40:34AM +0100, Dr. David Alan Gilbert wrote:
> > > * Jason Gunthorpe (jgg@nvidia.com) wrote:  
> > > > On Mon, Oct 25, 2021 at 07:47:29PM +0100, Dr. David Alan Gilbert wrote:
> > > >   
> > > > > It may need some further refinement; for example in that quiesed state
> > > > > do counters still tick? will a NIC still respond to packets that don't
> > > > > get forwarded to the host?  
> > > > 
> > > > At least for the mlx5 NIC the two states are 'able to issue outbound
> > > > DMA' and 'all internal memories and state are frozen and unchanging'.  
> > > 
> > > Yeh, so my point was just that if you're adding a new state to this
> > > process, you need to define the details like that.  
> > 
> > We are not planning to propose any patches/uAPI specification for this
> > problem until after the mlx5 vfio driver is merged..
> 
> I'm not super comfortable with that.  If we're expecting to add a new
> bit to define a quiescent state prior to clearing the running flag and
> this is an optional device feature that userspace migration needs to be
> aware of and it's really not clear from a hypervisor when p2p DMA might
> be in use, I think that leaves userspace in a pickle how and when
> they'd impose restrictions on assignment with multiple assigned
> devices.  It's likely that the majority of initial use cases wouldn't
> need this feature, which would make it difficult to arbitrarily impose
> later.

Either supporting a freeze/quiesce split is an optional feature, and
current mlx5_vfio is an example of a driver that does not implement
it

Or, freeze/quiesce is a mandatory HW feature and no VFIO driver will
be merged that doesn't implement it - even if the HW cannot support
it.

Frankly, I think we will see HW that doesn't support this and VFIO
will be required to have it as an optional feature

What does userspace do? I don't know - but it does need to be aware if
the optional HW feature is available and it does need to make choices
if the VM will be migratable or not.

I don't see any path where userspace truly cannot care unless HW
support for this is mandatory.

Jason
Alex Williamson Oct. 26, 2021, 7:50 p.m. UTC | #30
On Tue, 26 Oct 2021 12:18:51 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote:
> 
> > > This is also why I don't like it being so transparent as it is
> > > something userspace needs to care about - especially if the HW cannot
> > > support such a thing, if we intend to allow that.  
> > 
> > Userspace does need to care, but userspace's concern over this should
> > not be able to compromise the platform and therefore making VF
> > assignment more susceptible to fatal error conditions to comply with a
> > migration uAPI is troublesome for me.  
> 
> It is an interesting scenario.
> 
> I think it points that we are not implementing this fully properly.
> 
> The !RUNNING state should be like your reset efforts.
> 
> All access to the MMIO memories from userspace should be revoked
> during !RUNNING
> 
> All VMAs zap'd.
> 
> All IOMMU peer mappings invalidated.
> 
> The kernel should directly block userspace from causing a MMIO TLP
> before the device driver goes to !RUNNING.
> 
> Then the question of what the device does at this edge is not
> relevant as hostile userspace cannot trigger it.
> 
> The logical way to implement this is to key off running and
> block/unblock MMIO access when !RUNNING.
> 
> To me this strongly suggests that the extra bit is the correct way
> forward as the driver is much simpler to implement and understand if
> RUNNING directly controls the availability of MMIO instead of having
> an irregular case where !RUNNING still allows MMIO but only until a
> pending_bytes read.
> 
> Given the complexity of this can we move ahead with the current
> mlx5_vfio and Yishai&co can come with some followup proposal to split
> the freeze/queice and block MMIO?

I know how much we want this driver in, but I'm surprised that you're
advocating to cut-and-run with an implementation where we've identified
a potentially significant gap with some hand waving that we'll resolve
it later.

Deciding at some point in the future to forcefully block device MMIO
access from userspace when the device stops running is clearly a user
visible change and therefore subject to the don't-break-userspace
clause.  It also seems to presume that the device relies on the
vfio-core to block device access, whereas device implementations may
not require such if they're able to snapshot device state.  That might
also indicate that "freeze" is only an implementation specific
requirement.  Thanks,

Alex
Jason Gunthorpe Oct. 26, 2021, 11:43 p.m. UTC | #31
On Tue, Oct 26, 2021 at 01:50:46PM -0600, Alex Williamson wrote:
> On Tue, 26 Oct 2021 12:18:51 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote:
> > 
> > > > This is also why I don't like it being so transparent as it is
> > > > something userspace needs to care about - especially if the HW cannot
> > > > support such a thing, if we intend to allow that.  
> > > 
> > > Userspace does need to care, but userspace's concern over this should
> > > not be able to compromise the platform and therefore making VF
> > > assignment more susceptible to fatal error conditions to comply with a
> > > migration uAPI is troublesome for me.  
> > 
> > It is an interesting scenario.
> > 
> > I think it points that we are not implementing this fully properly.
> > 
> > The !RUNNING state should be like your reset efforts.
> > 
> > All access to the MMIO memories from userspace should be revoked
> > during !RUNNING
> > 
> > All VMAs zap'd.
> > 
> > All IOMMU peer mappings invalidated.
> > 
> > The kernel should directly block userspace from causing a MMIO TLP
> > before the device driver goes to !RUNNING.
> > 
> > Then the question of what the device does at this edge is not
> > relevant as hostile userspace cannot trigger it.
> > 
> > The logical way to implement this is to key off running and
> > block/unblock MMIO access when !RUNNING.
> > 
> > To me this strongly suggests that the extra bit is the correct way
> > forward as the driver is much simpler to implement and understand if
> > RUNNING directly controls the availability of MMIO instead of having
> > an irregular case where !RUNNING still allows MMIO but only until a
> > pending_bytes read.
> > 
> > Given the complexity of this can we move ahead with the current
> > mlx5_vfio and Yishai&co can come with some followup proposal to split
> > the freeze/queice and block MMIO?
> 
> I know how much we want this driver in, but I'm surprised that you're
> advocating to cut-and-run with an implementation where we've identified
> a potentially significant gap with some hand waving that we'll resolve
> it later.

I don't view it as cut and run, but making reasonable quanta of
progress with patch series of reviewable size and scope.

At a certain point we need to get the base level of functionality,
that matches the currently defined ABI merged so we can talk about
where the ABI needs to go.

If you are uncomfortable about this from a uABI stability perspective
then mark the driver EXPERIMENTAL and do not merge any other migration
drivers until the two details are fully sorted out.

As far as the actual issue, if you hadn't just discovered it now
nobody would have known we have this gap - much like how the very
similar reset issue was present in VFIO for so many years until you
plugged it.

> Deciding at some point in the future to forcefully block device MMIO
> access from userspace when the device stops running is clearly a user
> visible change and therefore subject to the don't-break-userspace
> clause.  

I don't think so, this was done for reset retroactively after
all. Well behaved qmeu should have silenced all MMIO touches as part
of the ABI contract anyhow.

The "don't-break-userspace" is not an absolute prohibition, Linus has
been very clear this limitation is about direct, ideally demonstrable,
breakage to actually deployed software.

> That might also indicate that "freeze" is only an implementation
> specific requirement.  Thanks,

It doesn't matter if a theoretical device can exist that doesn't need
"freeze" - this device does, and so it is the lowest common
denominator for the uAPI contract and userspace must abide by the
restriction.

Further, I can think of no utility to exposing some non-freeze route
as an optional feature. If someone has a scenario where this is useful
they should come with a uAPI design and feature bit to describe how it
works and what it does. This driver will probably not ever set that
feature.

Jason
Alex Williamson Oct. 27, 2021, 7:05 p.m. UTC | #32
On Tue, 26 Oct 2021 20:43:00 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Oct 26, 2021 at 01:50:46PM -0600, Alex Williamson wrote:
> > On Tue, 26 Oct 2021 12:18:51 -0300
> > Jason Gunthorpe <jgg@nvidia.com> wrote:
> >   
> > > On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote:
> > >   
> > > > > This is also why I don't like it being so transparent as it is
> > > > > something userspace needs to care about - especially if the HW cannot
> > > > > support such a thing, if we intend to allow that.    
> > > > 
> > > > Userspace does need to care, but userspace's concern over this should
> > > > not be able to compromise the platform and therefore making VF
> > > > assignment more susceptible to fatal error conditions to comply with a
> > > > migration uAPI is troublesome for me.    
> > > 
> > > It is an interesting scenario.
> > > 
> > > I think it points that we are not implementing this fully properly.
> > > 
> > > The !RUNNING state should be like your reset efforts.
> > > 
> > > All access to the MMIO memories from userspace should be revoked
> > > during !RUNNING
> > > 
> > > All VMAs zap'd.
> > > 
> > > All IOMMU peer mappings invalidated.
> > > 
> > > The kernel should directly block userspace from causing a MMIO TLP
> > > before the device driver goes to !RUNNING.
> > > 
> > > Then the question of what the device does at this edge is not
> > > relevant as hostile userspace cannot trigger it.
> > > 
> > > The logical way to implement this is to key off running and
> > > block/unblock MMIO access when !RUNNING.
> > > 
> > > To me this strongly suggests that the extra bit is the correct way
> > > forward as the driver is much simpler to implement and understand if
> > > RUNNING directly controls the availability of MMIO instead of having
> > > an irregular case where !RUNNING still allows MMIO but only until a
> > > pending_bytes read.
> > > 
> > > Given the complexity of this can we move ahead with the current
> > > mlx5_vfio and Yishai&co can come with some followup proposal to split
> > > the freeze/queice and block MMIO?  
> > 
> > I know how much we want this driver in, but I'm surprised that you're
> > advocating to cut-and-run with an implementation where we've identified
> > a potentially significant gap with some hand waving that we'll resolve
> > it later.  
> 
> I don't view it as cut and run, but making reasonable quanta of
> progress with patch series of reviewable size and scope.
> 
> At a certain point we need to get the base level of functionality,
> that matches the currently defined ABI merged so we can talk about
> where the ABI needs to go.
> 
> If you are uncomfortable about this from a uABI stability perspective
> then mark the driver EXPERIMENTAL and do not merge any other migration
> drivers until the two details are fully sorted out.
> 
> As far as the actual issue, if you hadn't just discovered it now
> nobody would have known we have this gap - much like how the very
> similar reset issue was present in VFIO for so many years until you
> plugged it.

But the fact that we did discover it is hugely important.  We've
identified that the potential use case is significantly limited and
that userspace doesn't have a good mechanism to determine when to
expose that limitation to the user.  We're tossing around solutions
that involve extensions, if not changes to the uAPI.  It's Wednesday of
rc7.

I feel like we've already been burned by making one of these
"reasonable quanta of progress" to accept and mark experimental
decisions with where we stand between defining the uAPI in the kernel
and accepting an experimental implementation in QEMU.  Now we have
multiple closed driver implementations (none of which are contributing
to this discussion), but thankfully we're not committed to supporting
them because we have no open implementations.  I think we could get away
with ripping up the uAPI if we really needed to.  

> > Deciding at some point in the future to forcefully block device MMIO
> > access from userspace when the device stops running is clearly a user
> > visible change and therefore subject to the don't-break-userspace
> > clause.    
> 
> I don't think so, this was done for reset retroactively after
> all. Well behaved qmeu should have silenced all MMIO touches as part
> of the ABI contract anyhow.

That's not obvious to me and I think it conflates access to the device
and execution of the device.  If it's QEMU's responsibility to quiesce
access to the device anyway, why does the kernel need to impose this
restriction.  I'd think we'd generally only impose such a restriction
if the alternative allows the user to invoke bad behavior outside the
scope of their use of the device or consistency of the migration data.
It appears that any such behavior would be implementation specific here.

> The "don't-break-userspace" is not an absolute prohibition, Linus has
> been very clear this limitation is about direct, ideally demonstrable,
> breakage to actually deployed software.

And if we introduce an open driver that unblocks QEMU support to become
non-experimental, I think that's where we stand.

> > That might also indicate that "freeze" is only an implementation
> > specific requirement.  Thanks,  
> 
> It doesn't matter if a theoretical device can exist that doesn't need
> "freeze" - this device does, and so it is the lowest common
> denominator for the uAPI contract and userspace must abide by the
> restriction.

Sorry, "to the victor go the spoils" is not really how I strictly want
to define a uAPI contract with userspace.  If we're claiming that
userspace is responsible for quiescing devices and we're providing a
means for that to occur, and userspace is already responsible for
managing MMIO access, then the only reason the kernel would forcefully
impose such a restriction itself would be to protect the host and the
implementation of that would depend on whether this is expected to be a
universal or device specific limitation.  Thanks,

Alex
Jason Gunthorpe Oct. 27, 2021, 7:23 p.m. UTC | #33
On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote:

> > As far as the actual issue, if you hadn't just discovered it now
> > nobody would have known we have this gap - much like how the very
> > similar reset issue was present in VFIO for so many years until you
> > plugged it.
> 
> But the fact that we did discover it is hugely important.  We've
> identified that the potential use case is significantly limited and
> that userspace doesn't have a good mechanism to determine when to
> expose that limitation to the user.  

Huh?

We've identified that, depending on device behavior, the kernel may
need to revoke MMIO access to protect itself from hostile userspace
triggering TLP Errors or something.

Well behaved userspace must already stop touching the MMIO on the
device when !RUNNING - I see no compelling argument against that
position.

We've been investigating how the mlx5 HW will behave in corner cases,
and currently it looks like mlx5 vfio will not generate error TLPs, or
corrupt the device itself due to MMIO operations when !RUNNING. So the
driver itself, as written, probably does not currently have a bug
here, or need changes.

> We're tossing around solutions that involve extensions, if not
> changes to the uAPI.  It's Wednesday of rc7.

The P2P issue is seperate, and as I keep saying, unless you want to
block support for any HW that does not have freeze&queice userspace
must be aware of this ability and it is logical to design it as an
extension from where we are now.

> I feel like we've already been burned by making one of these
> "reasonable quanta of progress" to accept and mark experimental
> decisions with where we stand between defining the uAPI in the kernel
> and accepting an experimental implementation in QEMU.  

I won't argue there..

> Now we have multiple closed driver implementations (none of which
> are contributing to this discussion), but thankfully we're not
> committed to supporting them because we have no open
> implementations.  I think we could get away with ripping up the uAPI
> if we really needed to.

Do we need to?

> > > Deciding at some point in the future to forcefully block device MMIO
> > > access from userspace when the device stops running is clearly a user
> > > visible change and therefore subject to the don't-break-userspace
> > > clause.    
> > 
> > I don't think so, this was done for reset retroactively after
> > all. Well behaved qmeu should have silenced all MMIO touches as part
> > of the ABI contract anyhow.
> 
> That's not obvious to me and I think it conflates access to the device
> and execution of the device.  If it's QEMU's responsibility to quiesce
> access to the device anyway, why does the kernel need to impose this
> restriction.  I'd think we'd generally only impose such a restriction
> if the alternative allows the user to invoke bad behavior outside the
> scope of their use of the device or consistency of the migration data.
> It appears that any such behavior would be implementation specific here.

I think if an implementation has a problem, like error TLPs, then yes,
it must fence. The conservative specification of the uAPI is that
userspace should not allow MMIO when !RUNNING.

If we ever get any implementation that needs this to fence then we
should do it for all implementations just out of consistency.

> > The "don't-break-userspace" is not an absolute prohibition, Linus has
> > been very clear this limitation is about direct, ideally demonstrable,
> > breakage to actually deployed software.
> 
> And if we introduce an open driver that unblocks QEMU support to become
> non-experimental, I think that's where we stand.

Yes, if qemu becomes deployed, but our testing shows qemu support
needs a lot of work before it is deployable, so that doesn't seem to
be an immediate risk.

> > > That might also indicate that "freeze" is only an implementation
> > > specific requirement.  Thanks,  
> > 
> > It doesn't matter if a theoretical device can exist that doesn't need
> > "freeze" - this device does, and so it is the lowest common
> > denominator for the uAPI contract and userspace must abide by the
> > restriction.
> 
> Sorry, "to the victor go the spoils" is not really how I strictly want
> to define a uAPI contract with userspace.  

This is not the "victor go the spoils" this is meeting the least
common denominator of HW we have today.

If some fictional HW can be more advanced and can snapshot not freeze,
that is great, but it doesn't change one bit that mlx5 cannot and will
not work that way. Since mlx5 must be supported, there is no choice
but to define the uAPI around its limitations.

snapshot devices are strictly a superset of freeze devices, they can
emulate freeze by doing snapshot at the freeze operation.

In all cases userspace should not touch the device when !RUNNING to
preserve generality to all implementations.

> If we're claiming that userspace is responsible for quiescing
> devices and we're providing a means for that to occur, and userspace
> is already responsible for managing MMIO access, then the only
> reason the kernel would forcefully impose such a restriction itself
> would be to protect the host and the implementation of that would
> depend on whether this is expected to be a universal or device
> specific limitation.  

I think the best way forward is to allow for revoke to happen if we
ever need it (by specification), and not implement it right now.

So, I am not left with a clear idea what is still open that you see as
blocking. Can you summarize?

Jason
Cornelia Huck Oct. 28, 2021, 3:08 p.m. UTC | #34
On Wed, Oct 27 2021, Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote:

>> We're tossing around solutions that involve extensions, if not
>> changes to the uAPI.  It's Wednesday of rc7.
>
> The P2P issue is seperate, and as I keep saying, unless you want to
> block support for any HW that does not have freeze&queice userspace
> must be aware of this ability and it is logical to design it as an
> extension from where we are now.

I think the very fact that we're still discussing whether something
needs to be changed/documented or not already shows that this is nothing
that should go in right now. Actually, I'd already consider it too late
even if we agreed now; I would expect a change like this to get at least
two weeks in linux-next before the merge window.

>> > The "don't-break-userspace" is not an absolute prohibition, Linus has
>> > been very clear this limitation is about direct, ideally demonstrable,
>> > breakage to actually deployed software.
>> 
>> And if we introduce an open driver that unblocks QEMU support to become
>> non-experimental, I think that's where we stand.
>
> Yes, if qemu becomes deployed, but our testing shows qemu support
> needs a lot of work before it is deployable, so that doesn't seem to
> be an immediate risk.

Do you have any patches/problem reports you can share?

If you already identified that there is work to be done in QEMU, I think
that speaks even more for delaying this. What if we notice that uapi
changes are needed while fixing QEMU?
Alex Williamson Oct. 28, 2021, 3:30 p.m. UTC | #35
On Wed, 27 Oct 2021 16:23:45 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote:
> 
> > > As far as the actual issue, if you hadn't just discovered it now
> > > nobody would have known we have this gap - much like how the very
> > > similar reset issue was present in VFIO for so many years until you
> > > plugged it.  
> > 
> > But the fact that we did discover it is hugely important.  We've
> > identified that the potential use case is significantly limited and
> > that userspace doesn't have a good mechanism to determine when to
> > expose that limitation to the user.    
> 
> Huh?
> 
> We've identified that, depending on device behavior, the kernel may
> need to revoke MMIO access to protect itself from hostile userspace
> triggering TLP Errors or something.
> 
> Well behaved userspace must already stop touching the MMIO on the
> device when !RUNNING - I see no compelling argument against that
> position.

Not touching MMIO is not specified in our uAPI protocol, nor is it an
obvious assumption to me, nor is it sufficient to assume well behaved
userspace in the implementation of a kernel interface.

> We've been investigating how the mlx5 HW will behave in corner cases,
> and currently it looks like mlx5 vfio will not generate error TLPs, or
> corrupt the device itself due to MMIO operations when !RUNNING. So the
> driver itself, as written, probably does not currently have a bug
> here, or need changes.

This is a system level observation or is it actually looking at the
bus?  An Unsupported Request on MMIO write won't even generate an AER
on some systems, but others can trigger a fatal error on others.

That sounds like potentially good news, but either way we're still also
discussing a fundamental gap in the uAPI for quiescing multiple devices
in a coordinated way and how we actually define !_RUNNING.

> > We're tossing around solutions that involve extensions, if not
> > changes to the uAPI.  It's Wednesday of rc7.  
> 
> The P2P issue is seperate, and as I keep saying, unless you want to
> block support for any HW that does not have freeze&queice userspace
> must be aware of this ability and it is logical to design it as an
> extension from where we are now.

Is this essentially suggesting that the uAPI be clarified to state
that the base implementation is only applicable to userspace contexts
with a single migratable vfio device instance?  Does that need to
preemptively include /dev/iommu generically, ie. anything that could
potentially have an IOMMU mapping to the device?

I agree that it would be easier to add a capability to expose
multi-device compatibility than to try to retrofit one to expose a
restriction.

> > I feel like we've already been burned by making one of these
> > "reasonable quanta of progress" to accept and mark experimental
> > decisions with where we stand between defining the uAPI in the kernel
> > and accepting an experimental implementation in QEMU.    
> 
> I won't argue there..
> 
> > Now we have multiple closed driver implementations (none of which
> > are contributing to this discussion), but thankfully we're not
> > committed to supporting them because we have no open
> > implementations.  I think we could get away with ripping up the uAPI
> > if we really needed to.  
> 
> Do we need to?

I'd prefer not.

> > > > Deciding at some point in the future to forcefully block device MMIO
> > > > access from userspace when the device stops running is clearly a user
> > > > visible change and therefore subject to the don't-break-userspace
> > > > clause.      
> > > 
> > > I don't think so, this was done for reset retroactively after
> > > all. Well behaved qmeu should have silenced all MMIO touches as part
> > > of the ABI contract anyhow.  
> > 
> > That's not obvious to me and I think it conflates access to the device
> > and execution of the device.  If it's QEMU's responsibility to quiesce
> > access to the device anyway, why does the kernel need to impose this
> > restriction.  I'd think we'd generally only impose such a restriction
> > if the alternative allows the user to invoke bad behavior outside the
> > scope of their use of the device or consistency of the migration data.
> > It appears that any such behavior would be implementation specific here.  
> 
> I think if an implementation has a problem, like error TLPs, then yes,
> it must fence. The conservative specification of the uAPI is that
> userspace should not allow MMIO when !RUNNING.
> 
> If we ever get any implementation that needs this to fence then we
> should do it for all implementations just out of consistency.

Like I've indicated, this is not an obvious corollary of the !_RUNNING
state to me.  I'd tend more towards letting userspace do what they want
and only restrict as necessary to protect the host.  For example the
state of the device when !_RUNNING may be changed by external stimuli,
including MMIO and DMA accesses, but the device does not independently
advance state.

Also, I think we necessarily require config space read-access to
support migration, which begs the question specifically which regions,
if any, are restricted when !_RUNNING?  Could we get away with zapping
mmaps (sigbus on fault) but allowing r/w access?

> > > The "don't-break-userspace" is not an absolute prohibition, Linus has
> > > been very clear this limitation is about direct, ideally demonstrable,
> > > breakage to actually deployed software.  
> > 
> > And if we introduce an open driver that unblocks QEMU support to become
> > non-experimental, I think that's where we stand.  
> 
> Yes, if qemu becomes deployed, but our testing shows qemu support
> needs a lot of work before it is deployable, so that doesn't seem to
> be an immediate risk.

Good news... I guess...  but do we know what other uAPI changes might
be lurking without completing that effort?

> > > > That might also indicate that "freeze" is only an implementation
> > > > specific requirement.  Thanks,    
> > > 
> > > It doesn't matter if a theoretical device can exist that doesn't need
> > > "freeze" - this device does, and so it is the lowest common
> > > denominator for the uAPI contract and userspace must abide by the
> > > restriction.  
> > 
> > Sorry, "to the victor go the spoils" is not really how I strictly want
> > to define a uAPI contract with userspace.    
> 
> This is not the "victor go the spoils" this is meeting the least
> common denominator of HW we have today.
>
> If some fictional HW can be more advanced and can snapshot not freeze,
> that is great, but it doesn't change one bit that mlx5 cannot and will
> not work that way. Since mlx5 must be supported, there is no choice
> but to define the uAPI around its limitations.

But it seems like you've found that mlx5 is resilient to these things
that you're also deeming necessary to restrict.

> snapshot devices are strictly a superset of freeze devices, they can
> emulate freeze by doing snapshot at the freeze operation.

True.

> In all cases userspace should not touch the device when !RUNNING to
> preserve generality to all implementations.

Not and obvious conclusion to me.

> > If we're claiming that userspace is responsible for quiescing
> > devices and we're providing a means for that to occur, and userspace
> > is already responsible for managing MMIO access, then the only
> > reason the kernel would forcefully impose such a restriction itself
> > would be to protect the host and the implementation of that would
> > depend on whether this is expected to be a universal or device
> > specific limitation.    
> 
> I think the best way forward is to allow for revoke to happen if we
> ever need it (by specification), and not implement it right now.
> 
> So, I am not left with a clear idea what is still open that you see as
> blocking. Can you summarize?

It seems we have numerous uAPI questions floating around, including
whether the base specification is limited to a single physical device
within the user's IOMMU context, what the !_RUNNING state actually
implies about the device state, expectations around userspace access
to device regions while in this state, and who is responsible for
limiting such access, and uncertainty what other uAPI changes are
necessary as QEMU support is stabilized.

Why should we rush a driver in just before the merge window and
potentially increase our experimental driver debt load rather than
continue to co-develop kernel and userspace drivers and maybe also
get input from the owners of the existing out-of-tree drivers?  Thanks,

Alex
Jason Gunthorpe Oct. 28, 2021, 11:47 p.m. UTC | #36
On Thu, Oct 28, 2021 at 09:30:35AM -0600, Alex Williamson wrote:
> On Wed, 27 Oct 2021 16:23:45 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote:
> > 
> > > > As far as the actual issue, if you hadn't just discovered it now
> > > > nobody would have known we have this gap - much like how the very
> > > > similar reset issue was present in VFIO for so many years until you
> > > > plugged it.  
> > > 
> > > But the fact that we did discover it is hugely important.  We've
> > > identified that the potential use case is significantly limited and
> > > that userspace doesn't have a good mechanism to determine when to
> > > expose that limitation to the user.    
> > 
> > Huh?
> > 
> > We've identified that, depending on device behavior, the kernel may
> > need to revoke MMIO access to protect itself from hostile userspace
> > triggering TLP Errors or something.
> > 
> > Well behaved userspace must already stop touching the MMIO on the
> > device when !RUNNING - I see no compelling argument against that
> > position.
> 
> Not touching MMIO is not specified in our uAPI protocol,

To be frank, not much is specified in the uAPI comment, certainly not
a detailed meaning of RUNNING.

> nor is it an obvious assumption to me, nor is it sufficient to
> assume well behaved userspace in the implementation of a kernel
> interface.

I view two aspects to !RUNNING:

 1) the kernel must protect itself from hostile userspace. This means
    preventing loss of kernel or device integrity (ie no error TLPs, no
    crashing/corrupting the device/etc)

 2) userspace must follow the rules so that the migration is not
    corrupted. We want to set the rules in a way that gives the
    greatest freedom of HW implementation

Regarding 1, I think we all agree on this, and currently we believe
mlx5 is meeting this goal as-is.

Regarding 2, I think about possible implementations and come to the
conclusion that !RUNNING must mean no MMIO. For several major reasons

- For whatever reason a poor device may become harmed by MMIO during
  !RUNNING and so we may someday need to revoke MMIO like we do for
  reset. This implies that segfault on MMIO during !RUNNING
  is an option we should keep open.

- A simple DMA queue device, kind of like the HNS driver, could
  implement migration without HW support. Transition to !RUNNING only
  needs to wait for the device to fully drained the DMA queue.

  Any empty DMA queue with no MMIOs means a quiet and migration ready 
  device.

  However, if further MMIOs poke at the device it may resume
  operating and issue DMAs, which would corrupt the migration.

- We cannot define what MMIO during !RUNNING should even do. What
  should a write do? What should a read return? The mlx5 version is
  roughly discard the write and return garbage on read. While this
  does not corrupt the migration it is also not useful behavior to
  define.

In several of these case I'm happy if broken userspace harms itself
and corrupts the migration. That does not impact the integrity of the
kernel, and is just buggy userspace.

> > We've been investigating how the mlx5 HW will behave in corner cases,
> > and currently it looks like mlx5 vfio will not generate error TLPs, or
> > corrupt the device itself due to MMIO operations when !RUNNING. So the
> > driver itself, as written, probably does not currently have a bug
> > here, or need changes.
> 
> This is a system level observation or is it actually looking at the
> bus?  An Unsupported Request on MMIO write won't even generate an AER
> on some systems, but others can trigger a fatal error on others.

At this point this information is a design analysis from the HW
people.

> > > We're tossing around solutions that involve extensions, if not
> > > changes to the uAPI.  It's Wednesday of rc7.  
> > 
> > The P2P issue is seperate, and as I keep saying, unless you want to
> > block support for any HW that does not have freeze&queice userspace
> > must be aware of this ability and it is logical to design it as an
> > extension from where we are now.
> 
> Is this essentially suggesting that the uAPI be clarified to state
> that the base implementation is only applicable to userspace contexts
> with a single migratable vfio device instance?  

That is one way to look at it, yes. It is not just a uAPI limitation
but a HW limitation as the NDMA state does require direct HW support
to continue accepting MMIO/etc but not issue DMA. A simple DMA queue
device as I imagine above couldn't implement it.

> Does that need to preemptively include /dev/iommu generically,
> ie. anything that could potentially have an IOMMU mapping to the
> device?

Going back to the top, for #1 the kernel must protect its
integrity. So, like reset, if we have a driver where revoke is
required then the revoke must extend to /dev/iommu as well.

For #2 - it is up to userspace. If userspace plugs the device into
/dev/iommu and keeps operating it then the migration can be
corrupted. Buggy userspace.

> I agree that it would be easier to add a capability to expose
> multi-device compatibility than to try to retrofit one to expose a
> restriction.

Yes, me too. What we have here is a realization that the current
interface does not support P2P scenarios. There is a wide universe of
applications that don't need P2P.

The realization is that qemu has a bug in that it allows the VM to
execute P2P operations which are incompatible with my above definition
of !RUNNING. The result is the #2 case: migration corruption.

qemu should protect itself from a VM causing corruption of the
migration. Either by only supporting migration with a single VFIO
device, or directly blocking P2P scenarios using the IOMMU.

To support P2P will require new kernel support, capability and HW
support. Which, in concrete terms, means we need to write a new uAPI
spec, update the mlx5 vfio driver, implement qemu patches, and test
the full solution.

Right now we are focused on the non-P2P cases, which I think is a
reasonable starting limitation.

> Like I've indicated, this is not an obvious corollary of the !_RUNNING
> state to me.  I'd tend more towards letting userspace do what they want
> and only restrict as necessary to protect the host.  For example the
> state of the device when !_RUNNING may be changed by external stimuli,
> including MMIO and DMA accesses, but the device does not independently
> advance state.

As above this is mixing #1 and #2 - it is fine to allow the device to
do whatever as long as it doesn't harm the host - however that doesn't
define the conditions userspace must follow to have a successful
migration.

> Also, I think we necessarily require config space read-access to
> support migration, which begs the question specifically which regions,
> if any, are restricted when !_RUNNING?  Could we get away with zapping
> mmaps (sigbus on fault) but allowing r/w access?

Ideally we would define exactly what device operations are allowed
during !RUNNING such that the migration will be successful. Operations
outside that list should be considered things that could corrupt the
migration.

This list should be as narrow as possible to allow the broadest range
of HW designs.

> > Yes, if qemu becomes deployed, but our testing shows qemu support
> > needs a lot of work before it is deployable, so that doesn't seem to
> > be an immediate risk.
> 
> Good news... I guess...  but do we know what other uAPI changes might
> be lurking without completing that effort?

Well, I would say this patch series is approximately the mid point of
the project. We are about 60 patches into kernel changes at this
point. What is left is approximately:

 - fix bugs in qemu so single-device operation is robust
 - dirty page tracking using the system iommu (via iommufd I suppose?)
 - dirty page tracking using the device iommu
 - migration with P2P ongoing: uAPI spec, kernel implementation
   and qemu implementation

Then we might have a product..

I also know the mlx5 device was designed with knowledge of other
operating systems and our team believes the device interface meets all
needs.

So, is the uAPI OK? I'd say provisionally yes. It works within its
limitations and several vendors have implemented it, even if only two
are heading toward in-tree.

Is it clearly specified and covers all scenarios? No..

> > If some fictional HW can be more advanced and can snapshot not freeze,
> > that is great, but it doesn't change one bit that mlx5 cannot and will
> > not work that way. Since mlx5 must be supported, there is no choice
> > but to define the uAPI around its limitations.
> 
> But it seems like you've found that mlx5 is resilient to these things
> that you're also deeming necessary to restrict.

Here I am talking about freeze/quiesce as a HW design choice, not the
mmio stuff.
 
> > So, I am not left with a clear idea what is still open that you see as
> > blocking. Can you summarize?
> 
> It seems we have numerous uAPI questions floating around, including
> whether the base specification is limited to a single physical device
> within the user's IOMMU context, what the !_RUNNING state actually
> implies about the device state, expectations around userspace access
> to device regions while in this state, and who is responsible for
> limiting such access, and uncertainty what other uAPI changes are
> necessary as QEMU support is stabilized.

I think these questions have straightfoward answers. I've tried to
explain my view above.

> Why should we rush a driver in just before the merge window and
> potentially increase our experimental driver debt load rather than
> continue to co-develop kernel and userspace drivers and maybe also
> get input from the owners of the existing out-of-tree drivers?  Thanks,

It is not a big deal to defer things to rc1, though merging a
leaf-driver that has been on-list over a month is certainly not
rushing either.

We are not here doing all this work because we want to co-develop
kernel and user space drivers out of tree for ages.

Why to merge it? Because there is still lots of work to do, and to
make progress on the next bits require agreeing to the basic stuff
first!

So, lets have some actional feedback on what you need to see for an
rc1 merging please.

Since there are currently no unaddressed comments on the patches, I
assume you want to see more work done, please define it.

Thanks,
Jason
Jason Gunthorpe Oct. 29, 2021, 12:26 a.m. UTC | #37
On Thu, Oct 28, 2021 at 05:08:11PM +0200, Cornelia Huck wrote:

> that should go in right now. Actually, I'd already consider it too late
> even if we agreed now; I would expect a change like this to get at least
> two weeks in linux-next before the merge window.

Usually linux-next is about sorting out integration problems so we
have an orderly merge window. Nobody is going to test this code just
because it is in linux-next, it isn't mm or something with coverage
there.

> > Yes, if qemu becomes deployed, but our testing shows qemu support
> > needs a lot of work before it is deployable, so that doesn't seem to
> > be an immediate risk.
> 
> Do you have any patches/problem reports you can share?

Yishai has some stuff, he was doing failure injection testing and
other interesting things. I think we are hoping to start looking at
it.

> If you already identified that there is work to be done in QEMU, I think
> that speaks even more for delaying this. What if we notice that uapi
> changes are needed while fixing QEMU?

I don't think it is those kinds of bugs.

Jason
Cornelia Huck Oct. 29, 2021, 6:57 a.m. UTC | #38
On Thu, Oct 28 2021, Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Thu, Oct 28, 2021 at 09:30:35AM -0600, Alex Williamson wrote:
>> On Wed, 27 Oct 2021 16:23:45 -0300
>> Jason Gunthorpe <jgg@nvidia.com> wrote:
>> 
>> > On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote:
>> > 
>> > > > As far as the actual issue, if you hadn't just discovered it now
>> > > > nobody would have known we have this gap - much like how the very
>> > > > similar reset issue was present in VFIO for so many years until you
>> > > > plugged it.  
>> > > 
>> > > But the fact that we did discover it is hugely important.  We've
>> > > identified that the potential use case is significantly limited and
>> > > that userspace doesn't have a good mechanism to determine when to
>> > > expose that limitation to the user.    
>> > 
>> > Huh?
>> > 
>> > We've identified that, depending on device behavior, the kernel may
>> > need to revoke MMIO access to protect itself from hostile userspace
>> > triggering TLP Errors or something.
>> > 
>> > Well behaved userspace must already stop touching the MMIO on the
>> > device when !RUNNING - I see no compelling argument against that
>> > position.
>> 
>> Not touching MMIO is not specified in our uAPI protocol,
>
> To be frank, not much is specified in the uAPI comment, certainly not
> a detailed meaning of RUNNING.

Yes! And I think that means we need to improve that comment before the
first in-tree driver to use it is merged, just to make sure we all agree
on the protocol, and future drivers can rely on that understanding as
well.
Yishai Hadas Oct. 29, 2021, 7:35 a.m. UTC | #39
On 10/29/2021 3:26 AM, Jason Gunthorpe wrote:
> On Thu, Oct 28, 2021 at 05:08:11PM +0200, Cornelia Huck wrote:
>
>> that should go in right now. Actually, I'd already consider it too late
>> even if we agreed now; I would expect a change like this to get at least
>> two weeks in linux-next before the merge window.
> Usually linux-next is about sorting out integration problems so we
> have an orderly merge window. Nobody is going to test this code just
> because it is in linux-next, it isn't mm or something with coverage
> there.


Right, in addition, the series has been on-list over a month to let 
people review and comment.

V5 has no specific comment on, I believe that we are in a good state / 
point to move forward with it.

>>> Yes, if qemu becomes deployed, but our testing shows qemu support
>>> needs a lot of work before it is deployable, so that doesn't seem to
>>> be an immediate risk.
>> Do you have any patches/problem reports you can share?
> Yishai has some stuff, he was doing failure injection testing and
> other interesting things. I think we are hoping to start looking at
> it.


Correct, I encountered some SF of QEMU upon failure injection / error flows.

For example,

- Unbinding the VF then trying to run savevm.

- Moving the mlx5 device to ERROR state, my expectation was to see some 
recovery flow from QEMU as of calling the RESET ioctl to let it be 
running again, however it crashed.

- etc.

Yes, we have some plans to start looking at.

>> If you already identified that there is work to be done in QEMU, I think
>> that speaks even more for delaying this. What if we notice that uapi
>> changes are needed while fixing QEMU?
> I don't think it is those kinds of bugs.

Right, it doesn't seem as uapi changed are required, need to debug and 
fix QEMU.

Yishai
Yishai Hadas Oct. 29, 2021, 7:48 a.m. UTC | #40
On 10/29/2021 9:57 AM, Cornelia Huck wrote:
> On Thu, Oct 28 2021, Jason Gunthorpe <jgg@nvidia.com> wrote:
>
>> On Thu, Oct 28, 2021 at 09:30:35AM -0600, Alex Williamson wrote:
>>> On Wed, 27 Oct 2021 16:23:45 -0300
>>> Jason Gunthorpe <jgg@nvidia.com> wrote:
>>>
>>>> On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote:
>>>>
>>>>>> As far as the actual issue, if you hadn't just discovered it now
>>>>>> nobody would have known we have this gap - much like how the very
>>>>>> similar reset issue was present in VFIO for so many years until you
>>>>>> plugged it.
>>>>> But the fact that we did discover it is hugely important.  We've
>>>>> identified that the potential use case is significantly limited and
>>>>> that userspace doesn't have a good mechanism to determine when to
>>>>> expose that limitation to the user.
>>>> Huh?
>>>>
>>>> We've identified that, depending on device behavior, the kernel may
>>>> need to revoke MMIO access to protect itself from hostile userspace
>>>> triggering TLP Errors or something.
>>>>
>>>> Well behaved userspace must already stop touching the MMIO on the
>>>> device when !RUNNING - I see no compelling argument against that
>>>> position.
>>> Not touching MMIO is not specified in our uAPI protocol,
>> To be frank, not much is specified in the uAPI comment, certainly not
>> a detailed meaning of RUNNING.
> Yes! And I think that means we need to improve that comment before the
> first in-tree driver to use it is merged, just to make sure we all agree
> on the protocol, and future drivers can rely on that understanding as
> well.
>

This can done by a follow-up patch as part of the the RC cycles, once we 
agree on the exact comment.

Alternatively,

We can come with V6 on Sunday if we can agree on the comment here soon.

In any case, we don't expect for now code changes in mlx5 as of that.

Frankly, I don't believe that this should block the series from being 
merged.

Yishai
Shameerali Kolothum Thodi Oct. 29, 2021, 10:32 a.m. UTC | #41
Hi Jason,

> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgg@nvidia.com]
> Sent: 29 October 2021 00:48
> To: Alex Williamson <alex.williamson@redhat.com>
> Cc: Cornelia Huck <cohuck@redhat.com>; Yishai Hadas <yishaih@nvidia.com>;
> bhelgaas@google.com; saeedm@nvidia.com; linux-pci@vger.kernel.org;
> kvm@vger.kernel.org; netdev@vger.kernel.org; kuba@kernel.org;
> leonro@nvidia.com; kwankhede@nvidia.com; mgurtovoy@nvidia.com;
> maorg@nvidia.com; Dr. David Alan Gilbert <dgilbert@redhat.com>
> Subject: Re: [PATCH V2 mlx5-next 12/14] vfio/mlx5: Implement vfio_pci driver
> for mlx5 devices
> 
> On Thu, Oct 28, 2021 at 09:30:35AM -0600, Alex Williamson wrote:
> > On Wed, 27 Oct 2021 16:23:45 -0300
> > Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > > On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote:
> > >
> > > > > As far as the actual issue, if you hadn't just discovered it now
> > > > > nobody would have known we have this gap - much like how the very
> > > > > similar reset issue was present in VFIO for so many years until you
> > > > > plugged it.
> > > >
> > > > But the fact that we did discover it is hugely important.  We've
> > > > identified that the potential use case is significantly limited and
> > > > that userspace doesn't have a good mechanism to determine when to
> > > > expose that limitation to the user.
> > >
> > > Huh?
> > >
> > > We've identified that, depending on device behavior, the kernel may
> > > need to revoke MMIO access to protect itself from hostile userspace
> > > triggering TLP Errors or something.
> > >
> > > Well behaved userspace must already stop touching the MMIO on the
> > > device when !RUNNING - I see no compelling argument against that
> > > position.
> >
> > Not touching MMIO is not specified in our uAPI protocol,
> 
> To be frank, not much is specified in the uAPI comment, certainly not
> a detailed meaning of RUNNING.
> 
> > nor is it an obvious assumption to me, nor is it sufficient to
> > assume well behaved userspace in the implementation of a kernel
> > interface.
> 
> I view two aspects to !RUNNING:
> 
>  1) the kernel must protect itself from hostile userspace. This means
>     preventing loss of kernel or device integrity (ie no error TLPs, no
>     crashing/corrupting the device/etc)
> 
>  2) userspace must follow the rules so that the migration is not
>     corrupted. We want to set the rules in a way that gives the
>     greatest freedom of HW implementation
> 
> Regarding 1, I think we all agree on this, and currently we believe
> mlx5 is meeting this goal as-is.
> 
> Regarding 2, I think about possible implementations and come to the
> conclusion that !RUNNING must mean no MMIO. For several major reasons
> 
> - For whatever reason a poor device may become harmed by MMIO during
>   !RUNNING and so we may someday need to revoke MMIO like we do for
>   reset. This implies that segfault on MMIO during !RUNNING
>   is an option we should keep open.
> 
> - A simple DMA queue device, kind of like the HNS driver, could
>   implement migration without HW support. Transition to !RUNNING only
>   needs to wait for the device to fully drained the DMA queue.
> 
>   Any empty DMA queue with no MMIOs means a quiet and migration ready
>   device.
> 
>   However, if further MMIOs poke at the device it may resume
>   operating and issue DMAs, which would corrupt the migration.
 
Just trying to follow the thread here. The above situation where MMIOs can be
further poked in the !RUNNING state, do you mean poking through P2P or
Guest/Qemu can still do that? I am just trying to see if the current uAPI fits well
within the use case where we don't have any P2P scenario and the assigned dev
is behind the IOMMU.

Thanks,
Shameer 


> 
> - We cannot define what MMIO during !RUNNING should even do. What
>   should a write do? What should a read return? The mlx5 version is
>   roughly discard the write and return garbage on read. While this
>   does not corrupt the migration it is also not useful behavior to
>   define.
> 
> In several of these case I'm happy if broken userspace harms itself
> and corrupts the migration. That does not impact the integrity of the
> kernel, and is just buggy userspace.
> 
> > > We've been investigating how the mlx5 HW will behave in corner cases,
> > > and currently it looks like mlx5 vfio will not generate error TLPs, or
> > > corrupt the device itself due to MMIO operations when !RUNNING. So the
> > > driver itself, as written, probably does not currently have a bug
> > > here, or need changes.
> >
> > This is a system level observation or is it actually looking at the
> > bus?  An Unsupported Request on MMIO write won't even generate an AER
> > on some systems, but others can trigger a fatal error on others.
> 
> At this point this information is a design analysis from the HW
> people.
> 
> > > > We're tossing around solutions that involve extensions, if not
> > > > changes to the uAPI.  It's Wednesday of rc7.
> > >
> > > The P2P issue is seperate, and as I keep saying, unless you want to
> > > block support for any HW that does not have freeze&queice userspace
> > > must be aware of this ability and it is logical to design it as an
> > > extension from where we are now.
> >
> > Is this essentially suggesting that the uAPI be clarified to state
> > that the base implementation is only applicable to userspace contexts
> > with a single migratable vfio device instance?
> 
> That is one way to look at it, yes. It is not just a uAPI limitation
> but a HW limitation as the NDMA state does require direct HW support
> to continue accepting MMIO/etc but not issue DMA. A simple DMA queue
> device as I imagine above couldn't implement it.
> 
> > Does that need to preemptively include /dev/iommu generically,
> > ie. anything that could potentially have an IOMMU mapping to the
> > device?
> 
> Going back to the top, for #1 the kernel must protect its
> integrity. So, like reset, if we have a driver where revoke is
> required then the revoke must extend to /dev/iommu as well.
> 
> For #2 - it is up to userspace. If userspace plugs the device into
> /dev/iommu and keeps operating it then the migration can be
> corrupted. Buggy userspace.
> 
> > I agree that it would be easier to add a capability to expose
> > multi-device compatibility than to try to retrofit one to expose a
> > restriction.
> 
> Yes, me too. What we have here is a realization that the current
> interface does not support P2P scenarios. There is a wide universe of
> applications that don't need P2P.
> 
> The realization is that qemu has a bug in that it allows the VM to
> execute P2P operations which are incompatible with my above definition
> of !RUNNING. The result is the #2 case: migration corruption.
> 
> qemu should protect itself from a VM causing corruption of the
> migration. Either by only supporting migration with a single VFIO
> device, or directly blocking P2P scenarios using the IOMMU.
> 
> To support P2P will require new kernel support, capability and HW
> support. Which, in concrete terms, means we need to write a new uAPI
> spec, update the mlx5 vfio driver, implement qemu patches, and test
> the full solution.
> 
> Right now we are focused on the non-P2P cases, which I think is a
> reasonable starting limitation.
> 
> > Like I've indicated, this is not an obvious corollary of the !_RUNNING
> > state to me.  I'd tend more towards letting userspace do what they want
> > and only restrict as necessary to protect the host.  For example the
> > state of the device when !_RUNNING may be changed by external stimuli,
> > including MMIO and DMA accesses, but the device does not independently
> > advance state.
> 
> As above this is mixing #1 and #2 - it is fine to allow the device to
> do whatever as long as it doesn't harm the host - however that doesn't
> define the conditions userspace must follow to have a successful
> migration.
> 
> > Also, I think we necessarily require config space read-access to
> > support migration, which begs the question specifically which regions,
> > if any, are restricted when !_RUNNING?  Could we get away with zapping
> > mmaps (sigbus on fault) but allowing r/w access?
> 
> Ideally we would define exactly what device operations are allowed
> during !RUNNING such that the migration will be successful. Operations
> outside that list should be considered things that could corrupt the
> migration.
> 
> This list should be as narrow as possible to allow the broadest range
> of HW designs.
> 
> > > Yes, if qemu becomes deployed, but our testing shows qemu support
> > > needs a lot of work before it is deployable, so that doesn't seem to
> > > be an immediate risk.
> >
> > Good news... I guess...  but do we know what other uAPI changes might
> > be lurking without completing that effort?
> 
> Well, I would say this patch series is approximately the mid point of
> the project. We are about 60 patches into kernel changes at this
> point. What is left is approximately:
> 
>  - fix bugs in qemu so single-device operation is robust
>  - dirty page tracking using the system iommu (via iommufd I suppose?)
>  - dirty page tracking using the device iommu
>  - migration with P2P ongoing: uAPI spec, kernel implementation
>    and qemu implementation
> 
> Then we might have a product..
> 
> I also know the mlx5 device was designed with knowledge of other
> operating systems and our team believes the device interface meets all
> needs.
> 
> So, is the uAPI OK? I'd say provisionally yes. It works within its
> limitations and several vendors have implemented it, even if only two
> are heading toward in-tree.
> 
> Is it clearly specified and covers all scenarios? No..
> 
> > > If some fictional HW can be more advanced and can snapshot not freeze,
> > > that is great, but it doesn't change one bit that mlx5 cannot and will
> > > not work that way. Since mlx5 must be supported, there is no choice
> > > but to define the uAPI around its limitations.
> >
> > But it seems like you've found that mlx5 is resilient to these things
> > that you're also deeming necessary to restrict.
> 
> Here I am talking about freeze/quiesce as a HW design choice, not the
> mmio stuff.
> 
> > > So, I am not left with a clear idea what is still open that you see as
> > > blocking. Can you summarize?
> >
> > It seems we have numerous uAPI questions floating around, including
> > whether the base specification is limited to a single physical device
> > within the user's IOMMU context, what the !_RUNNING state actually
> > implies about the device state, expectations around userspace access
> > to device regions while in this state, and who is responsible for
> > limiting such access, and uncertainty what other uAPI changes are
> > necessary as QEMU support is stabilized.
> 
> I think these questions have straightfoward answers. I've tried to
> explain my view above.
> 
> > Why should we rush a driver in just before the merge window and
> > potentially increase our experimental driver debt load rather than
> > continue to co-develop kernel and userspace drivers and maybe also
> > get input from the owners of the existing out-of-tree drivers?  Thanks,
> 
> It is not a big deal to defer things to rc1, though merging a
> leaf-driver that has been on-list over a month is certainly not
> rushing either.
> 
> We are not here doing all this work because we want to co-develop
> kernel and user space drivers out of tree for ages.
> 
> Why to merge it? Because there is still lots of work to do, and to
> make progress on the next bits require agreeing to the basic stuff
> first!
> 
> So, lets have some actional feedback on what you need to see for an
> rc1 merging please.
> 
> Since there are currently no unaddressed comments on the patches, I
> assume you want to see more work done, please define it.
> 
> Thanks,
> Jason
Jason Gunthorpe Oct. 29, 2021, 12:15 p.m. UTC | #42
On Fri, Oct 29, 2021 at 10:32:03AM +0000, Shameerali Kolothum Thodi wrote:
> Just trying to follow the thread here. The above situation where MMIOs can be
> further poked in the !RUNNING state, do you mean poking through P2P or
> Guest/Qemu can still do that? I am just trying to see if the current uAPI fits well
> within the use case where we don't have any P2P scenario and the assigned dev
> is behind the IOMMU.

The way it is today any VFIO user can poke MMIO while in the !RUNNING
state, no P2P needed, just write to the mmap.

Jason
Alex Williamson Oct. 29, 2021, 10:06 p.m. UTC | #43
On Thu, 28 Oct 2021 20:47:50 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Thu, Oct 28, 2021 at 09:30:35AM -0600, Alex Williamson wrote:
> > On Wed, 27 Oct 2021 16:23:45 -0300
> > Jason Gunthorpe <jgg@nvidia.com> wrote:
> >   
> > > On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote:
> > >   
> > > > > As far as the actual issue, if you hadn't just discovered it now
> > > > > nobody would have known we have this gap - much like how the very
> > > > > similar reset issue was present in VFIO for so many years until you
> > > > > plugged it.    
> > > > 
> > > > But the fact that we did discover it is hugely important.  We've
> > > > identified that the potential use case is significantly limited and
> > > > that userspace doesn't have a good mechanism to determine when to
> > > > expose that limitation to the user.      
> > > 
> > > Huh?
> > > 
> > > We've identified that, depending on device behavior, the kernel may
> > > need to revoke MMIO access to protect itself from hostile userspace
> > > triggering TLP Errors or something.
> > > 
> > > Well behaved userspace must already stop touching the MMIO on the
> > > device when !RUNNING - I see no compelling argument against that
> > > position.  
> > 
> > Not touching MMIO is not specified in our uAPI protocol,  
> 
> To be frank, not much is specified in the uAPI comment, certainly not
> a detailed meaning of RUNNING.
> 
> > nor is it an obvious assumption to me, nor is it sufficient to
> > assume well behaved userspace in the implementation of a kernel
> > interface.  
> 
> I view two aspects to !RUNNING:
> 
>  1) the kernel must protect itself from hostile userspace. This means
>     preventing loss of kernel or device integrity (ie no error TLPs, no
>     crashing/corrupting the device/etc)
> 
>  2) userspace must follow the rules so that the migration is not
>     corrupted. We want to set the rules in a way that gives the
>     greatest freedom of HW implementation
> 
> Regarding 1, I think we all agree on this, and currently we believe
> mlx5 is meeting this goal as-is.
> 
> Regarding 2, I think about possible implementations and come to the
> conclusion that !RUNNING must mean no MMIO. For several major reasons
> 
> - For whatever reason a poor device may become harmed by MMIO during
>   !RUNNING and so we may someday need to revoke MMIO like we do for
>   reset. This implies that segfault on MMIO during !RUNNING
>   is an option we should keep open.
> 
> - A simple DMA queue device, kind of like the HNS driver, could
>   implement migration without HW support. Transition to !RUNNING only
>   needs to wait for the device to fully drained the DMA queue.
> 
>   Any empty DMA queue with no MMIOs means a quiet and migration ready 
>   device.
> 
>   However, if further MMIOs poke at the device it may resume
>   operating and issue DMAs, which would corrupt the migration.
> 
> - We cannot define what MMIO during !RUNNING should even do. What
>   should a write do? What should a read return? The mlx5 version is
>   roughly discard the write and return garbage on read. While this
>   does not corrupt the migration it is also not useful behavior to
>   define.
> 
> In several of these case I'm happy if broken userspace harms itself
> and corrupts the migration. That does not impact the integrity of the
> kernel, and is just buggy userspace.
> 
> > > We've been investigating how the mlx5 HW will behave in corner cases,
> > > and currently it looks like mlx5 vfio will not generate error TLPs, or
> > > corrupt the device itself due to MMIO operations when !RUNNING. So the
> > > driver itself, as written, probably does not currently have a bug
> > > here, or need changes.  
> > 
> > This is a system level observation or is it actually looking at the
> > bus?  An Unsupported Request on MMIO write won't even generate an AER
> > on some systems, but others can trigger a fatal error on others.  
> 
> At this point this information is a design analysis from the HW
> people.
> 
> > > > We're tossing around solutions that involve extensions, if not
> > > > changes to the uAPI.  It's Wednesday of rc7.    
> > > 
> > > The P2P issue is seperate, and as I keep saying, unless you want to
> > > block support for any HW that does not have freeze&queice userspace
> > > must be aware of this ability and it is logical to design it as an
> > > extension from where we are now.  
> > 
> > Is this essentially suggesting that the uAPI be clarified to state
> > that the base implementation is only applicable to userspace contexts
> > with a single migratable vfio device instance?    
> 
> That is one way to look at it, yes. It is not just a uAPI limitation
> but a HW limitation as the NDMA state does require direct HW support
> to continue accepting MMIO/etc but not issue DMA. A simple DMA queue
> device as I imagine above couldn't implement it.
> 
> > Does that need to preemptively include /dev/iommu generically,
> > ie. anything that could potentially have an IOMMU mapping to the
> > device?  
> 
> Going back to the top, for #1 the kernel must protect its
> integrity. So, like reset, if we have a driver where revoke is
> required then the revoke must extend to /dev/iommu as well.
> 
> For #2 - it is up to userspace. If userspace plugs the device into
> /dev/iommu and keeps operating it then the migration can be
> corrupted. Buggy userspace.
> 
> > I agree that it would be easier to add a capability to expose
> > multi-device compatibility than to try to retrofit one to expose a
> > restriction.  
> 
> Yes, me too. What we have here is a realization that the current
> interface does not support P2P scenarios. There is a wide universe of
> applications that don't need P2P.
> 
> The realization is that qemu has a bug in that it allows the VM to
> execute P2P operations which are incompatible with my above definition
> of !RUNNING. The result is the #2 case: migration corruption.
> 
> qemu should protect itself from a VM causing corruption of the
> migration. Either by only supporting migration with a single VFIO
> device, or directly blocking P2P scenarios using the IOMMU.
>
> To support P2P will require new kernel support, capability and HW
> support. Which, in concrete terms, means we need to write a new uAPI
> spec, update the mlx5 vfio driver, implement qemu patches, and test
> the full solution.
> 
> Right now we are focused on the non-P2P cases, which I think is a
> reasonable starting limitation.

It's a reasonable starting point iff we know that we need to support
devices that cannot themselves support a quiescent state.  Otherwise it
would make sense to go back to work on the uAPI because I suspect the
implications to userspace are not going to be as simple as "oops, can't
migrate, there are two devices."  As you say, there's a universe of
devices that run together that don't care about p2p and QEMU will be
pressured to support migration of those configurations.

QEMU currently already supports p2p between assigned devices, which
means the user or management tool is going to need to choose whether
they prefer migration or legacy p2p compatibility.  The DMA mapping
aspects of this get complicated.  Ideally we could skip p2p DMA
mappings for any individual device that doesn't support this future
quiescent state, but we don't do mappings based on individual devices,
we do them based on the container.  There's a least common denominator
among the devices in a container, but we also support hotplug and we
can't suddenly decide to tear down p2p mappings because a new device is
added.  That probably means that we can't automatically enable both p2p
and migration, even if we initially only have devices that support this
new quiescent migration state.  We'd need to design the QEMU options
so we can't have a subset of devices that want p2p and another set that
want migration.  If we ever want both migration and p2p, QEMU would
need to reject any device that can't comply.

If we're moving forward without an independent quiescent state, the
uAPI should include clarification specifying that it's the user's
responsibility to independently prevent DMA to devices while the device
is !_RUNNING.  Whether that's because DMA to the device is always
blocked at the IOMMU or a configuration restriction to prevent
additional devices that could generate DMA is left to the user.

Support would need to be added in the kernel to tear down these
mappings if there were a scenario where the user failing to prevent DMA
to the device could cause misbehavior classified under your #1 above,
harm to host.

> > Like I've indicated, this is not an obvious corollary of the !_RUNNING
> > state to me.  I'd tend more towards letting userspace do what they want
> > and only restrict as necessary to protect the host.  For example the
> > state of the device when !_RUNNING may be changed by external stimuli,
> > including MMIO and DMA accesses, but the device does not independently
> > advance state.  
> 
> As above this is mixing #1 and #2 - it is fine to allow the device to
> do whatever as long as it doesn't harm the host - however that doesn't
> define the conditions userspace must follow to have a successful
> migration.
> 
> > Also, I think we necessarily require config space read-access to
> > support migration, which begs the question specifically which regions,
> > if any, are restricted when !_RUNNING?  Could we get away with zapping
> > mmaps (sigbus on fault) but allowing r/w access?  
> 
> Ideally we would define exactly what device operations are allowed
> during !RUNNING such that the migration will be successful. Operations
> outside that list should be considered things that could corrupt the
> migration.
> 
> This list should be as narrow as possible to allow the broadest range
> of HW designs.

So we need a proposal of that list.

> > > Yes, if qemu becomes deployed, but our testing shows qemu support
> > > needs a lot of work before it is deployable, so that doesn't seem to
> > > be an immediate risk.  
> > 
> > Good news... I guess...  but do we know what other uAPI changes might
> > be lurking without completing that effort?  
> 
> Well, I would say this patch series is approximately the mid point of
> the project. We are about 60 patches into kernel changes at this
> point. What is left is approximately:
> 
>  - fix bugs in qemu so single-device operation is robust
>  - dirty page tracking using the system iommu (via iommufd I suppose?)
>  - dirty page tracking using the device iommu
>  - migration with P2P ongoing: uAPI spec, kernel implementation
>    and qemu implementation
> 
> Then we might have a product..
> 
> I also know the mlx5 device was designed with knowledge of other
> operating systems and our team believes the device interface meets all
> needs.
> 
> So, is the uAPI OK? I'd say provisionally yes. It works within its
> limitations and several vendors have implemented it, even if only two
> are heading toward in-tree.
> 
> Is it clearly specified and covers all scenarios? No..
> 
> > > If some fictional HW can be more advanced and can snapshot not freeze,
> > > that is great, but it doesn't change one bit that mlx5 cannot and will
> > > not work that way. Since mlx5 must be supported, there is no choice
> > > but to define the uAPI around its limitations.  
> > 
> > But it seems like you've found that mlx5 is resilient to these things
> > that you're also deeming necessary to restrict.  
> 
> Here I am talking about freeze/quiesce as a HW design choice, not the
> mmio stuff.
>  
> > > So, I am not left with a clear idea what is still open that you see as
> > > blocking. Can you summarize?  
> > 
> > It seems we have numerous uAPI questions floating around, including
> > whether the base specification is limited to a single physical device
> > within the user's IOMMU context, what the !_RUNNING state actually
> > implies about the device state, expectations around userspace access
> > to device regions while in this state, and who is responsible for
> > limiting such access, and uncertainty what other uAPI changes are
> > necessary as QEMU support is stabilized.  
> 
> I think these questions have straightfoward answers. I've tried to
> explain my view above.
> 
> > Why should we rush a driver in just before the merge window and
> > potentially increase our experimental driver debt load rather than
> > continue to co-develop kernel and userspace drivers and maybe also
> > get input from the owners of the existing out-of-tree drivers?  Thanks,  
> 
> It is not a big deal to defer things to rc1, though merging a
> leaf-driver that has been on-list over a month is certainly not
> rushing either.

If "on-list over a month" is meant to imply that it's well vetted, it
does not.  That's a pretty quick time frame given the uAPI viability
discussions that it's generated.
 
> We are not here doing all this work because we want to co-develop
> kernel and user space drivers out of tree for ages.
> 
> Why to merge it? Because there is still lots of work to do, and to
> make progress on the next bits require agreeing to the basic stuff
> first!
> 
> So, lets have some actional feedback on what you need to see for an
> rc1 merging please.
> 
> Since there are currently no unaddressed comments on the patches, I
> assume you want to see more work done, please define it.

I'm tending to agree that there's value in moving forward, but there's
a lot we're defining here that's not in the uAPI, so I'd like to see
those things become formalized.

I think this version is defining that it's the user's responsibility to
prevent external DMA to devices while in the !_RUNNING state.  This
resolves the condition that we have no means to coordinate quiescing
multiple devices.  We shouldn't necessarily prescribe a single device
solution in the uAPI if the same can be equally achieved through
configuration of DMA mapping.

I was almost on board with blocking MMIO, especially as p2p is just DMA
mapping of MMIO, but what about MSI-X?  During _RESUME we must access
the MSI-X vector table via the SET_IRQS ioctl to configure interrupts.
Is this exempt because the access occurs in the host?  In any case, it
requires that the device cannot be absolutely static while !_RUNNING.
Does (_RESUMING) have different rules than (_SAVING)?

So I'm still unclear how the uAPI needs to be updated relative to
region access.  We need that list of what the user is allowed to
access, which seems like minimally config space and MSI-X table space,
but are these implicitly known for vfio-pci devices or do we need
region flags or capabilities to describe?  We can't generally know the
disposition of device specific regions relative to this access.  Thanks,

Alex
Jason Gunthorpe Nov. 1, 2021, 5:25 p.m. UTC | #44
On Fri, Oct 29, 2021 at 04:06:21PM -0600, Alex Williamson wrote:

> > Right now we are focused on the non-P2P cases, which I think is a
> > reasonable starting limitation.
> 
> It's a reasonable starting point iff we know that we need to support
> devices that cannot themselves support a quiescent state.  Otherwise it
> would make sense to go back to work on the uAPI because I suspect the
> implications to userspace are not going to be as simple as "oops, can't
> migrate, there are two devices."  As you say, there's a universe of
> devices that run together that don't care about p2p and QEMU will be
> pressured to support migration of those configurations.

I agree with this, but I also think what I saw in the proposed hns
driver suggests it's HW cannot do quiescent, if so this is the first
counter-example to the notion it is a universal ability?

hns people: Can you put your device in a state where it is operating,
able to accept and respond to MMIO, and yet guarentees it generates no
DMA transactions?

> want migration.  If we ever want both migration and p2p, QEMU would
> need to reject any device that can't comply.

Yes, it looks like a complicated task on the qemu side to get this
resolved

> > It is not a big deal to defer things to rc1, though merging a
> > leaf-driver that has been on-list over a month is certainly not
> > rushing either.
> 
> If "on-list over a month" is meant to imply that it's well vetted, it
> does not.  That's a pretty quick time frame given the uAPI viability
> discussions that it's generated.

I only said rushed :)
  
> I'm tending to agree that there's value in moving forward, but there's
> a lot we're defining here that's not in the uAPI, so I'd like to see
> those things become formalized.

Ok, lets come up with a documentation patch then to define !RUNNING as
I outlined and start to come up with the allowed list of actions..

I think I would like to have a proper rst file for documenting the
uapi as well.

> I think this version is defining that it's the user's responsibility to
> prevent external DMA to devices while in the !_RUNNING state.  This
> resolves the condition that we have no means to coordinate quiescing
> multiple devices.  We shouldn't necessarily prescribe a single device
> solution in the uAPI if the same can be equally achieved through
> configuration of DMA mapping.

I'm not sure what this means?
 
> I was almost on board with blocking MMIO, especially as p2p is just DMA
> mapping of MMIO, but what about MSI-X?  During _RESUME we must access
> the MSI-X vector table via the SET_IRQS ioctl to configure interrupts.
> Is this exempt because the access occurs in the host?  

s/in the host/in the kernel/ SET_IRQS is a kernel ioctl that uses the
core MSIX code to do the mmio, so it would not be impacted by MMIO
zap.

Looks like you've already marked these points with the
vfio_pci_memory_lock_and_enable(), so a zap for migration would have
to be a little different than a zap for reset.

Still, this is something that needs clear definition, I would expect
the SET_IRQS to happen after resuming clears but before running sets
to give maximum HW flexibility and symmetry with saving.

And we should really define clearly what a device is supposed to do
with the interrupt vectors during migration. Obviously there are races
here.

> In any case, it requires that the device cannot be absolutely static
> while !_RUNNING.  Does (_RESUMING) have different rules than
> (_SAVING)?

I'd prever to avoid all device touches during both resuming and
saving, and do them during !RUNNING

> So I'm still unclear how the uAPI needs to be updated relative to
> region access.  We need that list of what the user is allowed to
> access, which seems like minimally config space and MSI-X table space,
> but are these implicitly known for vfio-pci devices or do we need
> region flags or capabilities to describe?  We can't generally know the
> disposition of device specific regions relative to this access.  Thanks,

I'd prefer to be general and have the spec forbid
everything. Specifying things like VFIO_DEVICE_SET_IRQS1 covers all the
bus types.

Other bus types should get spec updates before any other bus type
driver is merged.

Thanks,
Jason
Shameerali Kolothum Thodi Nov. 2, 2021, 11:19 a.m. UTC | #45
> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgg@nvidia.com]
> Sent: 01 November 2021 17:25
> To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum
> Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: Cornelia Huck <cohuck@redhat.com>; Yishai Hadas <yishaih@nvidia.com>;
> bhelgaas@google.com; saeedm@nvidia.com; linux-pci@vger.kernel.org;
> kvm@vger.kernel.org; netdev@vger.kernel.org; kuba@kernel.org;
> leonro@nvidia.com; kwankhede@nvidia.com; mgurtovoy@nvidia.com;
> maorg@nvidia.com; Dr. David Alan Gilbert <dgilbert@redhat.com>
> Subject: Re: [PATCH V2 mlx5-next 12/14] vfio/mlx5: Implement vfio_pci driver
> for mlx5 devices
> 
> On Fri, Oct 29, 2021 at 04:06:21PM -0600, Alex Williamson wrote:
> 
> > > Right now we are focused on the non-P2P cases, which I think is a
> > > reasonable starting limitation.
> >
> > It's a reasonable starting point iff we know that we need to support
> > devices that cannot themselves support a quiescent state.  Otherwise it
> > would make sense to go back to work on the uAPI because I suspect the
> > implications to userspace are not going to be as simple as "oops, can't
> > migrate, there are two devices."  As you say, there's a universe of
> > devices that run together that don't care about p2p and QEMU will be
> > pressured to support migration of those configurations.
> 
> I agree with this, but I also think what I saw in the proposed hns
> driver suggests it's HW cannot do quiescent, if so this is the first
> counter-example to the notion it is a universal ability?
> 
> hns people: Can you put your device in a state where it is operating,
> able to accept and respond to MMIO, and yet guarentees it generates no
> DMA transactions?

AFAIK, I am afraid we cannot guarantee that as per our current implementation.
At present in !RUNNING state we are putting the device in to a PAUSE state so it
will complete the current request and keep the remaining ones in queue. But it can
still receive a new request which will trigger the PAUSE state exit and resume the
operation.

So I guess, it is possible to corrupt the migration if user space misbehaves.

Thanks,
Shameer
Alex Williamson Nov. 2, 2021, 2:56 p.m. UTC | #46
On Mon, 1 Nov 2021 14:25:06 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Fri, Oct 29, 2021 at 04:06:21PM -0600, Alex Williamson wrote:
> 
> > > Right now we are focused on the non-P2P cases, which I think is a
> > > reasonable starting limitation.  
> > 
> > It's a reasonable starting point iff we know that we need to support
> > devices that cannot themselves support a quiescent state.  Otherwise it
> > would make sense to go back to work on the uAPI because I suspect the
> > implications to userspace are not going to be as simple as "oops, can't
> > migrate, there are two devices."  As you say, there's a universe of
> > devices that run together that don't care about p2p and QEMU will be
> > pressured to support migration of those configurations.  
> 
> I agree with this, but I also think what I saw in the proposed hns
> driver suggests it's HW cannot do quiescent, if so this is the first
> counter-example to the notion it is a universal ability?
> 
> hns people: Can you put your device in a state where it is operating,
> able to accept and respond to MMIO, and yet guarentees it generates no
> DMA transactions?
> 
> > want migration.  If we ever want both migration and p2p, QEMU would
> > need to reject any device that can't comply.  
> 
> Yes, it looks like a complicated task on the qemu side to get this
> resolved
> 
> > > It is not a big deal to defer things to rc1, though merging a
> > > leaf-driver that has been on-list over a month is certainly not
> > > rushing either.  
> > 
> > If "on-list over a month" is meant to imply that it's well vetted, it
> > does not.  That's a pretty quick time frame given the uAPI viability
> > discussions that it's generated.  
> 
> I only said rushed :)

To push forward regardless of unresolved questions is rushing
regardless of how long it's been on-list.

> > I'm tending to agree that there's value in moving forward, but there's
> > a lot we're defining here that's not in the uAPI, so I'd like to see
> > those things become formalized.  
> 
> Ok, lets come up with a documentation patch then to define !RUNNING as
> I outlined and start to come up with the allowed list of actions..
> 
> I think I would like to have a proper rst file for documenting the
> uapi as well.
> 
> > I think this version is defining that it's the user's responsibility to
> > prevent external DMA to devices while in the !_RUNNING state.  This
> > resolves the condition that we have no means to coordinate quiescing
> > multiple devices.  We shouldn't necessarily prescribe a single device
> > solution in the uAPI if the same can be equally achieved through
> > configuration of DMA mapping.  
> 
> I'm not sure what this means?

I'm just trying to avoid the uAPI calling out a single-device
restriction if there are other ways that userspace can quiesce external
DMA outside of the uAPI, such as by limiting p2p DMA mappings at the
IOMMU, ie. define the userspace requirements but don't dictate a
specific solution.

> > I was almost on board with blocking MMIO, especially as p2p is just DMA
> > mapping of MMIO, but what about MSI-X?  During _RESUME we must access
> > the MSI-X vector table via the SET_IRQS ioctl to configure interrupts.
> > Is this exempt because the access occurs in the host?    
> 
> s/in the host/in the kernel/ SET_IRQS is a kernel ioctl that uses the
> core MSIX code to do the mmio, so it would not be impacted by MMIO
> zap.

AIUI, "zap" is just the proposed userspace manifestation that the
device cannot accept MMIO writes while !_RUNNING, but these writes must
occur in that state.

> Looks like you've already marked these points with the
> vfio_pci_memory_lock_and_enable(), so a zap for migration would have
> to be a little different than a zap for reset.
> 
> Still, this is something that needs clear definition, I would expect
> the SET_IRQS to happen after resuming clears but before running sets
> to give maximum HW flexibility and symmetry with saving.

There's no requirement that the device enters a null state (!_RESUMING
| !_SAVING | !_RUNNING), the uAPI even species the flows as _RESUMING
transitioning to _RUNNING.  There's no point at which we can do
SET_IRQS other than in the _RESUMING state.  Generally SET_IRQS
ioctls are coordinated with the guest driver based on actions to the
device, we can't be mucking with IRQs while the device is presumed
running and already generating interrupt conditions.

> And we should really define clearly what a device is supposed to do
> with the interrupt vectors during migration. Obviously there are races
> here.

The device should not be generating interrupts while !_RUNNING, pending
interrupts should be held until the device is _RUNNING.  To me this
means the sequence must be that INTx/MSI/MSI-X are restored while in
the !_RUNNING state.

> > In any case, it requires that the device cannot be absolutely static
> > while !_RUNNING.  Does (_RESUMING) have different rules than
> > (_SAVING)?  
> 
> I'd prever to avoid all device touches during both resuming and
> saving, and do them during !RUNNING

There's no such state required by the uAPI.

> > So I'm still unclear how the uAPI needs to be updated relative to
> > region access.  We need that list of what the user is allowed to
> > access, which seems like minimally config space and MSI-X table space,
> > but are these implicitly known for vfio-pci devices or do we need
> > region flags or capabilities to describe?  We can't generally know the
> > disposition of device specific regions relative to this access.  Thanks,  
> 
> I'd prefer to be general and have the spec forbid
> everything. Specifying things like VFIO_DEVICE_SET_IRQS1 covers all the
> bus types.

AFAICT, SET_IRQS while _RESUMING is a requirement, as is some degree of
access to config space.  It seems you're proposing a new required null
state which is contradictory to the existing uAPI.  Thanks,

Alex
Jason Gunthorpe Nov. 2, 2021, 3:54 p.m. UTC | #47
On Tue, Nov 02, 2021 at 08:56:51AM -0600, Alex Williamson wrote:

> > Still, this is something that needs clear definition, I would expect
> > the SET_IRQS to happen after resuming clears but before running sets
> > to give maximum HW flexibility and symmetry with saving.
> 
> There's no requirement that the device enters a null state (!_RESUMING
> | !_SAVING | !_RUNNING), the uAPI even species the flows as _RESUMING
> transitioning to _RUNNING.  

If the device saves the MSI-X state inside it's migration data (as
apparently would be convenient for other OSs) then when RESUMING
clears and the migration data is de-serialized the device will
overwrite the MSI-X data.

Since Linux as an OS wants to control the MSI-X it needs to load it
after RESUMING, but before RUNNING.

> There's no point at which we can do SET_IRQS other than in the
> _RESUMING state.  Generally SET_IRQS ioctls are coordinated with the
> guest driver based on actions to the device, we can't be mucking
> with IRQs while the device is presumed running and already
> generating interrupt conditions.

We need to do it in state 000

ie resume should go 

  000 -> 100 -> 000 -> 001

With SET_IRQS and any other fixing done during the 2nd 000, after the
migration data has been loaded into the device.

> > And we should really define clearly what a device is supposed to do
> > with the interrupt vectors during migration. Obviously there are races
> > here.
> 
> The device should not be generating interrupts while !_RUNNING, pending
> interrupts should be held until the device is _RUNNING.  To me this
> means the sequence must be that INTx/MSI/MSI-X are restored while in
> the !_RUNNING state.

Yes

> > > In any case, it requires that the device cannot be absolutely static
> > > while !_RUNNING.  Does (_RESUMING) have different rules than
> > > (_SAVING)?  
> > 
> > I'd prever to avoid all device touches during both resuming and
> > saving, and do them during !RUNNING
> 
> There's no such state required by the uAPI.

The uAPI comment does not define when to do the SET_IRQS, it seems
this has been missed.

We really should fix it, unless you feel strongly that the
experimental API in qemu shouldn't be changed.

Jason
Alex Williamson Nov. 2, 2021, 4:22 p.m. UTC | #48
On Tue, 2 Nov 2021 12:54:20 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Nov 02, 2021 at 08:56:51AM -0600, Alex Williamson wrote:
> 
> > > Still, this is something that needs clear definition, I would expect
> > > the SET_IRQS to happen after resuming clears but before running sets
> > > to give maximum HW flexibility and symmetry with saving.  
> > 
> > There's no requirement that the device enters a null state (!_RESUMING
> > | !_SAVING | !_RUNNING), the uAPI even species the flows as _RESUMING
> > transitioning to _RUNNING.    
> 
> If the device saves the MSI-X state inside it's migration data (as
> apparently would be convenient for other OSs) then when RESUMING
> clears and the migration data is de-serialized the device will
> overwrite the MSI-X data.
> 
> Since Linux as an OS wants to control the MSI-X it needs to load it
> after RESUMING, but before RUNNING.

This is not how it works today, QEMU enables MSI/X based on the config
space information, which is also outside of the device migration stream.

> > There's no point at which we can do SET_IRQS other than in the
> > _RESUMING state.  Generally SET_IRQS ioctls are coordinated with the
> > guest driver based on actions to the device, we can't be mucking
> > with IRQs while the device is presumed running and already
> > generating interrupt conditions.  
> 
> We need to do it in state 000
> 
> ie resume should go 
> 
>   000 -> 100 -> 000 -> 001
> 
> With SET_IRQS and any other fixing done during the 2nd 000, after the
> migration data has been loaded into the device.

Again, this is not how QEMU works today.

> > > And we should really define clearly what a device is supposed to do
> > > with the interrupt vectors during migration. Obviously there are races
> > > here.  
> > 
> > The device should not be generating interrupts while !_RUNNING, pending
> > interrupts should be held until the device is _RUNNING.  To me this
> > means the sequence must be that INTx/MSI/MSI-X are restored while in
> > the !_RUNNING state.  
> 
> Yes

Except I suppose them to be restored while _RESUMING is set.

> > > > In any case, it requires that the device cannot be absolutely static
> > > > while !_RUNNING.  Does (_RESUMING) have different rules than
> > > > (_SAVING)?    
> > > 
> > > I'd prever to avoid all device touches during both resuming and
> > > saving, and do them during !RUNNING  
> > 
> > There's no such state required by the uAPI.  
> 
> The uAPI comment does not define when to do the SET_IRQS, it seems
> this has been missed.
> 
> We really should fix it, unless you feel strongly that the
> experimental API in qemu shouldn't be changed.

I think the QEMU implementation fills in some details of how the uAPI
is expected to work.  MSI/X is expected to be restored while _RESUMING
based on the config space of the device, there is no intermediate step
between _RESUMING and _RUNNING.  Introducing such a requirement
precludes the option of a post-copy implementation of (_RESUMING |
_RUNNING).  Thanks,

Alex
Jason Gunthorpe Nov. 2, 2021, 4:36 p.m. UTC | #49
On Tue, Nov 02, 2021 at 10:22:36AM -0600, Alex Williamson wrote:

> > > There's no point at which we can do SET_IRQS other than in the
> > > _RESUMING state.  Generally SET_IRQS ioctls are coordinated with the
> > > guest driver based on actions to the device, we can't be mucking
> > > with IRQs while the device is presumed running and already
> > > generating interrupt conditions.  
> > 
> > We need to do it in state 000
> > 
> > ie resume should go 
> > 
> >   000 -> 100 -> 000 -> 001
> > 
> > With SET_IRQS and any other fixing done during the 2nd 000, after the
> > migration data has been loaded into the device.
> 
> Again, this is not how QEMU works today.

I know, I think it is a poor choice to carve out certain changes to
the device that must be preserved across loading the migration state.

> > The uAPI comment does not define when to do the SET_IRQS, it seems
> > this has been missed.
> > 
> > We really should fix it, unless you feel strongly that the
> > experimental API in qemu shouldn't be changed.
> 
> I think the QEMU implementation fills in some details of how the uAPI
> is expected to work.

Well, we already know QEMU has problems, like the P2P thing. Is this a
bug, or a preferred limitation as designed?

> MSI/X is expected to be restored while _RESUMING based on the
> config space of the device, there is no intermediate step between
> _RESUMING and _RUNNING.  Introducing such a requirement precludes
> the option of a post-copy implementation of (_RESUMING | _RUNNING).

Not precluded, a new state bit would be required to implement some
future post-copy.

0000 -> 1100 -> 1000 -> 1001 -> 0001

Instead of overloading the meaning of RUNNING.

I think this is cleaner anyhow.

(though I don't know how we'd structure the save side to get two
bitstreams)

Thanks,
Jason
Alex Williamson Nov. 2, 2021, 8:15 p.m. UTC | #50
On Tue, 2 Nov 2021 13:36:10 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Nov 02, 2021 at 10:22:36AM -0600, Alex Williamson wrote:
> 
> > > > There's no point at which we can do SET_IRQS other than in the
> > > > _RESUMING state.  Generally SET_IRQS ioctls are coordinated with the
> > > > guest driver based on actions to the device, we can't be mucking
> > > > with IRQs while the device is presumed running and already
> > > > generating interrupt conditions.    
> > > 
> > > We need to do it in state 000
> > > 
> > > ie resume should go 
> > > 
> > >   000 -> 100 -> 000 -> 001
> > > 
> > > With SET_IRQS and any other fixing done during the 2nd 000, after the
> > > migration data has been loaded into the device.  
> > 
> > Again, this is not how QEMU works today.  
> 
> I know, I think it is a poor choice to carve out certain changes to
> the device that must be preserved across loading the migration state.
> 
> > > The uAPI comment does not define when to do the SET_IRQS, it seems
> > > this has been missed.
> > > 
> > > We really should fix it, unless you feel strongly that the
> > > experimental API in qemu shouldn't be changed.  
> > 
> > I think the QEMU implementation fills in some details of how the uAPI
> > is expected to work.  
> 
> Well, we already know QEMU has problems, like the P2P thing. Is this a
> bug, or a preferred limitation as designed?
> 
> > MSI/X is expected to be restored while _RESUMING based on the
> > config space of the device, there is no intermediate step between
> > _RESUMING and _RUNNING.  Introducing such a requirement precludes
> > the option of a post-copy implementation of (_RESUMING | _RUNNING).  
> 
> Not precluded, a new state bit would be required to implement some
> future post-copy.
> 
> 0000 -> 1100 -> 1000 -> 1001 -> 0001
> 
> Instead of overloading the meaning of RUNNING.
> 
> I think this is cleaner anyhow.
> 
> (though I don't know how we'd structure the save side to get two
> bitstreams)

The way this is supposed to work is that the device migration stream
contains the device internal state.  QEMU is then responsible for
restoring the external state of the device, including the DMA mappings,
interrupts, and config space.  It's not possible for the migration
driver to reestablish these things.  So there is a necessary division
of device state between QEMU and the migration driver.

If we don't think the uAPI includes the necessary states, doesn't
sufficiently define the states, and we're not following the existing
QEMU implementation as the guide for the intentions of the uAPI spec,
then what exactly is the proposed mlx5 migration driver implementing
and why would we even considering including it at this point?  Thanks,

Alex
Jason Gunthorpe Nov. 3, 2021, 12:09 p.m. UTC | #51
On Tue, Nov 02, 2021 at 02:15:47PM -0600, Alex Williamson wrote:
> On Tue, 2 Nov 2021 13:36:10 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Tue, Nov 02, 2021 at 10:22:36AM -0600, Alex Williamson wrote:
> > 
> > > > > There's no point at which we can do SET_IRQS other than in the
> > > > > _RESUMING state.  Generally SET_IRQS ioctls are coordinated with the
> > > > > guest driver based on actions to the device, we can't be mucking
> > > > > with IRQs while the device is presumed running and already
> > > > > generating interrupt conditions.    
> > > > 
> > > > We need to do it in state 000
> > > > 
> > > > ie resume should go 
> > > > 
> > > >   000 -> 100 -> 000 -> 001
> > > > 
> > > > With SET_IRQS and any other fixing done during the 2nd 000, after the
> > > > migration data has been loaded into the device.  
> > > 
> > > Again, this is not how QEMU works today.  
> > 
> > I know, I think it is a poor choice to carve out certain changes to
> > the device that must be preserved across loading the migration state.
> > 
> > > > The uAPI comment does not define when to do the SET_IRQS, it seems
> > > > this has been missed.
> > > > 
> > > > We really should fix it, unless you feel strongly that the
> > > > experimental API in qemu shouldn't be changed.  
> > > 
> > > I think the QEMU implementation fills in some details of how the uAPI
> > > is expected to work.  
> > 
> > Well, we already know QEMU has problems, like the P2P thing. Is this a
> > bug, or a preferred limitation as designed?
> > 
> > > MSI/X is expected to be restored while _RESUMING based on the
> > > config space of the device, there is no intermediate step between
> > > _RESUMING and _RUNNING.  Introducing such a requirement precludes
> > > the option of a post-copy implementation of (_RESUMING | _RUNNING).  
> > 
> > Not precluded, a new state bit would be required to implement some
> > future post-copy.
> > 
> > 0000 -> 1100 -> 1000 -> 1001 -> 0001
> > 
> > Instead of overloading the meaning of RUNNING.
> > 
> > I think this is cleaner anyhow.
> > 
> > (though I don't know how we'd structure the save side to get two
> > bitstreams)
> 
> The way this is supposed to work is that the device migration stream
> contains the device internal state.  QEMU is then responsible for
> restoring the external state of the device, including the DMA mappings,
> interrupts, and config space.  It's not possible for the migration
> driver to reestablish these things.  So there is a necessary division
> of device state between QEMU and the migration driver.
> 
> If we don't think the uAPI includes the necessary states, doesn't
> sufficiently define the states, and we're not following the existing
> QEMU implementation as the guide for the intentions of the uAPI spec,
> then what exactly is the proposed mlx5 migration driver implementing
> and why would we even considering including it at this point?  Thanks,

The driver posting follows the undocumented behaviors of QEMU

You asked that these all be documented, evaluated and formalized as a
precondition to merging it.

So, what do you want? A critical review of the uAPI design or
documenting whatever behvaior is coded in qemu?

A critical review suggest SET_IRQ should not happen during RESUMING,
but mlx5 today doesn't care either way.

Jason
Alex Williamson Nov. 3, 2021, 3:44 p.m. UTC | #52
On Wed, 3 Nov 2021 09:09:55 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Nov 02, 2021 at 02:15:47PM -0600, Alex Williamson wrote:
> > On Tue, 2 Nov 2021 13:36:10 -0300
> > Jason Gunthorpe <jgg@nvidia.com> wrote:
> >   
> > > On Tue, Nov 02, 2021 at 10:22:36AM -0600, Alex Williamson wrote:
> > >   
> > > > > > There's no point at which we can do SET_IRQS other than in the
> > > > > > _RESUMING state.  Generally SET_IRQS ioctls are coordinated with the
> > > > > > guest driver based on actions to the device, we can't be mucking
> > > > > > with IRQs while the device is presumed running and already
> > > > > > generating interrupt conditions.      
> > > > > 
> > > > > We need to do it in state 000
> > > > > 
> > > > > ie resume should go 
> > > > > 
> > > > >   000 -> 100 -> 000 -> 001
> > > > > 
> > > > > With SET_IRQS and any other fixing done during the 2nd 000, after the
> > > > > migration data has been loaded into the device.    
> > > > 
> > > > Again, this is not how QEMU works today.    
> > > 
> > > I know, I think it is a poor choice to carve out certain changes to
> > > the device that must be preserved across loading the migration state.
> > >   
> > > > > The uAPI comment does not define when to do the SET_IRQS, it seems
> > > > > this has been missed.
> > > > > 
> > > > > We really should fix it, unless you feel strongly that the
> > > > > experimental API in qemu shouldn't be changed.    
> > > > 
> > > > I think the QEMU implementation fills in some details of how the uAPI
> > > > is expected to work.    
> > > 
> > > Well, we already know QEMU has problems, like the P2P thing. Is this a
> > > bug, or a preferred limitation as designed?
> > >   
> > > > MSI/X is expected to be restored while _RESUMING based on the
> > > > config space of the device, there is no intermediate step between
> > > > _RESUMING and _RUNNING.  Introducing such a requirement precludes
> > > > the option of a post-copy implementation of (_RESUMING | _RUNNING).    
> > > 
> > > Not precluded, a new state bit would be required to implement some
> > > future post-copy.
> > > 
> > > 0000 -> 1100 -> 1000 -> 1001 -> 0001
> > > 
> > > Instead of overloading the meaning of RUNNING.
> > > 
> > > I think this is cleaner anyhow.
> > > 
> > > (though I don't know how we'd structure the save side to get two
> > > bitstreams)  
> > 
> > The way this is supposed to work is that the device migration stream
> > contains the device internal state.  QEMU is then responsible for
> > restoring the external state of the device, including the DMA mappings,
> > interrupts, and config space.  It's not possible for the migration
> > driver to reestablish these things.  So there is a necessary division
> > of device state between QEMU and the migration driver.
> > 
> > If we don't think the uAPI includes the necessary states, doesn't
> > sufficiently define the states, and we're not following the existing
> > QEMU implementation as the guide for the intentions of the uAPI spec,
> > then what exactly is the proposed mlx5 migration driver implementing
> > and why would we even considering including it at this point?  Thanks,  
> 
> The driver posting follows the undocumented behaviors of QEMU

In one email I read that QEMU clearly should not be performing SET_IRQS
while the device is _RESUMING (which it does) and we need to require an
interim state before the device becomes _RUNNING to poke at the device
(which QEMU doesn't do and the uAPI doesn't require), and the next I
read that we should proceed with some useful quanta of work despite
that we clearly don't intend to retain much of the protocol of the
current uAPI long term...

> You asked that these all be documented, evaluated and formalized as a
> precondition to merging it.
> 
> So, what do you want? A critical review of the uAPI design or
> documenting whatever behvaior is coded in qemu?

Too much is in flux and we're only getting breadcrumbs of the changes
to come.  It's becoming more evident that we're likely to sufficiently
modify the uAPI to the point where I'd probably suggest a new "v2"
subtype for the region.

> A critical review suggest SET_IRQ should not happen during RESUMING,
> but mlx5 today doesn't care either way.

But if it can't happening during _RESUMING and once the device is
_RUNNING it's too late, then we're demanding an interim state that is
not required by the existing protocol.  We're redefining that existing
operations on the device while in _RESUMING cannot occur in that device
state.  That's more than uAPI clarification.  Thanks,

Alex
Jason Gunthorpe Nov. 3, 2021, 4:10 p.m. UTC | #53
On Wed, Nov 03, 2021 at 09:44:09AM -0600, Alex Williamson wrote:

> In one email I read that QEMU clearly should not be performing SET_IRQS
> while the device is _RESUMING (which it does) and we need to require an
> interim state before the device becomes _RUNNING to poke at the device
> (which QEMU doesn't do and the uAPI doesn't require), and the next I
> read that we should proceed with some useful quanta of work despite
> that we clearly don't intend to retain much of the protocol of the
> current uAPI long term...

mlx5 implements the protocol as is today, in a way that is compatible
with today's qemu. Qemu has various problems like the P2P issue we
talked about, but it is something working.

If you want to do a full re-review of the protocol and make changes,
then fine, let's do that, but everything should be on the table, and
changing qemu shouldn't be a blocker.

In one email you are are saying we need to document and decide things
as a pre-condition to move the driver forward, and then in the next
email you say whatever qemu does is the specification, and can't
change it.

Part of this messy discussion is my fault as I've been a little
unclear in mixing my "community view" of how the protocol should be
designed to maximize future HW support and then switching to topics
that have direct relevance to mlx5 itself.

I want to see devices like hns be supportable and, from experience,
I'm very skeptical about placing HW design restrictions into a
uAPI. So I don't like those things.

However, mlx5's HW is robust and more functional than hns, and doesn't
care which way things are decided.

> Too much is in flux and we're only getting breadcrumbs of the
> changes to come.

We have no intention to go in and change the uapi after merging beyond
solving the P2P issue.

Since we now have confirmation that hns cannot do P2P I see no issue
to keep the current design as the non-p2p baseline that hns will
implement and the P2P upgrade should be designed separately.

> It's becoming more evident that we're likely to sufficiently modify
> the uAPI to the point where I'd probably suggest a new "v2" subtype
> for the region.

I don't think this is evident. It is really your/community choice what
to do in VFIO.

If vfio sticks with the uAPI "as is" then it places additional
requirements on future HW designs.

If you want to relax these requirements before stabilizing the uAPI,
then we need to make those changes now.

It is your decision. I don't know of any upcoming HW designs that have
a problem with any of the choices.

Thanks,
Jason
Alex Williamson Nov. 3, 2021, 6:04 p.m. UTC | #54
On Wed, 3 Nov 2021 13:10:19 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Wed, Nov 03, 2021 at 09:44:09AM -0600, Alex Williamson wrote:
> 
> > In one email I read that QEMU clearly should not be performing SET_IRQS
> > while the device is _RESUMING (which it does) and we need to require an
> > interim state before the device becomes _RUNNING to poke at the device
> > (which QEMU doesn't do and the uAPI doesn't require), and the next I
> > read that we should proceed with some useful quanta of work despite
> > that we clearly don't intend to retain much of the protocol of the
> > current uAPI long term...  
> 
> mlx5 implements the protocol as is today, in a way that is compatible
> with today's qemu. Qemu has various problems like the P2P issue we
> talked about, but it is something working.
> 
> If you want to do a full re-review of the protocol and make changes,
> then fine, let's do that, but everything should be on the table, and
> changing qemu shouldn't be a blocker.

I don't think changing QEMU is a blocker, but QEMU should be seen as
the closest thing we currently have to a reference user implementation
against the uAPI and therefore may define de facto behaviors that are
not sufficiently clear in the uAPI.  So if we see issues with the QEMU
implementation, that's a reflection on gaps and disagreements in the
uAPI itself.  If we think we need new device states and protocols to
handle the issues being raised, we need plans to incrementally add
those to the uAPI, otherwise we should halt and reevaluate the existing
uAPI for a full overhaul.

We agreed that it's easier to add a feature than a restriction in a
uAPI, so how do we resolve that some future device may require a new
state in order to apply the SET_IRQS configuration?  Existing userspace
would fail with such a device.

> In one email you are are saying we need to document and decide things
> as a pre-condition to move the driver forward, and then in the next
> email you say whatever qemu does is the specification, and can't
> change it.

I don't think I ever said we can't change it.  I'm being presented with
new information, new requirements, new protocols that existing QEMU
code does not follow.  We can change QEMU, but as I noted before we're
getting dangerously close to having a formal, non-experimental user
while we're poking holes in the uAPI and we need to consider how the
uAPI extends to fill those holes and remains backwards compatible to
the current implementation.

> Part of this messy discussion is my fault as I've been a little
> unclear in mixing my "community view" of how the protocol should be
> designed to maximize future HW support and then switching to topics
> that have direct relevance to mlx5 itself.

Better sooner than later to evaluate the limitations and compatibility
issues against what we think is reasonable hardware behavior with
respect to migration states and transitions.

> I want to see devices like hns be supportable and, from experience,
> I'm very skeptical about placing HW design restrictions into a
> uAPI. So I don't like those things.
> 
> However, mlx5's HW is robust and more functional than hns, and doesn't
> care which way things are decided.

Regardless, the issues are already out on the table.  We want migration
for mlx5, but we also want it to be as reasonably close to what we
think can support any device designed for this use case.  You seem to
have far more visibility into that than I do.

> > Too much is in flux and we're only getting breadcrumbs of the
> > changes to come.  
> 
> We have no intention to go in and change the uapi after merging beyond
> solving the P2P issue.

Then I'm confused where we're at with the notion that we shouldn't be
calling SET_IRQS while in the _RESUMING state.

> Since we now have confirmation that hns cannot do P2P I see no issue
> to keep the current design as the non-p2p baseline that hns will
> implement and the P2P upgrade should be designed separately.
> 
> > It's becoming more evident that we're likely to sufficiently modify
> > the uAPI to the point where I'd probably suggest a new "v2" subtype
> > for the region.  
> 
> I don't think this is evident. It is really your/community choice what
> to do in VFIO.
> 
> If vfio sticks with the uAPI "as is" then it places additional
> requirements on future HW designs.
> 
> If you want to relax these requirements before stabilizing the uAPI,
> then we need to make those changes now.
> 
> It is your decision. I don't know of any upcoming HW designs that have
> a problem with any of the choices.

If we're going to move forward with the existing uAPI, then we're going
to need to start factoring compatibility into our discussions of
missing states and protocols.  For example, requiring that the device
is "quiesced" when the _RUNNING bit is cleared and "frozen" when
pending_bytes is read has certain compatibility advantages versus
defining a new state bit.  Likewise, it might be fair to define that
userspace should not touch device MMIO during _RESUMING until after the
last bit of the device migration stream has been written, and then it's
free to touch MMIO before transitioning directly to the _RUNNING state.

IOW, we at least need to entertain methods to achieve the
clarifications were trying for within the existing uAPI rather than
toss out new device states and protocols at every turn for the sake of
API purity.  The rate at which we're proposing new states and required
transitions without a plan for the uAPI is not where I want to be for
adding the driver that could lock us in to a supported uAPI.  Thanks,

Alex
Cornelia Huck Nov. 4, 2021, 11:19 a.m. UTC | #55
So, I doubt that I'm the only person trying to follow this discussion
who has lost the overview about issues and possible solutions here. I
think it would be a good idea to summarize what has been brought up so
far outside of this thread.

To that effect, I've created an etherpad at
https://etherpad.opendev.org/p/VFIOMigrationDiscussions and started
filling it with some points. It would be great if others could fill in
the blanks so that everyone has a chance to see what is on the table so
far, so that we can hopefully discuss this on-list and come up with
something that works.
Jason Gunthorpe Nov. 5, 2021, 1:24 p.m. UTC | #56
On Wed, Nov 03, 2021 at 12:04:11PM -0600, Alex Williamson wrote:

> We agreed that it's easier to add a feature than a restriction in a
> uAPI, so how do we resolve that some future device may require a new
> state in order to apply the SET_IRQS configuration?

I would say don't support those devices. If there is even a hint that
they could maybe exist then we should fix it now. Once the uapi is set
and documented we should expect device makers to consider it when
building their devices.

As for SET_IRQs, I have been looking at making documentation and I
don't like the way the documentation has to be wrriten because of
this.

What I see as an understandable, clear, documentation is:

 - SAVING set - no device touches allowed beyond migration operations
   and reset via XX
   Must be set with !RUNNING
 - RESUMING set - same as SAVING
 - RUNNING cleared - limited device touches in this list: SET_IRQs, XX
   config, XX.
   Device may assume no touches outside the above. (ie no MMIO)
   Implies NDMA
 - NDMA set - full device touches
   Device may not issue DMA or interrupts (??)
   Device may not dirty pages
 - RUNNING set - full functionality
 * In no state may a device generate an error TLP, device
   hang/integrity failure or kernel intergity failure, no matter
   what userspace does.
   The device is permitted to corrupt the migration/VM or SEGV
   userspace if userspace doesn't follow the rules.

(we are trying to figure out what the XX's are right now, would
appreciate any help)

This is something I think we could expect a HW engineering team to
follow and implement in devices. It doesn't complicate things.

Overall, at this moment, I would prioritize documentation clarity over
strict compatability with qemu, because people have to follow this
documentation and make their devices long into the future. If the
documentation is convoluted for compatibility reasons HW people are
more likely to get it wrong. When HW people get it wrong they are more
likely to ask for "quirks" in the uAPI to fix their mistakes.

The pending_bytes P2P idea is also quite complicated to document as
now we have to describe an HW state not in terms of a NDMA control
bit, but in terms of a bunch of implicit operations in a protocol. Not
so nice.

So, here is what I propose. Let us work on some documentation and come
up with the sort of HW centric docs like above and we can then decide
if we want to make the qemu changes it will imply, or not. We'll
include the P2P stuff, as we see it, so it shows a whole picture.

I think that will help everyone participate fully in the discussion.

> If we're going to move forward with the existing uAPI, then we're going
> to need to start factoring compatibility into our discussions of
> missing states and protocols.  For example, requiring that the device
> is "quiesced" when the _RUNNING bit is cleared and "frozen" when
> pending_bytes is read has certain compatibility advantages versus
> defining a new state bit. 

Not entirely, to support P2P going from RESUMING directly to RUNNING
is not possible. There must be an in between state that all devices
reach before they go to RUNNING. It seems P2P cannot be bolted into
the existing qmeu flow with a kernel only change?

> clarifications were trying for within the existing uAPI rather than
> toss out new device states and protocols at every turn for the sake of
> API purity.  The rate at which we're proposing new states and required
> transitions without a plan for the uAPI is not where I want to be for
> adding the driver that could lock us in to a supported uAPI.  Thanks,

Well, to be fair, the other cases I suggested new stats was when you
asked about features we don't have at all today (like post-copy). I
think adding new states is a very reasonable way to approach adding
new features. As long as new features can be supported with new states
we have a forward compatability story.

Thanks,
Jason
Alex Williamson Nov. 5, 2021, 3:31 p.m. UTC | #57
On Fri, 5 Nov 2021 10:24:04 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Wed, Nov 03, 2021 at 12:04:11PM -0600, Alex Williamson wrote:
> 
> > We agreed that it's easier to add a feature than a restriction in a
> > uAPI, so how do we resolve that some future device may require a new
> > state in order to apply the SET_IRQS configuration?  
> 
> I would say don't support those devices. If there is even a hint that
> they could maybe exist then we should fix it now. Once the uapi is set
> and documented we should expect device makers to consider it when
> building their devices.
> 
> As for SET_IRQs, I have been looking at making documentation and I
> don't like the way the documentation has to be wrriten because of
> this.
> 
> What I see as an understandable, clear, documentation is:
> 
>  - SAVING set - no device touches allowed beyond migration operations
>    and reset via XX

I'd suggest defining reset via ioctl only.

>    Must be set with !RUNNING

Not sure what this means.  Pre-copy requires SAVING and RUNNING
together, is this only suggesting that to get the final device state we
need to do so in a !RUNNING state?

>  - RESUMING set - same as SAVING

I take it then that we're defining a new protocol if we can't do
SET_IRQS here.

>  - RUNNING cleared - limited device touches in this list: SET_IRQs, XX
>    config, XX.
>    Device may assume no touches outside the above. (ie no MMIO)
>    Implies NDMA

SET_IRQS is MMIO, is the distinction userspace vs kernel?

>  - NDMA set - full device touches
>    Device may not issue DMA or interrupts (??)
>    Device may not dirty pages

Is this achievable?  We can't bound the time where incoming DMA is
possible, devices don't have infinite buffers.

>  - RUNNING set - full functionality
>  * In no state may a device generate an error TLP, device
>    hang/integrity failure or kernel intergity failure, no matter
>    what userspace does.
>    The device is permitted to corrupt the migration/VM or SEGV
>    userspace if userspace doesn't follow the rules.
> 
> (we are trying to figure out what the XX's are right now, would
> appreciate any help)
> 
> This is something I think we could expect a HW engineering team to
> follow and implement in devices. It doesn't complicate things.
> 
> Overall, at this moment, I would prioritize documentation clarity over
> strict compatability with qemu, because people have to follow this
> documentation and make their devices long into the future. If the
> documentation is convoluted for compatibility reasons HW people are
> more likely to get it wrong. When HW people get it wrong they are more
> likely to ask for "quirks" in the uAPI to fix their mistakes.

I might still suggest a v2 migration sub-type, we'll just immediately
deprecate the original as we have no users and QEMU would modify all
support to find only the new sub-type as code is updated.  "v1" never
really materialized, but we can avoid future confusion if it's never
produced by in-tree drivers and never consume by mainstream userspace.

> The pending_bytes P2P idea is also quite complicated to document as
> now we have to describe an HW state not in terms of a NDMA control
> bit, but in terms of a bunch of implicit operations in a protocol. Not
> so nice.
> 
> So, here is what I propose. Let us work on some documentation and come
> up with the sort of HW centric docs like above and we can then decide
> if we want to make the qemu changes it will imply, or not. We'll
> include the P2P stuff, as we see it, so it shows a whole picture.
> 
> I think that will help everyone participate fully in the discussion.

Good plan.

> > If we're going to move forward with the existing uAPI, then we're going
> > to need to start factoring compatibility into our discussions of
> > missing states and protocols.  For example, requiring that the device
> > is "quiesced" when the _RUNNING bit is cleared and "frozen" when
> > pending_bytes is read has certain compatibility advantages versus
> > defining a new state bit.   
> 
> Not entirely, to support P2P going from RESUMING directly to RUNNING
> is not possible. There must be an in between state that all devices
> reach before they go to RUNNING. It seems P2P cannot be bolted into
> the existing qmeu flow with a kernel only change?

Perhaps, yes.

> > clarifications were trying for within the existing uAPI rather than
> > toss out new device states and protocols at every turn for the sake of
> > API purity.  The rate at which we're proposing new states and required
> > transitions without a plan for the uAPI is not where I want to be for
> > adding the driver that could lock us in to a supported uAPI.  Thanks,  
> 
> Well, to be fair, the other cases I suggested new stats was when you
> asked about features we don't have at all today (like post-copy). I
> think adding new states is a very reasonable way to approach adding
> new features. As long as new features can be supported with new states
> we have a forward compatability story.

That has a viable upgrade path, I'm onboard with that.  A device that
imposes it can't do SET_IRQS while RESUMING when we have no required
state in between RESUMING and RUNNING are the sorts of issues that I'm
going to get hung up on.  I take it from the above that you're building
that state transition requirement into the uAPI now.  Thanks,

Alex
Cornelia Huck Nov. 5, 2021, 4:53 p.m. UTC | #58
On Thu, Nov 04 2021, Cornelia Huck <cohuck@redhat.com> wrote:

> So, I doubt that I'm the only person trying to follow this discussion
> who has lost the overview about issues and possible solutions here. I
> think it would be a good idea to summarize what has been brought up so
> far outside of this thread.
>
> To that effect, I've created an etherpad at
> https://etherpad.opendev.org/p/VFIOMigrationDiscussions and started
> filling it with some points. It would be great if others could fill in
> the blanks so that everyone has a chance to see what is on the table so
> far, so that we can hopefully discuss this on-list and come up with
> something that works.

...just to clarify, my idea was that we could have a writeup of the
various issues and proposed solutions on the etherpad, and then post the
contents on-list next week as a starting point for a discussion that is
not hidden deeply inside the discussion on a patch set.

So, please continue adding points :)
Tian, Kevin Nov. 8, 2021, 8:53 a.m. UTC | #59
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, October 26, 2021 11:19 PM
> 
> On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote:
> 
> > > This is also why I don't like it being so transparent as it is
> > > something userspace needs to care about - especially if the HW cannot
> > > support such a thing, if we intend to allow that.
> >
> > Userspace does need to care, but userspace's concern over this should
> > not be able to compromise the platform and therefore making VF
> > assignment more susceptible to fatal error conditions to comply with a
> > migration uAPI is troublesome for me.
> 
> It is an interesting scenario.
> 
> I think it points that we are not implementing this fully properly.
> 
> The !RUNNING state should be like your reset efforts.
> 
> All access to the MMIO memories from userspace should be revoked
> during !RUNNING

This assumes that vCPUs must be stopped before !RUNNING is entered 
in virtualization case. and it is true today.

But it may not hold when talking about guest SVA and I/O page fault [1].
The problem is that the pending requests may trigger I/O page faults
on guest page tables. W/o running vCPUs to handle those faults, the
quiesce command cannot complete draining the pending requests
if the device doesn't support preempt-on-fault (at least it's the case for
some Intel and Huawei devices, possibly true for most initial SVA
implementations). 

Of course migrating guest SVA requires more changes as discussed in [1]. 
Here just want to point out this forward-looking requirement so any 
definition change in this thread won't break that usage.

[1] https://lore.kernel.org/qemu-devel/06cb5bfd-f6f8-b61b-1a7e-60a9ae2f8fac@nvidia.com/T/
(p.s. 'stop device' in [1] means 'quiesce device' in this thread)

Thanks,
Kevin

> 
> All VMAs zap'd.
> 
> All IOMMU peer mappings invalidated.
> 
> The kernel should directly block userspace from causing a MMIO TLP
> before the device driver goes to !RUNNING.
> 
> Then the question of what the device does at this edge is not
> relevant as hostile userspace cannot trigger it.
> 
> The logical way to implement this is to key off running and
> block/unblock MMIO access when !RUNNING.
> 
> To me this strongly suggests that the extra bit is the correct way
> forward as the driver is much simpler to implement and understand if
> RUNNING directly controls the availability of MMIO instead of having
> an irregular case where !RUNNING still allows MMIO but only until a
> pending_bytes read.
> 
> Given the complexity of this can we move ahead with the current
> mlx5_vfio and Yishai&co can come with some followup proposal to split
> the freeze/queice and block MMIO?
> 
> Jason
Jason Gunthorpe Nov. 8, 2021, 12:35 p.m. UTC | #60
On Mon, Nov 08, 2021 at 08:53:20AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Tuesday, October 26, 2021 11:19 PM
> > 
> > On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote:
> > 
> > > > This is also why I don't like it being so transparent as it is
> > > > something userspace needs to care about - especially if the HW cannot
> > > > support such a thing, if we intend to allow that.
> > >
> > > Userspace does need to care, but userspace's concern over this should
> > > not be able to compromise the platform and therefore making VF
> > > assignment more susceptible to fatal error conditions to comply with a
> > > migration uAPI is troublesome for me.
> > 
> > It is an interesting scenario.
> > 
> > I think it points that we are not implementing this fully properly.
> > 
> > The !RUNNING state should be like your reset efforts.
> > 
> > All access to the MMIO memories from userspace should be revoked
> > during !RUNNING
> 
> This assumes that vCPUs must be stopped before !RUNNING is entered 
> in virtualization case. and it is true today.
> 
> But it may not hold when talking about guest SVA and I/O page fault [1].
> The problem is that the pending requests may trigger I/O page faults
> on guest page tables. W/o running vCPUs to handle those faults, the
> quiesce command cannot complete draining the pending requests
> if the device doesn't support preempt-on-fault (at least it's the case for
> some Intel and Huawei devices, possibly true for most initial SVA
> implementations). 

It cannot be ordered any other way.

vCPUs must be stopped first, then the PCI devices must be stopped
after, otherwise the vCPU can touch a stopped a device while handling
a fault which is unreasonable.

However, migrating a pending IOMMU fault does seem unreasonable as well.

The NDA state can potentially solve this:

  RUNNING | VCPU RUNNING - Normal
  NDMA | RUNNING | VCPU RUNNING - Halt and flush DMA, and thus all faults
  NDMA | RUNNING - Halt all MMIO access
  0 - Halted everything

Though this may be more disruptive to the vCPUs as they could spin on
DMA/interrupts that will not come.

Jason
Tian, Kevin Nov. 9, 2021, 12:58 a.m. UTC | #61
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Monday, November 8, 2021 8:36 PM
> 
> On Mon, Nov 08, 2021 at 08:53:20AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Tuesday, October 26, 2021 11:19 PM
> > >
> > > On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote:
> > >
> > > > > This is also why I don't like it being so transparent as it is
> > > > > something userspace needs to care about - especially if the HW
> cannot
> > > > > support such a thing, if we intend to allow that.
> > > >
> > > > Userspace does need to care, but userspace's concern over this should
> > > > not be able to compromise the platform and therefore making VF
> > > > assignment more susceptible to fatal error conditions to comply with a
> > > > migration uAPI is troublesome for me.
> > >
> > > It is an interesting scenario.
> > >
> > > I think it points that we are not implementing this fully properly.
> > >
> > > The !RUNNING state should be like your reset efforts.
> > >
> > > All access to the MMIO memories from userspace should be revoked
> > > during !RUNNING
> >
> > This assumes that vCPUs must be stopped before !RUNNING is entered
> > in virtualization case. and it is true today.
> >
> > But it may not hold when talking about guest SVA and I/O page fault [1].
> > The problem is that the pending requests may trigger I/O page faults
> > on guest page tables. W/o running vCPUs to handle those faults, the
> > quiesce command cannot complete draining the pending requests
> > if the device doesn't support preempt-on-fault (at least it's the case for
> > some Intel and Huawei devices, possibly true for most initial SVA
> > implementations).
> 
> It cannot be ordered any other way.
> 
> vCPUs must be stopped first, then the PCI devices must be stopped
> after, otherwise the vCPU can touch a stopped a device while handling
> a fault which is unreasonable.
> 
> However, migrating a pending IOMMU fault does seem unreasonable as well.
> 
> The NDA state can potentially solve this:
> 
>   RUNNING | VCPU RUNNING - Normal
>   NDMA | RUNNING | VCPU RUNNING - Halt and flush DMA, and thus all
> faults
>   NDMA | RUNNING - Halt all MMIO access

should be two steps?

NDMA | RUNNING - vCPU stops access to the device
NDMA - halt all MMIO access by revoking mapping

>   0 - Halted everything

yes, adding a new state sounds better than reordering the vcpu/device
stop sequence.

> 
> Though this may be more disruptive to the vCPUs as they could spin on
> DMA/interrupts that will not come.

it's inevitable regardless how we define the migration states. the
actual impact depends on how long 'Halt and flush DMA' will take.

Thanks
Kevin
Jason Gunthorpe Nov. 9, 2021, 12:45 p.m. UTC | #62
On Tue, Nov 09, 2021 at 12:58:26AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Monday, November 8, 2021 8:36 PM
> > 
> > On Mon, Nov 08, 2021 at 08:53:20AM +0000, Tian, Kevin wrote:
> > > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > > Sent: Tuesday, October 26, 2021 11:19 PM
> > > >
> > > > On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote:
> > > >
> > > > > > This is also why I don't like it being so transparent as it is
> > > > > > something userspace needs to care about - especially if the HW
> > cannot
> > > > > > support such a thing, if we intend to allow that.
> > > > >
> > > > > Userspace does need to care, but userspace's concern over this should
> > > > > not be able to compromise the platform and therefore making VF
> > > > > assignment more susceptible to fatal error conditions to comply with a
> > > > > migration uAPI is troublesome for me.
> > > >
> > > > It is an interesting scenario.
> > > >
> > > > I think it points that we are not implementing this fully properly.
> > > >
> > > > The !RUNNING state should be like your reset efforts.
> > > >
> > > > All access to the MMIO memories from userspace should be revoked
> > > > during !RUNNING
> > >
> > > This assumes that vCPUs must be stopped before !RUNNING is entered
> > > in virtualization case. and it is true today.
> > >
> > > But it may not hold when talking about guest SVA and I/O page fault [1].
> > > The problem is that the pending requests may trigger I/O page faults
> > > on guest page tables. W/o running vCPUs to handle those faults, the
> > > quiesce command cannot complete draining the pending requests
> > > if the device doesn't support preempt-on-fault (at least it's the case for
> > > some Intel and Huawei devices, possibly true for most initial SVA
> > > implementations).
> > 
> > It cannot be ordered any other way.
> > 
> > vCPUs must be stopped first, then the PCI devices must be stopped
> > after, otherwise the vCPU can touch a stopped a device while handling
> > a fault which is unreasonable.
> > 
> > However, migrating a pending IOMMU fault does seem unreasonable as well.
> > 
> > The NDA state can potentially solve this:
> > 
> >   RUNNING | VCPU RUNNING - Normal
> >   NDMA | RUNNING | VCPU RUNNING - Halt and flush DMA, and thus all
> > faults
> >   NDMA | RUNNING - Halt all MMIO access
> 
> should be two steps?
> 
> NDMA | RUNNING - vCPU stops access to the device
> NDMA - halt all MMIO access by revoking mapping

No, NDMA without running is equivalent to 0, which is the next step:

> >   0 - Halted everything

Jason
Jason Gunthorpe Nov. 15, 2021, 11:29 p.m. UTC | #63
On Fri, Nov 05, 2021 at 09:31:45AM -0600, Alex Williamson wrote:
> On Fri, 5 Nov 2021 10:24:04 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Wed, Nov 03, 2021 at 12:04:11PM -0600, Alex Williamson wrote:
> > 
> > > We agreed that it's easier to add a feature than a restriction in a
> > > uAPI, so how do we resolve that some future device may require a new
> > > state in order to apply the SET_IRQS configuration?  
> > 
> > I would say don't support those devices. If there is even a hint that
> > they could maybe exist then we should fix it now. Once the uapi is set
> > and documented we should expect device makers to consider it when
> > building their devices.
> > 
> > As for SET_IRQs, I have been looking at making documentation and I
> > don't like the way the documentation has to be wrriten because of
> > this.
> > 
> > What I see as an understandable, clear, documentation is:
> > 
> >  - SAVING set - no device touches allowed beyond migration operations
> >    and reset via XX
> 
> I'd suggest defining reset via ioctl only.
> 
> >    Must be set with !RUNNING
> 
> Not sure what this means.  Pre-copy requires SAVING and RUNNING
> together, is this only suggesting that to get the final device state we
> need to do so in a !RUNNING state?

Sorry, I did not think about pre-copy here, mlx5 doesn't do it so I'm
not as familiar

> >  - RESUMING set - same as SAVING
> 
> I take it then that we're defining a new protocol if we can't do
> SET_IRQS here.

We've been working on some documentation and one of the challenges
turns out that all the PCI device state owned by other subsystems (eg
the PCI core, the interrupt code, power management, etc) must be kept
in sync. No matter what RESUMING cannot just async change device state
that the kernel assumes it is controlling.

So, in practice, this necessarily requires forbidding the device from
touching the MSI table, and other stuff, during RESUMING.

Further, since we can't just halt all the other kernel subsystems
during SAVING/RESUMING the device must be able to accept touches in
those areas, for completely unrelated reasons, (eg a MSI addr/data
being changed) safely.

Seems like no need to change SET_IRQs.


> >  - NDMA set - full device touches
> >    Device may not issue DMA or interrupts (??)
> >    Device may not dirty pages
> 
> Is this achievable?  We can't bound the time where incoming DMA is
> possible, devices don't have infinite buffers.

It is a necessary evil for migration. 

The device cannot know how long it will be suspended for and must
cope. With networking discarded packets can be resent, but the reality
is that real deployments need a QOS that the device will not be paused
for too long otherwise the peers may declare the node dead.

> > Not entirely, to support P2P going from RESUMING directly to RUNNING
> > is not possible. There must be an in between state that all devices
> > reach before they go to RUNNING. It seems P2P cannot be bolted into
> > the existing qmeu flow with a kernel only change?
> 
> Perhaps, yes.

We have also been looking at dirty tracking and we are wondering how
that should work. (Dirty tracking will be another followup)

If we look at mlx5, it will have built in dirty tracking, and when
used with a newer IOMMUs there is also system dirty tracking
available.

I think userspace should decide if it wants to use mlx5 built in or
the system IOMMU to do dirty tracking.

Presumably the system IOMMU is turned on via
VFIO_IOMMU_DIRTY_PAGES_FLAG_START, but what controls if the mlx5
mechanism should be used or not?

mlx5 also has no way to return the dirty log. If the system IOMMU is
not used then VFIO_IOMMU_DIRTY_PAGES_FLAG_START should not be done,
however that is what controls all the logic under the two GET_BITMAP
APIs. (even if fixed I don't really like the idea of the IOMMU
extracting this data from the migration driver in the context of
iommufd)

Further how does mlx5 even report that it has dirty tracking?

Was there some plan here we are missing?

In light of all this I'm wondering if device dirty tracking should
exist as new ioctls on the device FD and reserve the type1 code to
only work the IOMMU dirty tracking.

Jason
Cornelia Huck Nov. 16, 2021, 4:59 p.m. UTC | #64
On Fri, Nov 05 2021, Cornelia Huck <cohuck@redhat.com> wrote:

> On Thu, Nov 04 2021, Cornelia Huck <cohuck@redhat.com> wrote:
>
>> So, I doubt that I'm the only person trying to follow this discussion
>> who has lost the overview about issues and possible solutions here. I
>> think it would be a good idea to summarize what has been brought up so
>> far outside of this thread.
>>
>> To that effect, I've created an etherpad at
>> https://etherpad.opendev.org/p/VFIOMigrationDiscussions and started
>> filling it with some points. It would be great if others could fill in
>> the blanks so that everyone has a chance to see what is on the table so
>> far, so that we can hopefully discuss this on-list and come up with
>> something that works.
>
> ...just to clarify, my idea was that we could have a writeup of the
> various issues and proposed solutions on the etherpad, and then post the
> contents on-list next week as a starting point for a discussion that is
> not hidden deeply inside the discussion on a patch set.
>
> So, please continue adding points :)

Last call for anything you want to add to the etherpad; I'll post the
contents tomorrow.

[Yes, I wanted to post it last week, but got sidetracked.]
Alex Williamson Nov. 16, 2021, 5:57 p.m. UTC | #65
On Mon, 15 Nov 2021 19:29:21 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Fri, Nov 05, 2021 at 09:31:45AM -0600, Alex Williamson wrote:
> > On Fri, 5 Nov 2021 10:24:04 -0300
> > Jason Gunthorpe <jgg@nvidia.com> wrote:
> >   
> > > On Wed, Nov 03, 2021 at 12:04:11PM -0600, Alex Williamson wrote:
> > >   
> > > > We agreed that it's easier to add a feature than a restriction in a
> > > > uAPI, so how do we resolve that some future device may require a new
> > > > state in order to apply the SET_IRQS configuration?    
> > > 
> > > I would say don't support those devices. If there is even a hint that
> > > they could maybe exist then we should fix it now. Once the uapi is set
> > > and documented we should expect device makers to consider it when
> > > building their devices.
> > > 
> > > As for SET_IRQs, I have been looking at making documentation and I
> > > don't like the way the documentation has to be wrriten because of
> > > this.
> > > 
> > > What I see as an understandable, clear, documentation is:
> > > 
> > >  - SAVING set - no device touches allowed beyond migration operations
> > >    and reset via XX  
> > 
> > I'd suggest defining reset via ioctl only.
> >   
> > >    Must be set with !RUNNING  
> > 
> > Not sure what this means.  Pre-copy requires SAVING and RUNNING
> > together, is this only suggesting that to get the final device state we
> > need to do so in a !RUNNING state?  
> 
> Sorry, I did not think about pre-copy here, mlx5 doesn't do it so I'm
> not as familiar
> 
> > >  - RESUMING set - same as SAVING  
> > 
> > I take it then that we're defining a new protocol if we can't do
> > SET_IRQS here.  
> 
> We've been working on some documentation and one of the challenges
> turns out that all the PCI device state owned by other subsystems (eg
> the PCI core, the interrupt code, power management, etc) must be kept
> in sync. No matter what RESUMING cannot just async change device state
> that the kernel assumes it is controlling.
> 
> So, in practice, this necessarily requires forbidding the device from
> touching the MSI table, and other stuff, during RESUMING.
> 
> Further, since we can't just halt all the other kernel subsystems
> during SAVING/RESUMING the device must be able to accept touches in
> those areas, for completely unrelated reasons, (eg a MSI addr/data
> being changed) safely.
> 
> Seems like no need to change SET_IRQs.
> 
> 
> > >  - NDMA set - full device touches
> > >    Device may not issue DMA or interrupts (??)
> > >    Device may not dirty pages  
> > 
> > Is this achievable?  We can't bound the time where incoming DMA is
> > possible, devices don't have infinite buffers.  
> 
> It is a necessary evil for migration. 
> 
> The device cannot know how long it will be suspended for and must
> cope. With networking discarded packets can be resent, but the reality
> is that real deployments need a QOS that the device will not be paused
> for too long otherwise the peers may declare the node dead.
> 
> > > Not entirely, to support P2P going from RESUMING directly to RUNNING
> > > is not possible. There must be an in between state that all devices
> > > reach before they go to RUNNING. It seems P2P cannot be bolted into
> > > the existing qmeu flow with a kernel only change?  
> > 
> > Perhaps, yes.  
> 
> We have also been looking at dirty tracking and we are wondering how
> that should work. (Dirty tracking will be another followup)
> 
> If we look at mlx5, it will have built in dirty tracking, and when
> used with a newer IOMMUs there is also system dirty tracking
> available.
> 
> I think userspace should decide if it wants to use mlx5 built in or
> the system IOMMU to do dirty tracking.

What information does userspace use to inform such a decision?
Ultimately userspace just wants the finest granularity of tracking,
shouldn't that guide our decisions which to provide?

> Presumably the system IOMMU is turned on via
> VFIO_IOMMU_DIRTY_PAGES_FLAG_START, but what controls if the mlx5
> mechanism should be used or not?
> 
> mlx5 also has no way to return the dirty log. If the system IOMMU is
> not used then VFIO_IOMMU_DIRTY_PAGES_FLAG_START should not be done,
> however that is what controls all the logic under the two GET_BITMAP
> APIs. (even if fixed I don't really like the idea of the IOMMU
> extracting this data from the migration driver in the context of
> iommufd)
> 
> Further how does mlx5 even report that it has dirty tracking?
> 
> Was there some plan here we are missing?

I believe the intended progression of dirty tracking is that by default
all mapped ranges are dirty.  If the device supports page pinning, then
we reduce the set of dirty pages to those pages which are pinned.  A
device that doesn't otherwise need page pinning, such as a fully IOMMU
backed device, would use gratuitous page pinning triggered by the
_SAVING state activation on the device.  It sounds like mlx5 could use
this existing support today.

We had also discussed variants to page pinning that might be more
useful as device dirty page support improves.  For example calls to
mark pages dirty once rather than the perpetual dirtying of pinned
pages, calls to pin pages for read vs write, etc.  We didn't dive much
into system IOMMU dirtying, but presumably we'd have a fault handler
triggered if a page is written by the device and go from there.

> In light of all this I'm wondering if device dirty tracking should
> exist as new ioctls on the device FD and reserve the type1 code to
> only work the IOMMU dirty tracking.

Our existing model is working towards the IOMMU, ie. container,
interface aggregating dirty page context.  For example when page
pinning is used, it's only when all devices within the container are
using page pinning that we can report the pinned subset as dirty.
Otherwise userspace needs to poll each device, which I suppose enables
your idea that userspace decides which source to use, but why?  Does
the IOMMU dirty page tracking exclude devices if the user queries the
device separately?  How would it know?  What's the advantage?  It seems
like this creates too many support paths that all need to converge on
the same answer.  Consolidating DMA dirty page tracking to the DMA
mapping interface for all devices within a DMA context makes more sense
to me.  Thanks,

Alex
Jason Gunthorpe Nov. 16, 2021, 7:25 p.m. UTC | #66
On Tue, Nov 16, 2021 at 10:57:36AM -0700, Alex Williamson wrote:

> > I think userspace should decide if it wants to use mlx5 built in or
> > the system IOMMU to do dirty tracking.
> 
> What information does userspace use to inform such a decision?

Kernel can't know which approach performs better. Operators should
benchmark and make a choice for their deployment HW. Maybe device
tracking severely impacts device performance or vice versa.

Kernel doesn't easily know what userspace has done, maybe one device
supports migration driver dirty tracking and one device does not.

Is user space going to use a system IOMMU for both devices? 

Is it going to put the simple device in NDMA early and continue to
dirty track to shutdown the other devices?

> Ultimately userspace just wants the finest granularity of tracking,
> shouldn't that guide our decisions which to provide?

At least for mlx5 there is going to some trade off curve of device
performance, dirty tracking page size, and working set.

Even lower is better is not necessarily true. After overheads on a
400GB RDMA NIC there is not such a big difference between doing a 4k
and 16k scatter transfer. The CPU work to process all the extra bitmap
data may not be a net win compared to block transfer times.

Conversly someone doing 1G TCP transfers probably cares a lot to
minimize block size.

Overall, I think there is far too much up in the air and unmeasured to
firmly commit the kernel to a fixed policy.

So, I would like to see userspace control most of the policy aspects,
including the dirty track provider.

> I believe the intended progression of dirty tracking is that by default
> all mapped ranges are dirty.  If the device supports page pinning, then
> we reduce the set of dirty pages to those pages which are pinned.  A
> device that doesn't otherwise need page pinning, such as a fully IOMMU

How does userspace know if dirty tracking works or not? All I see
VFIO_IOMMU_DIRTY_PAGES_FLAG_START unconditionally allocs some bitmaps.

I'm surprised it doesn't check that only NO_IOMMU's devices are
attached to the container and refuse to dirty track otherwise - since
it doesn't work..

> backed device, would use gratuitous page pinning triggered by the
> _SAVING state activation on the device.  It sounds like mlx5 could use
> this existing support today.

How does mlx5 know if it should turn on its dirty page tracking on
SAVING or if the system IOMMU covers it? Or for some reason userspace
doesn't want dirty tracking but is doing pre-copy?

When we mix dirty track with pre-copy, the progression seems to be:

  DITRY TRACKING | RUNNING
     Copy every page to the remote
  DT | SAVING | RUNNING
     Copy pre-copy migration data to the remote
  SAVING | NDMA | RUNNING
     Read and clear dirty track device bitmap
  DT | SAVING | RUNNING
     Copy new dirtied data
     (maybe loop back to NDMA a few times?)
  SAVING | NDMA | RUNNING
     P2P grace state
  0
    Read the dirty track and copy data
    Read and send the migration state

Can we do something so complex using only SAVING?

.. and along the lines of the above how do we mix in NDMA to the iommu
container, and how does it work if only some devices support NDMA?

> We had also discussed variants to page pinning that might be more
> useful as device dirty page support improves.  For example calls to
> mark pages dirty once rather than the perpetual dirtying of pinned
> pages, calls to pin pages for read vs write, etc.  We didn't dive much
> into system IOMMU dirtying, but presumably we'd have a fault handler
> triggered if a page is written by the device and go from there.

Would be interesting to know for sure what current IOMMU HW has
done. I'm supposing the easiest implementation is to write a dirty bit
to the IO PTE the same as the CPU writes a dirty bit the normal PTE.

> > In light of all this I'm wondering if device dirty tracking should
> > exist as new ioctls on the device FD and reserve the type1 code to
> > only work the IOMMU dirty tracking.
> 
> Our existing model is working towards the IOMMU, ie. container,
> interface aggregating dirty page context.  

This creates inefficiencies in the kernel, we copy from the mlx5
formed data structure to new memory in the iommu through a very
ineffficent API and then again we do an ioctl to copy it once more and
throw all the extra work away. It does not seem good for something
where we want performance.

> For example when page pinning is used, it's only when all devices
> within the container are using page pinning that we can report the
> pinned subset as dirty.  Otherwise userspace needs to poll each
> device, which I suppose enables your idea that userspace decides
> which source to use, but why?

Efficiency, and user selectable policy.

Userspace can just allocate an all zeros bitmap and feed it to each of
the providers in the kernel using a 'or in your dirty' semantic.

No redundant kernel data marshaling, userspace gets to decide which
tracking provider to use, and it is simple to implement in the kernel.

Userspace has to do this anyhow if it has configurations with multiple
containers. For instance because it was forced to split the containers
due to one device not supporting NDMA.

> Does the IOMMU dirty page tracking exclude devices if the user
> queries the device separately?  

What makes sense to me is multiple tracking providers. Each can be
turned on and off.

If the container tracking provider says it supports tracking then it
means it can track DMA from every device it is connected to (unlike
today?). eg by using IOMMU HW that naturally does this, or by only
having only NO_IOMMU devices.

If the migration driver says it supports tracking, then it only tracks
DMA from that device.

> How would it know?  What's the advantage?  It seems like this
> creates too many support paths that all need to converge on the same
> answer.  Consolidating DMA dirty page tracking to the DMA mapping
> interface for all devices within a DMA context makes more sense to
> me.

What I see is a lot of questions and limitations with this
approach. If we stick to funneling everything through the iommu then
answering the questions seem to create a large amount of kernel
work. Enough to ask if it is worthwhile..

.. and then we have to ask how does this all work in IOMMUFD where it
is not so reasonable to tightly couple the migration driver and the
IOAS and I get more questions :)

Jason
Alex Williamson Nov. 16, 2021, 9:10 p.m. UTC | #67
On Tue, 16 Nov 2021 15:25:05 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Nov 16, 2021 at 10:57:36AM -0700, Alex Williamson wrote:
> 
> > > I think userspace should decide if it wants to use mlx5 built in or
> > > the system IOMMU to do dirty tracking.  
> > 
> > What information does userspace use to inform such a decision?  
> 
> Kernel can't know which approach performs better. Operators should
> benchmark and make a choice for their deployment HW. Maybe device
> tracking severely impacts device performance or vice versa.

I'm all for keeping policy decisions out of the kernel, but it's pretty
absurd to expect a userspace operator to benchmark various combination
and wire various knobs through the user interface for this.  It seems
to me that the kernel, ie. the vfio variant driver, *can* know the best
default.  We can design in interfaces so that the driver may, for
example, know whether to pin pages or defer to the system IOMMU dirty
tracking.  The driver provider can provide quirks for IOMMU
implementations that perform poorly versus device provided
alternatives.  The driver can know if a device iotlb cache provides the
better result.  The driver can provide module options or devlink tweaks
to change the behavior.  This seems like something userspace doesn't
want to care about in the common path.

> Kernel doesn't easily know what userspace has done, maybe one device
> supports migration driver dirty tracking and one device does not.

And that's exactly why the current type1 implementation exposes the
least common denominator to userspace, ie. pinned pages only if all
devices in the container have enabled this degree of granularity.

> Is user space going to use a system IOMMU for both devices? 

If the system IOMMU supports it and none of the drivers have opt'd to
report via other means, yes.

> Is it going to put the simple device in NDMA early and continue to
> dirty track to shutdown the other devices?

Yes, the current model could account for this, the device entering NDMA
mode effectively becomes enlightened because it knows that it is no
longer dirtying pages.  It'd be the same as a driver turning on page
pinning with _SAVING, we'd just need a way to do that without actually
pinning a page.

> > Ultimately userspace just wants the finest granularity of tracking,
> > shouldn't that guide our decisions which to provide?  
> 
> At least for mlx5 there is going to some trade off curve of device
> performance, dirty tracking page size, and working set.
> 
> Even lower is better is not necessarily true. After overheads on a
> 400GB RDMA NIC there is not such a big difference between doing a 4k
> and 16k scatter transfer. The CPU work to process all the extra bitmap
> data may not be a net win compared to block transfer times.
> 
> Conversly someone doing 1G TCP transfers probably cares a lot to
> minimize block size.
> 
> Overall, I think there is far too much up in the air and unmeasured to
> firmly commit the kernel to a fixed policy.
> 
> So, I would like to see userspace control most of the policy aspects,
> including the dirty track provider.

This sounds like device specific migration parameter tuning via a
devlink interface to me, tbh.  How would you propose a generic
vfio/iommufd interface to tune this sort of thing?

> > I believe the intended progression of dirty tracking is that by default
> > all mapped ranges are dirty.  If the device supports page pinning, then
> > we reduce the set of dirty pages to those pages which are pinned.  A
> > device that doesn't otherwise need page pinning, such as a fully IOMMU  
> 
> How does userspace know if dirty tracking works or not? All I see
> VFIO_IOMMU_DIRTY_PAGES_FLAG_START unconditionally allocs some bitmaps.

IIRC, it's always supported by type1.  In the worst case we always
report all mapped pages as dirty.

> I'm surprised it doesn't check that only NO_IOMMU's devices are
> attached to the container and refuse to dirty track otherwise - since
> it doesn't work..

No-IOMMU doesn't use type1, the ioctl returns errno.

> > backed device, would use gratuitous page pinning triggered by the
> > _SAVING state activation on the device.  It sounds like mlx5 could use
> > this existing support today.  
> 
> How does mlx5 know if it should turn on its dirty page tracking on
> SAVING or if the system IOMMU covers it? Or for some reason userspace
> doesn't want dirty tracking but is doing pre-copy?

Likely there'd be some sort of IOMMU property the driver could check,
type1 would need to figure out the same.  The type1/iommufd interfaces
to the driver could evolve so that the driver can know if DMA dirty
tracking is enabled by the user.

> When we mix dirty track with pre-copy, the progression seems to be:
> 
>   DITRY TRACKING | RUNNING
>      Copy every page to the remote
>   DT | SAVING | RUNNING
>      Copy pre-copy migration data to the remote
>   SAVING | NDMA | RUNNING
>      Read and clear dirty track device bitmap
>   DT | SAVING | RUNNING
>      Copy new dirtied data
>      (maybe loop back to NDMA a few times?)
>   SAVING | NDMA | RUNNING
>      P2P grace state
>   0
>     Read the dirty track and copy data
>     Read and send the migration state
> 
> Can we do something so complex using only SAVING?

I'm not demanding that triggering device dirty tracking on saving is
how this must be done, I'm only stating that's an idea that was
discussed.  If we need more complicated triggers between the IOMMU and
device, let's define those, but I don't see that doing so negates the
benefits of aggregated dirty bitmaps in the IOMMU context.

> .. and along the lines of the above how do we mix in NDMA to the iommu
> container, and how does it work if only some devices support NDMA?

As above, turning on NDMA effectively enlightens the device, we'd need
a counter interface to de-enlighten, the IOMMU dirty context
dynamically works at the least common denominator at the time.

> > We had also discussed variants to page pinning that might be more
> > useful as device dirty page support improves.  For example calls to
> > mark pages dirty once rather than the perpetual dirtying of pinned
> > pages, calls to pin pages for read vs write, etc.  We didn't dive much
> > into system IOMMU dirtying, but presumably we'd have a fault handler
> > triggered if a page is written by the device and go from there.  
> 
> Would be interesting to know for sure what current IOMMU HW has
> done. I'm supposing the easiest implementation is to write a dirty bit
> to the IO PTE the same as the CPU writes a dirty bit the normal PTE.
> 
> > > In light of all this I'm wondering if device dirty tracking should
> > > exist as new ioctls on the device FD and reserve the type1 code to
> > > only work the IOMMU dirty tracking.  
> > 
> > Our existing model is working towards the IOMMU, ie. container,
> > interface aggregating dirty page context.    
> 
> This creates inefficiencies in the kernel, we copy from the mlx5
> formed data structure to new memory in the iommu through a very
> ineffficent API and then again we do an ioctl to copy it once more and
> throw all the extra work away. It does not seem good for something
> where we want performance.

So maybe the dirty bitmaps for the IOMMU context need to be exposed to
and directly modifiable by the drivers using atomic bitmap ops.  Maybe
those same bitmaps can be mmap'd to userspace.  These bitmaps are not
insignificant, do we want every driver managing their own copies?

> > For example when page pinning is used, it's only when all devices
> > within the container are using page pinning that we can report the
> > pinned subset as dirty.  Otherwise userspace needs to poll each
> > device, which I suppose enables your idea that userspace decides
> > which source to use, but why?  
> 
> Efficiency, and user selectable policy.
> 
> Userspace can just allocate an all zeros bitmap and feed it to each of
> the providers in the kernel using a 'or in your dirty' semantic.
> 
> No redundant kernel data marshaling, userspace gets to decide which
> tracking provider to use, and it is simple to implement in the kernel.
> 
> Userspace has to do this anyhow if it has configurations with multiple
> containers. For instance because it was forced to split the containers
> due to one device not supporting NDMA.

Huh?  When did that become a requirement?  I feel like there are a lot
of excuses listed here, but nothing that really demands a per device
interface, or at least a per device interface that we have any hope of
specifying.  Shared infrastructure in the IOMMU allows both kernel-side
and userspace-side consolidation of these bitmaps.  It's pretty clear
that our current interfaces are rudimentary, but we've got to start
somewhere.
 
> > Does the IOMMU dirty page tracking exclude devices if the user
> > queries the device separately?    
> 
> What makes sense to me is multiple tracking providers. Each can be
> turned on and off.
> 
> If the container tracking provider says it supports tracking then it
> means it can track DMA from every device it is connected to (unlike
> today?). eg by using IOMMU HW that naturally does this, or by only
> having only NO_IOMMU devices.

Let's kick No-IOMMU out of this conversation, it doesn't claim to have
any of these features, it never will.  Type1 can always provide dirty
tracking with the default being to mark all mapped pages perpetually
dirty.  It doesn't make much sense for userspace to get a concise dirty
bitmap from one device and a full dirty bitmap from another.  We've
optimized that userspace gets the least common denominator for the
entire IOMMU context.

> If the migration driver says it supports tracking, then it only tracks
> DMA from that device.

I don't see what this buys us.  Userspace is only going to do a
migration if all devices support the per device migration region.  At
that point we need the best representation of the dirty bitmap we can
provide per IOMMU context.  It makes sense to me to aggregate per
device dirtying into that one context.

> > How would it know?  What's the advantage?  It seems like this
> > creates too many support paths that all need to converge on the same
> > answer.  Consolidating DMA dirty page tracking to the DMA mapping
> > interface for all devices within a DMA context makes more sense to
> > me.  
> 
> What I see is a lot of questions and limitations with this
> approach. If we stick to funneling everything through the iommu then
> answering the questions seem to create a large amount of kernel
> work. Enough to ask if it is worthwhile..

If we need a common userspace IOMMU subsystem like IOMMUfd that can
handle driver page pinning, IOMMU faults, and dirty tracking, why does
it suddenly become an unbearable burden to allow other means besides
page pinning for a driver to relay DMA page writes?  OTOH, aggregating
these features in the IOMMU reduces both overhead of per device bitmaps
and user operations to create their own consolidated view.

> .. and then we have to ask how does this all work in IOMMUFD where it
> is not so reasonable to tightly couple the migration driver and the
> IOAS and I get more questions :)

To me, the per device option seems pretty ad-hoc, cumbersome and
complicated for userspace, and invents problems with the aggregated
bitmap that either don't really exist (imo) or where interfaces could be
refined.  Maybe start with what uAPI visible knobs really make sense
and provide a benefit for per-device dirty bitmap tuning and how a
device agnostic userspace like QEMU is going to make intelligent
decisions about those knobs.  Otherwise I see the knobs as out-of-band
and most of the other arguments tending towards NIH.  Thanks,

Alex
Jason Gunthorpe Nov. 17, 2021, 1:48 a.m. UTC | #68
On Tue, Nov 16, 2021 at 02:10:31PM -0700, Alex Williamson wrote:
> On Tue, 16 Nov 2021 15:25:05 -0400
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Tue, Nov 16, 2021 at 10:57:36AM -0700, Alex Williamson wrote:
> > 
> > > > I think userspace should decide if it wants to use mlx5 built in or
> > > > the system IOMMU to do dirty tracking.  
> > > 
> > > What information does userspace use to inform such a decision?  
> > 
> > Kernel can't know which approach performs better. Operators should
> > benchmark and make a choice for their deployment HW. Maybe device
> > tracking severely impacts device performance or vice versa.
> 
> I'm all for keeping policy decisions out of the kernel, but it's pretty
> absurd to expect a userspace operator to benchmark various combination
> and wire various knobs through the user interface for this. 

Huh? This is the standard operating procedure in netdev land, mm and
others. Max performance requires tunables. And here we really do care
alot about migration time. I've seen the mm complexity supporting the
normal vCPU migration to know this is a high priority for many people.

> It seems to me that the kernel, ie. the vfio variant driver, *can*
> know the best default.  

If the kernel can have an algorithm to find the best default then qemu
can implement the same algorithm too. I'm also skeptical about this
claim the best is knowable.

> > Kernel doesn't easily know what userspace has done, maybe one device
> > supports migration driver dirty tracking and one device does not.
> 
> And that's exactly why the current type1 implementation exposes the
> least common denominator to userspace, ie. pinned pages only if all
> devices in the container have enabled this degree of granularity.

The current implementation forces no dirty tracking. That is a policy
choice that denies userspace the option to run one device with dirty
track and simply suspend the other.

> > So, I would like to see userspace control most of the policy aspects,
> > including the dirty track provider.
> 
> This sounds like device specific migration parameter tuning via a
> devlink interface to me, tbh.  How would you propose a generic
> vfio/iommufd interface to tune this sort of thing?

As I said, if each page track provider has its own interface it is
straightforward to make policy in userspace. The only tunable beyond
that is the dirty page tracking granularity. That is already
provided by userspace, but not as an parameter during START.

I don't see why we'd need something as complicated as devlink just
yet.

> > How does userspace know if dirty tracking works or not? All I see
> > VFIO_IOMMU_DIRTY_PAGES_FLAG_START unconditionally allocs some bitmaps.
> 
> IIRC, it's always supported by type1.  In the worst case we always
> report all mapped pages as dirty.

Again this denies userspace a policy choice. Why do dirty tracking
gyrations if they don't work? Just directly suspend the devices and
then copy.

> > I'm surprised it doesn't check that only NO_IOMMU's devices are
> > attached to the container and refuse to dirty track otherwise - since
> > it doesn't work..
> 
> No-IOMMU doesn't use type1, the ioctl returns errno.

Sorry, I mistyped that, I ment emulated iommu, as HCH has called it:

vfio_register_emulated_iommu_dev()

> > When we mix dirty track with pre-copy, the progression seems to be:
> > 
> >   DITRY TRACKING | RUNNING
> >      Copy every page to the remote
> >   DT | SAVING | RUNNING
> >      Copy pre-copy migration data to the remote
> >   SAVING | NDMA | RUNNING
> >      Read and clear dirty track device bitmap
> >   DT | SAVING | RUNNING
> >      Copy new dirtied data
> >      (maybe loop back to NDMA a few times?)
> >   SAVING | NDMA | RUNNING
> >      P2P grace state
> >   0
> >     Read the dirty track and copy data
> >     Read and send the migration state
> > 
> > Can we do something so complex using only SAVING?
> 
> I'm not demanding that triggering device dirty tracking on saving is
> how this must be done, I'm only stating that's an idea that was
> discussed.  If we need more complicated triggers between the IOMMU and
> device, let's define those, but I don't see that doing so negates the
> benefits of aggregated dirty bitmaps in the IOMMU context.

Okay. As far as your request to document things as we seem them
upcoming I belive we should have some idea how dirty tracking control
fits in. I agree that is not related to how the bitmap is reported. We
will continue to think about dirty tracking as not connected to
SAVING.

> > This creates inefficiencies in the kernel, we copy from the mlx5
> > formed data structure to new memory in the iommu through a very
> > ineffficent API and then again we do an ioctl to copy it once more and
> > throw all the extra work away. It does not seem good for something
> > where we want performance.
> 
> So maybe the dirty bitmaps for the IOMMU context need to be exposed to
> and directly modifiable by the drivers using atomic bitmap ops.  Maybe
> those same bitmaps can be mmap'd to userspace.  These bitmaps are not
> insignificant, do we want every driver managing their own copies?

If we look at mlx5 for example there is no choice. The dirty log is in
some device specific format and is not sharable. We must allocate
memory to work with it.

What I don't need is the bitmap memory in the iommu container, that is
all useless for mlx5.

So, down this path we need some API for the iommu context to not
allocate its memory at all and refer to storage from the tracking
provider for cases where that makes sense.

> > Userspace has to do this anyhow if it has configurations with multiple
> > containers. For instance because it was forced to split the containers
> > due to one device not supporting NDMA.
> 
> Huh?  When did that become a requirement?  

It is not a requirement, it is something userspace can do, if it
wants. And we talked about this, if NDMA isn't supported the P2P can't
work, and a way to enforce that is to not include P2P in the IOMMU
mapping. Except if you have an asymmetric NDMA you may want an
asymmetric IOMMU mapping too where the NDMA devices can do P2P and the
others don't. That is two containers and two dirty bitmaps.

Who knows, it isn't the job of the kernel to make these choices, the
kernel just provides tools.

> I feel like there are a lot of excuses listed here, but nothing that
> really demands a per device interface, 

"excuses" and "NIH" is a bit rude Alex.

From my side, you are asking for a lot work from us (who else?) to
define and implement a wack of missing kernel functionality.

I think it is very reasonable to ask what the return to the community
is for this work. "makes more sense to me" is not something that
is really compelling.

So, I'd rather you tell me concretely why doing this work, in this
way, is a good idea.

> > If the migration driver says it supports tracking, then it only tracks
> > DMA from that device.
> 
> I don't see what this buys us.  Userspace is only going to do a
> migration if all devices support the per device migration region.  At
> that point we need the best representation of the dirty bitmap we can
> provide per IOMMU context.  It makes sense to me to aggregate per
> device dirtying into that one context.

Again, there are policy choices userspace can make, like just
suspending the device that doesn't do dirty tracking and continuing to
dirty track the ones that do.

This might be very logical if a non-dirty tracking device is something
like IDXD that will just cause higher request latency and the dirty
tracking is mlx5 that will cause externally visable artifacts.

My point is *we don't know* what people will want.

I also think you are too focused on a one-size solution that fits into
a qemu sort of enteprise product. While I can agree with your position
relative to an enterprise style customer, NVIDIA has different
customers, many that have their own customized VMMs that are tuned for
their HW choices. For these customers I do like to see that the kernel
allows options, and I don't think it is appropriate to be so
dismissive of this perspective.

> > What I see is a lot of questions and limitations with this
> > approach. If we stick to funneling everything through the iommu then
> > answering the questions seem to create a large amount of kernel
> > work. Enough to ask if it is worthwhile..
> 
> If we need a common userspace IOMMU subsystem like IOMMUfd that can
> handle driver page pinning, IOMMU faults, and dirty tracking, why does
> it suddenly become an unbearable burden to allow other means besides
> page pinning for a driver to relay DMA page writes?

I can see some concrete reasons for iommufd, like it allows this code
to be shared between mutliple subsystems that need it. Its design
supports more functionality than the vfio container can.

Here, I don't quite see it. If userspace does

  for (i = 0; i != num_trackers; i++)
     ioctl(merge dirty bitmap, i, &bitmap)

Or
   ioctl(read diry bitmap, &bitmap)

Hardly seems decisive. What bothers me is the overhead and kernel
complexity.

If we split them it looks quite simple:
 - new ioctl on vfio device to read & clear dirty bitmap
    & extension flag to show this is supported
 - DIRTY_TRACK migration state bit to start/stop
   Simple logic that read is only possible in NDMA/!RUNNING
 - new ioctl on vfio device to report dirty tracking capability flags:
    - internal dirty track supported
    - real dirty track through attached container supported
      (only mdev can set this today)

Scenarios:

a. If userspace has a mlx5 and a mdev then it does a for loop as above to
   start and read the dirty bitmaps.

b. If userspace has only mlx5 it reads one bitmap

c. If userspace has mlx5 and some other PCI device it can activate
   mlx5 and leave the container alone. Suspend the PCI device early.
   Or directly give up on dirty track

For funneling through the container ioctls.. Humm:
 - Still track per device/container connection if internal/external
   dirty track is supported. Add an new ioctl so userspace can have
   this info for (c)
 - For external dirty track have some ops callbacks for start/stop,
   read bitmap and clear. (So, no migration state flag?)
 - On start make the policy choice if the external/internal will be
   used, then negotiate some uniform tracking size and iterate over
   all externals to call start
 - On read.. to avoid overheads iterate over the internal bitmap
   and read ops on all external bitmaps and or them together
   then copy to user. Just ignore NDMA and rely on userspace to
   do it right?
 - Clear iterates and zeros bitmaps
 - Change logic to not allocate tracking bitmaps if no mdevs

a. works the same, kernel turns on both trackers
b. works the same, kernel turns on only mlx5
c. Hum. We have to disable the 'no tracker report everything as
   dirty' feature somehow so we can access only the mlx5 tracker
   without forcing evey page seen as dirty. Needs a details

And figure out how this relates to the ongoing iommufd project (which,
BTW, we have now invested a lot in too).

I'm not as convinced as you are that the 2nd is obviously better, and
on principle I don't like avoidable policy choices baked in the
kernel.

And I don't see that it makes userspace really much simpler anyhow. On
the other hand it looks like a lot more kernel work..

> OTOH, aggregating these features in the IOMMU reduces both overhead
> of per device bitmaps and user operations to create their own
> consolidated view.

I don't understand this, funneling will not reduce overhead, at best
with some work we can almost break even by not allocating the SW
bitmaps.

Jason
Alex Williamson Nov. 18, 2021, 6:15 p.m. UTC | #69
On Tue, 16 Nov 2021 21:48:31 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Nov 16, 2021 at 02:10:31PM -0700, Alex Williamson wrote:
> > On Tue, 16 Nov 2021 15:25:05 -0400
> > Jason Gunthorpe <jgg@nvidia.com> wrote:
> >   
> > > On Tue, Nov 16, 2021 at 10:57:36AM -0700, Alex Williamson wrote:
> > >   
> > > > > I think userspace should decide if it wants to use mlx5 built in or
> > > > > the system IOMMU to do dirty tracking.    
> > > > 
> > > > What information does userspace use to inform such a decision?    
> > > 
> > > Kernel can't know which approach performs better. Operators should
> > > benchmark and make a choice for their deployment HW. Maybe device
> > > tracking severely impacts device performance or vice versa.  
> > 
> > I'm all for keeping policy decisions out of the kernel, but it's pretty
> > absurd to expect a userspace operator to benchmark various combination
> > and wire various knobs through the user interface for this.   
> 
> Huh? This is the standard operating procedure in netdev land, mm and
> others. Max performance requires tunables. And here we really do care
> alot about migration time. I've seen the mm complexity supporting the
> normal vCPU migration to know this is a high priority for many people.

Per previous reply:

"Maybe start with what uAPI visible knobs really make sense
and provide a benefit for per-device dirty bitmap tuning and how a
device agnostic userspace like QEMU is going to make intelligent
decisions about those knobs."

> > It seems to me that the kernel, ie. the vfio variant driver, *can*
> > know the best default.    
> 
> If the kernel can have an algorithm to find the best default then qemu
> can implement the same algorithm too. I'm also skeptical about this
> claim the best is knowable.

QEMU is device agnostic and has no visibility to the system IOMMU
capabilities.
 
> > > Kernel doesn't easily know what userspace has done, maybe one device
> > > supports migration driver dirty tracking and one device does not.  
> > 
> > And that's exactly why the current type1 implementation exposes the
> > least common denominator to userspace, ie. pinned pages only if all
> > devices in the container have enabled this degree of granularity.  
> 
> The current implementation forces no dirty tracking. That is a policy
> choice that denies userspace the option to run one device with dirty
> track and simply suspend the other.

"Forces no dirty tracking"?  I think you're suggesting that if a device
within an IOMMU context doesn't support dirty tracking that we're
forced to always report all mappings within that context as dirty,
because for some reason we're not allowed to evolve how devices can
interact with a shared dirty context, but we are allowed to invent a new
per-device uAPI?

> > > So, I would like to see userspace control most of the policy aspects,
> > > including the dirty track provider.  
> > 
> > This sounds like device specific migration parameter tuning via a
> > devlink interface to me, tbh.  How would you propose a generic
> > vfio/iommufd interface to tune this sort of thing?  
> 
> As I said, if each page track provider has its own interface it is
> straightforward to make policy in userspace. The only tunable beyond
> that is the dirty page tracking granularity. That is already
> provided by userspace, but not as an parameter during START.
> 
> I don't see why we'd need something as complicated as devlink just
> yet.

The approaches aren't that different.  It seems like you want userspace
to be able to select which devices to get dirty page info from while
I'm suggesting that we can develop ways that the user can manage how
the device interacts with a shared dirty page state.  Per-device dirty
page state doesn't preclude a shared dirty state, we still expect that
any IOMMU interface is the source of truth when we have multiple
devices sharing an IOMMU context.

What if we have a scenario where devices optionally have per device
dirty page tracking.

A) mlx5 w/ device level dirty page tracking, vGPU w/ page pinning

vs

B) mlx5 w/ device level dirty page tracking, NIC with system IOMMU
   dirty page tracking

Compare and contrast in which case the shared IOMMU context dirty page
tracking reports both device's versus only the devices without
per-device tracking.  Is this obvious to both userspace and the shared
dirty page tracking if it's the same or different in both cases?

> > > How does userspace know if dirty tracking works or not? All I see
> > > VFIO_IOMMU_DIRTY_PAGES_FLAG_START unconditionally allocs some bitmaps.  
> > 
> > IIRC, it's always supported by type1.  In the worst case we always
> > report all mapped pages as dirty.  
> 
> Again this denies userspace a policy choice. Why do dirty tracking
> gyrations if they don't work? Just directly suspend the devices and
> then copy.

As above, it seems like you're freezing one interface and allowing the
other to evolve.  We can create ways that a device without dirty
tracking can be marked as not generating DMA within a shared dirty page
context.

If maybe what you're really trying to get at all along is visibility to
the per-device dirty page contribution, that does sound interesting.
But I also don't know how that interacts with system IOMMU based
reporting.  Does the IOMMU report to the driver that reports to
userspace the per-device dirty pages?

> > > I'm surprised it doesn't check that only NO_IOMMU's devices are
> > > attached to the container and refuse to dirty track otherwise - since
> > > it doesn't work..  
> > 
> > No-IOMMU doesn't use type1, the ioctl returns errno.  
> 
> Sorry, I mistyped that, I ment emulated iommu, as HCH has called it:
> 
> vfio_register_emulated_iommu_dev()

Ok, so that's essentially the vfio IOMMU driver just keeping track of
userspace mappings in order to provide page pinning, where a device
supporting page pinning is our first step in more fine grained dirty
tracking.  Type1 keeps track of pages pinned across all devices sharing
the IOMMU container, thus we have container based dirty page reporting.
 
> > > When we mix dirty track with pre-copy, the progression seems to be:
> > > 
> > >   DITRY TRACKING | RUNNING
> > >      Copy every page to the remote
> > >   DT | SAVING | RUNNING
> > >      Copy pre-copy migration data to the remote
> > >   SAVING | NDMA | RUNNING
> > >      Read and clear dirty track device bitmap
> > >   DT | SAVING | RUNNING
> > >      Copy new dirtied data
> > >      (maybe loop back to NDMA a few times?)
> > >   SAVING | NDMA | RUNNING
> > >      P2P grace state
> > >   0
> > >     Read the dirty track and copy data
> > >     Read and send the migration state
> > > 
> > > Can we do something so complex using only SAVING?  
> > 
> > I'm not demanding that triggering device dirty tracking on saving is
> > how this must be done, I'm only stating that's an idea that was
> > discussed.  If we need more complicated triggers between the IOMMU and
> > device, let's define those, but I don't see that doing so negates the
> > benefits of aggregated dirty bitmaps in the IOMMU context.  
> 
> Okay. As far as your request to document things as we seem them
> upcoming I belive we should have some idea how dirty tracking control
> fits in. I agree that is not related to how the bitmap is reported. We
> will continue to think about dirty tracking as not connected to
> SAVING.

Ok.
 
> > > This creates inefficiencies in the kernel, we copy from the mlx5
> > > formed data structure to new memory in the iommu through a very
> > > ineffficent API and then again we do an ioctl to copy it once more and
> > > throw all the extra work away. It does not seem good for something
> > > where we want performance.  
> > 
> > So maybe the dirty bitmaps for the IOMMU context need to be exposed to
> > and directly modifiable by the drivers using atomic bitmap ops.  Maybe
> > those same bitmaps can be mmap'd to userspace.  These bitmaps are not
> > insignificant, do we want every driver managing their own copies?  
> 
> If we look at mlx5 for example there is no choice. The dirty log is in
> some device specific format and is not sharable. We must allocate
> memory to work with it.
> 
> What I don't need is the bitmap memory in the iommu container, that is
> all useless for mlx5.
> 
> So, down this path we need some API for the iommu context to not
> allocate its memory at all and refer to storage from the tracking
> provider for cases where that makes sense.

That depends whether there are other devices in the context and if the
container dirty context is meant to include all devices or if the
driver is opt'ing out of the shared tracking... if that's a thing.
Alternatively, drivers could register callbacks to report their dirty
pages into the shared tracking for ranges requested by the user.  We
need to figure out how per-device tracking an system IOMMU tracking get
along if that's where we're headed.
 
> > > Userspace has to do this anyhow if it has configurations with multiple
> > > containers. For instance because it was forced to split the containers
> > > due to one device not supporting NDMA.  
> > 
> > Huh?  When did that become a requirement?    
> 
> It is not a requirement, it is something userspace can do, if it
> wants. And we talked about this, if NDMA isn't supported the P2P can't
> work, and a way to enforce that is to not include P2P in the IOMMU
> mapping. Except if you have an asymmetric NDMA you may want an
> asymmetric IOMMU mapping too where the NDMA devices can do P2P and the
> others don't. That is two containers and two dirty bitmaps.

I'm not sure how I was supposed to infer that use case from "...forced
to split the containers due to one device not support NDMA".  That's
certainly an option, but not a requirement.  In fact, if QEMU were to
do something like that, then we have some devices that can do p2p and
some devices that cannot... all or none seems like a much more
deterministic choice for QEMU.  How do guest drivers currently test p2p?
 
> Who knows, it isn't the job of the kernel to make these choices, the
> kernel just provides tools.

Agreed, but I don't see that userspace choosing to use separate
contexts either negates the value of the kernel aggregating dirty pages
within a container or clearly makes the case for per-device dirty pages.

> > I feel like there are a lot of excuses listed here, but nothing that
> > really demands a per device interface,   
> 
> "excuses" and "NIH" is a bit rude Alex.
> 
> From my side, you are asking for a lot work from us (who else?) to
> define and implement a wack of missing kernel functionality.
> 
> I think it is very reasonable to ask what the return to the community
> is for this work. "makes more sense to me" is not something that
> is really compelling.
> 
> So, I'd rather you tell me concretely why doing this work, in this
> way, is a good idea.

On my end, we have a defined dirty page tracking interface at the IOMMU
context with some, admittedly rough, plans on how we intend to evolve
that interface for both device level and IOMMU level tracking feeding
into a shared dirty page interfaces.  The example scenarios presented
mostly seem to have solutions within that design framework if we put a
little effort into it.  There are unanswered questions down either
path, but one of those paths is the path we previously chose.  The
greater burden is to switch paths, so rather I need to understand why
this is the better path, with due diligence to explore what the same
looks like with the current design.

It's only finally here in the thread that we're seeing some of the mlx5
implementation details that might favor a per-device solution, hints
that per device page granularity might be useful, and maybe that
exposing per-device dirty page footprints to userspace is underlying
this change of course.

> > > If the migration driver says it supports tracking, then it only tracks
> > > DMA from that device.  
> > 
> > I don't see what this buys us.  Userspace is only going to do a
> > migration if all devices support the per device migration region.  At
> > that point we need the best representation of the dirty bitmap we can
> > provide per IOMMU context.  It makes sense to me to aggregate per
> > device dirtying into that one context.  
> 
> Again, there are policy choices userspace can make, like just
> suspending the device that doesn't do dirty tracking and continuing to
> dirty track the ones that do.
> 
> This might be very logical if a non-dirty tracking device is something
> like IDXD that will just cause higher request latency and the dirty
> tracking is mlx5 that will cause externally visable artifacts.
> 
> My point is *we don't know* what people will want.
> 
> I also think you are too focused on a one-size solution that fits into
> a qemu sort of enteprise product. While I can agree with your position
> relative to an enterprise style customer, NVIDIA has different
> customers, many that have their own customized VMMs that are tuned for
> their HW choices. For these customers I do like to see that the kernel
> allows options, and I don't think it is appropriate to be so
> dismissive of this perspective.

So provide the justification I asked for previously and quoted above,
what are the things we want to be able to tune that cannot be done
through reasonable extensions of the current design?  I'm not trying to
be dismissive, I'm lacking facts and evidence of due diligence that the
current design is incapable of meeting our goals.
 
> > > What I see is a lot of questions and limitations with this
> > > approach. If we stick to funneling everything through the iommu then
> > > answering the questions seem to create a large amount of kernel
> > > work. Enough to ask if it is worthwhile..  
> > 
> > If we need a common userspace IOMMU subsystem like IOMMUfd that can
> > handle driver page pinning, IOMMU faults, and dirty tracking, why does
> > it suddenly become an unbearable burden to allow other means besides
> > page pinning for a driver to relay DMA page writes?  
> 
> I can see some concrete reasons for iommufd, like it allows this code
> to be shared between mutliple subsystems that need it. Its design
> supports more functionality than the vfio container can.
> 
> Here, I don't quite see it. If userspace does
> 
>   for (i = 0; i != num_trackers; i++)
>      ioctl(merge dirty bitmap, i, &bitmap)
> 
> Or
>    ioctl(read diry bitmap, &bitmap)
> 
> Hardly seems decisive.

On the same order as "makes more sense to me" ;)

> What bothers me is the overhead and kernel
> complexity.
> 
> If we split them it looks quite simple:
>  - new ioctl on vfio device to read & clear dirty bitmap
>     & extension flag to show this is supported
>  - DIRTY_TRACK migration state bit to start/stop

Is this another device_state bit?

>    Simple logic that read is only possible in NDMA/!RUNNING
>  - new ioctl on vfio device to report dirty tracking capability flags:
>     - internal dirty track supported
>     - real dirty track through attached container supported
>       (only mdev can set this today)

How does system IOMMU dirty tracking work?
 
> Scenarios:
> 
> a. If userspace has a mlx5 and a mdev then it does a for loop as above to
>    start and read the dirty bitmaps.
> 
> b. If userspace has only mlx5 it reads one bitmap
> 
> c. If userspace has mlx5 and some other PCI device it can activate
>    mlx5 and leave the container alone. Suspend the PCI device early.
>    Or directly give up on dirty track
> 
> For funneling through the container ioctls.. Humm:
>  - Still track per device/container connection if internal/external
>    dirty track is supported. Add an new ioctl so userspace can have
>    this info for (c)
>  - For external dirty track have some ops callbacks for start/stop,
>    read bitmap and clear. (So, no migration state flag?)
>  - On start make the policy choice if the external/internal will be
>    used, then negotiate some uniform tracking size and iterate over
>    all externals to call start
>  - On read.. to avoid overheads iterate over the internal bitmap
>    and read ops on all external bitmaps and or them together
>    then copy to user. Just ignore NDMA and rely on userspace to
>    do it right?
>  - Clear iterates and zeros bitmaps
>  - Change logic to not allocate tracking bitmaps if no mdevs

I don't understand what's being described here, I'm glad an attempt is
being made to see what this might look like with the current interface,
but at the same time the outline seems biased towards a complicated
portrayal.

It's a relatively simple progression, with no outside information the
container exposes all mapped pages as dirty.  Individual devices in the
container can expose themselves as enlightened in various ways,
initially we mark devices that pin pages as enlightened and when all
devices in the container are enlightened we can reduce the reported
bitmap.  As devices add their own dirty tracking we can add interfaces
to allow devices to register their status, mark dirty pages, and
potentially poll devices to update their dirty pages against ranges
requested by the user.  A device that's suspended, for example in PCI
D3 state, might mark itself as becoming enlightened and report no page
dirtying, a device ioctl might allow a more explicit instruction for
the same.  As IOMMU dirty page tracking comes online, the IOMMU can
essentially enlighten all the devices attached to the context.
 
> a. works the same, kernel turns on both trackers
> b. works the same, kernel turns on only mlx5
> c. Hum. We have to disable the 'no tracker report everything as
>    dirty' feature somehow so we can access only the mlx5 tracker
>    without forcing evey page seen as dirty. Needs a details

Doesn't seem as complicated in my rendition.
 
> And figure out how this relates to the ongoing iommufd project (which,
> BTW, we have now invested a lot in too).
> 
> I'm not as convinced as you are that the 2nd is obviously better, and
> on principle I don't like avoidable policy choices baked in the
> kernel.

Userspace has the same degree of policy decision in my version, what's
missing from my version is userspace visibility to the per device dirty
footprint and the newly mentioned desire for per-device page granularity.
 
> And I don't see that it makes userspace really much simpler anyhow. On
> the other hand it looks like a lot more kernel work..

There's kernel work either way.

> > OTOH, aggregating these features in the IOMMU reduces both overhead
> > of per device bitmaps and user operations to create their own
> > consolidated view.  
> 
> I don't understand this, funneling will not reduce overhead, at best
> with some work we can almost break even by not allocating the SW
> bitmaps.

The moment we have more than one device that requires a bitmap, where
the device doesn't have the visibility of the extent of the bitmap, we
introduce both space and time overhead versus a shared bitmap that can
be pre-allocated.

What's being asked for here is a change from the current plan of
record, that entails work and justification, including tangible
benefits versus a fair exploration of how the same might work in the
current design.

Anyway, as Connie mentioned off-list, we're deep into a comment thread
that's becoming more and more tangential to the original patch, I'm not
sure how many relevant contributors are still following this, and it's
probably best to propose a change in course to dirty bitmap tracking in
a new thread.  Thanks,

Alex
Jason Gunthorpe Nov. 22, 2021, 7:18 p.m. UTC | #70
On Thu, Nov 18, 2021 at 11:15:55AM -0700, Alex Williamson wrote:
> On Tue, 16 Nov 2021 21:48:31 -0400
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Tue, Nov 16, 2021 at 02:10:31PM -0700, Alex Williamson wrote:
> > > On Tue, 16 Nov 2021 15:25:05 -0400
> > > Jason Gunthorpe <jgg@nvidia.com> wrote:
> > >   
> > > > On Tue, Nov 16, 2021 at 10:57:36AM -0700, Alex Williamson wrote:
> > > >   
> > > > > > I think userspace should decide if it wants to use mlx5 built in or
> > > > > > the system IOMMU to do dirty tracking.    
> > > > > 
> > > > > What information does userspace use to inform such a decision?    
> > > > 
> > > > Kernel can't know which approach performs better. Operators should
> > > > benchmark and make a choice for their deployment HW. Maybe device
> > > > tracking severely impacts device performance or vice versa.  
> > > 
> > > I'm all for keeping policy decisions out of the kernel, but it's pretty
> > > absurd to expect a userspace operator to benchmark various combination
> > > and wire various knobs through the user interface for this.   
> > 
> > Huh? This is the standard operating procedure in netdev land, mm and
> > others. Max performance requires tunables. And here we really do care
> > alot about migration time. I've seen the mm complexity supporting the
> > normal vCPU migration to know this is a high priority for many people.
> 
> Per previous reply:
> 
> "Maybe start with what uAPI visible knobs really make sense
> and provide a benefit for per-device dirty bitmap tuning and how a
> device agnostic userspace like QEMU is going to make intelligent
> decisions about those knobs."

Well, the main knob is to pick which dirty tracker to use, and a
2ndary knob is to perhaps configure it (eg granual size or something,
but I have nothing concrete right now).

How to do that in the current uapi? I don't know. Beyond some very
ugly 'do not use system iommu' flag that doesn't fit every need, I
don't have any suggestion.

IMHO, generic qemu is fine to do something simple like always prefer
the system IOMMU tracker.

> QEMU is device agnostic and has no visibility to the system IOMMU
> capabilities.

Ah, there is a lot going on here. We are adding iommufd and iommufd's
design has a specific place to expose the iommu_domain, it's unique
capabilities, and uAPI visible connections from the devices. This is
the big uAPI design upgrade that is needed to get to all the features
we want to see on the iommu side.

For what we have now it is quite easy to add a few ioctls for dirty
track query/start/stop/read&clear that wire directly 1:1 to an ops on
the iommu_domain.

So, qemu does a simple 'query' to the iommu_domain, sees dirty track
is supported and esn't even do anything device specific. If that fails
then it queries each vfio_device for a tracker, if that fails it
operates with no dirty track.

Since each domain and device is a uAPI object all policy is trivially
in userspace hands with a simple uAPI design.

> > The current implementation forces no dirty tracking. That is a policy
> > choice that denies userspace the option to run one device with dirty
> > track and simply suspend the other.
> 
> "Forces no dirty tracking"?  I think you're suggesting that if a device
> within an IOMMU context doesn't support dirty tracking that we're
> forced to always report all mappings within that context as dirty,
> because for some reason we're not allowed to evolve how devices can
> interact with a shared dirty context, but we are allowed to invent a new
> per-device uAPI?

Basically yes. If we are changing the uAPI then let's change it to
something that makes sense for the HW implementations we actually
have. If you have a clever idea how to do this with the current API
I'm interested to hear, but I don't have anything in mind right now
that isn't much more complicated for everyone.

> What if we have a scenario where devices optionally have per device
> dirty page tracking.
> 
> A) mlx5 w/ device level dirty page tracking, vGPU w/ page pinning
> 
> vs
> 
> B) mlx5 w/ device level dirty page tracking, NIC with system IOMMU
>    dirty page tracking
> 
> Compare and contrast in which case the shared IOMMU context dirty page
> tracking reports both device's versus only the devices without
> per-device tracking.  Is this obvious to both userspace and the shared
> dirty page tracking if it's the same or different in both cases?

Neither A nor B have 'shared dirty page tracking'

The only case where we have some kind of shared tracker is when there
are multiple mdevs with migration support, and the mdev HW can support
a shared bitmap.

In iommufd we still have a reasonable place to put a shared mdev dirty
tracker - the "emulated iommu" is also exposed as uAPI object that can
hold it - however I wouldn't want to implement that in iommufd until
we have some intree drivers that need it.

All the out of tree mdev drivers can use the device specific ioctl. It
only means they can't share dirty bitmaps. Which I think is a
reasonable penalty for out of tree drivers.

> If maybe what you're really trying to get at all along is visibility to
> the per-device dirty page contribution, that does sound interesting.

Visibility and control, yes. It is the requests I've had from our
internal teams.

> But I also don't know how that interacts with system IOMMU based
> reporting.  Does the IOMMU report to the driver that reports to
> userspace the per-device dirty pages?

No, the system IOMMU reports through iommufd on the
iommu_domain. Completely unconnected from the migration driver.

Completely unconnected is the design simplification because I don't
need to make kernel infrastructure to build that connection and uAPI
to manage it explicitly.

> That depends whether there are other devices in the context and if the
> container dirty context is meant to include all devices or if the
> driver is opt'ing out of the shared tracking... 

Right now, *today*, I know of nothing that needs or can make good use
of shared state bitmaps.

> Alternatively, drivers could register callbacks to report their dirty
> pages into the shared tracking for ranges requested by the user.  We
> need to figure out how per-device tracking an system IOMMU tracking get
> along if that's where we're headed.

I don't want shared tracking, it is a waste of CPU and memory
resources. With a single API the only thing that would be OK is no
shared state and kernel iterates over all the trackers. This is quite
far from what is there right now.

> do something like that, then we have some devices that can do p2p and
> some devices that cannot... all or none seems like a much more
> deterministic choice for QEMU.  How do guest drivers currently test p2p?

There is a 'p2p allowed' call inside the kernel pci_p2p layer. It
could concievably be extended to consult some firmware table provided
by the hypervisor. It is another one of these cases, like IMS, where
guest transparency is not possible :(

However, you can easially see this need arising - eg a GPU is
frequently a P2P target but rarely a P2P initiator.

It is another case where I can see people making custom VMMs deviating
from what makes sense in enterprise qemu.

> Agreed, but I don't see that userspace choosing to use separate
> contexts either negates the value of the kernel aggregating dirty pages
> within a container or clearly makes the case for per-device dirty pages.

I bring it up because I don't see a clear way to support it at all
with the current API design. Happy to hear ideas

> It's only finally here in the thread that we're seeing some of the mlx5
> implementation details that might favor a per-device solution, hints
> that per device page granularity might be useful, and maybe that
> exposing per-device dirty page footprints to userspace is underlying
> this change of course.

Well, yes, I mean we've thought about this quite a lot internally, we
are not suggesting things randomly for "NIH" as you said. We have lots
of different use cases, several customers, multiple devices and more.

> So provide the justification I asked for previously and quoted above,
> what are the things we want to be able to tune that cannot be done
> through reasonable extensions of the current design?  I'm not trying to
> be dismissive, I'm lacking facts and evidence of due diligence that the
> current design is incapable of meeting our goals.

It is not incapable or not, it is ROI. I'm sure we can hack the
current design into something, with a lot of work and complexity.

I'm saying we see a much simpler version that has a simpler kernel
side.

> > If we split them it looks quite simple:
> >  - new ioctl on vfio device to read & clear dirty bitmap
> >     & extension flag to show this is supported
> >  - DIRTY_TRACK migration state bit to start/stop
> 
> Is this another device_state bit?

Probably, or a device ioctl - semantically the same

> >    Simple logic that read is only possible in NDMA/!RUNNING
> >  - new ioctl on vfio device to report dirty tracking capability flags:
> >     - internal dirty track supported
> >     - real dirty track through attached container supported
> >       (only mdev can set this today)
> 
> How does system IOMMU dirty tracking work?

As above, it is parallel and completely contained in iommufd.

It is split, iommufd only does system iommu tracking, the device FD
only does device integral tracking. They never cross interfaces
internally, so I don't need to make a kernel framework to support,
aggregate and control multiple dirty trackers.

> I don't understand what's being described here, I'm glad an attempt is
> being made to see what this might look like with the current interface,
> but at the same time the outline seems biased towards a complicated
> portrayal.

Well, not understanding is a problem :|
 
> > a. works the same, kernel turns on both trackers
> > b. works the same, kernel turns on only mlx5
> > c. Hum. We have to disable the 'no tracker report everything as
> >    dirty' feature somehow so we can access only the mlx5 tracker
> >    without forcing evey page seen as dirty. Needs a details
> 
> Doesn't seem as complicated in my rendition.

I don't see your path through the code.

> > And I don't see that it makes userspace really much simpler anyhow. On
> > the other hand it looks like a lot more kernel work..
> 
> There's kernel work either way.

Yes, more vs less :)
 
> > > OTOH, aggregating these features in the IOMMU reduces both overhead
> > > of per device bitmaps and user operations to create their own
> > > consolidated view.  
> > 
> > I don't understand this, funneling will not reduce overhead, at best
> > with some work we can almost break even by not allocating the SW
> > bitmaps.
> 
> The moment we have more than one device that requires a bitmap, where
> the device doesn't have the visibility of the extent of the bitmap, we
> introduce both space and time overhead versus a shared bitmap that can
> be pre-allocated.

The HW designs we can see right now: mlx5, AMD IOMMU, and hns IOMMU
(though less clear on this one) track the full IOVA space, they store
the track in their own memory and they allow reading each page's dirty
bit with an 'atomic test and clear' semantic.

So currently, we have *zero* devices that require the shared bitmap.

Why should I invest in maintaining and extending this code that has no
current user and no known future purpose?

> What's being asked for here is a change from the current plan of
> record, that entails work and justification, including tangible
> benefits versus a fair exploration of how the same might work in the
> current design.

So, we will make patches for our plan to split them.

We'll show an mlx5 implementation using an ioctl on the vfio device.

I'll sketch how a system iommu works through iommu_domain ops in
iommufd (my goal is to get an AMD implementation at least too)

I can write a paragraph how to fit a shared mdev bitmap tracker into
iommufd if an in-tree user comes..

Can someone provide an equally complete view of how to extend the
current API and solve the same set of use cases?

We can revist this in a month or so when we might have a mlx5 patch.

Jason
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index abdcbcfef73d..e824bfab4a01 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19699,6 +19699,12 @@  L:	kvm@vger.kernel.org
 S:	Maintained
 F:	drivers/vfio/platform/
 
+VFIO MLX5 PCI DRIVER
+M:	Yishai Hadas <yishaih@nvidia.com>
+L:	kvm@vger.kernel.org
+S:	Maintained
+F:	drivers/vfio/pci/mlx5/
+
 VGA_SWITCHEROO
 R:	Lukas Wunner <lukas@wunner.de>
 S:	Maintained
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 860424ccda1b..187b9c259944 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -43,4 +43,7 @@  config VFIO_PCI_IGD
 
 	  To enable Intel IGD assignment through vfio-pci, say Y.
 endif
+
+source "drivers/vfio/pci/mlx5/Kconfig"
+
 endif
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 349d68d242b4..ed9d6f2e0555 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -7,3 +7,5 @@  obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
 vfio-pci-y := vfio_pci.o
 vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
+
+obj-$(CONFIG_MLX5_VFIO_PCI)           += mlx5/
diff --git a/drivers/vfio/pci/mlx5/Kconfig b/drivers/vfio/pci/mlx5/Kconfig
new file mode 100644
index 000000000000..119712656400
--- /dev/null
+++ b/drivers/vfio/pci/mlx5/Kconfig
@@ -0,0 +1,10 @@ 
+# SPDX-License-Identifier: GPL-2.0-only
+config MLX5_VFIO_PCI
+	tristate "VFIO support for MLX5 PCI devices"
+	depends on MLX5_CORE
+	select VFIO_PCI_CORE
+	help
+	  This provides a migration support for MLX5 devices using the VFIO
+	  framework.
+
+	  If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/mlx5/Makefile b/drivers/vfio/pci/mlx5/Makefile
new file mode 100644
index 000000000000..689627da7ff5
--- /dev/null
+++ b/drivers/vfio/pci/mlx5/Makefile
@@ -0,0 +1,4 @@ 
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5-vfio-pci.o
+mlx5-vfio-pci-y := main.o cmd.o
+
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
new file mode 100644
index 000000000000..621b7fc60544
--- /dev/null
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -0,0 +1,696 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/sched/mm.h>
+#include <linux/vfio_pci_core.h>
+
+#include "cmd.h"
+
+enum {
+	MLX5VF_PCI_FREEZED = 1 << 0,
+};
+
+enum {
+	MLX5VF_REGION_PENDING_BYTES = 1 << 0,
+	MLX5VF_REGION_DATA_SIZE = 1 << 1,
+};
+
+enum {
+	MLX5VF_SUPPORTED_DEVICE_STATES = VFIO_DEVICE_STATE_RUNNING |
+					 VFIO_DEVICE_STATE_SAVING |
+					 VFIO_DEVICE_STATE_RESUMING,
+};
+
+#define MLX5VF_MIG_REGION_DATA_SIZE SZ_128K
+/* Data section offset from migration region */
+#define MLX5VF_MIG_REGION_DATA_OFFSET                                          \
+	(sizeof(struct vfio_device_migration_info))
+
+#define VFIO_DEVICE_MIGRATION_OFFSET(x)                                        \
+	(offsetof(struct vfio_device_migration_info, x))
+
+struct mlx5vf_pci_migration_info {
+	u32 vfio_dev_state; /* VFIO_DEVICE_STATE_XXX */
+	u32 dev_state; /* device migration state */
+	u32 region_state; /* Use MLX5VF_REGION_XXX */
+	u16 vhca_id;
+	struct mlx5_vhca_state_data vhca_state_data;
+};
+
+struct mlx5vf_pci_core_device {
+	struct vfio_pci_core_device core_device;
+	u8 migrate_cap:1;
+	/* protect migartion state */
+	struct mutex state_mutex;
+	struct mlx5vf_pci_migration_info vmig;
+};
+
+static int mlx5vf_pci_unquiesce_device(struct mlx5vf_pci_core_device *mvdev)
+{
+	return mlx5vf_cmd_resume_vhca(mvdev->core_device.pdev,
+				      mvdev->vmig.vhca_id,
+				      MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_MASTER);
+}
+
+static int mlx5vf_pci_quiesce_device(struct mlx5vf_pci_core_device *mvdev)
+{
+	return mlx5vf_cmd_suspend_vhca(
+		mvdev->core_device.pdev, mvdev->vmig.vhca_id,
+		MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_MASTER);
+}
+
+static int mlx5vf_pci_unfreeze_device(struct mlx5vf_pci_core_device *mvdev)
+{
+	int ret;
+
+	ret = mlx5vf_cmd_resume_vhca(mvdev->core_device.pdev,
+				     mvdev->vmig.vhca_id,
+				     MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_SLAVE);
+	if (ret)
+		return ret;
+
+	mvdev->vmig.dev_state &= ~MLX5VF_PCI_FREEZED;
+	return 0;
+}
+
+static int mlx5vf_pci_freeze_device(struct mlx5vf_pci_core_device *mvdev)
+{
+	int ret;
+
+	ret = mlx5vf_cmd_suspend_vhca(
+		mvdev->core_device.pdev, mvdev->vmig.vhca_id,
+		MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_SLAVE);
+	if (ret)
+		return ret;
+
+	mvdev->vmig.dev_state |= MLX5VF_PCI_FREEZED;
+	return 0;
+}
+
+static int mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev)
+{
+	u32 state_size = 0;
+	int ret;
+
+	if (!(mvdev->vmig.dev_state & MLX5VF_PCI_FREEZED))
+		return -EFAULT;
+
+	/* If we already read state no reason to re-read */
+	if (mvdev->vmig.vhca_state_data.state_size)
+		return 0;
+
+	ret = mlx5vf_cmd_query_vhca_migration_state(
+		mvdev->core_device.pdev, mvdev->vmig.vhca_id, &state_size);
+	if (ret)
+		return ret;
+
+	return mlx5vf_cmd_save_vhca_state(mvdev->core_device.pdev,
+					  mvdev->vmig.vhca_id, state_size,
+					  &mvdev->vmig.vhca_state_data);
+}
+
+static int mlx5vf_pci_new_write_window(struct mlx5vf_pci_core_device *mvdev)
+{
+	struct mlx5_vhca_state_data *state_data = &mvdev->vmig.vhca_state_data;
+	u32 num_pages_needed;
+	u64 allocated_ready;
+	u32 bytes_needed;
+
+	/* Check how many bytes are available from previous flows */
+	WARN_ON(state_data->num_pages * PAGE_SIZE <
+		state_data->win_start_offset);
+	allocated_ready = (state_data->num_pages * PAGE_SIZE) -
+			  state_data->win_start_offset;
+	WARN_ON(allocated_ready > MLX5VF_MIG_REGION_DATA_SIZE);
+
+	bytes_needed = MLX5VF_MIG_REGION_DATA_SIZE - allocated_ready;
+	if (!bytes_needed)
+		return 0;
+
+	num_pages_needed = DIV_ROUND_UP_ULL(bytes_needed, PAGE_SIZE);
+	return mlx5vf_add_migration_pages(state_data, num_pages_needed);
+}
+
+static ssize_t
+mlx5vf_pci_handle_migration_data_size(struct mlx5vf_pci_core_device *mvdev,
+				      char __user *buf, bool iswrite)
+{
+	struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig;
+	u64 data_size;
+	int ret;
+
+	if (iswrite) {
+		/* data_size is writable only during resuming state */
+		if (vmig->vfio_dev_state != VFIO_DEVICE_STATE_RESUMING)
+			return -EINVAL;
+
+		ret = copy_from_user(&data_size, buf, sizeof(data_size));
+		if (ret)
+			return -EFAULT;
+
+		vmig->vhca_state_data.state_size += data_size;
+		vmig->vhca_state_data.win_start_offset += data_size;
+		ret = mlx5vf_pci_new_write_window(mvdev);
+		if (ret)
+			return ret;
+
+	} else {
+		if (vmig->vfio_dev_state != VFIO_DEVICE_STATE_SAVING)
+			return -EINVAL;
+
+		data_size = min_t(u64, MLX5VF_MIG_REGION_DATA_SIZE,
+				  vmig->vhca_state_data.state_size -
+				  vmig->vhca_state_data.win_start_offset);
+		ret = copy_to_user(buf, &data_size, sizeof(data_size));
+		if (ret)
+			return -EFAULT;
+	}
+
+	vmig->region_state |= MLX5VF_REGION_DATA_SIZE;
+	return sizeof(data_size);
+}
+
+static ssize_t
+mlx5vf_pci_handle_migration_data_offset(struct mlx5vf_pci_core_device *mvdev,
+					char __user *buf, bool iswrite)
+{
+	static const u64 data_offset = MLX5VF_MIG_REGION_DATA_OFFSET;
+	int ret;
+
+	/* RO field */
+	if (iswrite)
+		return -EFAULT;
+
+	ret = copy_to_user(buf, &data_offset, sizeof(data_offset));
+	if (ret)
+		return -EFAULT;
+
+	return sizeof(data_offset);
+}
+
+static ssize_t
+mlx5vf_pci_handle_migration_pending_bytes(struct mlx5vf_pci_core_device *mvdev,
+					  char __user *buf, bool iswrite)
+{
+	struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig;
+	u64 pending_bytes;
+	int ret;
+
+	/* RO field */
+	if (iswrite)
+		return -EFAULT;
+
+	if (vmig->vfio_dev_state == (VFIO_DEVICE_STATE_SAVING |
+				     VFIO_DEVICE_STATE_RUNNING)) {
+		/* In pre-copy state we have no data to return for now,
+		 * return 0 pending bytes
+		 */
+		pending_bytes = 0;
+	} else {
+		if (!vmig->vhca_state_data.state_size)
+			return 0;
+		pending_bytes = vmig->vhca_state_data.state_size -
+				vmig->vhca_state_data.win_start_offset;
+	}
+
+	ret = copy_to_user(buf, &pending_bytes, sizeof(pending_bytes));
+	if (ret)
+		return -EFAULT;
+
+	/* Window moves forward once data from previous iteration was read */
+	if (vmig->region_state & MLX5VF_REGION_DATA_SIZE)
+		vmig->vhca_state_data.win_start_offset +=
+			min_t(u64, MLX5VF_MIG_REGION_DATA_SIZE, pending_bytes);
+
+	WARN_ON(vmig->vhca_state_data.win_start_offset >
+		vmig->vhca_state_data.state_size);
+
+	/* New iteration started */
+	vmig->region_state = MLX5VF_REGION_PENDING_BYTES;
+	return sizeof(pending_bytes);
+}
+
+static int mlx5vf_load_state(struct mlx5vf_pci_core_device *mvdev)
+{
+	if (!mvdev->vmig.vhca_state_data.state_size)
+		return 0;
+
+	return mlx5vf_cmd_load_vhca_state(mvdev->core_device.pdev,
+					  mvdev->vmig.vhca_id,
+					  &mvdev->vmig.vhca_state_data);
+}
+
+static void mlx5vf_reset_mig_state(struct mlx5vf_pci_core_device *mvdev)
+{
+	struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig;
+
+	vmig->region_state = 0;
+	mlx5vf_reset_vhca_state(&vmig->vhca_state_data);
+}
+
+static int mlx5vf_pci_set_device_state(struct mlx5vf_pci_core_device *mvdev,
+				       u32 state)
+{
+	struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig;
+	u32 old_state = vmig->vfio_dev_state;
+	int ret = 0;
+
+	if (old_state == VFIO_DEVICE_STATE_ERROR ||
+	    !VFIO_DEVICE_STATE_VALID(state) ||
+	    (state & ~MLX5VF_SUPPORTED_DEVICE_STATES))
+		return -EINVAL;
+
+	/* Running switches off */
+	if (((old_state ^ state) & VFIO_DEVICE_STATE_RUNNING) &&
+	    (old_state & VFIO_DEVICE_STATE_RUNNING)) {
+		ret = mlx5vf_pci_quiesce_device(mvdev);
+		if (ret)
+			return ret;
+		ret = mlx5vf_pci_freeze_device(mvdev);
+		if (ret) {
+			vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR;
+			return ret;
+		}
+	}
+
+	/* Resuming switches off */
+	if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) &&
+	    (old_state & VFIO_DEVICE_STATE_RESUMING)) {
+		/* deserialize state into the device */
+		ret = mlx5vf_load_state(mvdev);
+		if (ret) {
+			vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR;
+			return ret;
+		}
+	}
+
+	/* Resuming switches on */
+	if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) &&
+	    (state & VFIO_DEVICE_STATE_RESUMING)) {
+		mlx5vf_reset_mig_state(mvdev);
+		ret = mlx5vf_pci_new_write_window(mvdev);
+		if (ret)
+			return ret;
+	}
+
+	/* Saving switches on */
+	if (((old_state ^ state) & VFIO_DEVICE_STATE_SAVING) &&
+	    (state & VFIO_DEVICE_STATE_SAVING)) {
+		if (!(state & VFIO_DEVICE_STATE_RUNNING)) {
+			/* serialize post copy */
+			ret = mlx5vf_pci_save_device_data(mvdev);
+			if (ret)
+				return ret;
+		}
+	}
+
+	/* Running switches on */
+	if (((old_state ^ state) & VFIO_DEVICE_STATE_RUNNING) &&
+	    (state & VFIO_DEVICE_STATE_RUNNING)) {
+		ret = mlx5vf_pci_unfreeze_device(mvdev);
+		if (ret)
+			return ret;
+		ret = mlx5vf_pci_unquiesce_device(mvdev);
+		if (ret) {
+			vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR;
+			return ret;
+		}
+	}
+
+	vmig->vfio_dev_state = state;
+	return 0;
+}
+
+static ssize_t
+mlx5vf_pci_handle_migration_device_state(struct mlx5vf_pci_core_device *mvdev,
+					 char __user *buf, bool iswrite)
+{
+	size_t count = sizeof(mvdev->vmig.vfio_dev_state);
+	int ret;
+
+	if (iswrite) {
+		u32 device_state;
+
+		ret = copy_from_user(&device_state, buf, count);
+		if (ret)
+			return -EFAULT;
+
+		ret = mlx5vf_pci_set_device_state(mvdev, device_state);
+		if (ret)
+			return ret;
+	} else {
+		ret = copy_to_user(buf, &mvdev->vmig.vfio_dev_state, count);
+		if (ret)
+			return -EFAULT;
+	}
+
+	return count;
+}
+
+static ssize_t
+mlx5vf_pci_copy_user_data_to_device_state(struct mlx5vf_pci_core_device *mvdev,
+					  char __user *buf, size_t count,
+					  u64 offset)
+{
+	struct mlx5_vhca_state_data *state_data = &mvdev->vmig.vhca_state_data;
+	char __user *from_buff = buf;
+	u32 curr_offset;
+	u32 win_page_offset;
+	u32 copy_count;
+	struct page *page;
+	char *to_buff;
+	int ret;
+
+	curr_offset = state_data->win_start_offset + offset;
+
+	do {
+		page = mlx5vf_get_migration_page(&state_data->mig_data,
+						 curr_offset);
+		if (!page)
+			return -EINVAL;
+
+		win_page_offset = curr_offset % PAGE_SIZE;
+		copy_count = min_t(u32, PAGE_SIZE - win_page_offset, count);
+
+		to_buff = kmap_local_page(page);
+		ret = copy_from_user(to_buff + win_page_offset, from_buff,
+				     copy_count);
+		kunmap_local(to_buff);
+		if (ret)
+			return -EFAULT;
+
+		from_buff += copy_count;
+		curr_offset += copy_count;
+		count -= copy_count;
+	} while (count > 0);
+
+	return 0;
+}
+
+static ssize_t
+mlx5vf_pci_copy_device_state_to_user(struct mlx5vf_pci_core_device *mvdev,
+				     char __user *buf, u64 offset, size_t count)
+{
+	struct mlx5_vhca_state_data *state_data = &mvdev->vmig.vhca_state_data;
+	char __user *to_buff = buf;
+	u32 win_available_bytes;
+	u32 win_page_offset;
+	u32 copy_count;
+	u32 curr_offset;
+	char *from_buff;
+	struct page *page;
+	int ret;
+
+	win_available_bytes =
+		min_t(u64, MLX5VF_MIG_REGION_DATA_SIZE,
+		      mvdev->vmig.vhca_state_data.state_size -
+			      mvdev->vmig.vhca_state_data.win_start_offset);
+
+	if (count + offset > win_available_bytes)
+		return -EINVAL;
+
+	curr_offset = state_data->win_start_offset + offset;
+
+	do {
+		page = mlx5vf_get_migration_page(&state_data->mig_data,
+						 curr_offset);
+		if (!page)
+			return -EINVAL;
+
+		win_page_offset = curr_offset % PAGE_SIZE;
+		copy_count = min_t(u32, PAGE_SIZE - win_page_offset, count);
+
+		from_buff = kmap_local_page(page);
+		ret = copy_to_user(buf, from_buff + win_page_offset,
+				   copy_count);
+		kunmap_local(from_buff);
+		if (ret)
+			return -EFAULT;
+
+		curr_offset += copy_count;
+		count -= copy_count;
+		to_buff += copy_count;
+	} while (count);
+
+	return 0;
+}
+
+static ssize_t
+mlx5vf_pci_migration_data_rw(struct mlx5vf_pci_core_device *mvdev,
+			     char __user *buf, size_t count, u64 offset,
+			     bool iswrite)
+{
+	int ret;
+
+	if (offset + count > MLX5VF_MIG_REGION_DATA_SIZE)
+		return -EINVAL;
+
+	if (iswrite)
+		ret = mlx5vf_pci_copy_user_data_to_device_state(mvdev, buf,
+								count, offset);
+	else
+		ret = mlx5vf_pci_copy_device_state_to_user(mvdev, buf, offset,
+							   count);
+	if (ret)
+		return ret;
+	return count;
+}
+
+static ssize_t mlx5vf_pci_mig_rw(struct vfio_pci_core_device *vdev,
+				 char __user *buf, size_t count, loff_t *ppos,
+				 bool iswrite)
+{
+	struct mlx5vf_pci_core_device *mvdev =
+		container_of(vdev, struct mlx5vf_pci_core_device, core_device);
+	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	int ret;
+
+	mutex_lock(&mvdev->state_mutex);
+	/* Copy to/from the migration region data section */
+	if (pos >= MLX5VF_MIG_REGION_DATA_OFFSET) {
+		ret = mlx5vf_pci_migration_data_rw(
+			mvdev, buf, count, pos - MLX5VF_MIG_REGION_DATA_OFFSET,
+			iswrite);
+		goto end;
+	}
+
+	switch (pos) {
+	case VFIO_DEVICE_MIGRATION_OFFSET(device_state):
+		/* This is RW field. */
+		if (count != sizeof(mvdev->vmig.vfio_dev_state)) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = mlx5vf_pci_handle_migration_device_state(mvdev, buf,
+							       iswrite);
+		break;
+	case VFIO_DEVICE_MIGRATION_OFFSET(pending_bytes):
+		/*
+		 * The number of pending bytes still to be migrated from the
+		 * vendor driver. This is RO field.
+		 * Reading this field indicates on the start of a new iteration
+		 * to get device data.
+		 *
+		 */
+		ret = mlx5vf_pci_handle_migration_pending_bytes(mvdev, buf,
+								iswrite);
+		break;
+	case VFIO_DEVICE_MIGRATION_OFFSET(data_offset):
+		/*
+		 * The user application should read data_offset field from the
+		 * migration region. The user application should read the
+		 * device data from this offset within the migration region
+		 * during the _SAVING mode or write the device data during the
+		 * _RESUMING mode. This is RO field.
+		 */
+		ret = mlx5vf_pci_handle_migration_data_offset(mvdev, buf,
+							      iswrite);
+		break;
+	case VFIO_DEVICE_MIGRATION_OFFSET(data_size):
+		/*
+		 * The user application should read data_size to get the size
+		 * in bytes of the data copied to the migration region during
+		 * the _SAVING state by the device. The user application should
+		 * write the size in bytes of the data that was copied to
+		 * the migration region during the _RESUMING state by the user.
+		 * This is RW field.
+		 */
+		ret = mlx5vf_pci_handle_migration_data_size(mvdev, buf,
+							    iswrite);
+		break;
+	default:
+		ret = -EFAULT;
+		break;
+	}
+
+end:
+	mutex_unlock(&mvdev->state_mutex);
+	return ret;
+}
+
+static struct vfio_pci_regops migration_ops = {
+	.rw = mlx5vf_pci_mig_rw,
+};
+
+static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
+{
+	struct mlx5vf_pci_core_device *mvdev = container_of(
+		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+	struct vfio_pci_core_device *vdev = &mvdev->core_device;
+	int vf_id;
+	int ret;
+
+	ret = vfio_pci_core_enable(vdev);
+	if (ret)
+		return ret;
+
+	if (!mvdev->migrate_cap) {
+		vfio_pci_core_finish_enable(vdev);
+		return 0;
+	}
+
+	vf_id = pci_iov_vf_id(vdev->pdev);
+	if (vf_id < 0) {
+		ret = vf_id;
+		goto out_disable;
+	}
+
+	ret = mlx5vf_cmd_get_vhca_id(vdev->pdev, vf_id + 1,
+				     &mvdev->vmig.vhca_id);
+	if (ret)
+		goto out_disable;
+
+	ret = vfio_pci_register_dev_region(vdev, VFIO_REGION_TYPE_MIGRATION,
+					   VFIO_REGION_SUBTYPE_MIGRATION,
+					   &migration_ops,
+					   MLX5VF_MIG_REGION_DATA_OFFSET +
+					   MLX5VF_MIG_REGION_DATA_SIZE,
+					   VFIO_REGION_INFO_FLAG_READ |
+					   VFIO_REGION_INFO_FLAG_WRITE,
+					   NULL);
+	if (ret)
+		goto out_disable;
+
+	mvdev->vmig.vfio_dev_state = VFIO_DEVICE_STATE_RUNNING;
+	vfio_pci_core_finish_enable(vdev);
+	return 0;
+out_disable:
+	vfio_pci_core_disable(vdev);
+	return ret;
+}
+
+static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
+{
+	struct mlx5vf_pci_core_device *mvdev = container_of(
+		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+
+	vfio_pci_core_close_device(core_vdev);
+	mlx5vf_reset_mig_state(mvdev);
+}
+
+static const struct vfio_device_ops mlx5vf_pci_ops = {
+	.name = "mlx5-vfio-pci",
+	.open_device = mlx5vf_pci_open_device,
+	.close_device = mlx5vf_pci_close_device,
+	.ioctl = vfio_pci_core_ioctl,
+	.read = vfio_pci_core_read,
+	.write = vfio_pci_core_write,
+	.mmap = vfio_pci_core_mmap,
+	.request = vfio_pci_core_request,
+	.match = vfio_pci_core_match,
+};
+
+static int mlx5vf_pci_probe(struct pci_dev *pdev,
+			    const struct pci_device_id *id)
+{
+	struct mlx5vf_pci_core_device *mvdev;
+	int ret;
+
+	mvdev = kzalloc(sizeof(*mvdev), GFP_KERNEL);
+	if (!mvdev)
+		return -ENOMEM;
+	vfio_pci_core_init_device(&mvdev->core_device, pdev, &mlx5vf_pci_ops);
+
+	if (pdev->is_virtfn) {
+		struct mlx5_core_dev *mdev =
+			mlx5_vf_get_core_dev(pdev);
+
+		if (mdev) {
+			if (MLX5_CAP_GEN(mdev, migration)) {
+				mvdev->migrate_cap = 1;
+				mutex_init(&mvdev->state_mutex);
+			}
+			mlx5_vf_put_core_dev(mdev);
+		}
+	}
+
+	ret = vfio_pci_core_register_device(&mvdev->core_device);
+	if (ret)
+		goto out_free;
+
+	dev_set_drvdata(&pdev->dev, mvdev);
+	return 0;
+
+out_free:
+	vfio_pci_core_uninit_device(&mvdev->core_device);
+	kfree(mvdev);
+	return ret;
+}
+
+static void mlx5vf_pci_remove(struct pci_dev *pdev)
+{
+	struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev);
+
+	vfio_pci_core_unregister_device(&mvdev->core_device);
+	vfio_pci_core_uninit_device(&mvdev->core_device);
+	kfree(mvdev);
+}
+
+static const struct pci_device_id mlx5vf_pci_table[] = {
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
+	{}
+};
+
+MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
+
+static struct pci_driver mlx5vf_pci_driver = {
+	.name = KBUILD_MODNAME,
+	.id_table = mlx5vf_pci_table,
+	.probe = mlx5vf_pci_probe,
+	.remove = mlx5vf_pci_remove,
+	.err_handler = &vfio_pci_core_err_handlers,
+};
+
+static void __exit mlx5vf_pci_cleanup(void)
+{
+	pci_unregister_driver(&mlx5vf_pci_driver);
+}
+
+static int __init mlx5vf_pci_init(void)
+{
+	return pci_register_driver(&mlx5vf_pci_driver);
+}
+
+module_init(mlx5vf_pci_init);
+module_exit(mlx5vf_pci_cleanup);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
+MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
+MODULE_DESCRIPTION(
+	"MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");