diff mbox series

[V3,vfio,10/14] vfio/mlx5: Introduce vfio precopy ioctl implementation

Message ID 20221205144838.245287-11-yishaih@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Add migration PRE_COPY support for mlx5 driver | expand

Commit Message

Yishai Hadas Dec. 5, 2022, 2:48 p.m. UTC
vfio precopy ioctl returns an estimation of data available for
transferring from the device.

Whenever a user is using VFIO_MIG_GET_PRECOPY_INFO, track the current
state of the device, and if needed, append the dirty data to the
transfer FD data. This is done by saving a middle state.

As mlx5 runs the SAVE command asynchronously, make sure to query for
incremental data only once there is no active save command.
Running both in parallel, might end-up with a failure in the incremental
query command on un-tracked vhca.

Also, a middle state will be saved only after the previous state has
finished its SAVE command and has been fully transferred, this prevents
endless use resources.

Co-developed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
 drivers/vfio/pci/mlx5/cmd.c  |  16 +++++
 drivers/vfio/pci/mlx5/main.c | 111 +++++++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)

Comments

Alex Williamson Dec. 5, 2022, 7:03 p.m. UTC | #1
On Mon, 5 Dec 2022 16:48:34 +0200
Yishai Hadas <yishaih@nvidia.com> wrote:

> vfio precopy ioctl returns an estimation of data available for
> transferring from the device.
> 
> Whenever a user is using VFIO_MIG_GET_PRECOPY_INFO, track the current
> state of the device, and if needed, append the dirty data to the
> transfer FD data. This is done by saving a middle state.
> 
> As mlx5 runs the SAVE command asynchronously, make sure to query for
> incremental data only once there is no active save command.
> Running both in parallel, might end-up with a failure in the incremental
> query command on un-tracked vhca.
> 
> Also, a middle state will be saved only after the previous state has
> finished its SAVE command and has been fully transferred, this prevents
> endless use resources.
> 
> Co-developed-by: Shay Drory <shayd@nvidia.com>
> Signed-off-by: Shay Drory <shayd@nvidia.com>
> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> ---
>  drivers/vfio/pci/mlx5/cmd.c  |  16 +++++
>  drivers/vfio/pci/mlx5/main.c | 111 +++++++++++++++++++++++++++++++++++
>  2 files changed, 127 insertions(+)
> 
> diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
> index 160fa38fc78d..12e74ecebe64 100644
> --- a/drivers/vfio/pci/mlx5/cmd.c
> +++ b/drivers/vfio/pci/mlx5/cmd.c
> @@ -67,12 +67,25 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
>  {
>  	u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
>  	u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
> +	bool inc = query_flags & MLX5VF_QUERY_INC;
>  	int ret;
>  
>  	lockdep_assert_held(&mvdev->state_mutex);
>  	if (mvdev->mdev_detach)
>  		return -ENOTCONN;
>  
> +	/*
> +	 * In case PRE_COPY is used, saving_migf is exposed while device is
> +	 * running. Make sure to run only once there is no active save command.
> +	 * Running both in parallel, might end-up with a failure in the
> +	 * incremental query command on un-tracked vhca.
> +	 */
> +	if (inc) {
> +		ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
> +		if (ret)
> +			return ret;
> +	}
> +
>  	MLX5_SET(query_vhca_migration_state_in, in, opcode,
>  		 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
>  	MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
> @@ -82,6 +95,9 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
>  
>  	ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
>  				  out);
> +	if (inc)
> +		complete(&mvdev->saving_migf->save_comp);
> +
>  	if (ret)
>  		return ret;
>  
> diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
> index 9a36e36ec33b..08c7d96e92b7 100644
> --- a/drivers/vfio/pci/mlx5/main.c
> +++ b/drivers/vfio/pci/mlx5/main.c
> @@ -294,10 +294,121 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
>  	wake_up_interruptible(&migf->poll_wait);
>  }
>  
> +static ssize_t mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
> +				    unsigned long arg)

ssize_t is incompatible with file_operations.unlocked_ioctl in 32-bit
builds (i386):

drivers/vfio/pci/mlx5/main.c:419:27: error: initialization of ‘long int (*)(struct file *, unsigned int,  long unsigned int)’ from incompatible pointer type ‘ssize_t (*)(struct file *, unsigned int,  long unsigned int)’ {aka ‘int (*)(struct file *, unsigned int,  long unsigned int)’} [-Werror=incompatible-pointer-types]
  419 |         .unlocked_ioctl = mlx5vf_precopy_ioctl,
      |                           ^~~~~~~~~~~~~~~~~~~~


Thanks,
Alex

>  static const struct file_operations mlx5vf_save_fops = {
>  	.owner = THIS_MODULE,
>  	.read = mlx5vf_save_read,
>  	.poll = mlx5vf_save_poll,
> +	.unlocked_ioctl = mlx5vf_precopy_ioctl,
> +	.compat_ioctl = compat_ptr_ioctl,
>  	.release = mlx5vf_release_file,
>  	.llseek = no_llseek,
>  };
diff mbox series

Patch

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 160fa38fc78d..12e74ecebe64 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -67,12 +67,25 @@  int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
 {
 	u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
+	bool inc = query_flags & MLX5VF_QUERY_INC;
 	int ret;
 
 	lockdep_assert_held(&mvdev->state_mutex);
 	if (mvdev->mdev_detach)
 		return -ENOTCONN;
 
+	/*
+	 * In case PRE_COPY is used, saving_migf is exposed while device is
+	 * running. Make sure to run only once there is no active save command.
+	 * Running both in parallel, might end-up with a failure in the
+	 * incremental query command on un-tracked vhca.
+	 */
+	if (inc) {
+		ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
+		if (ret)
+			return ret;
+	}
+
 	MLX5_SET(query_vhca_migration_state_in, in, opcode,
 		 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
 	MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
@@ -82,6 +95,9 @@  int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
 
 	ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
 				  out);
+	if (inc)
+		complete(&mvdev->saving_migf->save_comp);
+
 	if (ret)
 		return ret;
 
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 9a36e36ec33b..08c7d96e92b7 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -294,10 +294,121 @@  static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
 	wake_up_interruptible(&migf->poll_wait);
 }
 
+static ssize_t mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
+				    unsigned long arg)
+{
+	struct mlx5_vf_migration_file *migf = filp->private_data;
+	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
+	struct mlx5_vhca_data_buffer *buf;
+	struct vfio_precopy_info info = {};
+	loff_t *pos = &filp->f_pos;
+	unsigned long minsz;
+	size_t inc_length = 0;
+	bool end_of_data;
+	int ret;
+
+	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+		return -ENOTTY;
+
+	minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+
+	if (copy_from_user(&info, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	mutex_lock(&mvdev->state_mutex);
+	if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
+	    mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+		ret = -EINVAL;
+		goto err_state_unlock;
+	}
+
+	/*
+	 * We can't issue a SAVE command when the device is suspended, so as
+	 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
+	 * bytes that can't be read.
+	 */
+	if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
+		/*
+		 * Once the query returns it's guaranteed that there is no
+		 * active SAVE command.
+		 * As so, the other code below is safe with the proper locks.
+		 */
+		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
+							    MLX5VF_QUERY_INC);
+		if (ret)
+			goto err_state_unlock;
+	}
+
+	mutex_lock(&migf->lock);
+	if (migf->state == MLX5_MIGF_STATE_ERROR) {
+		ret = -ENODEV;
+		goto err_migf_unlock;
+	}
+
+	buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data);
+	if (buf) {
+		if (buf->start_pos == 0) {
+			info.initial_bytes = buf->header_image_size - *pos;
+		} else if (buf->start_pos ==
+				sizeof(struct mlx5_vf_migration_header)) {
+			/* First data buffer following the header */
+			info.initial_bytes = buf->start_pos +
+						buf->length - *pos;
+		} else {
+			info.dirty_bytes = buf->start_pos + buf->length - *pos;
+		}
+	} else {
+		if (!end_of_data) {
+			ret = -EINVAL;
+			goto err_migf_unlock;
+		}
+
+		info.dirty_bytes = inc_length;
+	}
+
+	if (!end_of_data || !inc_length) {
+		mutex_unlock(&migf->lock);
+		goto done;
+	}
+
+	mutex_unlock(&migf->lock);
+	/*
+	 * We finished transferring the current state and the device has a
+	 * dirty state, save a new state to be ready for.
+	 */
+	buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
+	if (IS_ERR(buf)) {
+		ret = PTR_ERR(buf);
+		mlx5vf_mark_err(migf);
+		goto err_state_unlock;
+	}
+
+	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
+	if (ret) {
+		mlx5vf_mark_err(migf);
+		mlx5vf_put_data_buffer(buf);
+		goto err_state_unlock;
+	}
+
+done:
+	mlx5vf_state_mutex_unlock(mvdev);
+	return copy_to_user((void __user *)arg, &info, minsz);
+err_migf_unlock:
+	mutex_unlock(&migf->lock);
+err_state_unlock:
+	mlx5vf_state_mutex_unlock(mvdev);
+	return ret;
+}
+
 static const struct file_operations mlx5vf_save_fops = {
 	.owner = THIS_MODULE,
 	.read = mlx5vf_save_read,
 	.poll = mlx5vf_save_poll,
+	.unlocked_ioctl = mlx5vf_precopy_ioctl,
+	.compat_ioctl = compat_ptr_ioctl,
 	.release = mlx5vf_release_file,
 	.llseek = no_llseek,
 };