diff mbox series

[vfio,08/13] vfio/mlx5: Introduce vfio precopy ioctl implementation

Message ID 20221106174630.25909-9-yishaih@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Add migration PRE_COPY support for mlx5 driver | expand

Commit Message

Yishai Hadas Nov. 6, 2022, 5:46 p.m. UTC
From: Shay Drory <shayd@nvidia.com>

vfio precopy ioctl returns an estimation of data available for
transferring from the device.

Whenever a user is using VFIO_MIG_GET_PRECOPY_INFO, track the current
state of the device, and if needed, append the dirty data to the
transfer FD data. This is done by saving a middle state.

As mlx5 runs the SAVE command asynchronously, make sure to query for
incremental data only once there is no active save command.
Running both in parallel, might end-up with a failure in the incremental
query command on un-tracked vhca.

Also, a middle state will be saved only after the previous state has
finished its SAVE command and has been fully transferred, this enables
to re-use the resources.

In order to map between FD position and the new saved state data, store
the current FD position.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
 drivers/vfio/pci/mlx5/cmd.c  |   9 +++
 drivers/vfio/pci/mlx5/cmd.h  |   1 +
 drivers/vfio/pci/mlx5/main.c | 131 +++++++++++++++++++++++++++++++++++
 3 files changed, 141 insertions(+)
diff mbox series

Patch

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index eb684455c2b2..2d2171191218 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -64,6 +64,15 @@  int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
 	if (mvdev->mdev_detach)
 		return -ENOTCONN;
 
+	/*
+	 * In case PRE_COPY is used, saving_migf is exposed while device is
+	 * running. Make sure to run only once there is no active save command.
+	 * Running both in parallel, might end-up with a failure in the
+	 * incremental query command on un-tracked vhca.
+	 */
+	if (query_flags & MLX5VF_QUERY_INC)
+		wait_event(mvdev->saving_migf->save_wait,
+			   !mvdev->saving_migf->save_cb_active);
 	MLX5_SET(query_vhca_migration_state_in, in, opcode,
 		 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
 	MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index c12fa81ba53f..07a2fc54c9d8 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -30,6 +30,7 @@  struct mlx5_vf_migration_file {
 	u8 save_cb_active:1;
 
 	struct sg_append_table table;
+	size_t table_start_pos;
 	size_t image_length;
 	size_t allocated_length;
 	/*
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 10e073c32ab1..266626066fed 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -107,6 +107,22 @@  static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf,
 	return ret;
 }
 
+static void mlx5vf_prep_next_table(struct mlx5_vf_migration_file *migf)
+{
+	struct sg_page_iter sg_iter;
+
+	lockdep_assert_held(&migf->lock);
+	migf->table_start_pos += migf->image_length;
+	/* clear sgtable, all data has been transferred */
+	for_each_sgtable_page(&migf->table.sgt, &sg_iter, 0)
+		__free_page(sg_page_iter_page(&sg_iter));
+	sg_free_append_table(&migf->table);
+	memset(&migf->table, 0, sizeof(migf->table));
+	migf->image_length = 0;
+	migf->allocated_length = 0;
+	migf->last_offset_sg = NULL;
+}
+
 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
 {
 	struct sg_page_iter sg_iter;
@@ -120,6 +136,7 @@  static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
 	migf->image_length = 0;
 	migf->allocated_length = 0;
 	migf->final_length = 0;
+	migf->table_start_pos = 0;
 	migf->filp->f_pos = 0;
 	for_each_sgtable_page(&migf->final_table.sgt, &sg_iter, 0)
 		__free_page(sg_page_iter_page(&sg_iter));
@@ -137,6 +154,13 @@  static int mlx5vf_release_file(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+#define MIGF_TOTAL_DATA(migf) \
+	(migf->table_start_pos + migf->image_length + migf->final_length)
+
+#define VFIO_MIG_STATE_PRE_COPY(mvdev) \
+	(mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY || \
+	 mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY_P2P)
+
 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
 			       loff_t *pos)
 {
@@ -230,10 +254,117 @@  static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
 	wake_up_interruptible(&migf->poll_wait);
 }
 
+static ssize_t mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
+				    unsigned long arg)
+{
+	struct mlx5_vf_migration_file *migf = filp->private_data;
+	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
+	bool first_state, state_finish_transfer;
+	struct vfio_precopy_info info;
+	loff_t *pos = &filp->f_pos;
+	unsigned long minsz;
+	size_t inc_length;
+	int ret;
+
+	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+		return -ENOTTY;
+
+	minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+
+	if (copy_from_user(&info, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	mutex_lock(&mvdev->state_mutex);
+	if (!VFIO_MIG_STATE_PRE_COPY(migf->mvdev)) {
+		ret = -EINVAL;
+		goto err_state_unlock;
+	}
+
+	/*
+	 * We can't issue a SAVE command when the device is suspended, so as
+	 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
+	 * bytes that can't be read.
+	 */
+	if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+		/*
+		 * Once the query returns it's guaranteed that there is no
+		 * active SAVE command.
+		 * As so, the other code below is safe with the proper locks.
+		 */
+		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
+							    MLX5VF_QUERY_INC);
+		if (ret)
+			goto err_state_unlock;
+	}
+
+	mutex_lock(&migf->lock);
+	if (*pos > MIGF_TOTAL_DATA(migf)) {
+		ret = -EINVAL;
+		goto err_migf_unlock;
+	}
+
+	if (migf->disabled || migf->is_err) {
+		ret = -ENODEV;
+		goto err_migf_unlock;
+	}
+
+	first_state = migf->table_start_pos == 0;
+	if (first_state) {
+		info.initial_bytes = MIGF_TOTAL_DATA(migf) - *pos;
+		info.dirty_bytes = 0;
+	} else {
+		info.initial_bytes = 0;
+		info.dirty_bytes = MIGF_TOTAL_DATA(migf) - *pos;
+	}
+	state_finish_transfer = *pos == MIGF_TOTAL_DATA(migf);
+	if (!(state_finish_transfer && inc_length &&
+	      mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY)) {
+		mutex_unlock(&migf->lock);
+		goto done;
+	}
+
+	/*
+	 * We finished transferring the current state and the device has a
+	 * dirty state, save a new state to be ready for.
+	 */
+	mlx5vf_prep_next_table(migf);
+	ret = mlx5vf_add_migration_pages(migf,
+					 DIV_ROUND_UP_ULL(inc_length, PAGE_SIZE),
+					 &migf->table);
+	mutex_unlock(&migf->lock);
+	if (ret) {
+		mlx5vf_mark_err(migf);
+		goto err_state_unlock;
+	}
+
+	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, true, true);
+	if (ret) {
+		mlx5vf_mark_err(migf);
+		goto err_state_unlock;
+	}
+
+	info.dirty_bytes += inc_length;
+
+done:
+	mlx5vf_state_mutex_unlock(mvdev);
+	return copy_to_user((void __user *)arg, &info, minsz);
+
+err_migf_unlock:
+	mutex_unlock(&migf->lock);
+err_state_unlock:
+	mlx5vf_state_mutex_unlock(mvdev);
+	return ret;
+}
+
 static const struct file_operations mlx5vf_save_fops = {
 	.owner = THIS_MODULE,
 	.read = mlx5vf_save_read,
 	.poll = mlx5vf_save_poll,
+	.unlocked_ioctl = mlx5vf_precopy_ioctl,
+	.compat_ioctl = compat_ptr_ioctl,
 	.release = mlx5vf_release_file,
 	.llseek = no_llseek,
 };