diff mbox series

[vfio,2/3] vfio/mlx5: Improve the source side flow upon pre_copy

Message ID 20230124144955.139901-3-yishaih@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Few improvements in the migration area of mlx5 driver | expand

Commit Message

Yishai Hadas Jan. 24, 2023, 2:49 p.m. UTC
Improve the source side flow upon pre_copy as of below.

- Prepare the stop_copy buffers as part of moving to pre_copy.
- Send to the target a record that includes the expected
  stop_copy size to let it optimize its stop_copy flow as well.

As for sending the target this new record type (i.e.
MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE) we split the current 64 header
flags bits into 32 flags bits and another 32 tag bits, each record may
have a tag and a flag whether it's optional or mandatory. Optional
records will be ignored in the target.

The above reduces the downtime upon stop_copy as the relevant data stuff
is prepared ahead as part of pre_copy.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
 drivers/vfio/pci/mlx5/cmd.c  |  31 +++++---
 drivers/vfio/pci/mlx5/cmd.h  |  21 +++++-
 drivers/vfio/pci/mlx5/main.c | 133 +++++++++++++++++++++++++++++------
 3 files changed, 151 insertions(+), 34 deletions(-)
diff mbox series

Patch

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index e956e79626b7..5161d845c478 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -500,7 +500,7 @@  void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
 }
 
 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
-			  size_t image_size)
+			  size_t image_size, bool initial_pre_copy)
 {
 	struct mlx5_vf_migration_file *migf = header_buf->migf;
 	struct mlx5_vf_migration_header header = {};
@@ -508,7 +508,9 @@  static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
 	struct page *page;
 	u8 *to_buff;
 
-	header.image_size = cpu_to_le64(image_size);
+	header.record_size = cpu_to_le64(image_size);
+	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
+	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
 	page = mlx5vf_get_migration_page(header_buf, 0);
 	if (!page)
 		return -EINVAL;
@@ -516,12 +518,13 @@  static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
 	memcpy(to_buff, &header, sizeof(header));
 	kunmap_local(to_buff);
 	header_buf->length = sizeof(header);
-	header_buf->header_image_size = image_size;
 	header_buf->start_pos = header_buf->migf->max_pos;
 	migf->max_pos += header_buf->length;
 	spin_lock_irqsave(&migf->list_lock, flags);
 	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
 	spin_unlock_irqrestore(&migf->list_lock, flags);
+	if (initial_pre_copy)
+		migf->pre_copy_initial_bytes += sizeof(header);
 	return 0;
 }
 
@@ -535,11 +538,14 @@  static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
 	if (!status) {
 		size_t image_size;
 		unsigned long flags;
+		bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
+				!async_data->last_chunk;
 
 		image_size = MLX5_GET(save_vhca_state_out, async_data->out,
 				      actual_image_size);
 		if (async_data->header_buf) {
-			status = add_buf_header(async_data->header_buf, image_size);
+			status = add_buf_header(async_data->header_buf, image_size,
+						initial_pre_copy);
 			if (status)
 				goto err;
 		}
@@ -549,6 +555,8 @@  static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
 		spin_lock_irqsave(&migf->list_lock, flags);
 		list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
 		spin_unlock_irqrestore(&migf->list_lock, flags);
+		if (initial_pre_copy)
+			migf->pre_copy_initial_bytes += image_size;
 		migf->state = async_data->last_chunk ?
 			MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
 		wake_up_interruptible(&migf->poll_wait);
@@ -610,11 +618,16 @@  int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
 	}
 
 	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
-		header_buf = mlx5vf_get_data_buffer(migf,
-			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
-		if (IS_ERR(header_buf)) {
-			err = PTR_ERR(header_buf);
-			goto err_free;
+		if (async_data->last_chunk && migf->buf_header) {
+			header_buf = migf->buf_header;
+			migf->buf_header = NULL;
+		} else {
+			header_buf = mlx5vf_get_data_buffer(migf,
+				sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+			if (IS_ERR(header_buf)) {
+				err = PTR_ERR(header_buf);
+				goto err_free;
+			}
 		}
 	}
 
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index 657d94affe2b..8f1bef580028 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -32,10 +32,26 @@  enum mlx5_vf_load_state {
 	MLX5_VF_LOAD_STATE_LOAD_IMAGE,
 };
 
+struct mlx5_vf_migration_tag_stop_copy_data {
+	__le64 stop_copy_size;
+};
+
+enum mlx5_vf_migf_header_flags {
+	MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY = 0,
+	MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL = 1 << 0,
+};
+
+enum mlx5_vf_migf_header_tag {
+	MLX5_MIGF_HEADER_TAG_FW_DATA = 0,
+	MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE = 1 << 0,
+};
+
 struct mlx5_vf_migration_header {
-	__le64 image_size;
+	__le64 record_size;
 	/* For future use in case we may need to change the kernel protocol */
-	__le64 flags;
+	__le32 flags; /* Use mlx5_vf_migf_header_flags */
+	__le32 tag; /* Use mlx5_vf_migf_header_tag */
+	__u8 data[]; /* Its size is given in the record_size */
 };
 
 struct mlx5_vhca_data_buffer {
@@ -73,6 +89,7 @@  struct mlx5_vf_migration_file {
 	enum mlx5_vf_load_state load_state;
 	u32 pdn;
 	loff_t max_pos;
+	u64 pre_copy_initial_bytes;
 	struct mlx5_vhca_data_buffer *buf;
 	struct mlx5_vhca_data_buffer *buf_header;
 	spinlock_t list_lock;
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 7ba127d8889a..6856e7b97533 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -304,6 +304,87 @@  static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
 	wake_up_interruptible(&migf->poll_wait);
 }
 
+static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
+{
+	size_t size = sizeof(struct mlx5_vf_migration_header) +
+		sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
+	struct mlx5_vf_migration_tag_stop_copy_data data = {};
+	struct mlx5_vhca_data_buffer *header_buf = NULL;
+	struct mlx5_vf_migration_header header = {};
+	unsigned long flags;
+	struct page *page;
+	u8 *to_buff;
+	int ret;
+
+	header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
+	if (IS_ERR(header_buf))
+		return PTR_ERR(header_buf);
+
+	header.record_size = cpu_to_le64(sizeof(data));
+	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
+	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
+	page = mlx5vf_get_migration_page(header_buf, 0);
+	if (!page) {
+		ret = -EINVAL;
+		goto err;
+	}
+	to_buff = kmap_local_page(page);
+	memcpy(to_buff, &header, sizeof(header));
+	header_buf->length = sizeof(header);
+	data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length);
+	memcpy(to_buff + sizeof(header), &data, sizeof(data));
+	header_buf->length += sizeof(data);
+	kunmap_local(to_buff);
+	header_buf->start_pos = header_buf->migf->max_pos;
+	migf->max_pos += header_buf->length;
+	spin_lock_irqsave(&migf->list_lock, flags);
+	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
+	spin_unlock_irqrestore(&migf->list_lock, flags);
+	migf->pre_copy_initial_bytes = size;
+	return 0;
+err:
+	mlx5vf_put_data_buffer(header_buf);
+	return ret;
+}
+
+static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf,
+				 size_t state_size)
+{
+	struct mlx5_vhca_data_buffer *buf;
+	size_t inc_state_size;
+	int ret;
+
+	/* let's be ready for stop_copy size that might grow by 10 percents */
+	if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
+		inc_state_size = state_size;
+
+	buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
+
+	migf->buf = buf;
+	buf = mlx5vf_get_data_buffer(migf,
+			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+	if (IS_ERR(buf)) {
+		ret = PTR_ERR(buf);
+		goto err;
+	}
+
+	migf->buf_header = buf;
+	ret = mlx5vf_add_stop_copy_header(migf);
+	if (ret)
+		goto err_header;
+	return 0;
+
+err_header:
+	mlx5vf_put_data_buffer(migf->buf_header);
+	migf->buf_header = NULL;
+err:
+	mlx5vf_put_data_buffer(migf->buf);
+	migf->buf = NULL;
+	return ret;
+}
+
 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
 				 unsigned long arg)
 {
@@ -314,7 +395,7 @@  static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
 	loff_t *pos = &filp->f_pos;
 	unsigned long minsz;
 	size_t inc_length = 0;
-	bool end_of_data;
+	bool end_of_data = false;
 	int ret;
 
 	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
@@ -358,25 +439,19 @@  static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
 		goto err_migf_unlock;
 	}
 
-	buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data);
-	if (buf) {
-		if (buf->start_pos == 0) {
-			info.initial_bytes = buf->header_image_size - *pos;
-		} else if (buf->start_pos ==
-				sizeof(struct mlx5_vf_migration_header)) {
-			/* First data buffer following the header */
-			info.initial_bytes = buf->start_pos +
-						buf->length - *pos;
-		} else {
-			info.dirty_bytes = buf->start_pos + buf->length - *pos;
-		}
+	if (migf->pre_copy_initial_bytes > *pos) {
+		info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
 	} else {
-		if (!end_of_data) {
-			ret = -EINVAL;
-			goto err_migf_unlock;
+		buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data);
+		if (buf) {
+			info.dirty_bytes = buf->start_pos + buf->length - *pos;
+		} else {
+			if (!end_of_data) {
+				ret = -EINVAL;
+				goto err_migf_unlock;
+			}
+			info.dirty_bytes = inc_length;
 		}
-
-		info.dirty_bytes = inc_length;
 	}
 
 	if (!end_of_data || !inc_length) {
@@ -441,10 +516,16 @@  static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
 	if (ret)
 		goto err;
 
-	buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
-	if (IS_ERR(buf)) {
-		ret = PTR_ERR(buf);
-		goto err;
+	/* Checking whether we have a matching pre-allocated buffer that can fit */
+	if (migf->buf && migf->buf->allocated_length >= length) {
+		buf = migf->buf;
+		migf->buf = NULL;
+	} else {
+		buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
+		if (IS_ERR(buf)) {
+			ret = PTR_ERR(buf);
+			goto err;
+		}
 	}
 
 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
@@ -503,6 +584,12 @@  mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
 	if (ret)
 		goto out_pd;
 
+	if (track) {
+		ret = mlx5vf_prep_stop_copy(migf, length);
+		if (ret)
+			goto out_pd;
+	}
+
 	buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
 	if (IS_ERR(buf)) {
 		ret = PTR_ERR(buf);
@@ -516,7 +603,7 @@  mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
 out_save:
 	mlx5vf_free_data_buffer(buf);
 out_pd:
-	mlx5vf_cmd_dealloc_pd(migf);
+	mlx5fv_cmd_clean_migf_resources(migf);
 out_free:
 	fput(migf->filp);
 end: