diff mbox series

[vfio,6/9] vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase

Message ID 20230911093856.81910-7-yishaih@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Add chunk mode support for mlx5 driver | expand

Commit Message

Yishai Hadas Sept. 11, 2023, 9:38 a.m. UTC
This patch is another preparation step towards working in chunk mode.

It pre-allocates chunks for the STOP_COPY phase to let the driver use
them immediately and prevent an extra allocation upon that phase.

Before that patch we had a single large buffer that was dedicated for
the STOP_COPY phase as there was a single SAVE in the source for the
last image.

Once we'll move to chunk mode the idea is to have some small buffers
that will be used upon the STOP_COPY phase.

The driver will read-ahead from the firmware the full state in
small/optimized chunks while letting QEMU/user space read in parallel
the available data.

Each buffer holds its chunk number to let it be recognized down the road
in the coming patches.

The chunk buffer size is picked-up based on the minimum size that
firmware requires, the total full size and some max value in the driver
code which was set to 8MB to achieve some optimized downtime in the
general case.

As the chunk mode is applicable even if we move directly to STOP_COPY
the buffers preparation and some other related stuff is done
unconditionally with regards to STOP/PRE-COPY.

Note:
In that phase in the series we still didn't activate the chunk mode and
the first buffer will be used in all the places.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
 drivers/vfio/pci/mlx5/cmd.c  |  23 +++---
 drivers/vfio/pci/mlx5/cmd.h  |   8 +-
 drivers/vfio/pci/mlx5/main.c | 150 ++++++++++++++++++++++-------------
 3 files changed, 116 insertions(+), 65 deletions(-)
diff mbox series

Patch

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 7b48a9b80bc6..b18735ee5d07 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -632,9 +632,9 @@  int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
 	}
 
 	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
-		if (async_data->stop_copy_chunk && migf->buf_header) {
-			header_buf = migf->buf_header;
-			migf->buf_header = NULL;
+		if (async_data->stop_copy_chunk && migf->buf_header[0]) {
+			header_buf = migf->buf_header[0];
+			migf->buf_header[0] = NULL;
 		} else {
 			header_buf = mlx5vf_get_data_buffer(migf,
 				sizeof(struct mlx5_vf_migration_header), DMA_NONE);
@@ -721,18 +721,21 @@  void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
 {
 	struct mlx5_vhca_data_buffer *entry;
+	int i;
 
 	lockdep_assert_held(&migf->mvdev->state_mutex);
 	WARN_ON(migf->mvdev->mdev_detach);
 
-	if (migf->buf) {
-		mlx5vf_free_data_buffer(migf->buf);
-		migf->buf = NULL;
-	}
+	for (i = 0; i < MAX_NUM_CHUNKS; i++) {
+		if (migf->buf[i]) {
+			mlx5vf_free_data_buffer(migf->buf[i]);
+			migf->buf[i] = NULL;
+		}
 
-	if (migf->buf_header) {
-		mlx5vf_free_data_buffer(migf->buf_header);
-		migf->buf_header = NULL;
+		if (migf->buf_header[i]) {
+			mlx5vf_free_data_buffer(migf->buf_header[i]);
+			migf->buf_header[i] = NULL;
+		}
 	}
 
 	list_splice(&migf->avail_list, &migf->buf_list);
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index ac5dca5fe6b1..6d8d52804c83 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -64,6 +64,7 @@  struct mlx5_vhca_data_buffer {
 	u32 mkey;
 	enum dma_data_direction dma_dir;
 	u8 dmaed:1;
+	u8 stop_copy_chunk_num;
 	struct list_head buf_elm;
 	struct mlx5_vf_migration_file *migf;
 	/* Optimize mlx5vf_get_migration_page() for sequential access */
@@ -82,6 +83,8 @@  struct mlx5vf_async_data {
 	void *out;
 };
 
+#define MAX_NUM_CHUNKS 2
+
 struct mlx5_vf_migration_file {
 	struct file *filp;
 	struct mutex lock;
@@ -94,8 +97,9 @@  struct mlx5_vf_migration_file {
 	u32 record_tag;
 	u64 stop_copy_prep_size;
 	u64 pre_copy_initial_bytes;
-	struct mlx5_vhca_data_buffer *buf;
-	struct mlx5_vhca_data_buffer *buf_header;
+	/* Upon chunk mode preserve another set of buffers for stop_copy phase */
+	struct mlx5_vhca_data_buffer *buf[MAX_NUM_CHUNKS];
+	struct mlx5_vhca_data_buffer *buf_header[MAX_NUM_CHUNKS];
 	spinlock_t list_lock;
 	struct list_head buf_list;
 	struct list_head avail_list;
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 90cb36fee6c0..351b61303b72 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -24,6 +24,8 @@ 
 /* Device specification max LOAD size */
 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
 
+#define MAX_CHUNK_SIZE SZ_8M
+
 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
 {
 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
@@ -304,7 +306,8 @@  static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
 	wake_up_interruptible(&migf->poll_wait);
 }
 
-static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
+static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
+				       bool track)
 {
 	size_t size = sizeof(struct mlx5_vf_migration_header) +
 		sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
@@ -331,7 +334,7 @@  static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
 	to_buff = kmap_local_page(page);
 	memcpy(to_buff, &header, sizeof(header));
 	header_buf->length = sizeof(header);
-	data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length);
+	data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
 	memcpy(to_buff + sizeof(header), &data, sizeof(data));
 	header_buf->length += sizeof(data);
 	kunmap_local(to_buff);
@@ -340,48 +343,83 @@  static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
 	spin_lock_irqsave(&migf->list_lock, flags);
 	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
 	spin_unlock_irqrestore(&migf->list_lock, flags);
-	migf->pre_copy_initial_bytes = size;
+	if (track)
+		migf->pre_copy_initial_bytes = size;
 	return 0;
 err:
 	mlx5vf_put_data_buffer(header_buf);
 	return ret;
 }
 
-static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf,
-				 size_t state_size)
+static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
+				 struct mlx5_vf_migration_file *migf,
+				 size_t state_size, u64 full_size,
+				 bool track)
 {
 	struct mlx5_vhca_data_buffer *buf;
 	size_t inc_state_size;
+	int num_chunks;
 	int ret;
+	int i;
 
-	/* let's be ready for stop_copy size that might grow by 10 percents */
-	if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
-		inc_state_size = state_size;
+	if (mvdev->chunk_mode) {
+		size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
 
-	buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
-	if (IS_ERR(buf))
-		return PTR_ERR(buf);
+		/* from firmware perspective at least 'state_size' buffer should be set */
+		inc_state_size = max(state_size, chunk_size);
+	} else {
+		if (track) {
+			/* let's be ready for stop_copy size that might grow by 10 percents */
+			if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
+				inc_state_size = state_size;
+		} else {
+			inc_state_size = state_size;
+		}
+	}
 
-	migf->buf = buf;
-	buf = mlx5vf_get_data_buffer(migf,
-			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
-	if (IS_ERR(buf)) {
-		ret = PTR_ERR(buf);
-		goto err;
+	/* let's not overflow the device specification max SAVE size */
+	inc_state_size = min_t(size_t, inc_state_size,
+		(BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
+
+	num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
+	for (i = 0; i < num_chunks; i++) {
+		buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
+		if (IS_ERR(buf)) {
+			ret = PTR_ERR(buf);
+			goto err;
+		}
+
+		migf->buf[i] = buf;
+		buf = mlx5vf_get_data_buffer(migf,
+				sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+		if (IS_ERR(buf)) {
+			ret = PTR_ERR(buf);
+			goto err;
+		}
+		migf->buf_header[i] = buf;
+		if (mvdev->chunk_mode) {
+			migf->buf[i]->stop_copy_chunk_num = i + 1;
+			migf->buf_header[i]->stop_copy_chunk_num = i + 1;
+		}
 	}
 
-	migf->buf_header = buf;
-	ret = mlx5vf_add_stop_copy_header(migf);
+	ret = mlx5vf_add_stop_copy_header(migf, track);
 	if (ret)
-		goto err_header;
+		goto err;
 	return 0;
 
-err_header:
-	mlx5vf_put_data_buffer(migf->buf_header);
-	migf->buf_header = NULL;
 err:
-	mlx5vf_put_data_buffer(migf->buf);
-	migf->buf = NULL;
+	for (i = 0; i < num_chunks; i++) {
+		if (migf->buf[i]) {
+			mlx5vf_put_data_buffer(migf->buf[i]);
+			migf->buf[i] = NULL;
+		}
+		if (migf->buf_header[i]) {
+			mlx5vf_put_data_buffer(migf->buf_header[i]);
+			migf->buf_header[i] = NULL;
+		}
+	}
+
 	return ret;
 }
 
@@ -511,9 +549,9 @@  static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
 		goto err;
 
 	/* Checking whether we have a matching pre-allocated buffer that can fit */
-	if (migf->buf && migf->buf->allocated_length >= length) {
-		buf = migf->buf;
-		migf->buf = NULL;
+	if (migf->buf[0]->allocated_length >= length) {
+		buf = migf->buf[0];
+		migf->buf[0] = NULL;
 	} else {
 		buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
 		if (IS_ERR(buf)) {
@@ -541,6 +579,7 @@  mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
 	struct mlx5_vf_migration_file *migf;
 	struct mlx5_vhca_data_buffer *buf;
 	size_t length;
+	u64 full_size;
 	int ret;
 
 	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
@@ -574,20 +613,25 @@  mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
 	INIT_LIST_HEAD(&migf->buf_list);
 	INIT_LIST_HEAD(&migf->avail_list);
 	spin_lock_init(&migf->list_lock);
-	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, 0);
+	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
+	if (ret)
+		goto out_pd;
+
+	ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
 	if (ret)
 		goto out_pd;
 
 	if (track) {
-		ret = mlx5vf_prep_stop_copy(migf, length);
-		if (ret)
+		/* leave the allocated buffer ready for the stop-copy phase */
+		buf = mlx5vf_alloc_data_buffer(migf,
+			migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
+		if (IS_ERR(buf)) {
+			ret = PTR_ERR(buf);
 			goto out_pd;
-	}
-
-	buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
-	if (IS_ERR(buf)) {
-		ret = PTR_ERR(buf);
-		goto out_pd;
+		}
+	} else {
+		buf = migf->buf[0];
+		migf->buf[0] = NULL;
 	}
 
 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
@@ -820,8 +864,8 @@  static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
 				   size_t len, loff_t *pos)
 {
 	struct mlx5_vf_migration_file *migf = filp->private_data;
-	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
-	struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
+	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
+	struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
 	loff_t requested_length;
 	bool has_work = false;
 	ssize_t done = 0;
@@ -856,15 +900,15 @@  static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
 			if (vhca_buf_header->allocated_length < migf->record_size) {
 				mlx5vf_free_data_buffer(vhca_buf_header);
 
-				migf->buf_header = mlx5vf_alloc_data_buffer(migf,
+				migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
 						migf->record_size, DMA_NONE);
-				if (IS_ERR(migf->buf_header)) {
-					ret = PTR_ERR(migf->buf_header);
-					migf->buf_header = NULL;
+				if (IS_ERR(migf->buf_header[0])) {
+					ret = PTR_ERR(migf->buf_header[0]);
+					migf->buf_header[0] = NULL;
 					goto out_unlock;
 				}
 
-				vhca_buf_header = migf->buf_header;
+				vhca_buf_header = migf->buf_header[0];
 			}
 
 			vhca_buf_header->start_pos = migf->max_pos;
@@ -884,15 +928,15 @@  static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
 			if (vhca_buf->allocated_length < size) {
 				mlx5vf_free_data_buffer(vhca_buf);
 
-				migf->buf = mlx5vf_alloc_data_buffer(migf,
+				migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
 							size, DMA_TO_DEVICE);
-				if (IS_ERR(migf->buf)) {
-					ret = PTR_ERR(migf->buf);
-					migf->buf = NULL;
+				if (IS_ERR(migf->buf[0])) {
+					ret = PTR_ERR(migf->buf[0]);
+					migf->buf[0] = NULL;
 					goto out_unlock;
 				}
 
-				vhca_buf = migf->buf;
+				vhca_buf = migf->buf[0];
 			}
 
 			vhca_buf->start_pos = migf->max_pos;
@@ -974,7 +1018,7 @@  mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
 		goto out_pd;
 	}
 
-	migf->buf = buf;
+	migf->buf[0] = buf;
 	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
 		buf = mlx5vf_alloc_data_buffer(migf,
 			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
@@ -983,7 +1027,7 @@  mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
 			goto out_buf;
 		}
 
-		migf->buf_header = buf;
+		migf->buf_header[0] = buf;
 		migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
 	} else {
 		/* Initial state will be to read the image */
@@ -997,7 +1041,7 @@  mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
 	spin_lock_init(&migf->list_lock);
 	return migf;
 out_buf:
-	mlx5vf_free_data_buffer(migf->buf);
+	mlx5vf_free_data_buffer(migf->buf[0]);
 out_pd:
 	mlx5vf_cmd_dealloc_pd(migf);
 out_free:
@@ -1101,7 +1145,7 @@  mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
 		if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
 			ret = mlx5vf_cmd_load_vhca_state(mvdev,
 							 mvdev->resuming_migf,
-							 mvdev->resuming_migf->buf);
+							 mvdev->resuming_migf->buf[0]);
 			if (ret)
 				return ERR_PTR(ret);
 		}