@@ -14,6 +14,7 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
{
+ struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
@@ -21,6 +22,14 @@ int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
if (mvdev->mdev_detach)
return -ENOTCONN;
+ /*
+ * In case PRE_COPY is used, saving_migf is exposed while the device is
+ * running. Make sure to run only once there is no active save command.
+ * Running both in parallel, might end-up with a failure in the save
+ * command once it will try to turn on 'tracking' on a suspended device.
+ */
+ if (migf)
+ wait_event(migf->save_wait, !migf->save_cb_active);
MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
@@ -45,7 +54,7 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
}
int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
- size_t *state_size)
+ size_t *state_size, u8 query_flags)
{
u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
@@ -59,6 +68,8 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
+ MLX5_SET(query_vhca_migration_state_in, in, incremental,
+ query_flags & MLX5VF_QUERY_INC);
ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
out);
@@ -210,11 +221,11 @@ static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
}
static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
- struct mlx5_vf_migration_file *migf,
+ struct sg_table *sgt,
struct mlx5_vhca_recv_buf *recv_buf,
u32 *mkey, size_t length)
{
- size_t npages = migf ? DIV_ROUND_UP(length, PAGE_SIZE) :
+ size_t npages = sgt ? DIV_ROUND_UP(length, PAGE_SIZE) :
recv_buf->npages;
int err = 0, inlen;
__be64 *mtt;
@@ -232,10 +243,10 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
DIV_ROUND_UP(npages, 2));
mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
- if (migf) {
+ if (sgt) {
struct sg_dma_page_iter dma_iter;
- for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0)
+ for_each_sgtable_dma_page(sgt, &dma_iter, 0)
*mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
} else {
int i;
@@ -255,7 +266,7 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
MLX5_SET(mkc, mkc, qpn, 0xffffff);
MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
- MLX5_SET64(mkc, mkc, len, migf ? length : (npages * PAGE_SIZE));
+ MLX5_SET64(mkc, mkc, len, sgt ? length : (npages * PAGE_SIZE));
err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
kvfree(in);
return err;
@@ -277,7 +288,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
mutex_unlock(&migf->lock);
mlx5_core_destroy_mkey(mdev, async_data->mkey);
- dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0);
+ dma_unmap_sgtable(mdev->device, async_data->sgt, DMA_FROM_DEVICE, 0);
mlx5_core_dealloc_pd(mdev, async_data->pdn);
kvfree(async_data->out);
migf->save_cb_active = false;
@@ -293,9 +304,14 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
struct mlx5_vf_migration_file, async_data);
if (!status) {
- WRITE_ONCE(migf->image_length,
- MLX5_GET(save_vhca_state_out, async_data->out,
- actual_image_size));
+ size_t len = MLX5_GET(save_vhca_state_out, async_data->out,
+ actual_image_size);
+
+ if (async_data->sgt == &migf->final_table.sgt)
+ WRITE_ONCE(migf->final_length, len);
+ else
+ WRITE_ONCE(migf->image_length, len);
+
wake_up_interruptible(&migf->poll_wait);
}
@@ -308,7 +324,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
}
int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
- struct mlx5_vf_migration_file *migf)
+ struct mlx5_vf_migration_file *migf, bool inc,
+ bool track)
{
u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
@@ -327,12 +344,15 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
if (err)
return err;
- err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE,
- 0);
+ async_data = &migf->async_data;
+ async_data->sgt = (!track && inc) ? &migf->final_table.sgt :
+ &migf->table.sgt;
+ err = dma_map_sgtable(mdev->device, async_data->sgt,
+ DMA_FROM_DEVICE, 0);
if (err)
goto err_dma_map;
- err = _create_mkey(mdev, pdn, migf, NULL,
+ err = _create_mkey(mdev, pdn, async_data->sgt, NULL,
&mkey, migf->allocated_length);
if (err)
goto err_create_mkey;
@@ -343,8 +363,9 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
MLX5_SET(save_vhca_state_in, in, mkey, mkey);
MLX5_SET(save_vhca_state_in, in, size, migf->allocated_length);
+ MLX5_SET(save_vhca_state_in, in, incremental, inc);
+ MLX5_SET(save_vhca_state_in, in, set_track, track);
- async_data = &migf->async_data;
async_data->out = kvzalloc(out_size, GFP_KERNEL);
if (!async_data->out) {
err = -ENOMEM;
@@ -370,7 +391,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
err_out:
mlx5_core_destroy_mkey(mdev, mkey);
err_create_mkey:
- dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0);
+ dma_unmap_sgtable(mdev->device, async_data->sgt, DMA_FROM_DEVICE, 0);
err_dma_map:
mlx5_core_dealloc_pd(mdev, pdn);
migf->save_cb_active = false;
@@ -405,7 +426,8 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
if (err)
goto err_reg;
- err = _create_mkey(mdev, pdn, migf, NULL, &mkey, migf->image_length);
+ err = _create_mkey(mdev, pdn, &migf->table.sgt, NULL, &mkey,
+ migf->image_length);
if (err)
goto err_mkey;
@@ -15,6 +15,7 @@
struct mlx5vf_async_data {
struct mlx5_async_work cb_work;
struct work_struct work;
+ struct sg_table *sgt;
int status;
u32 pdn;
u32 mkey;
@@ -31,6 +32,12 @@ struct mlx5_vf_migration_file {
struct sg_append_table table;
size_t image_length;
size_t allocated_length;
+ /*
+ * The device can be moved to stop_copy before the previous state was
+ * fully read. Another set of variables is needed to maintain it.
+ */
+ size_t final_length;
+ struct sg_append_table final_table;
/* Optimize mlx5vf_get_migration_page() for sequential access */
struct scatterlist *last_offset_sg;
@@ -115,17 +122,22 @@ struct mlx5vf_pci_core_device {
struct mlx5_core_dev *mdev;
};
+enum {
+ MLX5VF_QUERY_INC = (1UL << 0),
+};
+
int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
- size_t *state_size);
+ size_t *state_size, u8 query_flags);
void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
const struct vfio_migration_ops *mig_ops,
const struct vfio_log_ops *log_ops);
void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev);
void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev);
int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
- struct mlx5_vf_migration_file *migf);
+ struct mlx5_vf_migration_file *migf, bool inc,
+ bool track);
int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
struct mlx5_vf_migration_file *migf);
void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev);
@@ -64,7 +64,8 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf,
}
static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf,
- unsigned int npages)
+ unsigned int npages,
+ struct sg_append_table *table)
{
unsigned int to_alloc = npages;
struct page **page_list;
@@ -85,7 +86,7 @@ static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf,
}
to_alloc -= filled;
ret = sg_alloc_append_table_from_pages(
- &migf->table, page_list, filled, 0,
+ table, page_list, filled, 0,
filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
GFP_KERNEL);
@@ -118,7 +119,11 @@ static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
migf->disabled = true;
migf->image_length = 0;
migf->allocated_length = 0;
+ migf->final_length = 0;
migf->filp->f_pos = 0;
+ for_each_sgtable_page(&migf->final_table.sgt, &sg_iter, 0)
+ __free_page(sg_page_iter_page(&sg_iter));
+ sg_free_append_table(&migf->final_table);
mutex_unlock(&migf->lock);
}
@@ -215,6 +220,16 @@ static __poll_t mlx5vf_save_poll(struct file *filp,
return pollflags;
}
+/*
+ * FD is exposed and user can use it after receiving an error.
+ * Mark migf in error, and wake the user.
+ */
+static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
+{
+ migf->is_err = true;
+ wake_up_interruptible(&migf->poll_wait);
+}
+
static const struct file_operations mlx5vf_save_fops = {
.owner = THIS_MODULE,
.read = mlx5vf_save_read,
@@ -223,8 +238,35 @@ static const struct file_operations mlx5vf_save_fops = {
.llseek = no_llseek,
};
+static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
+{
+ struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
+ size_t length;
+ int ret;
+
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
+ MLX5VF_QUERY_INC);
+ if (ret)
+ return ret;
+
+ if (migf->is_err)
+ return -ENODEV;
+
+ ret = mlx5vf_add_migration_pages(
+ migf, DIV_ROUND_UP_ULL(length, PAGE_SIZE), &migf->final_table);
+ if (ret) {
+ mlx5vf_mark_err(migf);
+ return ret;
+ }
+
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, true, false);
+ if (ret)
+ mlx5vf_mark_err(migf);
+ return ret;
+}
+
static struct mlx5_vf_migration_file *
-mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev)
+mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
{
struct mlx5_vf_migration_file *migf;
size_t length;
@@ -249,17 +291,17 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev)
init_waitqueue_head(&migf->save_wait);
mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
- ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length);
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
if (ret)
goto out_free;
ret = mlx5vf_add_migration_pages(
- migf, DIV_ROUND_UP_ULL(length, PAGE_SIZE));
+ migf, DIV_ROUND_UP_ULL(length, PAGE_SIZE), &migf->table);
if (ret)
goto out_free;
migf->mvdev = mvdev;
- ret = mlx5vf_cmd_save_vhca_state(mvdev, migf);
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, false, track);
if (ret)
goto out_free;
return migf;
@@ -296,7 +338,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
done = mlx5vf_add_migration_pages(
migf,
DIV_ROUND_UP(requested_length - migf->allocated_length,
- PAGE_SIZE));
+ PAGE_SIZE), &migf->table);
if (done)
goto out_unlock;
}
@@ -403,7 +445,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
return NULL;
}
- if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
ret = mlx5vf_cmd_suspend_vhca(mvdev,
MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
if (ret)
@@ -411,7 +454,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
return NULL;
}
- if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) {
+ if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
ret = mlx5vf_cmd_resume_vhca(mvdev,
MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
if (ret)
@@ -422,7 +466,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
struct mlx5_vf_migration_file *migf;
- migf = mlx5vf_pci_save_device_data(mvdev);
+ migf = mlx5vf_pci_save_device_data(mvdev, false);
if (IS_ERR(migf))
return ERR_CAST(migf);
get_file(migf->filp);
@@ -430,7 +474,10 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
return migf->filp;
}
- if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) {
+ if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
+ new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
mlx5vf_disable_fds(mvdev);
return NULL;
}
@@ -455,6 +502,28 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
return NULL;
}
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
+ (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+ new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ struct mlx5_vf_migration_file *migf;
+
+ migf = mlx5vf_pci_save_device_data(mvdev, true);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ mvdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ ret = mlx5vf_cmd_suspend_vhca(mvdev,
+ MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
+ if (ret)
+ return ERR_PTR(ret);
+ ret = mlx5vf_pci_save_device_inc_data(mvdev);
+ return ret ? ERR_PTR(ret) : NULL;
+ }
+
/*
* vfio_mig_get_next_state() does not use arcs other than the above
*/
@@ -523,7 +592,7 @@ static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
mutex_lock(&mvdev->state_mutex);
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
- &state_size);
+ &state_size, 0);
if (!ret)
*stop_copy_length = state_size;
mlx5vf_state_mutex_unlock(mvdev);