diff mbox series

[v4,08/24] drm/amdkfd: CRIU Implement KFD unpause operation

Message ID 20211223003711.13064-9-rajneesh.bhardwaj@amd.com (mailing list archive)
State New, archived
Headers show
Series CHECKPOINT RESTORE WITH ROCm | expand

Commit Message

Rajneesh Bhardwaj Dec. 23, 2021, 12:36 a.m. UTC
From: David Yat Sin <david.yatsin@amd.com>

Introducing UNPAUSE op. After CRIU amdgpu plugin performs a PROCESS_INFO
op the queues will be stay in an evicted state. Once the plugin is done
draining BO contents, it is safe to perform an UNPAUSE op for the queues
to resume.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 37 +++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  1 +
 3 files changed, 40 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 87b9f019e96e..db2bb302a8d4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2040,6 +2040,14 @@  static int criu_checkpoint(struct file *filep,
 		goto exit_unlock;
 	}
 
+	/* Confirm all process queues are evicted */
+	if (!p->queues_paused) {
+		pr_err("Cannot dump process when queues are not in evicted state\n");
+		/* CRIU plugin did not call op PROCESS_INFO before checkpointing */
+		ret = -EINVAL;
+		goto exit_unlock;
+	}
+
 	criu_get_process_object_info(p, &num_bos, &priv_size);
 
 	if (num_bos != args->num_bos ||
@@ -2382,7 +2390,24 @@  static int criu_unpause(struct file *filep,
 			struct kfd_process *p,
 			struct kfd_ioctl_criu_args *args)
 {
-	return 0;
+	int ret;
+
+	mutex_lock(&p->mutex);
+
+	if (!p->queues_paused) {
+		mutex_unlock(&p->mutex);
+		return -EINVAL;
+	}
+
+	ret = kfd_process_restore_queues(p);
+	if (ret)
+		pr_err("Failed to unpause queues ret:%d\n", ret);
+	else
+		p->queues_paused = false;
+
+	mutex_unlock(&p->mutex);
+
+	return ret;
 }
 
 static int criu_resume(struct file *filep,
@@ -2434,6 +2459,12 @@  static int criu_process_info(struct file *filep,
 		goto err_unlock;
 	}
 
+	ret = kfd_process_evict_queues(p);
+	if (ret)
+		goto err_unlock;
+
+	p->queues_paused = true;
+
 	args->pid = task_pid_nr_ns(p->lead_thread,
 					task_active_pid_ns(p->lead_thread));
 
@@ -2441,6 +2472,10 @@  static int criu_process_info(struct file *filep,
 
 	dev_dbg(kfd_device, "Num of bos:%u\n", args->num_bos);
 err_unlock:
+	if (ret) {
+		kfd_process_restore_queues(p);
+		p->queues_paused = false;
+	}
 	mutex_unlock(&p->mutex);
 	return ret;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index cd72541a8f4f..f3a9f3de34e4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -875,6 +875,9 @@  struct kfd_process {
 	struct svm_range_list svms;
 
 	bool xnack_enabled;
+
+	/* Queues are in paused stated because we are in the process of doing a CRIU checkpoint */
+	bool queues_paused;
 };
 
 #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index d2fcdc5e581f..e20fbb7ba9bb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1364,6 +1364,7 @@  static struct kfd_process *create_process(const struct task_struct *thread)
 	process->mm = thread->mm;
 	process->lead_thread = thread->group_leader;
 	process->n_pdds = 0;
+	process->queues_paused = false;
 	INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
 	INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
 	process->last_restore_timestamp = get_jiffies_64();