diff mbox series

[v4,24/24] drm/amdkfd: CRIU resume shared virtual memory ranges

Message ID 20211223003711.13064-25-rajneesh.bhardwaj@amd.com (mailing list archive)
State New, archived
Headers show
Series CHECKPOINT RESTORE WITH ROCm | expand

Commit Message

Rajneesh Bhardwaj Dec. 23, 2021, 12:37 a.m. UTC
In CRIU resume stage, resume all the shared virtual memory ranges from
the data stored inside the resuming kfd process during CRIU restore
phase. Also setup xnack mode and free up the resources.

Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 10 +++++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 55 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h     |  6 +++
 3 files changed, 71 insertions(+)

Comments

Felix Kuehling Jan. 11, 2022, 12:03 a.m. UTC | #1
On 2021-12-22 7:37 p.m., Rajneesh Bhardwaj wrote:
> In CRIU resume stage, resume all the shared virtual memory ranges from
> the data stored inside the resuming kfd process during CRIU restore
> phase. Also setup xnack mode and free up the resources.
>
> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 10 +++++
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 55 ++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.h     |  6 +++
>   3 files changed, 71 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index f7aa15b18f95..6191e37656dd 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2759,7 +2759,17 @@ static int criu_resume(struct file *filep,
>   	}
>   
>   	mutex_lock(&target->mutex);
> +	ret = kfd_criu_resume_svm(target);
> +	if (ret) {
> +		pr_err("kfd_criu_resume_svm failed for %i\n", args->pid);
> +		goto exit;
> +	}
> +
>   	ret =  amdgpu_amdkfd_criu_resume(target->kgd_process_info);
> +	if (ret)
> +		pr_err("amdgpu_amdkfd_criu_resume failed for %i\n", args->pid);
> +
> +exit:
>   	mutex_unlock(&target->mutex);
>   
>   	kfd_unref_process(target);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index e9f6c63c2a26..bd2dce37f345 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -3427,6 +3427,61 @@ svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm,
>   	return 0;
>   }
>   
> +int kfd_criu_resume_svm(struct kfd_process *p)
> +{
> +	int nattr_common = 4, nattr_accessibility = 1;
> +	struct criu_svm_metadata *criu_svm_md = NULL;
> +	struct criu_svm_metadata *next = NULL;
> +	struct svm_range_list *svms = &p->svms;
> +	int i, j, num_attrs, ret = 0;
> +	struct mm_struct *mm;
> +
> +	if (list_empty(&svms->criu_svm_metadata_list)) {
> +		pr_debug("No SVM data from CRIU restore stage 2\n");
> +		return ret;
> +	}
> +
> +	mm = get_task_mm(p->lead_thread);
> +	if (!mm) {
> +		pr_err("failed to get mm for the target process\n");
> +		return -ESRCH;
> +	}
> +
> +	num_attrs = nattr_common + (nattr_accessibility * p->n_pdds);
> +
> +	i = j = 0;
> +	list_for_each_entry(criu_svm_md, &svms->criu_svm_metadata_list, list) {
> +		pr_debug("criu_svm_md[%d]\n\tstart: 0x%llx size: 0x%llx (npages)\n",
> +			 i, criu_svm_md->start_addr, criu_svm_md->size);
> +		for (j = 0; j < num_attrs; j++) {
> +			pr_debug("\ncriu_svm_md[%d]->attrs[%d].type : 0x%x \ncriu_svm_md[%d]->attrs[%d].value : 0x%x\n",
> +				 i,j, criu_svm_md->attrs[j].type,
> +				 i,j, criu_svm_md->attrs[j].value);
> +		}

Is this super-detailed debug output really needed?

Regards,
   Felix


> +
> +		ret = svm_range_set_attr(p, mm, criu_svm_md->start_addr,
> +					 criu_svm_md->size, num_attrs,
> +					 criu_svm_md->attrs);
> +		if (ret) {
> +			pr_err("CRIU: failed to set range attributes\n");
> +			goto exit;
> +		}
> +
> +		i++;
> +	}
> +
> +exit:
> +	list_for_each_entry_safe(criu_svm_md, next, &svms->criu_svm_metadata_list, list) {
> +		pr_debug("freeing criu_svm_md[]\n\tstart: 0x%llx\n",
> +						criu_svm_md->start_addr);
> +		kfree(criu_svm_md);
> +	}
> +
> +	mmput(mm);
> +	return ret;
> +
> +}
> +
>   int svm_criu_prepare_for_resume(struct kfd_process *p,
>   				struct kfd_criu_svm_range_priv_data *svm_priv)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index e0c0853f085c..3b5bcb52723c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -195,6 +195,7 @@ int kfd_criu_restore_svm(struct kfd_process *p,
>   			 uint8_t __user *user_priv_ptr,
>   			 uint64_t *priv_data_offset,
>   			 uint64_t max_priv_data_size);
> +int kfd_criu_resume_svm(struct kfd_process *p);
>   struct kfd_process_device *
>   svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev);
>   void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_struct *mm);
> @@ -256,6 +257,11 @@ static inline int kfd_criu_restore_svm(struct kfd_process *p,
>   	return -EINVAL;
>   }
>   
> +static inline int kfd_criu_resume_svm(struct kfd_process *p)
> +{
> +	return 0;
> +}
> +
>   #define KFD_IS_SVM_API_SUPPORTED(dev) false
>   
>   #endif /* IS_ENABLED(CONFIG_HSA_AMD_SVM) */
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f7aa15b18f95..6191e37656dd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2759,7 +2759,17 @@  static int criu_resume(struct file *filep,
 	}
 
 	mutex_lock(&target->mutex);
+	ret = kfd_criu_resume_svm(target);
+	if (ret) {
+		pr_err("kfd_criu_resume_svm failed for %i\n", args->pid);
+		goto exit;
+	}
+
 	ret =  amdgpu_amdkfd_criu_resume(target->kgd_process_info);
+	if (ret)
+		pr_err("amdgpu_amdkfd_criu_resume failed for %i\n", args->pid);
+
+exit:
 	mutex_unlock(&target->mutex);
 
 	kfd_unref_process(target);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index e9f6c63c2a26..bd2dce37f345 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -3427,6 +3427,61 @@  svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm,
 	return 0;
 }
 
+int kfd_criu_resume_svm(struct kfd_process *p)
+{
+	int nattr_common = 4, nattr_accessibility = 1;
+	struct criu_svm_metadata *criu_svm_md = NULL;
+	struct criu_svm_metadata *next = NULL;
+	struct svm_range_list *svms = &p->svms;
+	int i, j, num_attrs, ret = 0;
+	struct mm_struct *mm;
+
+	if (list_empty(&svms->criu_svm_metadata_list)) {
+		pr_debug("No SVM data from CRIU restore stage 2\n");
+		return ret;
+	}
+
+	mm = get_task_mm(p->lead_thread);
+	if (!mm) {
+		pr_err("failed to get mm for the target process\n");
+		return -ESRCH;
+	}
+
+	num_attrs = nattr_common + (nattr_accessibility * p->n_pdds);
+
+	i = j = 0;
+	list_for_each_entry(criu_svm_md, &svms->criu_svm_metadata_list, list) {
+		pr_debug("criu_svm_md[%d]\n\tstart: 0x%llx size: 0x%llx (npages)\n",
+			 i, criu_svm_md->start_addr, criu_svm_md->size);
+		for (j = 0; j < num_attrs; j++) {
+			pr_debug("\ncriu_svm_md[%d]->attrs[%d].type : 0x%x \ncriu_svm_md[%d]->attrs[%d].value : 0x%x\n",
+				 i,j, criu_svm_md->attrs[j].type,
+				 i,j, criu_svm_md->attrs[j].value);
+		}
+
+		ret = svm_range_set_attr(p, mm, criu_svm_md->start_addr,
+					 criu_svm_md->size, num_attrs,
+					 criu_svm_md->attrs);
+		if (ret) {
+			pr_err("CRIU: failed to set range attributes\n");
+			goto exit;
+		}
+
+		i++;
+	}
+
+exit:
+	list_for_each_entry_safe(criu_svm_md, next, &svms->criu_svm_metadata_list, list) {
+		pr_debug("freeing criu_svm_md[]\n\tstart: 0x%llx\n",
+						criu_svm_md->start_addr);
+		kfree(criu_svm_md);
+	}
+
+	mmput(mm);
+	return ret;
+
+}
+
 int svm_criu_prepare_for_resume(struct kfd_process *p,
 				struct kfd_criu_svm_range_priv_data *svm_priv)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index e0c0853f085c..3b5bcb52723c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -195,6 +195,7 @@  int kfd_criu_restore_svm(struct kfd_process *p,
 			 uint8_t __user *user_priv_ptr,
 			 uint64_t *priv_data_offset,
 			 uint64_t max_priv_data_size);
+int kfd_criu_resume_svm(struct kfd_process *p);
 struct kfd_process_device *
 svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev);
 void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_struct *mm);
@@ -256,6 +257,11 @@  static inline int kfd_criu_restore_svm(struct kfd_process *p,
 	return -EINVAL;
 }
 
+static inline int kfd_criu_resume_svm(struct kfd_process *p)
+{
+	return 0;
+}
+
 #define KFD_IS_SVM_API_SUPPORTED(dev) false
 
 #endif /* IS_ENABLED(CONFIG_HSA_AMD_SVM) */