Message ID | 20211223003711.13064-5-rajneesh.bhardwaj@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | CHECKPOINT RESTORE WITH ROCm | expand |
On 2021-12-22 7:36 p.m., Rajneesh Bhardwaj wrote: > This IOCTL is expected to be called as a precursor to the actual > Checkpoint operation. This does the basic discovery into the target > process seized by CRIU and relays the information to the userspace that > utilizes it to start the Checkpoint operation via another dedicated > IOCTL. > > The process_info IOCTL determines the number of GPUs, buffer objects > that are associated with the target process, its process id in > caller's namespace since /proc/pid/mem interface maybe used to drain > the contents of the discovered buffer objects in userspace and getpid > returns the pid of CRIU dumper process. Also the pid of a process > inside a container might be different than its global pid so return > the ns pid. > > Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com> > Signed-off-by: David Yat Sin <david.yatsin@amd.com> > --- > drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 55 +++++++++++++++++++++++- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 + > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 14 ++++++ > 3 files changed, 70 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > index 1b863bd84c96..53d7a20e3c06 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > @@ -1857,6 +1857,41 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data) > } > #endif > > +uint64_t get_process_num_bos(struct kfd_process *p) > +{ > + uint64_t num_of_bos = 0, i; > + > + /* Run over all PDDs of the process */ > + for (i = 0; i < p->n_pdds; i++) { > + struct kfd_process_device *pdd = p->pdds[i]; > + void *mem; > + int id; > + > + idr_for_each_entry(&pdd->alloc_idr, mem, id) { > + struct kgd_mem *kgd_mem = (struct kgd_mem *)mem; > + > + if ((uint64_t)kgd_mem->va > pdd->gpuvm_base) > + num_of_bos++; > + } > + } > + return num_of_bos; > +} > + > +static void criu_get_process_object_info(struct kfd_process *p, > + uint32_t *num_bos, > + uint64_t *objs_priv_size) > +{ > + uint64_t priv_size; > + > + *num_bos = get_process_num_bos(p); > + > + if (objs_priv_size) { > + priv_size = sizeof(struct kfd_criu_process_priv_data); > + priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data); > + *objs_priv_size = priv_size; > + } > +} > + > static int criu_checkpoint(struct file *filep, > struct kfd_process *p, > struct kfd_ioctl_criu_args *args) > @@ -1889,7 +1924,25 @@ static int criu_process_info(struct file *filep, > struct kfd_process *p, > struct kfd_ioctl_criu_args *args) > { > - return 0; > + int ret = 0; > + > + mutex_lock(&p->mutex); > + > + if (!kfd_has_process_device_data(p)) { > + pr_err("No pdd for given process\n"); > + ret = -ENODEV; > + goto err_unlock; > + } > + > + args->pid = task_pid_nr_ns(p->lead_thread, > + task_active_pid_ns(p->lead_thread)); > + > + criu_get_process_object_info(p, &args->num_bos, &args->priv_data_size); > + > + dev_dbg(kfd_device, "Num of bos:%u\n", args->num_bos); > +err_unlock: > + mutex_unlock(&p->mutex); > + return ret; > } > > static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data) > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index e68f692362bb..4d9bc7af03af 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -950,6 +950,8 @@ void *kfd_process_device_translate_handle(struct kfd_process_device *p, > void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, > int handle); > > +bool kfd_has_process_device_data(struct kfd_process *p); > + > /* PASIDs */ > int kfd_pasid_init(void); > void kfd_pasid_exit(void); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > index d4c8a6948a9f..f77d556ca0fc 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -1456,6 +1456,20 @@ static int init_doorbell_bitmap(struct qcm_process_device *qpd, > return 0; > } > > +bool kfd_has_process_device_data(struct kfd_process *p) > +{ > + int i; > + > + for (i = 0; i < p->n_pdds; i++) { > + struct kfd_process_device *pdd = p->pdds[i]; I think checking p->n_pdds is sufficient. All the pdds with i < n_pdds should be non-NULL. Regards, Felix > + > + if (pdd) > + return true; > + } > + > + return false; > +} > + > struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, > struct kfd_process *p) > {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 1b863bd84c96..53d7a20e3c06 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1857,6 +1857,41 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data) } #endif +uint64_t get_process_num_bos(struct kfd_process *p) +{ + uint64_t num_of_bos = 0, i; + + /* Run over all PDDs of the process */ + for (i = 0; i < p->n_pdds; i++) { + struct kfd_process_device *pdd = p->pdds[i]; + void *mem; + int id; + + idr_for_each_entry(&pdd->alloc_idr, mem, id) { + struct kgd_mem *kgd_mem = (struct kgd_mem *)mem; + + if ((uint64_t)kgd_mem->va > pdd->gpuvm_base) + num_of_bos++; + } + } + return num_of_bos; +} + +static void criu_get_process_object_info(struct kfd_process *p, + uint32_t *num_bos, + uint64_t *objs_priv_size) +{ + uint64_t priv_size; + + *num_bos = get_process_num_bos(p); + + if (objs_priv_size) { + priv_size = sizeof(struct kfd_criu_process_priv_data); + priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data); + *objs_priv_size = priv_size; + } +} + static int criu_checkpoint(struct file *filep, struct kfd_process *p, struct kfd_ioctl_criu_args *args) @@ -1889,7 +1924,25 @@ static int criu_process_info(struct file *filep, struct kfd_process *p, struct kfd_ioctl_criu_args *args) { - return 0; + int ret = 0; + + mutex_lock(&p->mutex); + + if (!kfd_has_process_device_data(p)) { + pr_err("No pdd for given process\n"); + ret = -ENODEV; + goto err_unlock; + } + + args->pid = task_pid_nr_ns(p->lead_thread, + task_active_pid_ns(p->lead_thread)); + + criu_get_process_object_info(p, &args->num_bos, &args->priv_data_size); + + dev_dbg(kfd_device, "Num of bos:%u\n", args->num_bos); +err_unlock: + mutex_unlock(&p->mutex); + return ret; } static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index e68f692362bb..4d9bc7af03af 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -950,6 +950,8 @@ void *kfd_process_device_translate_handle(struct kfd_process_device *p, void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, int handle); +bool kfd_has_process_device_data(struct kfd_process *p); + /* PASIDs */ int kfd_pasid_init(void); void kfd_pasid_exit(void); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index d4c8a6948a9f..f77d556ca0fc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1456,6 +1456,20 @@ static int init_doorbell_bitmap(struct qcm_process_device *qpd, return 0; } +bool kfd_has_process_device_data(struct kfd_process *p) +{ + int i; + + for (i = 0; i < p->n_pdds; i++) { + struct kfd_process_device *pdd = p->pdds[i]; + + if (pdd) + return true; + } + + return false; +} + struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, struct kfd_process *p) {