diff mbox series

[v4,03/24] drm/amdkfd: CRIU Introduce Checkpoint-Restore APIs

Message ID 20211223003711.13064-4-rajneesh.bhardwaj@amd.com (mailing list archive)
State New, archived
Headers show
Series CHECKPOINT RESTORE WITH ROCm | expand

Commit Message

Rajneesh Bhardwaj Dec. 23, 2021, 12:36 a.m. UTC
Checkpoint-Restore in userspace (CRIU) is a powerful tool that can
snapshot a running process and later restore it on same or a remote
machine but expects the processes that have a device file (e.g. GPU)
associated with them, provide necessary driver support to assist CRIU
and its extensible plugin interface. Thus, In order to support the
Checkpoint-Restore of any ROCm process, the AMD Radeon Open Compute
Kernel driver, needs to provide a set of new APIs that provide
necessary VRAM metadata and its contents to a userspace component
(CRIU plugin) that can store it in form of image files.

This introduces some new ioctls which will be used to checkpoint-Restore
any KFD bound user process. KFD doesn't allow any arbitrary ioctl call
unless it is called by the group leader process. Since these ioctls are
expected to be called from a KFD criu plugin which has elevated ptrace
attached privileges and CAP_CHECKPOINT_RESTORE capabilities attached with
the file descriptors so modify KFD to allow such calls.

(API redesigned by David Yat Sin)
Suggested-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: David Yat Sin <david.yatsin@amd.com>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 94 +++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    | 65 +++++++++++++++-
 include/uapi/linux/kfd_ioctl.h           | 79 +++++++++++++++++++-
 3 files changed, 235 insertions(+), 3 deletions(-)

Comments

Felix Kuehling Jan. 10, 2022, 10:08 p.m. UTC | #1
On 2021-12-22 7:36 p.m., Rajneesh Bhardwaj wrote:
> Checkpoint-Restore in userspace (CRIU) is a powerful tool that can
> snapshot a running process and later restore it on same or a remote
> machine but expects the processes that have a device file (e.g. GPU)
> associated with them, provide necessary driver support to assist CRIU
> and its extensible plugin interface. Thus, In order to support the
> Checkpoint-Restore of any ROCm process, the AMD Radeon Open Compute
> Kernel driver, needs to provide a set of new APIs that provide
> necessary VRAM metadata and its contents to a userspace component
> (CRIU plugin) that can store it in form of image files.
>
> This introduces some new ioctls which will be used to checkpoint-Restore
> any KFD bound user process. KFD doesn't allow any arbitrary ioctl call
> unless it is called by the group leader process. Since these ioctls are
> expected to be called from a KFD criu plugin which has elevated ptrace
> attached privileges and CAP_CHECKPOINT_RESTORE capabilities attached with
> the file descriptors so modify KFD to allow such calls.
>
> (API redesigned by David Yat Sin)
> Suggested-by: Felix Kuehling <felix.kuehling@amd.com>
> Signed-off-by: David Yat Sin <david.yatsin@amd.com>
> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 94 +++++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    | 65 +++++++++++++++-
>   include/uapi/linux/kfd_ioctl.h           | 79 +++++++++++++++++++-
>   3 files changed, 235 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 4bfc0c8ab764..1b863bd84c96 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -33,6 +33,7 @@
>   #include <linux/time.h>
>   #include <linux/mm.h>
>   #include <linux/mman.h>
> +#include <linux/ptrace.h>
>   #include <linux/dma-buf.h>
>   #include <asm/processor.h>
>   #include "kfd_priv.h"
> @@ -1856,6 +1857,75 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
>   }
>   #endif
>   
> +static int criu_checkpoint(struct file *filep,
> +			   struct kfd_process *p,
> +			   struct kfd_ioctl_criu_args *args)
> +{
> +	return 0;
> +}
> +
> +static int criu_restore(struct file *filep,
> +			struct kfd_process *p,
> +			struct kfd_ioctl_criu_args *args)
> +{
> +	return 0;
> +}
> +
> +static int criu_unpause(struct file *filep,
> +			struct kfd_process *p,
> +			struct kfd_ioctl_criu_args *args)
> +{
> +	return 0;
> +}
> +
> +static int criu_resume(struct file *filep,
> +			struct kfd_process *p,
> +			struct kfd_ioctl_criu_args *args)
> +{
> +	return 0;
> +}
> +
> +static int criu_process_info(struct file *filep,
> +				struct kfd_process *p,
> +				struct kfd_ioctl_criu_args *args)
> +{
> +	return 0;
> +}
> +
> +static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
> +{
> +	struct kfd_ioctl_criu_args *args = data;
> +	int ret;
> +
> +	dev_dbg(kfd_device, "CRIU operation: %d\n", args->op);
> +	switch (args->op) {
> +	case KFD_CRIU_OP_PROCESS_INFO:
> +		ret = criu_process_info(filep, p, args);
> +		break;
> +	case KFD_CRIU_OP_CHECKPOINT:
> +		ret = criu_checkpoint(filep, p, args);
> +		break;
> +	case KFD_CRIU_OP_UNPAUSE:
> +		ret = criu_unpause(filep, p, args);
> +		break;
> +	case KFD_CRIU_OP_RESTORE:
> +		ret = criu_restore(filep, p, args);
> +		break;
> +	case KFD_CRIU_OP_RESUME:
> +		ret = criu_resume(filep, p, args);
> +		break;
> +	default:
> +		dev_dbg(kfd_device, "Unsupported CRIU operation:%d\n", args->op);
> +		ret = -EINVAL;
> +		break;
> +	}
> +
> +	if (ret)
> +		dev_dbg(kfd_device, "CRIU operation:%d err:%d\n", args->op, ret);
> +
> +	return ret;
> +}
> +
>   #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
>   	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
>   			    .cmd_drv = 0, .name = #ioctl}
> @@ -1959,6 +2029,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>   
>   	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE,
>   			kfd_ioctl_set_xnack_mode, 0),
> +
> +	AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_OP,
> +			kfd_ioctl_criu, KFD_IOC_FLAG_CHECKPOINT_RESTORE),
>   };
>   
>   #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
> @@ -1973,6 +2046,7 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
>   	char *kdata = NULL;
>   	unsigned int usize, asize;
>   	int retcode = -EINVAL;
> +	bool ptrace_attached = false;
>   
>   	if (nr >= AMDKFD_CORE_IOCTL_COUNT)
>   		goto err_i1;
> @@ -1998,7 +2072,15 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
>   	 * processes need to create their own KFD device context.
>   	 */
>   	process = filep->private_data;
> -	if (process->lead_thread != current->group_leader) {
> +
> +	rcu_read_lock();
> +	if ((ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE) &&
> +	    ptrace_parent(process->lead_thread) == current)
> +		ptrace_attached = true;
> +	rcu_read_unlock();
> +
> +	if (process->lead_thread != current->group_leader
> +	    && !ptrace_attached) {
>   		dev_dbg(kfd_device, "Using KFD FD in wrong process\n");
>   		retcode = -EBADF;
>   		goto err_i1;
> @@ -2013,6 +2095,16 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
>   		goto err_i1;
>   	}
>   
> +	/*
> +	 * Versions of docker shipped in Ubuntu 18.xx and 20.xx do not support
> +	 * CAP_CHECKPOINT_RESTORE, so we also allow access if CAP_SYS_ADMIN as CAP_SYS_ADMIN is a
> +	 * more priviledged access.
> +	 */
> +	if (unlikely(ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE)) {
> +		if (!capable(CAP_CHECKPOINT_RESTORE) && !capable(CAP_SYS_ADMIN))
> +			return -EACCES;
> +	}
> +
>   	if (cmd & (IOC_IN | IOC_OUT)) {
>   		if (asize <= sizeof(stack_kdata)) {
>   			kdata = stack_kdata;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 1d3f012bcd2a..e68f692362bb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -121,7 +121,26 @@
>    */
>   #define KFD_QUEUE_DOORBELL_MIRROR_OFFSET 512
>   
> -
> +/**
> + * enum kfd_ioctl_flags - KFD ioctl flags
> + * Various flags that can be set in &amdkfd_ioctl_desc.flags to control how
> + * userspace can use a given ioctl.
> + */
> +enum kfd_ioctl_flags {
> +	/*
> +	 * @KFD_IOC_FLAG_CHECKPOINT_RESTORE:
> +	 * Certain KFD ioctls such as AMDKFD_IOC_CRIU_OP can potentially
> +	 * perform privileged operations and load arbitrary data into MQDs and
> +	 * eventually HQD registers when the queue is mapped by HWS. In order to
> +	 * prevent this we should perform additional security checks.
> +	 *
> +	 * This is equivalent to callers with the CHECKPOINT_RESTORE capability.
> +	 *
> +	 * Note: Since earlier versions of docker do not support CHECKPOINT_RESTORE,
> +	 * we also allow ioctls with SYS_ADMIN capability.
> +	 */
> +	KFD_IOC_FLAG_CHECKPOINT_RESTORE = BIT(0),
> +};
>   /*
>    * Kernel module parameter to specify maximum number of supported queues per
>    * device
> @@ -1004,6 +1023,50 @@ void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
>   				  uint64_t tba_addr,
>   				  uint64_t tma_addr);
>   
> +/* CRIU */
> +/*
> + * Need to increment KFD_CRIU_PRIV_VERSION each time a change is made to any of the CRIU private
> + * structures:
> + * kfd_criu_process_priv_data
> + * kfd_criu_device_priv_data
> + * kfd_criu_bo_priv_data
> + * kfd_criu_queue_priv_data
> + * kfd_criu_event_priv_data
> + * kfd_criu_svm_range_priv_data
> + */
> +
> +#define KFD_CRIU_PRIV_VERSION 1
> +
> +struct kfd_criu_process_priv_data {
> +	uint32_t version;
> +};
> +
> +struct kfd_criu_device_priv_data {
> +	/* For future use */
> +	uint64_t reserved;
> +};
> +
> +struct kfd_criu_bo_priv_data {
> +	uint64_t reserved;
> +};
> +
> +struct kfd_criu_svm_range_priv_data {
> +	uint32_t object_type;
> +	uint64_t reserved;

The compiler adds 32-bit padding on x86_64. I think you want to make 
"reserved" 32-bit here. Same in the two structures below.


> +};
> +
> +struct kfd_criu_queue_priv_data {
> +	uint32_t object_type;
> +	uint64_t reserved;
> +};
> +
> +struct kfd_criu_event_priv_data {
> +	uint32_t object_type;
> +	uint64_t reserved;
> +};
> +
> +/* CRIU - End */
> +
>   /* Queue Context Management */
>   int init_queue(struct queue **q, const struct queue_properties *properties);
>   void uninit_queue(struct queue *q);
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index af96af174dc4..b5c297be081b 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -468,6 +468,80 @@ struct kfd_ioctl_smi_events_args {
>   	__u32 anon_fd;	/* from KFD */
>   };
>   
> +/**************************************************************************************************
> + * CRIU IOCTLs (Checkpoint Restore In Userspace)
> + *
> + * When checkpointing a process, the userspace application will perform:
> + * 1. PROCESS_INFO op to determine current process information

Maybe mention that this also pauses execution by evicting all the 
queues. That makes the UNPAUSE op below less mysterious.


> + * 2. CHECKPOINT op to checkpoint process contents (BOs, queues, events, svm-ranges)
> + * 3. UNPAUSE op to un-evict all the queues
> + *
> + * When restoring a process, the CRIU userspace application will perform:
> + *
> + * 1. RESTORE op to restore process contents
> + * 2. RESUME op to start the process
> + *
> + * Note: Queues are forced into an evicted state after a successful PROCESS_INFO. If user
> + * application need to perform an UNPAUSE operation to complete or abort a checkpoint.
> + */
> +
> +enum kfd_criu_op {
> +	KFD_CRIU_OP_PROCESS_INFO,
> +	KFD_CRIU_OP_CHECKPOINT,
> +	KFD_CRIU_OP_UNPAUSE,
> +	KFD_CRIU_OP_RESTORE,
> +	KFD_CRIU_OP_RESUME,
> +};
> +
> +/**
> + * kfd_ioctl_criu_args - Arguments perform CRIU operation
> + * @devices:		[in/out] User pointer to memory location for devices information
> + * @bos:		[in/out] User pointer to memory location for BOs information

It would help to reference the _bucket structures to make it clear that 
the "buckets" are the public information pointed to by these pointers. 
Also that these are pointers to arrays of buckets with num_devices and 
num_bos giving the array size.


> + * @priv_data:		[in/out] User pointer to memory location for private data
> + * @priv_data_size:	[in/out] Size of priv_data in bytes
> + * @num_devices:	[in/out] Number of GPUs used by process
> + * @num_bos		[in/out] Number of BOs used by process
> + * @num_objects:	[in/out] Number of objects used by process. Objects are opaque to
> + *				 user application
> + * @pid:		[in/out] PID of the process being checkpointed/restored

Do you need the pid during restore? Restore runs in the context of the 
restoring process itself.


> + * @op			[in] Type of operation (kfd_criu_op)
> + *
> + * Takes and releases process lock.

The process lock is a KFD implementation detail. I don't think this 
comment belongs in the UAPI definition.


> + * Return: 0 on success, -errno on failure
> + */
> +struct kfd_ioctl_criu_args {
> +	__u64 devices;		/* Used during ops: CHECKPOINT, RESTORE */
> +	__u64 bos;		/* Used during ops: CHECKPOINT, RESTORE */
> +	__u64 priv_data;	/* Used during ops: CHECKPOINT, RESTORE */
> +	__u64 priv_data_size;	/* Used during ops: PROCESS_INFO, RESTORE */
> +	__u32 num_devices;	/* Used during ops: PROCESS_INFO, RESTORE */
> +	__u32 num_bos;		/* Used during ops: PROCESS_INFO, RESTORE */
> +	__u32 num_objects;	/* Used during ops: PROCESS_INFO, RESTORE */
> +	__u32 pid;		/* Used during ops: PROCESS_INFO, RESTORE */

Do you need the pid during restore? Restore runs in the context of the 
restoring process itself.


> +	__u32 op;
> +};
> +
> +struct kfd_criu_device_bucket {
> +	__u32 user_gpu_id;
> +	__u32 actual_gpu_id;
> +	__u32 drm_fd;
> +	__u32 pad;
> +};
> +
> +struct kfd_criu_bo_bucket {
> +	__u64 addr;
> +	__u64 size;
> +	__u64 offset;
> +	__u64 restored_offset;    /* During restore, updated offset for BO */
> +	__u32 gpu_id;

Maybe add a comment that this is the user_gpu_id (I think it is ...).

Regards,
   Felix


> +	__u32 alloc_flags;
> +	__u32 dmabuf_fd;
> +	__u32 pad;
> +};
> +
> +/* CRIU IOCTLs - END */
> +/**************************************************************************************************/
> +
>   /* Register offset inside the remapped mmio page
>    */
>   enum kfd_mmio_remap {
> @@ -742,7 +816,10 @@ struct kfd_ioctl_set_xnack_mode_args {
>   #define AMDKFD_IOC_SET_XNACK_MODE		\
>   		AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args)
>   
> +#define AMDKFD_IOC_CRIU_OP			\
> +		AMDKFD_IOWR(0x22, struct kfd_ioctl_criu_args)
> +
>   #define AMDKFD_COMMAND_START		0x01
> -#define AMDKFD_COMMAND_END		0x22
> +#define AMDKFD_COMMAND_END		0x23
>   
>   #endif
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 4bfc0c8ab764..1b863bd84c96 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -33,6 +33,7 @@ 
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
+#include <linux/ptrace.h>
 #include <linux/dma-buf.h>
 #include <asm/processor.h>
 #include "kfd_priv.h"
@@ -1856,6 +1857,75 @@  static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
 }
 #endif
 
+static int criu_checkpoint(struct file *filep,
+			   struct kfd_process *p,
+			   struct kfd_ioctl_criu_args *args)
+{
+	return 0;
+}
+
+static int criu_restore(struct file *filep,
+			struct kfd_process *p,
+			struct kfd_ioctl_criu_args *args)
+{
+	return 0;
+}
+
+static int criu_unpause(struct file *filep,
+			struct kfd_process *p,
+			struct kfd_ioctl_criu_args *args)
+{
+	return 0;
+}
+
+static int criu_resume(struct file *filep,
+			struct kfd_process *p,
+			struct kfd_ioctl_criu_args *args)
+{
+	return 0;
+}
+
+static int criu_process_info(struct file *filep,
+				struct kfd_process *p,
+				struct kfd_ioctl_criu_args *args)
+{
+	return 0;
+}
+
+static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_criu_args *args = data;
+	int ret;
+
+	dev_dbg(kfd_device, "CRIU operation: %d\n", args->op);
+	switch (args->op) {
+	case KFD_CRIU_OP_PROCESS_INFO:
+		ret = criu_process_info(filep, p, args);
+		break;
+	case KFD_CRIU_OP_CHECKPOINT:
+		ret = criu_checkpoint(filep, p, args);
+		break;
+	case KFD_CRIU_OP_UNPAUSE:
+		ret = criu_unpause(filep, p, args);
+		break;
+	case KFD_CRIU_OP_RESTORE:
+		ret = criu_restore(filep, p, args);
+		break;
+	case KFD_CRIU_OP_RESUME:
+		ret = criu_resume(filep, p, args);
+		break;
+	default:
+		dev_dbg(kfd_device, "Unsupported CRIU operation:%d\n", args->op);
+		ret = -EINVAL;
+		break;
+	}
+
+	if (ret)
+		dev_dbg(kfd_device, "CRIU operation:%d err:%d\n", args->op, ret);
+
+	return ret;
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
 	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
 			    .cmd_drv = 0, .name = #ioctl}
@@ -1959,6 +2029,9 @@  static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE,
 			kfd_ioctl_set_xnack_mode, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_OP,
+			kfd_ioctl_criu, KFD_IOC_FLAG_CHECKPOINT_RESTORE),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
@@ -1973,6 +2046,7 @@  static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 	char *kdata = NULL;
 	unsigned int usize, asize;
 	int retcode = -EINVAL;
+	bool ptrace_attached = false;
 
 	if (nr >= AMDKFD_CORE_IOCTL_COUNT)
 		goto err_i1;
@@ -1998,7 +2072,15 @@  static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 	 * processes need to create their own KFD device context.
 	 */
 	process = filep->private_data;
-	if (process->lead_thread != current->group_leader) {
+
+	rcu_read_lock();
+	if ((ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE) &&
+	    ptrace_parent(process->lead_thread) == current)
+		ptrace_attached = true;
+	rcu_read_unlock();
+
+	if (process->lead_thread != current->group_leader
+	    && !ptrace_attached) {
 		dev_dbg(kfd_device, "Using KFD FD in wrong process\n");
 		retcode = -EBADF;
 		goto err_i1;
@@ -2013,6 +2095,16 @@  static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 		goto err_i1;
 	}
 
+	/*
+	 * Versions of docker shipped in Ubuntu 18.xx and 20.xx do not support
+	 * CAP_CHECKPOINT_RESTORE, so we also allow access if CAP_SYS_ADMIN as CAP_SYS_ADMIN is a
+	 * more priviledged access.
+	 */
+	if (unlikely(ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE)) {
+		if (!capable(CAP_CHECKPOINT_RESTORE) && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
+	}
+
 	if (cmd & (IOC_IN | IOC_OUT)) {
 		if (asize <= sizeof(stack_kdata)) {
 			kdata = stack_kdata;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 1d3f012bcd2a..e68f692362bb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -121,7 +121,26 @@ 
  */
 #define KFD_QUEUE_DOORBELL_MIRROR_OFFSET 512
 
-
+/**
+ * enum kfd_ioctl_flags - KFD ioctl flags
+ * Various flags that can be set in &amdkfd_ioctl_desc.flags to control how
+ * userspace can use a given ioctl.
+ */
+enum kfd_ioctl_flags {
+	/*
+	 * @KFD_IOC_FLAG_CHECKPOINT_RESTORE:
+	 * Certain KFD ioctls such as AMDKFD_IOC_CRIU_OP can potentially
+	 * perform privileged operations and load arbitrary data into MQDs and
+	 * eventually HQD registers when the queue is mapped by HWS. In order to
+	 * prevent this we should perform additional security checks.
+	 *
+	 * This is equivalent to callers with the CHECKPOINT_RESTORE capability.
+	 *
+	 * Note: Since earlier versions of docker do not support CHECKPOINT_RESTORE,
+	 * we also allow ioctls with SYS_ADMIN capability.
+	 */
+	KFD_IOC_FLAG_CHECKPOINT_RESTORE = BIT(0),
+};
 /*
  * Kernel module parameter to specify maximum number of supported queues per
  * device
@@ -1004,6 +1023,50 @@  void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
 				  uint64_t tba_addr,
 				  uint64_t tma_addr);
 
+/* CRIU */
+/*
+ * Need to increment KFD_CRIU_PRIV_VERSION each time a change is made to any of the CRIU private
+ * structures:
+ * kfd_criu_process_priv_data
+ * kfd_criu_device_priv_data
+ * kfd_criu_bo_priv_data
+ * kfd_criu_queue_priv_data
+ * kfd_criu_event_priv_data
+ * kfd_criu_svm_range_priv_data
+ */
+
+#define KFD_CRIU_PRIV_VERSION 1
+
+struct kfd_criu_process_priv_data {
+	uint32_t version;
+};
+
+struct kfd_criu_device_priv_data {
+	/* For future use */
+	uint64_t reserved;
+};
+
+struct kfd_criu_bo_priv_data {
+	uint64_t reserved;
+};
+
+struct kfd_criu_svm_range_priv_data {
+	uint32_t object_type;
+	uint64_t reserved;
+};
+
+struct kfd_criu_queue_priv_data {
+	uint32_t object_type;
+	uint64_t reserved;
+};
+
+struct kfd_criu_event_priv_data {
+	uint32_t object_type;
+	uint64_t reserved;
+};
+
+/* CRIU - End */
+
 /* Queue Context Management */
 int init_queue(struct queue **q, const struct queue_properties *properties);
 void uninit_queue(struct queue *q);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index af96af174dc4..b5c297be081b 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -468,6 +468,80 @@  struct kfd_ioctl_smi_events_args {
 	__u32 anon_fd;	/* from KFD */
 };
 
+/**************************************************************************************************
+ * CRIU IOCTLs (Checkpoint Restore In Userspace)
+ *
+ * When checkpointing a process, the userspace application will perform:
+ * 1. PROCESS_INFO op to determine current process information
+ * 2. CHECKPOINT op to checkpoint process contents (BOs, queues, events, svm-ranges)
+ * 3. UNPAUSE op to un-evict all the queues
+ *
+ * When restoring a process, the CRIU userspace application will perform:
+ *
+ * 1. RESTORE op to restore process contents
+ * 2. RESUME op to start the process
+ *
+ * Note: Queues are forced into an evicted state after a successful PROCESS_INFO. If user
+ * application need to perform an UNPAUSE operation to complete or abort a checkpoint.
+ */
+
+enum kfd_criu_op {
+	KFD_CRIU_OP_PROCESS_INFO,
+	KFD_CRIU_OP_CHECKPOINT,
+	KFD_CRIU_OP_UNPAUSE,
+	KFD_CRIU_OP_RESTORE,
+	KFD_CRIU_OP_RESUME,
+};
+
+/**
+ * kfd_ioctl_criu_args - Arguments perform CRIU operation
+ * @devices:		[in/out] User pointer to memory location for devices information
+ * @bos:		[in/out] User pointer to memory location for BOs information
+ * @priv_data:		[in/out] User pointer to memory location for private data
+ * @priv_data_size:	[in/out] Size of priv_data in bytes
+ * @num_devices:	[in/out] Number of GPUs used by process
+ * @num_bos		[in/out] Number of BOs used by process
+ * @num_objects:	[in/out] Number of objects used by process. Objects are opaque to
+ *				 user application
+ * @pid:		[in/out] PID of the process being checkpointed/restored
+ * @op			[in] Type of operation (kfd_criu_op)
+ *
+ * Takes and releases process lock.
+ * Return: 0 on success, -errno on failure
+ */
+struct kfd_ioctl_criu_args {
+	__u64 devices;		/* Used during ops: CHECKPOINT, RESTORE */
+	__u64 bos;		/* Used during ops: CHECKPOINT, RESTORE */
+	__u64 priv_data;	/* Used during ops: CHECKPOINT, RESTORE */
+	__u64 priv_data_size;	/* Used during ops: PROCESS_INFO, RESTORE */
+	__u32 num_devices;	/* Used during ops: PROCESS_INFO, RESTORE */
+	__u32 num_bos;		/* Used during ops: PROCESS_INFO, RESTORE */
+	__u32 num_objects;	/* Used during ops: PROCESS_INFO, RESTORE */
+	__u32 pid;		/* Used during ops: PROCESS_INFO, RESTORE */
+	__u32 op;
+};
+
+struct kfd_criu_device_bucket {
+	__u32 user_gpu_id;
+	__u32 actual_gpu_id;
+	__u32 drm_fd;
+	__u32 pad;
+};
+
+struct kfd_criu_bo_bucket {
+	__u64 addr;
+	__u64 size;
+	__u64 offset;
+	__u64 restored_offset;    /* During restore, updated offset for BO */
+	__u32 gpu_id;
+	__u32 alloc_flags;
+	__u32 dmabuf_fd;
+	__u32 pad;
+};
+
+/* CRIU IOCTLs - END */
+/**************************************************************************************************/
+
 /* Register offset inside the remapped mmio page
  */
 enum kfd_mmio_remap {
@@ -742,7 +816,10 @@  struct kfd_ioctl_set_xnack_mode_args {
 #define AMDKFD_IOC_SET_XNACK_MODE		\
 		AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args)
 
+#define AMDKFD_IOC_CRIU_OP			\
+		AMDKFD_IOWR(0x22, struct kfd_ioctl_criu_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x22
+#define AMDKFD_COMMAND_END		0x23
 
 #endif