diff mbox series

[drm-next,05/14] drm/nouveau: new VM_BIND uapi interfaces

Message ID 20230118061256.2689-6-dakr@redhat.com (mailing list archive)
State New, archived
Headers show
Series DRM GPUVA Manager & Nouveau VM_BIND UAPI | expand

Commit Message

Danilo Krummrich Jan. 18, 2023, 6:12 a.m. UTC
This commit provides the interfaces for the new UAPI motivated by the
Vulkan API. It allows user mode drivers (UMDs) to:

1) Initialize a GPU virtual address (VA) space via the new
   DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel reserved
   VA area.

2) Bind and unbind GPU VA space mappings via the new
   DRM_IOCTL_NOUVEAU_VM_BIND ioctl.

3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.

Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
asynchronous processing with DRM syncobjs as synchronization mechanism.

The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.

Co-authored-by: Dave Airlie <airlied@redhat.com>
Signed-off-by: Danilo Krummrich <dakr@redhat.com>
---
 Documentation/gpu/driver-uapi.rst |   8 ++
 include/uapi/drm/nouveau_drm.h    | 216 ++++++++++++++++++++++++++++++
 2 files changed, 224 insertions(+)

Comments

Matthew Brost Jan. 27, 2023, 1:05 a.m. UTC | #1
On Wed, Jan 18, 2023 at 07:12:47AM +0100, Danilo Krummrich wrote:
> This commit provides the interfaces for the new UAPI motivated by the
> Vulkan API. It allows user mode drivers (UMDs) to:
> 
> 1) Initialize a GPU virtual address (VA) space via the new
>    DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel reserved
>    VA area.
> 
> 2) Bind and unbind GPU VA space mappings via the new
>    DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
> 
> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
> 
> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
> asynchronous processing with DRM syncobjs as synchronization mechanism.
> 
> The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
> DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.
> 
> Co-authored-by: Dave Airlie <airlied@redhat.com>
> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> ---
>  Documentation/gpu/driver-uapi.rst |   8 ++
>  include/uapi/drm/nouveau_drm.h    | 216 ++++++++++++++++++++++++++++++
>  2 files changed, 224 insertions(+)
> 
> diff --git a/Documentation/gpu/driver-uapi.rst b/Documentation/gpu/driver-uapi.rst
> index 4411e6919a3d..9c7ca6e33a68 100644
> --- a/Documentation/gpu/driver-uapi.rst
> +++ b/Documentation/gpu/driver-uapi.rst
> @@ -6,3 +6,11 @@ drm/i915 uAPI
>  =============
>  
>  .. kernel-doc:: include/uapi/drm/i915_drm.h
> +
> +drm/nouveau uAPI
> +================
> +
> +VM_BIND / EXEC uAPI
> +-------------------
> +
> +.. kernel-doc:: include/uapi/drm/nouveau_drm.h
> diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h
> index 853a327433d3..f6e7d40201d4 100644
> --- a/include/uapi/drm/nouveau_drm.h
> +++ b/include/uapi/drm/nouveau_drm.h
> @@ -126,6 +126,216 @@ struct drm_nouveau_gem_cpu_fini {
>  	__u32 handle;
>  };
>  
> +/**
> + * struct drm_nouveau_sync - sync object
> + *
> + * This structure serves as synchronization mechanism for (potentially)
> + * asynchronous operations such as EXEC or VM_BIND.
> + */
> +struct drm_nouveau_sync {
> +	/**
> +	 * @flags: the flags for a sync object
> +	 *
> +	 * The first 8 bits are used to determine the type of the sync object.
> +	 */
> +	__u32 flags;
> +#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
> +#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
> +#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
> +	/**
> +	 * @handle: the handle of the sync object
> +	 */
> +	__u32 handle;
> +	/**
> +	 * @timeline_value:
> +	 *
> +	 * The timeline point of the sync object in case the syncobj is of
> +	 * type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
> +	 */
> +	__u64 timeline_value;
> +};
> +
> +/**
> + * struct drm_nouveau_vm_init - GPU VA space init structure
> + *
> + * Used to initialize the GPU's VA space for a user client, telling the kernel
> + * which portion of the VA space is managed by the UMD and kernel respectively.
> + */
> +struct drm_nouveau_vm_init {
> +	/**
> +	 * @unmanaged_addr: start address of the kernel managed VA space region
> +	 */
> +	__u64 unmanaged_addr;
> +	/**
> +	 * @unmanaged_size: size of the kernel managed VA space region in bytes
> +	 */
> +	__u64 unmanaged_size;
> +};
> +
> +/**
> + * struct drm_nouveau_vm_bind_op - VM_BIND operation
> + *
> + * This structure represents a single VM_BIND operation. UMDs should pass
> + * an array of this structure via struct drm_nouveau_vm_bind's &op_ptr field.
> + */
> +struct drm_nouveau_vm_bind_op {
> +	/**
> +	 * @op: the operation type
> +	 */
> +	__u32 op;
> +/**
> + * @DRM_NOUVEAU_VM_BIND_OP_ALLOC:
> + *
> + * The alloc operation is used to reserve a VA space region within the GPU's VA
> + * space. Optionally, the &DRM_NOUVEAU_VM_BIND_SPARSE flag can be passed to
> + * instruct the kernel to create sparse mappings for the given region.
> + */
> +#define DRM_NOUVEAU_VM_BIND_OP_ALLOC 0x0

Do you really need this operation? We have no concept of this in Xe,
e.g. we can create a VM and the entire address space is managed exactly
the same.

If this can be removed then the entire concept of regions in the GPUVA
can be removed too (drop struct drm_gpuva_region). I say this because
in Xe as I'm porting over to GPUVA the first thing I'm doing after
drm_gpuva_manager_init is calling drm_gpuva_region_insert on the entire
address space. To me this seems kinda useless but maybe I'm missing why
you need this for Nouveau. 

Matt

> +/**
> + * @DRM_NOUVEAU_VM_BIND_OP_FREE: Free a reserved VA space region.
> + */
> +#define DRM_NOUVEAU_VM_BIND_OP_FREE 0x1
> +/**
> + * @DRM_NOUVEAU_VM_BIND_OP_MAP:
> + *
> + * Map a GEM object to the GPU's VA space. The mapping must be fully enclosed by
> + * a previously allocated VA space region. If the region is sparse, existing
> + * sparse mappings are overwritten.
> + */
> +#define DRM_NOUVEAU_VM_BIND_OP_MAP 0x2
> +/**
> + * @DRM_NOUVEAU_VM_BIND_OP_UNMAP:
> + *
> + * Unmap an existing mapping in the GPU's VA space. If the region the mapping
> + * is located in is a sparse region, new sparse mappings are created where the
> + * unmapped (memory backed) mapping was mapped previously.
> + */
> +#define DRM_NOUVEAU_VM_BIND_OP_UNMAP 0x3
> +	/**
> +	 * @flags: the flags for a &drm_nouveau_vm_bind_op
> +	 */
> +	__u32 flags;
> +/**
> + * @DRM_NOUVEAU_VM_BIND_SPARSE:
> + *
> + * Indicates that an allocated VA space region should be sparse.
> + */
> +#define DRM_NOUVEAU_VM_BIND_SPARSE (1 << 8)
> +	/**
> +	 * @handle: the handle of the DRM GEM object to map
> +	 */
> +	__u32 handle;
> +	/**
> +	 * @addr:
> +	 *
> +	 * the address the VA space region or (memory backed) mapping should be mapped to
> +	 */
> +	__u64 addr;
> +	/**
> +	 * @bo_offset: the offset within the BO backing the mapping
> +	 */
> +	__u64 bo_offset;
> +	/**
> +	 * @range: the size of the requested mapping in bytes
> +	 */
> +	__u64 range;
> +};
> +
> +/**
> + * struct drm_nouveau_vm_bind - structure for DRM_IOCTL_NOUVEAU_VM_BIND
> + */
> +struct drm_nouveau_vm_bind {
> +	/**
> +	 * @op_count: the number of &drm_nouveau_vm_bind_op
> +	 */
> +	__u32 op_count;
> +	/**
> +	 * @flags: the flags for a &drm_nouveau_vm_bind ioctl
> +	 */
> +	__u32 flags;
> +/**
> + * @DRM_NOUVEAU_VM_BIND_RUN_ASYNC:
> + *
> + * Indicates that the given VM_BIND operation should be executed asynchronously
> + * by the kernel.
> + *
> + * If this flag is not supplied the kernel executes the associated operations
> + * synchronously and doesn't accept any &drm_nouveau_sync objects.
> + */
> +#define DRM_NOUVEAU_VM_BIND_RUN_ASYNC 0x1
> +	/**
> +	 * @wait_count: the number of wait &drm_nouveau_syncs
> +	 */
> +	__u32 wait_count;
> +	/**
> +	 * @sig_count: the number of &drm_nouveau_syncs to signal when finished
> +	 */
> +	__u32 sig_count;
> +	/**
> +	 * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
> +	 */
> +	__u64 wait_ptr;
> +	/**
> +	 * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
> +	 */
> +	__u64 sig_ptr;
> +	/**
> +	 * @op_ptr: pointer to the &drm_nouveau_vm_bind_ops to execute
> +	 */
> +	__u64 op_ptr;
> +};
> +
> +/**
> + * struct drm_nouveau_exec_push - EXEC push operation
> + *
> + * This structure represents a single EXEC push operation. UMDs should pass an
> + * array of this structure via struct drm_nouveau_exec's &push_ptr field.
> + */
> +struct drm_nouveau_exec_push {
> +	/**
> +	 * @va: the virtual address of the push buffer mapping
> +	 */
> +	__u64 va;
> +	/**
> +	 * @va_len: the length of the push buffer mapping
> +	 */
> +	__u64 va_len;
> +};
> +
> +/**
> + * struct drm_nouveau_exec - structure for DRM_IOCTL_NOUVEAU_EXEC
> + */
> +struct drm_nouveau_exec {
> +	/**
> +	 * @channel: the channel to execute the push buffer in
> +	 */
> +	__u32 channel;
> +	/**
> +	 * @push_count: the number of &drm_nouveau_exec_push ops
> +	 */
> +	__u32 push_count;
> +	/**
> +	 * @wait_count: the number of wait &drm_nouveau_syncs
> +	 */
> +	__u32 wait_count;
> +	/**
> +	 * @sig_count: the number of &drm_nouveau_syncs to signal when finished
> +	 */
> +	__u32 sig_count;
> +	/**
> +	 * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
> +	 */
> +	__u64 wait_ptr;
> +	/**
> +	 * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
> +	 */
> +	__u64 sig_ptr;
> +	/**
> +	 * @push_ptr: pointer to &drm_nouveau_exec_push ops
> +	 */
> +	__u64 push_ptr;
> +};
> +
>  #define DRM_NOUVEAU_GETPARAM           0x00 /* deprecated */
>  #define DRM_NOUVEAU_SETPARAM           0x01 /* deprecated */
>  #define DRM_NOUVEAU_CHANNEL_ALLOC      0x02 /* deprecated */
> @@ -136,6 +346,9 @@ struct drm_nouveau_gem_cpu_fini {
>  #define DRM_NOUVEAU_NVIF               0x07
>  #define DRM_NOUVEAU_SVM_INIT           0x08
>  #define DRM_NOUVEAU_SVM_BIND           0x09
> +#define DRM_NOUVEAU_VM_INIT            0x10
> +#define DRM_NOUVEAU_VM_BIND            0x11
> +#define DRM_NOUVEAU_EXEC               0x12
>  #define DRM_NOUVEAU_GEM_NEW            0x40
>  #define DRM_NOUVEAU_GEM_PUSHBUF        0x41
>  #define DRM_NOUVEAU_GEM_CPU_PREP       0x42
> @@ -197,6 +410,9 @@ struct drm_nouveau_svm_bind {
>  #define DRM_IOCTL_NOUVEAU_GEM_CPU_FINI       DRM_IOW (DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_CPU_FINI, struct drm_nouveau_gem_cpu_fini)
>  #define DRM_IOCTL_NOUVEAU_GEM_INFO           DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_INFO, struct drm_nouveau_gem_info)
>  
> +#define DRM_IOCTL_NOUVEAU_VM_INIT            DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_VM_INIT, struct drm_nouveau_vm_init)
> +#define DRM_IOCTL_NOUVEAU_VM_BIND            DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_VM_BIND, struct drm_nouveau_vm_bind)
> +#define DRM_IOCTL_NOUVEAU_EXEC               DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_EXEC, struct drm_nouveau_exec)
>  #if defined(__cplusplus)
>  }
>  #endif
> -- 
> 2.39.0
>
Danilo Krummrich Jan. 27, 2023, 1:26 a.m. UTC | #2
On 1/27/23 02:05, Matthew Brost wrote:
> On Wed, Jan 18, 2023 at 07:12:47AM +0100, Danilo Krummrich wrote:
>> This commit provides the interfaces for the new UAPI motivated by the
>> Vulkan API. It allows user mode drivers (UMDs) to:
>>
>> 1) Initialize a GPU virtual address (VA) space via the new
>>     DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel reserved
>>     VA area.
>>
>> 2) Bind and unbind GPU VA space mappings via the new
>>     DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>>
>> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
>>
>> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
>> asynchronous processing with DRM syncobjs as synchronization mechanism.
>>
>> The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
>> DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.
>>
>> Co-authored-by: Dave Airlie <airlied@redhat.com>
>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>> ---
>>   Documentation/gpu/driver-uapi.rst |   8 ++
>>   include/uapi/drm/nouveau_drm.h    | 216 ++++++++++++++++++++++++++++++
>>   2 files changed, 224 insertions(+)
>>
>> diff --git a/Documentation/gpu/driver-uapi.rst b/Documentation/gpu/driver-uapi.rst
>> index 4411e6919a3d..9c7ca6e33a68 100644
>> --- a/Documentation/gpu/driver-uapi.rst
>> +++ b/Documentation/gpu/driver-uapi.rst
>> @@ -6,3 +6,11 @@ drm/i915 uAPI
>>   =============
>>   
>>   .. kernel-doc:: include/uapi/drm/i915_drm.h
>> +
>> +drm/nouveau uAPI
>> +================
>> +
>> +VM_BIND / EXEC uAPI
>> +-------------------
>> +
>> +.. kernel-doc:: include/uapi/drm/nouveau_drm.h
>> diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h
>> index 853a327433d3..f6e7d40201d4 100644
>> --- a/include/uapi/drm/nouveau_drm.h
>> +++ b/include/uapi/drm/nouveau_drm.h
>> @@ -126,6 +126,216 @@ struct drm_nouveau_gem_cpu_fini {
>>   	__u32 handle;
>>   };
>>   
>> +/**
>> + * struct drm_nouveau_sync - sync object
>> + *
>> + * This structure serves as synchronization mechanism for (potentially)
>> + * asynchronous operations such as EXEC or VM_BIND.
>> + */
>> +struct drm_nouveau_sync {
>> +	/**
>> +	 * @flags: the flags for a sync object
>> +	 *
>> +	 * The first 8 bits are used to determine the type of the sync object.
>> +	 */
>> +	__u32 flags;
>> +#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
>> +#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
>> +#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
>> +	/**
>> +	 * @handle: the handle of the sync object
>> +	 */
>> +	__u32 handle;
>> +	/**
>> +	 * @timeline_value:
>> +	 *
>> +	 * The timeline point of the sync object in case the syncobj is of
>> +	 * type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
>> +	 */
>> +	__u64 timeline_value;
>> +};
>> +
>> +/**
>> + * struct drm_nouveau_vm_init - GPU VA space init structure
>> + *
>> + * Used to initialize the GPU's VA space for a user client, telling the kernel
>> + * which portion of the VA space is managed by the UMD and kernel respectively.
>> + */
>> +struct drm_nouveau_vm_init {
>> +	/**
>> +	 * @unmanaged_addr: start address of the kernel managed VA space region
>> +	 */
>> +	__u64 unmanaged_addr;
>> +	/**
>> +	 * @unmanaged_size: size of the kernel managed VA space region in bytes
>> +	 */
>> +	__u64 unmanaged_size;
>> +};
>> +
>> +/**
>> + * struct drm_nouveau_vm_bind_op - VM_BIND operation
>> + *
>> + * This structure represents a single VM_BIND operation. UMDs should pass
>> + * an array of this structure via struct drm_nouveau_vm_bind's &op_ptr field.
>> + */
>> +struct drm_nouveau_vm_bind_op {
>> +	/**
>> +	 * @op: the operation type
>> +	 */
>> +	__u32 op;
>> +/**
>> + * @DRM_NOUVEAU_VM_BIND_OP_ALLOC:
>> + *
>> + * The alloc operation is used to reserve a VA space region within the GPU's VA
>> + * space. Optionally, the &DRM_NOUVEAU_VM_BIND_SPARSE flag can be passed to
>> + * instruct the kernel to create sparse mappings for the given region.
>> + */
>> +#define DRM_NOUVEAU_VM_BIND_OP_ALLOC 0x0
> 
> Do you really need this operation? We have no concept of this in Xe,
> e.g. we can create a VM and the entire address space is managed exactly
> the same.

The idea for alloc/free is to let UMDs allocate a portion of the VA 
space (which I call a region), basically the same thing Vulkan 
represents with a VKBuffer.

It serves two purposes:

1. It gives the kernel (in particular the GPUVA manager) the bounds in 
which it is allowed to merge mappings. E.g. when a user request asks for 
a new mapping and we detect we could merge this mapping with an existing 
one (used in another VKBuffer than the mapping request came for) the 
driver is not allowed to change the page table for the existing mapping 
we want to merge with (assuming that some drivers would need to do this 
in order to merge), because the existing mapping could already be in use 
and by re-mapping it we'd potentially cause a fault on the GPU.

2. It is used for sparse residency in a way that such an allocated VA 
space region can be flagged as sparse, such that the kernel always keeps 
sparse mappings around for the parts of the region that do not contain 
actual memory backed mappings.

If for your driver merging is always OK, creating a single huge region 
would do the trick I guess. Otherwise, we could also add an option to 
the GPUVA manager (or a specific region, which could also be a single 
huge one) within which it never merges.

> 
> If this can be removed then the entire concept of regions in the GPUVA
> can be removed too (drop struct drm_gpuva_region). I say this because
> in Xe as I'm porting over to GPUVA the first thing I'm doing after
> drm_gpuva_manager_init is calling drm_gpuva_region_insert on the entire
> address space. To me this seems kinda useless but maybe I'm missing why
> you need this for Nouveau.
> 
> Matt
> 
>> +/**
>> + * @DRM_NOUVEAU_VM_BIND_OP_FREE: Free a reserved VA space region.
>> + */
>> +#define DRM_NOUVEAU_VM_BIND_OP_FREE 0x1
>> +/**
>> + * @DRM_NOUVEAU_VM_BIND_OP_MAP:
>> + *
>> + * Map a GEM object to the GPU's VA space. The mapping must be fully enclosed by
>> + * a previously allocated VA space region. If the region is sparse, existing
>> + * sparse mappings are overwritten.
>> + */
>> +#define DRM_NOUVEAU_VM_BIND_OP_MAP 0x2
>> +/**
>> + * @DRM_NOUVEAU_VM_BIND_OP_UNMAP:
>> + *
>> + * Unmap an existing mapping in the GPU's VA space. If the region the mapping
>> + * is located in is a sparse region, new sparse mappings are created where the
>> + * unmapped (memory backed) mapping was mapped previously.
>> + */
>> +#define DRM_NOUVEAU_VM_BIND_OP_UNMAP 0x3
>> +	/**
>> +	 * @flags: the flags for a &drm_nouveau_vm_bind_op
>> +	 */
>> +	__u32 flags;
>> +/**
>> + * @DRM_NOUVEAU_VM_BIND_SPARSE:
>> + *
>> + * Indicates that an allocated VA space region should be sparse.
>> + */
>> +#define DRM_NOUVEAU_VM_BIND_SPARSE (1 << 8)
>> +	/**
>> +	 * @handle: the handle of the DRM GEM object to map
>> +	 */
>> +	__u32 handle;
>> +	/**
>> +	 * @addr:
>> +	 *
>> +	 * the address the VA space region or (memory backed) mapping should be mapped to
>> +	 */
>> +	__u64 addr;
>> +	/**
>> +	 * @bo_offset: the offset within the BO backing the mapping
>> +	 */
>> +	__u64 bo_offset;
>> +	/**
>> +	 * @range: the size of the requested mapping in bytes
>> +	 */
>> +	__u64 range;
>> +};
>> +
>> +/**
>> + * struct drm_nouveau_vm_bind - structure for DRM_IOCTL_NOUVEAU_VM_BIND
>> + */
>> +struct drm_nouveau_vm_bind {
>> +	/**
>> +	 * @op_count: the number of &drm_nouveau_vm_bind_op
>> +	 */
>> +	__u32 op_count;
>> +	/**
>> +	 * @flags: the flags for a &drm_nouveau_vm_bind ioctl
>> +	 */
>> +	__u32 flags;
>> +/**
>> + * @DRM_NOUVEAU_VM_BIND_RUN_ASYNC:
>> + *
>> + * Indicates that the given VM_BIND operation should be executed asynchronously
>> + * by the kernel.
>> + *
>> + * If this flag is not supplied the kernel executes the associated operations
>> + * synchronously and doesn't accept any &drm_nouveau_sync objects.
>> + */
>> +#define DRM_NOUVEAU_VM_BIND_RUN_ASYNC 0x1
>> +	/**
>> +	 * @wait_count: the number of wait &drm_nouveau_syncs
>> +	 */
>> +	__u32 wait_count;
>> +	/**
>> +	 * @sig_count: the number of &drm_nouveau_syncs to signal when finished
>> +	 */
>> +	__u32 sig_count;
>> +	/**
>> +	 * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
>> +	 */
>> +	__u64 wait_ptr;
>> +	/**
>> +	 * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
>> +	 */
>> +	__u64 sig_ptr;
>> +	/**
>> +	 * @op_ptr: pointer to the &drm_nouveau_vm_bind_ops to execute
>> +	 */
>> +	__u64 op_ptr;
>> +};
>> +
>> +/**
>> + * struct drm_nouveau_exec_push - EXEC push operation
>> + *
>> + * This structure represents a single EXEC push operation. UMDs should pass an
>> + * array of this structure via struct drm_nouveau_exec's &push_ptr field.
>> + */
>> +struct drm_nouveau_exec_push {
>> +	/**
>> +	 * @va: the virtual address of the push buffer mapping
>> +	 */
>> +	__u64 va;
>> +	/**
>> +	 * @va_len: the length of the push buffer mapping
>> +	 */
>> +	__u64 va_len;
>> +};
>> +
>> +/**
>> + * struct drm_nouveau_exec - structure for DRM_IOCTL_NOUVEAU_EXEC
>> + */
>> +struct drm_nouveau_exec {
>> +	/**
>> +	 * @channel: the channel to execute the push buffer in
>> +	 */
>> +	__u32 channel;
>> +	/**
>> +	 * @push_count: the number of &drm_nouveau_exec_push ops
>> +	 */
>> +	__u32 push_count;
>> +	/**
>> +	 * @wait_count: the number of wait &drm_nouveau_syncs
>> +	 */
>> +	__u32 wait_count;
>> +	/**
>> +	 * @sig_count: the number of &drm_nouveau_syncs to signal when finished
>> +	 */
>> +	__u32 sig_count;
>> +	/**
>> +	 * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
>> +	 */
>> +	__u64 wait_ptr;
>> +	/**
>> +	 * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
>> +	 */
>> +	__u64 sig_ptr;
>> +	/**
>> +	 * @push_ptr: pointer to &drm_nouveau_exec_push ops
>> +	 */
>> +	__u64 push_ptr;
>> +};
>> +
>>   #define DRM_NOUVEAU_GETPARAM           0x00 /* deprecated */
>>   #define DRM_NOUVEAU_SETPARAM           0x01 /* deprecated */
>>   #define DRM_NOUVEAU_CHANNEL_ALLOC      0x02 /* deprecated */
>> @@ -136,6 +346,9 @@ struct drm_nouveau_gem_cpu_fini {
>>   #define DRM_NOUVEAU_NVIF               0x07
>>   #define DRM_NOUVEAU_SVM_INIT           0x08
>>   #define DRM_NOUVEAU_SVM_BIND           0x09
>> +#define DRM_NOUVEAU_VM_INIT            0x10
>> +#define DRM_NOUVEAU_VM_BIND            0x11
>> +#define DRM_NOUVEAU_EXEC               0x12
>>   #define DRM_NOUVEAU_GEM_NEW            0x40
>>   #define DRM_NOUVEAU_GEM_PUSHBUF        0x41
>>   #define DRM_NOUVEAU_GEM_CPU_PREP       0x42
>> @@ -197,6 +410,9 @@ struct drm_nouveau_svm_bind {
>>   #define DRM_IOCTL_NOUVEAU_GEM_CPU_FINI       DRM_IOW (DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_CPU_FINI, struct drm_nouveau_gem_cpu_fini)
>>   #define DRM_IOCTL_NOUVEAU_GEM_INFO           DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_INFO, struct drm_nouveau_gem_info)
>>   
>> +#define DRM_IOCTL_NOUVEAU_VM_INIT            DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_VM_INIT, struct drm_nouveau_vm_init)
>> +#define DRM_IOCTL_NOUVEAU_VM_BIND            DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_VM_BIND, struct drm_nouveau_vm_bind)
>> +#define DRM_IOCTL_NOUVEAU_EXEC               DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_EXEC, struct drm_nouveau_exec)
>>   #if defined(__cplusplus)
>>   }
>>   #endif
>> -- 
>> 2.39.0
>>
>
Danilo Krummrich Jan. 27, 2023, 1:43 a.m. UTC | #3
On 1/27/23 02:05, Matthew Brost wrote:
> On Wed, Jan 18, 2023 at 07:12:47AM +0100, Danilo Krummrich wrote:
>> This commit provides the interfaces for the new UAPI motivated by the
>> Vulkan API. It allows user mode drivers (UMDs) to:
>>
>> 1) Initialize a GPU virtual address (VA) space via the new
>>     DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel reserved
>>     VA area.
>>
>> 2) Bind and unbind GPU VA space mappings via the new
>>     DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>>
>> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
>>
>> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
>> asynchronous processing with DRM syncobjs as synchronization mechanism.
>>
>> The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
>> DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.
>>
>> Co-authored-by: Dave Airlie <airlied@redhat.com>
>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>> ---
>>   Documentation/gpu/driver-uapi.rst |   8 ++
>>   include/uapi/drm/nouveau_drm.h    | 216 ++++++++++++++++++++++++++++++
>>   2 files changed, 224 insertions(+)
>>
>> diff --git a/Documentation/gpu/driver-uapi.rst b/Documentation/gpu/driver-uapi.rst
>> index 4411e6919a3d..9c7ca6e33a68 100644
>> --- a/Documentation/gpu/driver-uapi.rst
>> +++ b/Documentation/gpu/driver-uapi.rst
>> @@ -6,3 +6,11 @@ drm/i915 uAPI
>>   =============
>>   
>>   .. kernel-doc:: include/uapi/drm/i915_drm.h
>> +
>> +drm/nouveau uAPI
>> +================
>> +
>> +VM_BIND / EXEC uAPI
>> +-------------------
>> +
>> +.. kernel-doc:: include/uapi/drm/nouveau_drm.h
>> diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h
>> index 853a327433d3..f6e7d40201d4 100644
>> --- a/include/uapi/drm/nouveau_drm.h
>> +++ b/include/uapi/drm/nouveau_drm.h
>> @@ -126,6 +126,216 @@ struct drm_nouveau_gem_cpu_fini {
>>   	__u32 handle;
>>   };
>>   
>> +/**
>> + * struct drm_nouveau_sync - sync object
>> + *
>> + * This structure serves as synchronization mechanism for (potentially)
>> + * asynchronous operations such as EXEC or VM_BIND.
>> + */
>> +struct drm_nouveau_sync {
>> +	/**
>> +	 * @flags: the flags for a sync object
>> +	 *
>> +	 * The first 8 bits are used to determine the type of the sync object.
>> +	 */
>> +	__u32 flags;
>> +#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
>> +#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
>> +#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
>> +	/**
>> +	 * @handle: the handle of the sync object
>> +	 */
>> +	__u32 handle;
>> +	/**
>> +	 * @timeline_value:
>> +	 *
>> +	 * The timeline point of the sync object in case the syncobj is of
>> +	 * type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
>> +	 */
>> +	__u64 timeline_value;
>> +};
>> +
>> +/**
>> + * struct drm_nouveau_vm_init - GPU VA space init structure
>> + *
>> + * Used to initialize the GPU's VA space for a user client, telling the kernel
>> + * which portion of the VA space is managed by the UMD and kernel respectively.
>> + */
>> +struct drm_nouveau_vm_init {
>> +	/**
>> +	 * @unmanaged_addr: start address of the kernel managed VA space region
>> +	 */
>> +	__u64 unmanaged_addr;
>> +	/**
>> +	 * @unmanaged_size: size of the kernel managed VA space region in bytes
>> +	 */
>> +	__u64 unmanaged_size;
>> +};
>> +
>> +/**
>> + * struct drm_nouveau_vm_bind_op - VM_BIND operation
>> + *
>> + * This structure represents a single VM_BIND operation. UMDs should pass
>> + * an array of this structure via struct drm_nouveau_vm_bind's &op_ptr field.
>> + */
>> +struct drm_nouveau_vm_bind_op {
>> +	/**
>> +	 * @op: the operation type
>> +	 */
>> +	__u32 op;
>> +/**
>> + * @DRM_NOUVEAU_VM_BIND_OP_ALLOC:
>> + *
>> + * The alloc operation is used to reserve a VA space region within the GPU's VA
>> + * space. Optionally, the &DRM_NOUVEAU_VM_BIND_SPARSE flag can be passed to
>> + * instruct the kernel to create sparse mappings for the given region.
>> + */
>> +#define DRM_NOUVEAU_VM_BIND_OP_ALLOC 0x0
> 
> Do you really need this operation? We have no concept of this in Xe,
> e.g. we can create a VM and the entire address space is managed exactly
> the same.
> 
> If this can be removed then the entire concept of regions in the GPUVA
> can be removed too (drop struct drm_gpuva_region). I say this because
> in Xe as I'm porting over to GPUVA the first thing I'm doing after
> drm_gpuva_manager_init is calling drm_gpuva_region_insert on the entire
> address space. 

Also, since you've been starting to use the code, this [1] is the branch 
I'm pushing my fixes for a v2 to. It already contains the changes for 
the GPUVA manager except for switching away from drm_mm.

[1] 
https://gitlab.freedesktop.org/nouvelles/kernel/-/tree/new-uapi-drm-next-fixes

> To me this seems kinda useless but maybe I'm missing why
> you need this for Nouveau.
> 
> Matt
> 
>> +/**
>> + * @DRM_NOUVEAU_VM_BIND_OP_FREE: Free a reserved VA space region.
>> + */
>> +#define DRM_NOUVEAU_VM_BIND_OP_FREE 0x1
>> +/**
>> + * @DRM_NOUVEAU_VM_BIND_OP_MAP:
>> + *
>> + * Map a GEM object to the GPU's VA space. The mapping must be fully enclosed by
>> + * a previously allocated VA space region. If the region is sparse, existing
>> + * sparse mappings are overwritten.
>> + */
>> +#define DRM_NOUVEAU_VM_BIND_OP_MAP 0x2
>> +/**
>> + * @DRM_NOUVEAU_VM_BIND_OP_UNMAP:
>> + *
>> + * Unmap an existing mapping in the GPU's VA space. If the region the mapping
>> + * is located in is a sparse region, new sparse mappings are created where the
>> + * unmapped (memory backed) mapping was mapped previously.
>> + */
>> +#define DRM_NOUVEAU_VM_BIND_OP_UNMAP 0x3
>> +	/**
>> +	 * @flags: the flags for a &drm_nouveau_vm_bind_op
>> +	 */
>> +	__u32 flags;
>> +/**
>> + * @DRM_NOUVEAU_VM_BIND_SPARSE:
>> + *
>> + * Indicates that an allocated VA space region should be sparse.
>> + */
>> +#define DRM_NOUVEAU_VM_BIND_SPARSE (1 << 8)
>> +	/**
>> +	 * @handle: the handle of the DRM GEM object to map
>> +	 */
>> +	__u32 handle;
>> +	/**
>> +	 * @addr:
>> +	 *
>> +	 * the address the VA space region or (memory backed) mapping should be mapped to
>> +	 */
>> +	__u64 addr;
>> +	/**
>> +	 * @bo_offset: the offset within the BO backing the mapping
>> +	 */
>> +	__u64 bo_offset;
>> +	/**
>> +	 * @range: the size of the requested mapping in bytes
>> +	 */
>> +	__u64 range;
>> +};
>> +
>> +/**
>> + * struct drm_nouveau_vm_bind - structure for DRM_IOCTL_NOUVEAU_VM_BIND
>> + */
>> +struct drm_nouveau_vm_bind {
>> +	/**
>> +	 * @op_count: the number of &drm_nouveau_vm_bind_op
>> +	 */
>> +	__u32 op_count;
>> +	/**
>> +	 * @flags: the flags for a &drm_nouveau_vm_bind ioctl
>> +	 */
>> +	__u32 flags;
>> +/**
>> + * @DRM_NOUVEAU_VM_BIND_RUN_ASYNC:
>> + *
>> + * Indicates that the given VM_BIND operation should be executed asynchronously
>> + * by the kernel.
>> + *
>> + * If this flag is not supplied the kernel executes the associated operations
>> + * synchronously and doesn't accept any &drm_nouveau_sync objects.
>> + */
>> +#define DRM_NOUVEAU_VM_BIND_RUN_ASYNC 0x1
>> +	/**
>> +	 * @wait_count: the number of wait &drm_nouveau_syncs
>> +	 */
>> +	__u32 wait_count;
>> +	/**
>> +	 * @sig_count: the number of &drm_nouveau_syncs to signal when finished
>> +	 */
>> +	__u32 sig_count;
>> +	/**
>> +	 * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
>> +	 */
>> +	__u64 wait_ptr;
>> +	/**
>> +	 * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
>> +	 */
>> +	__u64 sig_ptr;
>> +	/**
>> +	 * @op_ptr: pointer to the &drm_nouveau_vm_bind_ops to execute
>> +	 */
>> +	__u64 op_ptr;
>> +};
>> +
>> +/**
>> + * struct drm_nouveau_exec_push - EXEC push operation
>> + *
>> + * This structure represents a single EXEC push operation. UMDs should pass an
>> + * array of this structure via struct drm_nouveau_exec's &push_ptr field.
>> + */
>> +struct drm_nouveau_exec_push {
>> +	/**
>> +	 * @va: the virtual address of the push buffer mapping
>> +	 */
>> +	__u64 va;
>> +	/**
>> +	 * @va_len: the length of the push buffer mapping
>> +	 */
>> +	__u64 va_len;
>> +};
>> +
>> +/**
>> + * struct drm_nouveau_exec - structure for DRM_IOCTL_NOUVEAU_EXEC
>> + */
>> +struct drm_nouveau_exec {
>> +	/**
>> +	 * @channel: the channel to execute the push buffer in
>> +	 */
>> +	__u32 channel;
>> +	/**
>> +	 * @push_count: the number of &drm_nouveau_exec_push ops
>> +	 */
>> +	__u32 push_count;
>> +	/**
>> +	 * @wait_count: the number of wait &drm_nouveau_syncs
>> +	 */
>> +	__u32 wait_count;
>> +	/**
>> +	 * @sig_count: the number of &drm_nouveau_syncs to signal when finished
>> +	 */
>> +	__u32 sig_count;
>> +	/**
>> +	 * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
>> +	 */
>> +	__u64 wait_ptr;
>> +	/**
>> +	 * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
>> +	 */
>> +	__u64 sig_ptr;
>> +	/**
>> +	 * @push_ptr: pointer to &drm_nouveau_exec_push ops
>> +	 */
>> +	__u64 push_ptr;
>> +};
>> +
>>   #define DRM_NOUVEAU_GETPARAM           0x00 /* deprecated */
>>   #define DRM_NOUVEAU_SETPARAM           0x01 /* deprecated */
>>   #define DRM_NOUVEAU_CHANNEL_ALLOC      0x02 /* deprecated */
>> @@ -136,6 +346,9 @@ struct drm_nouveau_gem_cpu_fini {
>>   #define DRM_NOUVEAU_NVIF               0x07
>>   #define DRM_NOUVEAU_SVM_INIT           0x08
>>   #define DRM_NOUVEAU_SVM_BIND           0x09
>> +#define DRM_NOUVEAU_VM_INIT            0x10
>> +#define DRM_NOUVEAU_VM_BIND            0x11
>> +#define DRM_NOUVEAU_EXEC               0x12
>>   #define DRM_NOUVEAU_GEM_NEW            0x40
>>   #define DRM_NOUVEAU_GEM_PUSHBUF        0x41
>>   #define DRM_NOUVEAU_GEM_CPU_PREP       0x42
>> @@ -197,6 +410,9 @@ struct drm_nouveau_svm_bind {
>>   #define DRM_IOCTL_NOUVEAU_GEM_CPU_FINI       DRM_IOW (DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_CPU_FINI, struct drm_nouveau_gem_cpu_fini)
>>   #define DRM_IOCTL_NOUVEAU_GEM_INFO           DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_INFO, struct drm_nouveau_gem_info)
>>   
>> +#define DRM_IOCTL_NOUVEAU_VM_INIT            DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_VM_INIT, struct drm_nouveau_vm_init)
>> +#define DRM_IOCTL_NOUVEAU_VM_BIND            DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_VM_BIND, struct drm_nouveau_vm_bind)
>> +#define DRM_IOCTL_NOUVEAU_EXEC               DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_EXEC, struct drm_nouveau_exec)
>>   #if defined(__cplusplus)
>>   }
>>   #endif
>> -- 
>> 2.39.0
>>
>
Matthew Brost Jan. 27, 2023, 3:21 a.m. UTC | #4
On Fri, Jan 27, 2023 at 02:43:30AM +0100, Danilo Krummrich wrote:
> 
> 
> On 1/27/23 02:05, Matthew Brost wrote:
> > On Wed, Jan 18, 2023 at 07:12:47AM +0100, Danilo Krummrich wrote:
> > > This commit provides the interfaces for the new UAPI motivated by the
> > > Vulkan API. It allows user mode drivers (UMDs) to:
> > > 
> > > 1) Initialize a GPU virtual address (VA) space via the new
> > >     DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel reserved
> > >     VA area.
> > > 
> > > 2) Bind and unbind GPU VA space mappings via the new
> > >     DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
> > > 
> > > 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
> > > 
> > > Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
> > > asynchronous processing with DRM syncobjs as synchronization mechanism.
> > > 
> > > The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
> > > DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.
> > > 
> > > Co-authored-by: Dave Airlie <airlied@redhat.com>
> > > Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> > > ---
> > >   Documentation/gpu/driver-uapi.rst |   8 ++
> > >   include/uapi/drm/nouveau_drm.h    | 216 ++++++++++++++++++++++++++++++
> > >   2 files changed, 224 insertions(+)
> > > 
> > > diff --git a/Documentation/gpu/driver-uapi.rst b/Documentation/gpu/driver-uapi.rst
> > > index 4411e6919a3d..9c7ca6e33a68 100644
> > > --- a/Documentation/gpu/driver-uapi.rst
> > > +++ b/Documentation/gpu/driver-uapi.rst
> > > @@ -6,3 +6,11 @@ drm/i915 uAPI
> > >   =============
> > >   .. kernel-doc:: include/uapi/drm/i915_drm.h
> > > +
> > > +drm/nouveau uAPI
> > > +================
> > > +
> > > +VM_BIND / EXEC uAPI
> > > +-------------------
> > > +
> > > +.. kernel-doc:: include/uapi/drm/nouveau_drm.h
> > > diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h
> > > index 853a327433d3..f6e7d40201d4 100644
> > > --- a/include/uapi/drm/nouveau_drm.h
> > > +++ b/include/uapi/drm/nouveau_drm.h
> > > @@ -126,6 +126,216 @@ struct drm_nouveau_gem_cpu_fini {
> > >   	__u32 handle;
> > >   };
> > > +/**
> > > + * struct drm_nouveau_sync - sync object
> > > + *
> > > + * This structure serves as synchronization mechanism for (potentially)
> > > + * asynchronous operations such as EXEC or VM_BIND.
> > > + */
> > > +struct drm_nouveau_sync {
> > > +	/**
> > > +	 * @flags: the flags for a sync object
> > > +	 *
> > > +	 * The first 8 bits are used to determine the type of the sync object.
> > > +	 */
> > > +	__u32 flags;
> > > +#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
> > > +#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
> > > +#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
> > > +	/**
> > > +	 * @handle: the handle of the sync object
> > > +	 */
> > > +	__u32 handle;
> > > +	/**
> > > +	 * @timeline_value:
> > > +	 *
> > > +	 * The timeline point of the sync object in case the syncobj is of
> > > +	 * type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
> > > +	 */
> > > +	__u64 timeline_value;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_nouveau_vm_init - GPU VA space init structure
> > > + *
> > > + * Used to initialize the GPU's VA space for a user client, telling the kernel
> > > + * which portion of the VA space is managed by the UMD and kernel respectively.
> > > + */
> > > +struct drm_nouveau_vm_init {
> > > +	/**
> > > +	 * @unmanaged_addr: start address of the kernel managed VA space region
> > > +	 */
> > > +	__u64 unmanaged_addr;
> > > +	/**
> > > +	 * @unmanaged_size: size of the kernel managed VA space region in bytes
> > > +	 */
> > > +	__u64 unmanaged_size;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_nouveau_vm_bind_op - VM_BIND operation
> > > + *
> > > + * This structure represents a single VM_BIND operation. UMDs should pass
> > > + * an array of this structure via struct drm_nouveau_vm_bind's &op_ptr field.
> > > + */
> > > +struct drm_nouveau_vm_bind_op {
> > > +	/**
> > > +	 * @op: the operation type
> > > +	 */
> > > +	__u32 op;
> > > +/**
> > > + * @DRM_NOUVEAU_VM_BIND_OP_ALLOC:
> > > + *
> > > + * The alloc operation is used to reserve a VA space region within the GPU's VA
> > > + * space. Optionally, the &DRM_NOUVEAU_VM_BIND_SPARSE flag can be passed to
> > > + * instruct the kernel to create sparse mappings for the given region.
> > > + */
> > > +#define DRM_NOUVEAU_VM_BIND_OP_ALLOC 0x0
> > 
> > Do you really need this operation? We have no concept of this in Xe,
> > e.g. we can create a VM and the entire address space is managed exactly
> > the same.
> > 
> > If this can be removed then the entire concept of regions in the GPUVA
> > can be removed too (drop struct drm_gpuva_region). I say this because
> > in Xe as I'm porting over to GPUVA the first thing I'm doing after
> > drm_gpuva_manager_init is calling drm_gpuva_region_insert on the entire
> > address space.
> 
> Also, since you've been starting to use the code, this [1] is the branch I'm
> pushing my fixes for a v2 to. It already contains the changes for the GPUVA
> manager except for switching away from drm_mm.
> 
> [1] https://gitlab.freedesktop.org/nouvelles/kernel/-/tree/new-uapi-drm-next-fixes
> 

I will take a look at this branch. I believe you are on our Xe gitlab
project (working on getting this public) so you can comment on any MR I
post there, I expect to have something posted early next week to port Xe
to the gpuva.

Also I assume you are dri-devel IRC, what is your handle? Mine is
mbrost. It might be useful to chat in real time.

Matt

> > To me this seems kinda useless but maybe I'm missing why
> > you need this for Nouveau.
> > 
> > Matt
> > 
> > > +/**
> > > + * @DRM_NOUVEAU_VM_BIND_OP_FREE: Free a reserved VA space region.
> > > + */
> > > +#define DRM_NOUVEAU_VM_BIND_OP_FREE 0x1
> > > +/**
> > > + * @DRM_NOUVEAU_VM_BIND_OP_MAP:
> > > + *
> > > + * Map a GEM object to the GPU's VA space. The mapping must be fully enclosed by
> > > + * a previously allocated VA space region. If the region is sparse, existing
> > > + * sparse mappings are overwritten.
> > > + */
> > > +#define DRM_NOUVEAU_VM_BIND_OP_MAP 0x2
> > > +/**
> > > + * @DRM_NOUVEAU_VM_BIND_OP_UNMAP:
> > > + *
> > > + * Unmap an existing mapping in the GPU's VA space. If the region the mapping
> > > + * is located in is a sparse region, new sparse mappings are created where the
> > > + * unmapped (memory backed) mapping was mapped previously.
> > > + */
> > > +#define DRM_NOUVEAU_VM_BIND_OP_UNMAP 0x3
> > > +	/**
> > > +	 * @flags: the flags for a &drm_nouveau_vm_bind_op
> > > +	 */
> > > +	__u32 flags;
> > > +/**
> > > + * @DRM_NOUVEAU_VM_BIND_SPARSE:
> > > + *
> > > + * Indicates that an allocated VA space region should be sparse.
> > > + */
> > > +#define DRM_NOUVEAU_VM_BIND_SPARSE (1 << 8)
> > > +	/**
> > > +	 * @handle: the handle of the DRM GEM object to map
> > > +	 */
> > > +	__u32 handle;
> > > +	/**
> > > +	 * @addr:
> > > +	 *
> > > +	 * the address the VA space region or (memory backed) mapping should be mapped to
> > > +	 */
> > > +	__u64 addr;
> > > +	/**
> > > +	 * @bo_offset: the offset within the BO backing the mapping
> > > +	 */
> > > +	__u64 bo_offset;
> > > +	/**
> > > +	 * @range: the size of the requested mapping in bytes
> > > +	 */
> > > +	__u64 range;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_nouveau_vm_bind - structure for DRM_IOCTL_NOUVEAU_VM_BIND
> > > + */
> > > +struct drm_nouveau_vm_bind {
> > > +	/**
> > > +	 * @op_count: the number of &drm_nouveau_vm_bind_op
> > > +	 */
> > > +	__u32 op_count;
> > > +	/**
> > > +	 * @flags: the flags for a &drm_nouveau_vm_bind ioctl
> > > +	 */
> > > +	__u32 flags;
> > > +/**
> > > + * @DRM_NOUVEAU_VM_BIND_RUN_ASYNC:
> > > + *
> > > + * Indicates that the given VM_BIND operation should be executed asynchronously
> > > + * by the kernel.
> > > + *
> > > + * If this flag is not supplied the kernel executes the associated operations
> > > + * synchronously and doesn't accept any &drm_nouveau_sync objects.
> > > + */
> > > +#define DRM_NOUVEAU_VM_BIND_RUN_ASYNC 0x1
> > > +	/**
> > > +	 * @wait_count: the number of wait &drm_nouveau_syncs
> > > +	 */
> > > +	__u32 wait_count;
> > > +	/**
> > > +	 * @sig_count: the number of &drm_nouveau_syncs to signal when finished
> > > +	 */
> > > +	__u32 sig_count;
> > > +	/**
> > > +	 * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
> > > +	 */
> > > +	__u64 wait_ptr;
> > > +	/**
> > > +	 * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
> > > +	 */
> > > +	__u64 sig_ptr;
> > > +	/**
> > > +	 * @op_ptr: pointer to the &drm_nouveau_vm_bind_ops to execute
> > > +	 */
> > > +	__u64 op_ptr;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_nouveau_exec_push - EXEC push operation
> > > + *
> > > + * This structure represents a single EXEC push operation. UMDs should pass an
> > > + * array of this structure via struct drm_nouveau_exec's &push_ptr field.
> > > + */
> > > +struct drm_nouveau_exec_push {
> > > +	/**
> > > +	 * @va: the virtual address of the push buffer mapping
> > > +	 */
> > > +	__u64 va;
> > > +	/**
> > > +	 * @va_len: the length of the push buffer mapping
> > > +	 */
> > > +	__u64 va_len;
> > > +};
> > > +
> > > +/**
> > > + * struct drm_nouveau_exec - structure for DRM_IOCTL_NOUVEAU_EXEC
> > > + */
> > > +struct drm_nouveau_exec {
> > > +	/**
> > > +	 * @channel: the channel to execute the push buffer in
> > > +	 */
> > > +	__u32 channel;
> > > +	/**
> > > +	 * @push_count: the number of &drm_nouveau_exec_push ops
> > > +	 */
> > > +	__u32 push_count;
> > > +	/**
> > > +	 * @wait_count: the number of wait &drm_nouveau_syncs
> > > +	 */
> > > +	__u32 wait_count;
> > > +	/**
> > > +	 * @sig_count: the number of &drm_nouveau_syncs to signal when finished
> > > +	 */
> > > +	__u32 sig_count;
> > > +	/**
> > > +	 * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
> > > +	 */
> > > +	__u64 wait_ptr;
> > > +	/**
> > > +	 * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
> > > +	 */
> > > +	__u64 sig_ptr;
> > > +	/**
> > > +	 * @push_ptr: pointer to &drm_nouveau_exec_push ops
> > > +	 */
> > > +	__u64 push_ptr;
> > > +};
> > > +
> > >   #define DRM_NOUVEAU_GETPARAM           0x00 /* deprecated */
> > >   #define DRM_NOUVEAU_SETPARAM           0x01 /* deprecated */
> > >   #define DRM_NOUVEAU_CHANNEL_ALLOC      0x02 /* deprecated */
> > > @@ -136,6 +346,9 @@ struct drm_nouveau_gem_cpu_fini {
> > >   #define DRM_NOUVEAU_NVIF               0x07
> > >   #define DRM_NOUVEAU_SVM_INIT           0x08
> > >   #define DRM_NOUVEAU_SVM_BIND           0x09
> > > +#define DRM_NOUVEAU_VM_INIT            0x10
> > > +#define DRM_NOUVEAU_VM_BIND            0x11
> > > +#define DRM_NOUVEAU_EXEC               0x12
> > >   #define DRM_NOUVEAU_GEM_NEW            0x40
> > >   #define DRM_NOUVEAU_GEM_PUSHBUF        0x41
> > >   #define DRM_NOUVEAU_GEM_CPU_PREP       0x42
> > > @@ -197,6 +410,9 @@ struct drm_nouveau_svm_bind {
> > >   #define DRM_IOCTL_NOUVEAU_GEM_CPU_FINI       DRM_IOW (DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_CPU_FINI, struct drm_nouveau_gem_cpu_fini)
> > >   #define DRM_IOCTL_NOUVEAU_GEM_INFO           DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_INFO, struct drm_nouveau_gem_info)
> > > +#define DRM_IOCTL_NOUVEAU_VM_INIT            DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_VM_INIT, struct drm_nouveau_vm_init)
> > > +#define DRM_IOCTL_NOUVEAU_VM_BIND            DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_VM_BIND, struct drm_nouveau_vm_bind)
> > > +#define DRM_IOCTL_NOUVEAU_EXEC               DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_EXEC, struct drm_nouveau_exec)
> > >   #if defined(__cplusplus)
> > >   }
> > >   #endif
> > > -- 
> > > 2.39.0
> > > 
> > 
>
Danilo Krummrich Jan. 27, 2023, 3:33 a.m. UTC | #5
On 1/27/23 04:21, Matthew Brost wrote:
> On Fri, Jan 27, 2023 at 02:43:30AM +0100, Danilo Krummrich wrote:
>>
>>
>> On 1/27/23 02:05, Matthew Brost wrote:
>>> On Wed, Jan 18, 2023 at 07:12:47AM +0100, Danilo Krummrich wrote:
>>>> This commit provides the interfaces for the new UAPI motivated by the
>>>> Vulkan API. It allows user mode drivers (UMDs) to:
>>>>
>>>> 1) Initialize a GPU virtual address (VA) space via the new
>>>>      DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel reserved
>>>>      VA area.
>>>>
>>>> 2) Bind and unbind GPU VA space mappings via the new
>>>>      DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>>>>
>>>> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
>>>>
>>>> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
>>>> asynchronous processing with DRM syncobjs as synchronization mechanism.
>>>>
>>>> The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
>>>> DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.
>>>>
>>>> Co-authored-by: Dave Airlie <airlied@redhat.com>
>>>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>>>> ---
>>>>    Documentation/gpu/driver-uapi.rst |   8 ++
>>>>    include/uapi/drm/nouveau_drm.h    | 216 ++++++++++++++++++++++++++++++
>>>>    2 files changed, 224 insertions(+)
>>>>
>>>> diff --git a/Documentation/gpu/driver-uapi.rst b/Documentation/gpu/driver-uapi.rst
>>>> index 4411e6919a3d..9c7ca6e33a68 100644
>>>> --- a/Documentation/gpu/driver-uapi.rst
>>>> +++ b/Documentation/gpu/driver-uapi.rst
>>>> @@ -6,3 +6,11 @@ drm/i915 uAPI
>>>>    =============
>>>>    .. kernel-doc:: include/uapi/drm/i915_drm.h
>>>> +
>>>> +drm/nouveau uAPI
>>>> +================
>>>> +
>>>> +VM_BIND / EXEC uAPI
>>>> +-------------------
>>>> +
>>>> +.. kernel-doc:: include/uapi/drm/nouveau_drm.h
>>>> diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h
>>>> index 853a327433d3..f6e7d40201d4 100644
>>>> --- a/include/uapi/drm/nouveau_drm.h
>>>> +++ b/include/uapi/drm/nouveau_drm.h
>>>> @@ -126,6 +126,216 @@ struct drm_nouveau_gem_cpu_fini {
>>>>    	__u32 handle;
>>>>    };
>>>> +/**
>>>> + * struct drm_nouveau_sync - sync object
>>>> + *
>>>> + * This structure serves as synchronization mechanism for (potentially)
>>>> + * asynchronous operations such as EXEC or VM_BIND.
>>>> + */
>>>> +struct drm_nouveau_sync {
>>>> +	/**
>>>> +	 * @flags: the flags for a sync object
>>>> +	 *
>>>> +	 * The first 8 bits are used to determine the type of the sync object.
>>>> +	 */
>>>> +	__u32 flags;
>>>> +#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
>>>> +#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
>>>> +#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
>>>> +	/**
>>>> +	 * @handle: the handle of the sync object
>>>> +	 */
>>>> +	__u32 handle;
>>>> +	/**
>>>> +	 * @timeline_value:
>>>> +	 *
>>>> +	 * The timeline point of the sync object in case the syncobj is of
>>>> +	 * type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
>>>> +	 */
>>>> +	__u64 timeline_value;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct drm_nouveau_vm_init - GPU VA space init structure
>>>> + *
>>>> + * Used to initialize the GPU's VA space for a user client, telling the kernel
>>>> + * which portion of the VA space is managed by the UMD and kernel respectively.
>>>> + */
>>>> +struct drm_nouveau_vm_init {
>>>> +	/**
>>>> +	 * @unmanaged_addr: start address of the kernel managed VA space region
>>>> +	 */
>>>> +	__u64 unmanaged_addr;
>>>> +	/**
>>>> +	 * @unmanaged_size: size of the kernel managed VA space region in bytes
>>>> +	 */
>>>> +	__u64 unmanaged_size;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct drm_nouveau_vm_bind_op - VM_BIND operation
>>>> + *
>>>> + * This structure represents a single VM_BIND operation. UMDs should pass
>>>> + * an array of this structure via struct drm_nouveau_vm_bind's &op_ptr field.
>>>> + */
>>>> +struct drm_nouveau_vm_bind_op {
>>>> +	/**
>>>> +	 * @op: the operation type
>>>> +	 */
>>>> +	__u32 op;
>>>> +/**
>>>> + * @DRM_NOUVEAU_VM_BIND_OP_ALLOC:
>>>> + *
>>>> + * The alloc operation is used to reserve a VA space region within the GPU's VA
>>>> + * space. Optionally, the &DRM_NOUVEAU_VM_BIND_SPARSE flag can be passed to
>>>> + * instruct the kernel to create sparse mappings for the given region.
>>>> + */
>>>> +#define DRM_NOUVEAU_VM_BIND_OP_ALLOC 0x0
>>>
>>> Do you really need this operation? We have no concept of this in Xe,
>>> e.g. we can create a VM and the entire address space is managed exactly
>>> the same.
>>>
>>> If this can be removed then the entire concept of regions in the GPUVA
>>> can be removed too (drop struct drm_gpuva_region). I say this because
>>> in Xe as I'm porting over to GPUVA the first thing I'm doing after
>>> drm_gpuva_manager_init is calling drm_gpuva_region_insert on the entire
>>> address space.
>>
>> Also, since you've been starting to use the code, this [1] is the branch I'm
>> pushing my fixes for a v2 to. It already contains the changes for the GPUVA
>> manager except for switching away from drm_mm.
>>
>> [1] https://gitlab.freedesktop.org/nouvelles/kernel/-/tree/new-uapi-drm-next-fixes
>>
> 
> I will take a look at this branch. I believe you are on our Xe gitlab
> project (working on getting this public) so you can comment on any MR I
> post there, I expect to have something posted early next week to port Xe
> to the gpuva.
> 

Yes, I am.

> Also I assume you are dri-devel IRC, what is your handle? Mine is
> mbrost. It might be useful to chat in real time.

Mine is dakr, I just pinged you in #dri-devel, but it seems your client 
timed out shortly after, so I expect it didn't reach you.

- Danilo

> 
> Matt
> 
>>> To me this seems kinda useless but maybe I'm missing why
>>> you need this for Nouveau.
>>>
>>> Matt
>>>
>>>> +/**
>>>> + * @DRM_NOUVEAU_VM_BIND_OP_FREE: Free a reserved VA space region.
>>>> + */
>>>> +#define DRM_NOUVEAU_VM_BIND_OP_FREE 0x1
>>>> +/**
>>>> + * @DRM_NOUVEAU_VM_BIND_OP_MAP:
>>>> + *
>>>> + * Map a GEM object to the GPU's VA space. The mapping must be fully enclosed by
>>>> + * a previously allocated VA space region. If the region is sparse, existing
>>>> + * sparse mappings are overwritten.
>>>> + */
>>>> +#define DRM_NOUVEAU_VM_BIND_OP_MAP 0x2
>>>> +/**
>>>> + * @DRM_NOUVEAU_VM_BIND_OP_UNMAP:
>>>> + *
>>>> + * Unmap an existing mapping in the GPU's VA space. If the region the mapping
>>>> + * is located in is a sparse region, new sparse mappings are created where the
>>>> + * unmapped (memory backed) mapping was mapped previously.
>>>> + */
>>>> +#define DRM_NOUVEAU_VM_BIND_OP_UNMAP 0x3
>>>> +	/**
>>>> +	 * @flags: the flags for a &drm_nouveau_vm_bind_op
>>>> +	 */
>>>> +	__u32 flags;
>>>> +/**
>>>> + * @DRM_NOUVEAU_VM_BIND_SPARSE:
>>>> + *
>>>> + * Indicates that an allocated VA space region should be sparse.
>>>> + */
>>>> +#define DRM_NOUVEAU_VM_BIND_SPARSE (1 << 8)
>>>> +	/**
>>>> +	 * @handle: the handle of the DRM GEM object to map
>>>> +	 */
>>>> +	__u32 handle;
>>>> +	/**
>>>> +	 * @addr:
>>>> +	 *
>>>> +	 * the address the VA space region or (memory backed) mapping should be mapped to
>>>> +	 */
>>>> +	__u64 addr;
>>>> +	/**
>>>> +	 * @bo_offset: the offset within the BO backing the mapping
>>>> +	 */
>>>> +	__u64 bo_offset;
>>>> +	/**
>>>> +	 * @range: the size of the requested mapping in bytes
>>>> +	 */
>>>> +	__u64 range;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct drm_nouveau_vm_bind - structure for DRM_IOCTL_NOUVEAU_VM_BIND
>>>> + */
>>>> +struct drm_nouveau_vm_bind {
>>>> +	/**
>>>> +	 * @op_count: the number of &drm_nouveau_vm_bind_op
>>>> +	 */
>>>> +	__u32 op_count;
>>>> +	/**
>>>> +	 * @flags: the flags for a &drm_nouveau_vm_bind ioctl
>>>> +	 */
>>>> +	__u32 flags;
>>>> +/**
>>>> + * @DRM_NOUVEAU_VM_BIND_RUN_ASYNC:
>>>> + *
>>>> + * Indicates that the given VM_BIND operation should be executed asynchronously
>>>> + * by the kernel.
>>>> + *
>>>> + * If this flag is not supplied the kernel executes the associated operations
>>>> + * synchronously and doesn't accept any &drm_nouveau_sync objects.
>>>> + */
>>>> +#define DRM_NOUVEAU_VM_BIND_RUN_ASYNC 0x1
>>>> +	/**
>>>> +	 * @wait_count: the number of wait &drm_nouveau_syncs
>>>> +	 */
>>>> +	__u32 wait_count;
>>>> +	/**
>>>> +	 * @sig_count: the number of &drm_nouveau_syncs to signal when finished
>>>> +	 */
>>>> +	__u32 sig_count;
>>>> +	/**
>>>> +	 * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
>>>> +	 */
>>>> +	__u64 wait_ptr;
>>>> +	/**
>>>> +	 * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
>>>> +	 */
>>>> +	__u64 sig_ptr;
>>>> +	/**
>>>> +	 * @op_ptr: pointer to the &drm_nouveau_vm_bind_ops to execute
>>>> +	 */
>>>> +	__u64 op_ptr;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct drm_nouveau_exec_push - EXEC push operation
>>>> + *
>>>> + * This structure represents a single EXEC push operation. UMDs should pass an
>>>> + * array of this structure via struct drm_nouveau_exec's &push_ptr field.
>>>> + */
>>>> +struct drm_nouveau_exec_push {
>>>> +	/**
>>>> +	 * @va: the virtual address of the push buffer mapping
>>>> +	 */
>>>> +	__u64 va;
>>>> +	/**
>>>> +	 * @va_len: the length of the push buffer mapping
>>>> +	 */
>>>> +	__u64 va_len;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct drm_nouveau_exec - structure for DRM_IOCTL_NOUVEAU_EXEC
>>>> + */
>>>> +struct drm_nouveau_exec {
>>>> +	/**
>>>> +	 * @channel: the channel to execute the push buffer in
>>>> +	 */
>>>> +	__u32 channel;
>>>> +	/**
>>>> +	 * @push_count: the number of &drm_nouveau_exec_push ops
>>>> +	 */
>>>> +	__u32 push_count;
>>>> +	/**
>>>> +	 * @wait_count: the number of wait &drm_nouveau_syncs
>>>> +	 */
>>>> +	__u32 wait_count;
>>>> +	/**
>>>> +	 * @sig_count: the number of &drm_nouveau_syncs to signal when finished
>>>> +	 */
>>>> +	__u32 sig_count;
>>>> +	/**
>>>> +	 * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
>>>> +	 */
>>>> +	__u64 wait_ptr;
>>>> +	/**
>>>> +	 * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
>>>> +	 */
>>>> +	__u64 sig_ptr;
>>>> +	/**
>>>> +	 * @push_ptr: pointer to &drm_nouveau_exec_push ops
>>>> +	 */
>>>> +	__u64 push_ptr;
>>>> +};
>>>> +
>>>>    #define DRM_NOUVEAU_GETPARAM           0x00 /* deprecated */
>>>>    #define DRM_NOUVEAU_SETPARAM           0x01 /* deprecated */
>>>>    #define DRM_NOUVEAU_CHANNEL_ALLOC      0x02 /* deprecated */
>>>> @@ -136,6 +346,9 @@ struct drm_nouveau_gem_cpu_fini {
>>>>    #define DRM_NOUVEAU_NVIF               0x07
>>>>    #define DRM_NOUVEAU_SVM_INIT           0x08
>>>>    #define DRM_NOUVEAU_SVM_BIND           0x09
>>>> +#define DRM_NOUVEAU_VM_INIT            0x10
>>>> +#define DRM_NOUVEAU_VM_BIND            0x11
>>>> +#define DRM_NOUVEAU_EXEC               0x12
>>>>    #define DRM_NOUVEAU_GEM_NEW            0x40
>>>>    #define DRM_NOUVEAU_GEM_PUSHBUF        0x41
>>>>    #define DRM_NOUVEAU_GEM_CPU_PREP       0x42
>>>> @@ -197,6 +410,9 @@ struct drm_nouveau_svm_bind {
>>>>    #define DRM_IOCTL_NOUVEAU_GEM_CPU_FINI       DRM_IOW (DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_CPU_FINI, struct drm_nouveau_gem_cpu_fini)
>>>>    #define DRM_IOCTL_NOUVEAU_GEM_INFO           DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_INFO, struct drm_nouveau_gem_info)
>>>> +#define DRM_IOCTL_NOUVEAU_VM_INIT            DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_VM_INIT, struct drm_nouveau_vm_init)
>>>> +#define DRM_IOCTL_NOUVEAU_VM_BIND            DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_VM_BIND, struct drm_nouveau_vm_bind)
>>>> +#define DRM_IOCTL_NOUVEAU_EXEC               DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_EXEC, struct drm_nouveau_exec)
>>>>    #if defined(__cplusplus)
>>>>    }
>>>>    #endif
>>>> -- 
>>>> 2.39.0
>>>>
>>>
>>
>
Christian König Jan. 27, 2023, 7:55 a.m. UTC | #6
Am 27.01.23 um 02:26 schrieb Danilo Krummrich:
> On 1/27/23 02:05, Matthew Brost wrote:
>> On Wed, Jan 18, 2023 at 07:12:47AM +0100, Danilo Krummrich wrote:
>>> This commit provides the interfaces for the new UAPI motivated by the
>>> Vulkan API. It allows user mode drivers (UMDs) to:
>>>
>>> 1) Initialize a GPU virtual address (VA) space via the new
>>>     DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel reserved
>>>     VA area.
>>>
>>> 2) Bind and unbind GPU VA space mappings via the new
>>>     DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>>>
>>> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
>>>
>>> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
>>> asynchronous processing with DRM syncobjs as synchronization mechanism.
>>>
>>> The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
>>> DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.
>>>
>>> Co-authored-by: Dave Airlie <airlied@redhat.com>
>>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>>> ---
>>>   Documentation/gpu/driver-uapi.rst |   8 ++
>>>   include/uapi/drm/nouveau_drm.h    | 216 
>>> ++++++++++++++++++++++++++++++
>>>   2 files changed, 224 insertions(+)
>>>
>>> diff --git a/Documentation/gpu/driver-uapi.rst 
>>> b/Documentation/gpu/driver-uapi.rst
>>> index 4411e6919a3d..9c7ca6e33a68 100644
>>> --- a/Documentation/gpu/driver-uapi.rst
>>> +++ b/Documentation/gpu/driver-uapi.rst
>>> @@ -6,3 +6,11 @@ drm/i915 uAPI
>>>   =============
>>>     .. kernel-doc:: include/uapi/drm/i915_drm.h
>>> +
>>> +drm/nouveau uAPI
>>> +================
>>> +
>>> +VM_BIND / EXEC uAPI
>>> +-------------------
>>> +
>>> +.. kernel-doc:: include/uapi/drm/nouveau_drm.h
>>> diff --git a/include/uapi/drm/nouveau_drm.h 
>>> b/include/uapi/drm/nouveau_drm.h
>>> index 853a327433d3..f6e7d40201d4 100644
>>> --- a/include/uapi/drm/nouveau_drm.h
>>> +++ b/include/uapi/drm/nouveau_drm.h
>>> @@ -126,6 +126,216 @@ struct drm_nouveau_gem_cpu_fini {
>>>       __u32 handle;
>>>   };
>>>   +/**
>>> + * struct drm_nouveau_sync - sync object
>>> + *
>>> + * This structure serves as synchronization mechanism for 
>>> (potentially)
>>> + * asynchronous operations such as EXEC or VM_BIND.
>>> + */
>>> +struct drm_nouveau_sync {
>>> +    /**
>>> +     * @flags: the flags for a sync object
>>> +     *
>>> +     * The first 8 bits are used to determine the type of the sync 
>>> object.
>>> +     */
>>> +    __u32 flags;
>>> +#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
>>> +#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
>>> +#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
>>> +    /**
>>> +     * @handle: the handle of the sync object
>>> +     */
>>> +    __u32 handle;
>>> +    /**
>>> +     * @timeline_value:
>>> +     *
>>> +     * The timeline point of the sync object in case the syncobj is of
>>> +     * type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
>>> +     */
>>> +    __u64 timeline_value;
>>> +};
>>> +
>>> +/**
>>> + * struct drm_nouveau_vm_init - GPU VA space init structure
>>> + *
>>> + * Used to initialize the GPU's VA space for a user client, telling 
>>> the kernel
>>> + * which portion of the VA space is managed by the UMD and kernel 
>>> respectively.
>>> + */
>>> +struct drm_nouveau_vm_init {
>>> +    /**
>>> +     * @unmanaged_addr: start address of the kernel managed VA 
>>> space region
>>> +     */
>>> +    __u64 unmanaged_addr;
>>> +    /**
>>> +     * @unmanaged_size: size of the kernel managed VA space region 
>>> in bytes
>>> +     */
>>> +    __u64 unmanaged_size;
>>> +};
>>> +
>>> +/**
>>> + * struct drm_nouveau_vm_bind_op - VM_BIND operation
>>> + *
>>> + * This structure represents a single VM_BIND operation. UMDs 
>>> should pass
>>> + * an array of this structure via struct drm_nouveau_vm_bind's 
>>> &op_ptr field.
>>> + */
>>> +struct drm_nouveau_vm_bind_op {
>>> +    /**
>>> +     * @op: the operation type
>>> +     */
>>> +    __u32 op;
>>> +/**
>>> + * @DRM_NOUVEAU_VM_BIND_OP_ALLOC:
>>> + *
>>> + * The alloc operation is used to reserve a VA space region within 
>>> the GPU's VA
>>> + * space. Optionally, the &DRM_NOUVEAU_VM_BIND_SPARSE flag can be 
>>> passed to
>>> + * instruct the kernel to create sparse mappings for the given region.
>>> + */
>>> +#define DRM_NOUVEAU_VM_BIND_OP_ALLOC 0x0
>>
>> Do you really need this operation? We have no concept of this in Xe,
>> e.g. we can create a VM and the entire address space is managed exactly
>> the same.
>
> The idea for alloc/free is to let UMDs allocate a portion of the VA 
> space (which I call a region), basically the same thing Vulkan 
> represents with a VKBuffer.

If that's mangled into the same component/interface then I can say from 
experience that this is a pretty bad idea. We have tried something 
similar with radeon and it turned out horrible.

What you want is one component for tracking the VA allocations (drm_mm 
based) and a different component/interface for tracking the VA mappings 
(probably rb tree based).

amdgpu has even gotten so far that the VA allocations are tracked in 
libdrm in userspace.

Regards,
Christian.

>
> It serves two purposes:
>
> 1. It gives the kernel (in particular the GPUVA manager) the bounds in 
> which it is allowed to merge mappings. E.g. when a user request asks 
> for a new mapping and we detect we could merge this mapping with an 
> existing one (used in another VKBuffer than the mapping request came 
> for) the driver is not allowed to change the page table for the 
> existing mapping we want to merge with (assuming that some drivers 
> would need to do this in order to merge), because the existing mapping 
> could already be in use and by re-mapping it we'd potentially cause a 
> fault on the GPU.
>
> 2. It is used for sparse residency in a way that such an allocated VA 
> space region can be flagged as sparse, such that the kernel always 
> keeps sparse mappings around for the parts of the region that do not 
> contain actual memory backed mappings.
>
> If for your driver merging is always OK, creating a single huge region 
> would do the trick I guess. Otherwise, we could also add an option to 
> the GPUVA manager (or a specific region, which could also be a single 
> huge one) within which it never merges.
>
>>
>> If this can be removed then the entire concept of regions in the GPUVA
>> can be removed too (drop struct drm_gpuva_region). I say this because
>> in Xe as I'm porting over to GPUVA the first thing I'm doing after
>> drm_gpuva_manager_init is calling drm_gpuva_region_insert on the entire
>> address space. To me this seems kinda useless but maybe I'm missing why
>> you need this for Nouveau.
>>
>> Matt
>>
>>> +/**
>>> + * @DRM_NOUVEAU_VM_BIND_OP_FREE: Free a reserved VA space region.
>>> + */
>>> +#define DRM_NOUVEAU_VM_BIND_OP_FREE 0x1
>>> +/**
>>> + * @DRM_NOUVEAU_VM_BIND_OP_MAP:
>>> + *
>>> + * Map a GEM object to the GPU's VA space. The mapping must be 
>>> fully enclosed by
>>> + * a previously allocated VA space region. If the region is sparse, 
>>> existing
>>> + * sparse mappings are overwritten.
>>> + */
>>> +#define DRM_NOUVEAU_VM_BIND_OP_MAP 0x2
>>> +/**
>>> + * @DRM_NOUVEAU_VM_BIND_OP_UNMAP:
>>> + *
>>> + * Unmap an existing mapping in the GPU's VA space. If the region 
>>> the mapping
>>> + * is located in is a sparse region, new sparse mappings are 
>>> created where the
>>> + * unmapped (memory backed) mapping was mapped previously.
>>> + */
>>> +#define DRM_NOUVEAU_VM_BIND_OP_UNMAP 0x3
>>> +    /**
>>> +     * @flags: the flags for a &drm_nouveau_vm_bind_op
>>> +     */
>>> +    __u32 flags;
>>> +/**
>>> + * @DRM_NOUVEAU_VM_BIND_SPARSE:
>>> + *
>>> + * Indicates that an allocated VA space region should be sparse.
>>> + */
>>> +#define DRM_NOUVEAU_VM_BIND_SPARSE (1 << 8)
>>> +    /**
>>> +     * @handle: the handle of the DRM GEM object to map
>>> +     */
>>> +    __u32 handle;
>>> +    /**
>>> +     * @addr:
>>> +     *
>>> +     * the address the VA space region or (memory backed) mapping 
>>> should be mapped to
>>> +     */
>>> +    __u64 addr;
>>> +    /**
>>> +     * @bo_offset: the offset within the BO backing the mapping
>>> +     */
>>> +    __u64 bo_offset;
>>> +    /**
>>> +     * @range: the size of the requested mapping in bytes
>>> +     */
>>> +    __u64 range;
>>> +};
>>> +
>>> +/**
>>> + * struct drm_nouveau_vm_bind - structure for 
>>> DRM_IOCTL_NOUVEAU_VM_BIND
>>> + */
>>> +struct drm_nouveau_vm_bind {
>>> +    /**
>>> +     * @op_count: the number of &drm_nouveau_vm_bind_op
>>> +     */
>>> +    __u32 op_count;
>>> +    /**
>>> +     * @flags: the flags for a &drm_nouveau_vm_bind ioctl
>>> +     */
>>> +    __u32 flags;
>>> +/**
>>> + * @DRM_NOUVEAU_VM_BIND_RUN_ASYNC:
>>> + *
>>> + * Indicates that the given VM_BIND operation should be executed 
>>> asynchronously
>>> + * by the kernel.
>>> + *
>>> + * If this flag is not supplied the kernel executes the associated 
>>> operations
>>> + * synchronously and doesn't accept any &drm_nouveau_sync objects.
>>> + */
>>> +#define DRM_NOUVEAU_VM_BIND_RUN_ASYNC 0x1
>>> +    /**
>>> +     * @wait_count: the number of wait &drm_nouveau_syncs
>>> +     */
>>> +    __u32 wait_count;
>>> +    /**
>>> +     * @sig_count: the number of &drm_nouveau_syncs to signal when 
>>> finished
>>> +     */
>>> +    __u32 sig_count;
>>> +    /**
>>> +     * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
>>> +     */
>>> +    __u64 wait_ptr;
>>> +    /**
>>> +     * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
>>> +     */
>>> +    __u64 sig_ptr;
>>> +    /**
>>> +     * @op_ptr: pointer to the &drm_nouveau_vm_bind_ops to execute
>>> +     */
>>> +    __u64 op_ptr;
>>> +};
>>> +
>>> +/**
>>> + * struct drm_nouveau_exec_push - EXEC push operation
>>> + *
>>> + * This structure represents a single EXEC push operation. UMDs 
>>> should pass an
>>> + * array of this structure via struct drm_nouveau_exec's &push_ptr 
>>> field.
>>> + */
>>> +struct drm_nouveau_exec_push {
>>> +    /**
>>> +     * @va: the virtual address of the push buffer mapping
>>> +     */
>>> +    __u64 va;
>>> +    /**
>>> +     * @va_len: the length of the push buffer mapping
>>> +     */
>>> +    __u64 va_len;
>>> +};
>>> +
>>> +/**
>>> + * struct drm_nouveau_exec - structure for DRM_IOCTL_NOUVEAU_EXEC
>>> + */
>>> +struct drm_nouveau_exec {
>>> +    /**
>>> +     * @channel: the channel to execute the push buffer in
>>> +     */
>>> +    __u32 channel;
>>> +    /**
>>> +     * @push_count: the number of &drm_nouveau_exec_push ops
>>> +     */
>>> +    __u32 push_count;
>>> +    /**
>>> +     * @wait_count: the number of wait &drm_nouveau_syncs
>>> +     */
>>> +    __u32 wait_count;
>>> +    /**
>>> +     * @sig_count: the number of &drm_nouveau_syncs to signal when 
>>> finished
>>> +     */
>>> +    __u32 sig_count;
>>> +    /**
>>> +     * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
>>> +     */
>>> +    __u64 wait_ptr;
>>> +    /**
>>> +     * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
>>> +     */
>>> +    __u64 sig_ptr;
>>> +    /**
>>> +     * @push_ptr: pointer to &drm_nouveau_exec_push ops
>>> +     */
>>> +    __u64 push_ptr;
>>> +};
>>> +
>>>   #define DRM_NOUVEAU_GETPARAM           0x00 /* deprecated */
>>>   #define DRM_NOUVEAU_SETPARAM           0x01 /* deprecated */
>>>   #define DRM_NOUVEAU_CHANNEL_ALLOC      0x02 /* deprecated */
>>> @@ -136,6 +346,9 @@ struct drm_nouveau_gem_cpu_fini {
>>>   #define DRM_NOUVEAU_NVIF               0x07
>>>   #define DRM_NOUVEAU_SVM_INIT           0x08
>>>   #define DRM_NOUVEAU_SVM_BIND           0x09
>>> +#define DRM_NOUVEAU_VM_INIT            0x10
>>> +#define DRM_NOUVEAU_VM_BIND            0x11
>>> +#define DRM_NOUVEAU_EXEC               0x12
>>>   #define DRM_NOUVEAU_GEM_NEW            0x40
>>>   #define DRM_NOUVEAU_GEM_PUSHBUF        0x41
>>>   #define DRM_NOUVEAU_GEM_CPU_PREP       0x42
>>> @@ -197,6 +410,9 @@ struct drm_nouveau_svm_bind {
>>>   #define DRM_IOCTL_NOUVEAU_GEM_CPU_FINI       DRM_IOW 
>>> (DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_CPU_FINI, struct 
>>> drm_nouveau_gem_cpu_fini)
>>>   #define DRM_IOCTL_NOUVEAU_GEM_INFO DRM_IOWR(DRM_COMMAND_BASE + 
>>> DRM_NOUVEAU_GEM_INFO, struct drm_nouveau_gem_info)
>>>   +#define DRM_IOCTL_NOUVEAU_VM_INIT DRM_IOWR(DRM_COMMAND_BASE + 
>>> DRM_NOUVEAU_VM_INIT, struct drm_nouveau_vm_init)
>>> +#define DRM_IOCTL_NOUVEAU_VM_BIND DRM_IOWR(DRM_COMMAND_BASE + 
>>> DRM_NOUVEAU_VM_BIND, struct drm_nouveau_vm_bind)
>>> +#define DRM_IOCTL_NOUVEAU_EXEC DRM_IOWR(DRM_COMMAND_BASE + 
>>> DRM_NOUVEAU_EXEC, struct drm_nouveau_exec)
>>>   #if defined(__cplusplus)
>>>   }
>>>   #endif
>>> -- 
>>> 2.39.0
>>>
>>
>
Danilo Krummrich Jan. 27, 2023, 1:12 p.m. UTC | #7
On 1/27/23 08:55, Christian König wrote:
> Am 27.01.23 um 02:26 schrieb Danilo Krummrich:
>> On 1/27/23 02:05, Matthew Brost wrote:
>>> On Wed, Jan 18, 2023 at 07:12:47AM +0100, Danilo Krummrich wrote:
>>>> This commit provides the interfaces for the new UAPI motivated by the
>>>> Vulkan API. It allows user mode drivers (UMDs) to:
>>>>
>>>> 1) Initialize a GPU virtual address (VA) space via the new
>>>>     DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel reserved
>>>>     VA area.
>>>>
>>>> 2) Bind and unbind GPU VA space mappings via the new
>>>>     DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>>>>
>>>> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
>>>>
>>>> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
>>>> asynchronous processing with DRM syncobjs as synchronization mechanism.
>>>>
>>>> The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
>>>> DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.
>>>>
>>>> Co-authored-by: Dave Airlie <airlied@redhat.com>
>>>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>>>> ---
>>>>   Documentation/gpu/driver-uapi.rst |   8 ++
>>>>   include/uapi/drm/nouveau_drm.h    | 216 
>>>> ++++++++++++++++++++++++++++++
>>>>   2 files changed, 224 insertions(+)
>>>>
>>>> diff --git a/Documentation/gpu/driver-uapi.rst 
>>>> b/Documentation/gpu/driver-uapi.rst
>>>> index 4411e6919a3d..9c7ca6e33a68 100644
>>>> --- a/Documentation/gpu/driver-uapi.rst
>>>> +++ b/Documentation/gpu/driver-uapi.rst
>>>> @@ -6,3 +6,11 @@ drm/i915 uAPI
>>>>   =============
>>>>     .. kernel-doc:: include/uapi/drm/i915_drm.h
>>>> +
>>>> +drm/nouveau uAPI
>>>> +================
>>>> +
>>>> +VM_BIND / EXEC uAPI
>>>> +-------------------
>>>> +
>>>> +.. kernel-doc:: include/uapi/drm/nouveau_drm.h
>>>> diff --git a/include/uapi/drm/nouveau_drm.h 
>>>> b/include/uapi/drm/nouveau_drm.h
>>>> index 853a327433d3..f6e7d40201d4 100644
>>>> --- a/include/uapi/drm/nouveau_drm.h
>>>> +++ b/include/uapi/drm/nouveau_drm.h
>>>> @@ -126,6 +126,216 @@ struct drm_nouveau_gem_cpu_fini {
>>>>       __u32 handle;
>>>>   };
>>>>   +/**
>>>> + * struct drm_nouveau_sync - sync object
>>>> + *
>>>> + * This structure serves as synchronization mechanism for 
>>>> (potentially)
>>>> + * asynchronous operations such as EXEC or VM_BIND.
>>>> + */
>>>> +struct drm_nouveau_sync {
>>>> +    /**
>>>> +     * @flags: the flags for a sync object
>>>> +     *
>>>> +     * The first 8 bits are used to determine the type of the sync 
>>>> object.
>>>> +     */
>>>> +    __u32 flags;
>>>> +#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
>>>> +#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
>>>> +#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
>>>> +    /**
>>>> +     * @handle: the handle of the sync object
>>>> +     */
>>>> +    __u32 handle;
>>>> +    /**
>>>> +     * @timeline_value:
>>>> +     *
>>>> +     * The timeline point of the sync object in case the syncobj is of
>>>> +     * type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
>>>> +     */
>>>> +    __u64 timeline_value;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct drm_nouveau_vm_init - GPU VA space init structure
>>>> + *
>>>> + * Used to initialize the GPU's VA space for a user client, telling 
>>>> the kernel
>>>> + * which portion of the VA space is managed by the UMD and kernel 
>>>> respectively.
>>>> + */
>>>> +struct drm_nouveau_vm_init {
>>>> +    /**
>>>> +     * @unmanaged_addr: start address of the kernel managed VA 
>>>> space region
>>>> +     */
>>>> +    __u64 unmanaged_addr;
>>>> +    /**
>>>> +     * @unmanaged_size: size of the kernel managed VA space region 
>>>> in bytes
>>>> +     */
>>>> +    __u64 unmanaged_size;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct drm_nouveau_vm_bind_op - VM_BIND operation
>>>> + *
>>>> + * This structure represents a single VM_BIND operation. UMDs 
>>>> should pass
>>>> + * an array of this structure via struct drm_nouveau_vm_bind's 
>>>> &op_ptr field.
>>>> + */
>>>> +struct drm_nouveau_vm_bind_op {
>>>> +    /**
>>>> +     * @op: the operation type
>>>> +     */
>>>> +    __u32 op;
>>>> +/**
>>>> + * @DRM_NOUVEAU_VM_BIND_OP_ALLOC:
>>>> + *
>>>> + * The alloc operation is used to reserve a VA space region within 
>>>> the GPU's VA
>>>> + * space. Optionally, the &DRM_NOUVEAU_VM_BIND_SPARSE flag can be 
>>>> passed to
>>>> + * instruct the kernel to create sparse mappings for the given region.
>>>> + */
>>>> +#define DRM_NOUVEAU_VM_BIND_OP_ALLOC 0x0
>>>
>>> Do you really need this operation? We have no concept of this in Xe,
>>> e.g. we can create a VM and the entire address space is managed exactly
>>> the same.
>>
>> The idea for alloc/free is to let UMDs allocate a portion of the VA 
>> space (which I call a region), basically the same thing Vulkan 
>> represents with a VKBuffer.
> 
> If that's mangled into the same component/interface then I can say from 
> experience that this is a pretty bad idea. We have tried something 
> similar with radeon and it turned out horrible.

What was the exact constellation in radeon and which problems did arise 
from it?

> 
> What you want is one component for tracking the VA allocations (drm_mm 
> based) and a different component/interface for tracking the VA mappings 
> (probably rb tree based).

That's what the GPUVA manager is doing. There are gpuva_regions which 
correspond to VA allocations and gpuvas which represent the mappings. 
Both are tracked separately (currently both with a separate drm_mm, 
though). However, the GPUVA manager needs to take regions into account 
when dealing with mappings to make sure the GPUVA manager doesn't 
propose drivers to merge over region boundaries. Speaking from userspace 
PoV, the kernel wouldn't merge mappings from different VKBuffer objects 
even if they're virtually and physically contiguous.

For sparse residency the kernel also needs to know the region boundaries 
to make sure that it keeps sparse mappings around.

> 
> amdgpu has even gotten so far that the VA allocations are tracked in 
> libdrm in userspace
> 
> Regards,
> Christian.
> 
>>
>> It serves two purposes:
>>
>> 1. It gives the kernel (in particular the GPUVA manager) the bounds in 
>> which it is allowed to merge mappings. E.g. when a user request asks 
>> for a new mapping and we detect we could merge this mapping with an 
>> existing one (used in another VKBuffer than the mapping request came 
>> for) the driver is not allowed to change the page table for the 
>> existing mapping we want to merge with (assuming that some drivers 
>> would need to do this in order to merge), because the existing mapping 
>> could already be in use and by re-mapping it we'd potentially cause a 
>> fault on the GPU.
>>
>> 2. It is used for sparse residency in a way that such an allocated VA 
>> space region can be flagged as sparse, such that the kernel always 
>> keeps sparse mappings around for the parts of the region that do not 
>> contain actual memory backed mappings.
>>
>> If for your driver merging is always OK, creating a single huge region 
>> would do the trick I guess. Otherwise, we could also add an option to 
>> the GPUVA manager (or a specific region, which could also be a single 
>> huge one) within which it never merges.
>>
>>>
>>> If this can be removed then the entire concept of regions in the GPUVA
>>> can be removed too (drop struct drm_gpuva_region). I say this because
>>> in Xe as I'm porting over to GPUVA the first thing I'm doing after
>>> drm_gpuva_manager_init is calling drm_gpuva_region_insert on the entire
>>> address space. To me this seems kinda useless but maybe I'm missing why
>>> you need this for Nouveau.
>>>
>>> Matt
>>>
>>>> +/**
>>>> + * @DRM_NOUVEAU_VM_BIND_OP_FREE: Free a reserved VA space region.
>>>> + */
>>>> +#define DRM_NOUVEAU_VM_BIND_OP_FREE 0x1
>>>> +/**
>>>> + * @DRM_NOUVEAU_VM_BIND_OP_MAP:
>>>> + *
>>>> + * Map a GEM object to the GPU's VA space. The mapping must be 
>>>> fully enclosed by
>>>> + * a previously allocated VA space region. If the region is sparse, 
>>>> existing
>>>> + * sparse mappings are overwritten.
>>>> + */
>>>> +#define DRM_NOUVEAU_VM_BIND_OP_MAP 0x2
>>>> +/**
>>>> + * @DRM_NOUVEAU_VM_BIND_OP_UNMAP:
>>>> + *
>>>> + * Unmap an existing mapping in the GPU's VA space. If the region 
>>>> the mapping
>>>> + * is located in is a sparse region, new sparse mappings are 
>>>> created where the
>>>> + * unmapped (memory backed) mapping was mapped previously.
>>>> + */
>>>> +#define DRM_NOUVEAU_VM_BIND_OP_UNMAP 0x3
>>>> +    /**
>>>> +     * @flags: the flags for a &drm_nouveau_vm_bind_op
>>>> +     */
>>>> +    __u32 flags;
>>>> +/**
>>>> + * @DRM_NOUVEAU_VM_BIND_SPARSE:
>>>> + *
>>>> + * Indicates that an allocated VA space region should be sparse.
>>>> + */
>>>> +#define DRM_NOUVEAU_VM_BIND_SPARSE (1 << 8)
>>>> +    /**
>>>> +     * @handle: the handle of the DRM GEM object to map
>>>> +     */
>>>> +    __u32 handle;
>>>> +    /**
>>>> +     * @addr:
>>>> +     *
>>>> +     * the address the VA space region or (memory backed) mapping 
>>>> should be mapped to
>>>> +     */
>>>> +    __u64 addr;
>>>> +    /**
>>>> +     * @bo_offset: the offset within the BO backing the mapping
>>>> +     */
>>>> +    __u64 bo_offset;
>>>> +    /**
>>>> +     * @range: the size of the requested mapping in bytes
>>>> +     */
>>>> +    __u64 range;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct drm_nouveau_vm_bind - structure for 
>>>> DRM_IOCTL_NOUVEAU_VM_BIND
>>>> + */
>>>> +struct drm_nouveau_vm_bind {
>>>> +    /**
>>>> +     * @op_count: the number of &drm_nouveau_vm_bind_op
>>>> +     */
>>>> +    __u32 op_count;
>>>> +    /**
>>>> +     * @flags: the flags for a &drm_nouveau_vm_bind ioctl
>>>> +     */
>>>> +    __u32 flags;
>>>> +/**
>>>> + * @DRM_NOUVEAU_VM_BIND_RUN_ASYNC:
>>>> + *
>>>> + * Indicates that the given VM_BIND operation should be executed 
>>>> asynchronously
>>>> + * by the kernel.
>>>> + *
>>>> + * If this flag is not supplied the kernel executes the associated 
>>>> operations
>>>> + * synchronously and doesn't accept any &drm_nouveau_sync objects.
>>>> + */
>>>> +#define DRM_NOUVEAU_VM_BIND_RUN_ASYNC 0x1
>>>> +    /**
>>>> +     * @wait_count: the number of wait &drm_nouveau_syncs
>>>> +     */
>>>> +    __u32 wait_count;
>>>> +    /**
>>>> +     * @sig_count: the number of &drm_nouveau_syncs to signal when 
>>>> finished
>>>> +     */
>>>> +    __u32 sig_count;
>>>> +    /**
>>>> +     * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
>>>> +     */
>>>> +    __u64 wait_ptr;
>>>> +    /**
>>>> +     * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
>>>> +     */
>>>> +    __u64 sig_ptr;
>>>> +    /**
>>>> +     * @op_ptr: pointer to the &drm_nouveau_vm_bind_ops to execute
>>>> +     */
>>>> +    __u64 op_ptr;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct drm_nouveau_exec_push - EXEC push operation
>>>> + *
>>>> + * This structure represents a single EXEC push operation. UMDs 
>>>> should pass an
>>>> + * array of this structure via struct drm_nouveau_exec's &push_ptr 
>>>> field.
>>>> + */
>>>> +struct drm_nouveau_exec_push {
>>>> +    /**
>>>> +     * @va: the virtual address of the push buffer mapping
>>>> +     */
>>>> +    __u64 va;
>>>> +    /**
>>>> +     * @va_len: the length of the push buffer mapping
>>>> +     */
>>>> +    __u64 va_len;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct drm_nouveau_exec - structure for DRM_IOCTL_NOUVEAU_EXEC
>>>> + */
>>>> +struct drm_nouveau_exec {
>>>> +    /**
>>>> +     * @channel: the channel to execute the push buffer in
>>>> +     */
>>>> +    __u32 channel;
>>>> +    /**
>>>> +     * @push_count: the number of &drm_nouveau_exec_push ops
>>>> +     */
>>>> +    __u32 push_count;
>>>> +    /**
>>>> +     * @wait_count: the number of wait &drm_nouveau_syncs
>>>> +     */
>>>> +    __u32 wait_count;
>>>> +    /**
>>>> +     * @sig_count: the number of &drm_nouveau_syncs to signal when 
>>>> finished
>>>> +     */
>>>> +    __u32 sig_count;
>>>> +    /**
>>>> +     * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
>>>> +     */
>>>> +    __u64 wait_ptr;
>>>> +    /**
>>>> +     * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
>>>> +     */
>>>> +    __u64 sig_ptr;
>>>> +    /**
>>>> +     * @push_ptr: pointer to &drm_nouveau_exec_push ops
>>>> +     */
>>>> +    __u64 push_ptr;
>>>> +};
>>>> +
>>>>   #define DRM_NOUVEAU_GETPARAM           0x00 /* deprecated */
>>>>   #define DRM_NOUVEAU_SETPARAM           0x01 /* deprecated */
>>>>   #define DRM_NOUVEAU_CHANNEL_ALLOC      0x02 /* deprecated */
>>>> @@ -136,6 +346,9 @@ struct drm_nouveau_gem_cpu_fini {
>>>>   #define DRM_NOUVEAU_NVIF               0x07
>>>>   #define DRM_NOUVEAU_SVM_INIT           0x08
>>>>   #define DRM_NOUVEAU_SVM_BIND           0x09
>>>> +#define DRM_NOUVEAU_VM_INIT            0x10
>>>> +#define DRM_NOUVEAU_VM_BIND            0x11
>>>> +#define DRM_NOUVEAU_EXEC               0x12
>>>>   #define DRM_NOUVEAU_GEM_NEW            0x40
>>>>   #define DRM_NOUVEAU_GEM_PUSHBUF        0x41
>>>>   #define DRM_NOUVEAU_GEM_CPU_PREP       0x42
>>>> @@ -197,6 +410,9 @@ struct drm_nouveau_svm_bind {
>>>>   #define DRM_IOCTL_NOUVEAU_GEM_CPU_FINI       DRM_IOW 
>>>> (DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_CPU_FINI, struct 
>>>> drm_nouveau_gem_cpu_fini)
>>>>   #define DRM_IOCTL_NOUVEAU_GEM_INFO DRM_IOWR(DRM_COMMAND_BASE + 
>>>> DRM_NOUVEAU_GEM_INFO, struct drm_nouveau_gem_info)
>>>>   +#define DRM_IOCTL_NOUVEAU_VM_INIT DRM_IOWR(DRM_COMMAND_BASE + 
>>>> DRM_NOUVEAU_VM_INIT, struct drm_nouveau_vm_init)
>>>> +#define DRM_IOCTL_NOUVEAU_VM_BIND DRM_IOWR(DRM_COMMAND_BASE + 
>>>> DRM_NOUVEAU_VM_BIND, struct drm_nouveau_vm_bind)
>>>> +#define DRM_IOCTL_NOUVEAU_EXEC DRM_IOWR(DRM_COMMAND_BASE + 
>>>> DRM_NOUVEAU_EXEC, struct drm_nouveau_exec)
>>>>   #if defined(__cplusplus)
>>>>   }
>>>>   #endif
>>>> -- 
>>>> 2.39.0
>>>>
>>>
>>
>
Christian König Jan. 27, 2023, 1:23 p.m. UTC | #8
Am 27.01.23 um 14:12 schrieb Danilo Krummrich:
> On 1/27/23 08:55, Christian König wrote:
>> Am 27.01.23 um 02:26 schrieb Danilo Krummrich:
>>> On 1/27/23 02:05, Matthew Brost wrote:
>>>> On Wed, Jan 18, 2023 at 07:12:47AM +0100, Danilo Krummrich wrote:
>>>>> This commit provides the interfaces for the new UAPI motivated by the
>>>>> Vulkan API. It allows user mode drivers (UMDs) to:
>>>>>
>>>>> 1) Initialize a GPU virtual address (VA) space via the new
>>>>>     DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel 
>>>>> reserved
>>>>>     VA area.
>>>>>
>>>>> 2) Bind and unbind GPU VA space mappings via the new
>>>>>     DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>>>>>
>>>>> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
>>>>>
>>>>> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
>>>>> asynchronous processing with DRM syncobjs as synchronization 
>>>>> mechanism.
>>>>>
>>>>> The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
>>>>> DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.
>>>>>
>>>>> Co-authored-by: Dave Airlie <airlied@redhat.com>
>>>>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>>>>> ---
>>>>>   Documentation/gpu/driver-uapi.rst |   8 ++
>>>>>   include/uapi/drm/nouveau_drm.h    | 216 
>>>>> ++++++++++++++++++++++++++++++
>>>>>   2 files changed, 224 insertions(+)
>>>>>
>>>>> diff --git a/Documentation/gpu/driver-uapi.rst 
>>>>> b/Documentation/gpu/driver-uapi.rst
>>>>> index 4411e6919a3d..9c7ca6e33a68 100644
>>>>> --- a/Documentation/gpu/driver-uapi.rst
>>>>> +++ b/Documentation/gpu/driver-uapi.rst
>>>>> @@ -6,3 +6,11 @@ drm/i915 uAPI
>>>>>   =============
>>>>>     .. kernel-doc:: include/uapi/drm/i915_drm.h
>>>>> +
>>>>> +drm/nouveau uAPI
>>>>> +================
>>>>> +
>>>>> +VM_BIND / EXEC uAPI
>>>>> +-------------------
>>>>> +
>>>>> +.. kernel-doc:: include/uapi/drm/nouveau_drm.h
>>>>> diff --git a/include/uapi/drm/nouveau_drm.h 
>>>>> b/include/uapi/drm/nouveau_drm.h
>>>>> index 853a327433d3..f6e7d40201d4 100644
>>>>> --- a/include/uapi/drm/nouveau_drm.h
>>>>> +++ b/include/uapi/drm/nouveau_drm.h
>>>>> @@ -126,6 +126,216 @@ struct drm_nouveau_gem_cpu_fini {
>>>>>       __u32 handle;
>>>>>   };
>>>>>   +/**
>>>>> + * struct drm_nouveau_sync - sync object
>>>>> + *
>>>>> + * This structure serves as synchronization mechanism for 
>>>>> (potentially)
>>>>> + * asynchronous operations such as EXEC or VM_BIND.
>>>>> + */
>>>>> +struct drm_nouveau_sync {
>>>>> +    /**
>>>>> +     * @flags: the flags for a sync object
>>>>> +     *
>>>>> +     * The first 8 bits are used to determine the type of the 
>>>>> sync object.
>>>>> +     */
>>>>> +    __u32 flags;
>>>>> +#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
>>>>> +#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
>>>>> +#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
>>>>> +    /**
>>>>> +     * @handle: the handle of the sync object
>>>>> +     */
>>>>> +    __u32 handle;
>>>>> +    /**
>>>>> +     * @timeline_value:
>>>>> +     *
>>>>> +     * The timeline point of the sync object in case the syncobj 
>>>>> is of
>>>>> +     * type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
>>>>> +     */
>>>>> +    __u64 timeline_value;
>>>>> +};
>>>>> +
>>>>> +/**
>>>>> + * struct drm_nouveau_vm_init - GPU VA space init structure
>>>>> + *
>>>>> + * Used to initialize the GPU's VA space for a user client, 
>>>>> telling the kernel
>>>>> + * which portion of the VA space is managed by the UMD and kernel 
>>>>> respectively.
>>>>> + */
>>>>> +struct drm_nouveau_vm_init {
>>>>> +    /**
>>>>> +     * @unmanaged_addr: start address of the kernel managed VA 
>>>>> space region
>>>>> +     */
>>>>> +    __u64 unmanaged_addr;
>>>>> +    /**
>>>>> +     * @unmanaged_size: size of the kernel managed VA space 
>>>>> region in bytes
>>>>> +     */
>>>>> +    __u64 unmanaged_size;
>>>>> +};
>>>>> +
>>>>> +/**
>>>>> + * struct drm_nouveau_vm_bind_op - VM_BIND operation
>>>>> + *
>>>>> + * This structure represents a single VM_BIND operation. UMDs 
>>>>> should pass
>>>>> + * an array of this structure via struct drm_nouveau_vm_bind's 
>>>>> &op_ptr field.
>>>>> + */
>>>>> +struct drm_nouveau_vm_bind_op {
>>>>> +    /**
>>>>> +     * @op: the operation type
>>>>> +     */
>>>>> +    __u32 op;
>>>>> +/**
>>>>> + * @DRM_NOUVEAU_VM_BIND_OP_ALLOC:
>>>>> + *
>>>>> + * The alloc operation is used to reserve a VA space region 
>>>>> within the GPU's VA
>>>>> + * space. Optionally, the &DRM_NOUVEAU_VM_BIND_SPARSE flag can be 
>>>>> passed to
>>>>> + * instruct the kernel to create sparse mappings for the given 
>>>>> region.
>>>>> + */
>>>>> +#define DRM_NOUVEAU_VM_BIND_OP_ALLOC 0x0
>>>>
>>>> Do you really need this operation? We have no concept of this in Xe,
>>>> e.g. we can create a VM and the entire address space is managed 
>>>> exactly
>>>> the same.
>>>
>>> The idea for alloc/free is to let UMDs allocate a portion of the VA 
>>> space (which I call a region), basically the same thing Vulkan 
>>> represents with a VKBuffer.
>>
>> If that's mangled into the same component/interface then I can say 
>> from experience that this is a pretty bad idea. We have tried 
>> something similar with radeon and it turned out horrible.
>
> What was the exact constellation in radeon and which problems did 
> arise from it?
>
>>
>> What you want is one component for tracking the VA allocations 
>> (drm_mm based) and a different component/interface for tracking the 
>> VA mappings (probably rb tree based).
>
> That's what the GPUVA manager is doing. There are gpuva_regions which 
> correspond to VA allocations and gpuvas which represent the mappings. 
> Both are tracked separately (currently both with a separate drm_mm, 
> though). However, the GPUVA manager needs to take regions into account 
> when dealing with mappings to make sure the GPUVA manager doesn't 
> propose drivers to merge over region boundaries. Speaking from 
> userspace PoV, the kernel wouldn't merge mappings from different 
> VKBuffer objects even if they're virtually and physically contiguous.

That are two completely different things and shouldn't be handled in a 
single component.

We should probably talk about the design of the GPUVA manager once more 
when this should be applicable to all GPU drivers.

>
> For sparse residency the kernel also needs to know the region 
> boundaries to make sure that it keeps sparse mappings around.

What?

Regards,
Christian.

>
>>
>> amdgpu has even gotten so far that the VA allocations are tracked in 
>> libdrm in userspace
>>
>> Regards,
>> Christian.
>>
>>>
>>> It serves two purposes:
>>>
>>> 1. It gives the kernel (in particular the GPUVA manager) the bounds 
>>> in which it is allowed to merge mappings. E.g. when a user request 
>>> asks for a new mapping and we detect we could merge this mapping 
>>> with an existing one (used in another VKBuffer than the mapping 
>>> request came for) the driver is not allowed to change the page table 
>>> for the existing mapping we want to merge with (assuming that some 
>>> drivers would need to do this in order to merge), because the 
>>> existing mapping could already be in use and by re-mapping it we'd 
>>> potentially cause a fault on the GPU.
>>>
>>> 2. It is used for sparse residency in a way that such an allocated 
>>> VA space region can be flagged as sparse, such that the kernel 
>>> always keeps sparse mappings around for the parts of the region that 
>>> do not contain actual memory backed mappings.
>>>
>>> If for your driver merging is always OK, creating a single huge 
>>> region would do the trick I guess. Otherwise, we could also add an 
>>> option to the GPUVA manager (or a specific region, which could also 
>>> be a single huge one) within which it never merges.
>>>
>>>>
>>>> If this can be removed then the entire concept of regions in the GPUVA
>>>> can be removed too (drop struct drm_gpuva_region). I say this because
>>>> in Xe as I'm porting over to GPUVA the first thing I'm doing after
>>>> drm_gpuva_manager_init is calling drm_gpuva_region_insert on the 
>>>> entire
>>>> address space. To me this seems kinda useless but maybe I'm missing 
>>>> why
>>>> you need this for Nouveau.
>>>>
>>>> Matt
>>>>
>>>>> +/**
>>>>> + * @DRM_NOUVEAU_VM_BIND_OP_FREE: Free a reserved VA space region.
>>>>> + */
>>>>> +#define DRM_NOUVEAU_VM_BIND_OP_FREE 0x1
>>>>> +/**
>>>>> + * @DRM_NOUVEAU_VM_BIND_OP_MAP:
>>>>> + *
>>>>> + * Map a GEM object to the GPU's VA space. The mapping must be 
>>>>> fully enclosed by
>>>>> + * a previously allocated VA space region. If the region is 
>>>>> sparse, existing
>>>>> + * sparse mappings are overwritten.
>>>>> + */
>>>>> +#define DRM_NOUVEAU_VM_BIND_OP_MAP 0x2
>>>>> +/**
>>>>> + * @DRM_NOUVEAU_VM_BIND_OP_UNMAP:
>>>>> + *
>>>>> + * Unmap an existing mapping in the GPU's VA space. If the region 
>>>>> the mapping
>>>>> + * is located in is a sparse region, new sparse mappings are 
>>>>> created where the
>>>>> + * unmapped (memory backed) mapping was mapped previously.
>>>>> + */
>>>>> +#define DRM_NOUVEAU_VM_BIND_OP_UNMAP 0x3
>>>>> +    /**
>>>>> +     * @flags: the flags for a &drm_nouveau_vm_bind_op
>>>>> +     */
>>>>> +    __u32 flags;
>>>>> +/**
>>>>> + * @DRM_NOUVEAU_VM_BIND_SPARSE:
>>>>> + *
>>>>> + * Indicates that an allocated VA space region should be sparse.
>>>>> + */
>>>>> +#define DRM_NOUVEAU_VM_BIND_SPARSE (1 << 8)
>>>>> +    /**
>>>>> +     * @handle: the handle of the DRM GEM object to map
>>>>> +     */
>>>>> +    __u32 handle;
>>>>> +    /**
>>>>> +     * @addr:
>>>>> +     *
>>>>> +     * the address the VA space region or (memory backed) mapping 
>>>>> should be mapped to
>>>>> +     */
>>>>> +    __u64 addr;
>>>>> +    /**
>>>>> +     * @bo_offset: the offset within the BO backing the mapping
>>>>> +     */
>>>>> +    __u64 bo_offset;
>>>>> +    /**
>>>>> +     * @range: the size of the requested mapping in bytes
>>>>> +     */
>>>>> +    __u64 range;
>>>>> +};
>>>>> +
>>>>> +/**
>>>>> + * struct drm_nouveau_vm_bind - structure for 
>>>>> DRM_IOCTL_NOUVEAU_VM_BIND
>>>>> + */
>>>>> +struct drm_nouveau_vm_bind {
>>>>> +    /**
>>>>> +     * @op_count: the number of &drm_nouveau_vm_bind_op
>>>>> +     */
>>>>> +    __u32 op_count;
>>>>> +    /**
>>>>> +     * @flags: the flags for a &drm_nouveau_vm_bind ioctl
>>>>> +     */
>>>>> +    __u32 flags;
>>>>> +/**
>>>>> + * @DRM_NOUVEAU_VM_BIND_RUN_ASYNC:
>>>>> + *
>>>>> + * Indicates that the given VM_BIND operation should be executed 
>>>>> asynchronously
>>>>> + * by the kernel.
>>>>> + *
>>>>> + * If this flag is not supplied the kernel executes the 
>>>>> associated operations
>>>>> + * synchronously and doesn't accept any &drm_nouveau_sync objects.
>>>>> + */
>>>>> +#define DRM_NOUVEAU_VM_BIND_RUN_ASYNC 0x1
>>>>> +    /**
>>>>> +     * @wait_count: the number of wait &drm_nouveau_syncs
>>>>> +     */
>>>>> +    __u32 wait_count;
>>>>> +    /**
>>>>> +     * @sig_count: the number of &drm_nouveau_syncs to signal 
>>>>> when finished
>>>>> +     */
>>>>> +    __u32 sig_count;
>>>>> +    /**
>>>>> +     * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
>>>>> +     */
>>>>> +    __u64 wait_ptr;
>>>>> +    /**
>>>>> +     * @sig_ptr: pointer to &drm_nouveau_syncs to signal when 
>>>>> finished
>>>>> +     */
>>>>> +    __u64 sig_ptr;
>>>>> +    /**
>>>>> +     * @op_ptr: pointer to the &drm_nouveau_vm_bind_ops to execute
>>>>> +     */
>>>>> +    __u64 op_ptr;
>>>>> +};
>>>>> +
>>>>> +/**
>>>>> + * struct drm_nouveau_exec_push - EXEC push operation
>>>>> + *
>>>>> + * This structure represents a single EXEC push operation. UMDs 
>>>>> should pass an
>>>>> + * array of this structure via struct drm_nouveau_exec's 
>>>>> &push_ptr field.
>>>>> + */
>>>>> +struct drm_nouveau_exec_push {
>>>>> +    /**
>>>>> +     * @va: the virtual address of the push buffer mapping
>>>>> +     */
>>>>> +    __u64 va;
>>>>> +    /**
>>>>> +     * @va_len: the length of the push buffer mapping
>>>>> +     */
>>>>> +    __u64 va_len;
>>>>> +};
>>>>> +
>>>>> +/**
>>>>> + * struct drm_nouveau_exec - structure for DRM_IOCTL_NOUVEAU_EXEC
>>>>> + */
>>>>> +struct drm_nouveau_exec {
>>>>> +    /**
>>>>> +     * @channel: the channel to execute the push buffer in
>>>>> +     */
>>>>> +    __u32 channel;
>>>>> +    /**
>>>>> +     * @push_count: the number of &drm_nouveau_exec_push ops
>>>>> +     */
>>>>> +    __u32 push_count;
>>>>> +    /**
>>>>> +     * @wait_count: the number of wait &drm_nouveau_syncs
>>>>> +     */
>>>>> +    __u32 wait_count;
>>>>> +    /**
>>>>> +     * @sig_count: the number of &drm_nouveau_syncs to signal 
>>>>> when finished
>>>>> +     */
>>>>> +    __u32 sig_count;
>>>>> +    /**
>>>>> +     * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
>>>>> +     */
>>>>> +    __u64 wait_ptr;
>>>>> +    /**
>>>>> +     * @sig_ptr: pointer to &drm_nouveau_syncs to signal when 
>>>>> finished
>>>>> +     */
>>>>> +    __u64 sig_ptr;
>>>>> +    /**
>>>>> +     * @push_ptr: pointer to &drm_nouveau_exec_push ops
>>>>> +     */
>>>>> +    __u64 push_ptr;
>>>>> +};
>>>>> +
>>>>>   #define DRM_NOUVEAU_GETPARAM           0x00 /* deprecated */
>>>>>   #define DRM_NOUVEAU_SETPARAM           0x01 /* deprecated */
>>>>>   #define DRM_NOUVEAU_CHANNEL_ALLOC      0x02 /* deprecated */
>>>>> @@ -136,6 +346,9 @@ struct drm_nouveau_gem_cpu_fini {
>>>>>   #define DRM_NOUVEAU_NVIF               0x07
>>>>>   #define DRM_NOUVEAU_SVM_INIT           0x08
>>>>>   #define DRM_NOUVEAU_SVM_BIND           0x09
>>>>> +#define DRM_NOUVEAU_VM_INIT            0x10
>>>>> +#define DRM_NOUVEAU_VM_BIND            0x11
>>>>> +#define DRM_NOUVEAU_EXEC               0x12
>>>>>   #define DRM_NOUVEAU_GEM_NEW            0x40
>>>>>   #define DRM_NOUVEAU_GEM_PUSHBUF        0x41
>>>>>   #define DRM_NOUVEAU_GEM_CPU_PREP       0x42
>>>>> @@ -197,6 +410,9 @@ struct drm_nouveau_svm_bind {
>>>>>   #define DRM_IOCTL_NOUVEAU_GEM_CPU_FINI       DRM_IOW 
>>>>> (DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_CPU_FINI, struct 
>>>>> drm_nouveau_gem_cpu_fini)
>>>>>   #define DRM_IOCTL_NOUVEAU_GEM_INFO DRM_IOWR(DRM_COMMAND_BASE + 
>>>>> DRM_NOUVEAU_GEM_INFO, struct drm_nouveau_gem_info)
>>>>>   +#define DRM_IOCTL_NOUVEAU_VM_INIT DRM_IOWR(DRM_COMMAND_BASE + 
>>>>> DRM_NOUVEAU_VM_INIT, struct drm_nouveau_vm_init)
>>>>> +#define DRM_IOCTL_NOUVEAU_VM_BIND DRM_IOWR(DRM_COMMAND_BASE + 
>>>>> DRM_NOUVEAU_VM_BIND, struct drm_nouveau_vm_bind)
>>>>> +#define DRM_IOCTL_NOUVEAU_EXEC DRM_IOWR(DRM_COMMAND_BASE + 
>>>>> DRM_NOUVEAU_EXEC, struct drm_nouveau_exec)
>>>>>   #if defined(__cplusplus)
>>>>>   }
>>>>>   #endif
>>>>> -- 
>>>>> 2.39.0
>>>>>
>>>>
>>>
>>
>
Danilo Krummrich Jan. 27, 2023, 2:44 p.m. UTC | #9
On 1/27/23 14:23, Christian König wrote:
> 
> 
> Am 27.01.23 um 14:12 schrieb Danilo Krummrich:
>> On 1/27/23 08:55, Christian König wrote:
>>> Am 27.01.23 um 02:26 schrieb Danilo Krummrich:
>>>> On 1/27/23 02:05, Matthew Brost wrote:
>>>>> On Wed, Jan 18, 2023 at 07:12:47AM +0100, Danilo Krummrich wrote:
>>>>>> This commit provides the interfaces for the new UAPI motivated by the
>>>>>> Vulkan API. It allows user mode drivers (UMDs) to:
>>>>>>
>>>>>> 1) Initialize a GPU virtual address (VA) space via the new
>>>>>>     DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel 
>>>>>> reserved
>>>>>>     VA area.
>>>>>>
>>>>>> 2) Bind and unbind GPU VA space mappings via the new
>>>>>>     DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>>>>>>
>>>>>> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
>>>>>>
>>>>>> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
>>>>>> asynchronous processing with DRM syncobjs as synchronization 
>>>>>> mechanism.
>>>>>>
>>>>>> The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
>>>>>> DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.
>>>>>>
>>>>>> Co-authored-by: Dave Airlie <airlied@redhat.com>
>>>>>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>>>>>> ---
>>>>>>   Documentation/gpu/driver-uapi.rst |   8 ++
>>>>>>   include/uapi/drm/nouveau_drm.h    | 216 
>>>>>> ++++++++++++++++++++++++++++++
>>>>>>   2 files changed, 224 insertions(+)
>>>>>>
>>>>>> diff --git a/Documentation/gpu/driver-uapi.rst 
>>>>>> b/Documentation/gpu/driver-uapi.rst
>>>>>> index 4411e6919a3d..9c7ca6e33a68 100644
>>>>>> --- a/Documentation/gpu/driver-uapi.rst
>>>>>> +++ b/Documentation/gpu/driver-uapi.rst
>>>>>> @@ -6,3 +6,11 @@ drm/i915 uAPI
>>>>>>   =============
>>>>>>     .. kernel-doc:: include/uapi/drm/i915_drm.h
>>>>>> +
>>>>>> +drm/nouveau uAPI
>>>>>> +================
>>>>>> +
>>>>>> +VM_BIND / EXEC uAPI
>>>>>> +-------------------
>>>>>> +
>>>>>> +.. kernel-doc:: include/uapi/drm/nouveau_drm.h
>>>>>> diff --git a/include/uapi/drm/nouveau_drm.h 
>>>>>> b/include/uapi/drm/nouveau_drm.h
>>>>>> index 853a327433d3..f6e7d40201d4 100644
>>>>>> --- a/include/uapi/drm/nouveau_drm.h
>>>>>> +++ b/include/uapi/drm/nouveau_drm.h
>>>>>> @@ -126,6 +126,216 @@ struct drm_nouveau_gem_cpu_fini {
>>>>>>       __u32 handle;
>>>>>>   };
>>>>>>   +/**
>>>>>> + * struct drm_nouveau_sync - sync object
>>>>>> + *
>>>>>> + * This structure serves as synchronization mechanism for 
>>>>>> (potentially)
>>>>>> + * asynchronous operations such as EXEC or VM_BIND.
>>>>>> + */
>>>>>> +struct drm_nouveau_sync {
>>>>>> +    /**
>>>>>> +     * @flags: the flags for a sync object
>>>>>> +     *
>>>>>> +     * The first 8 bits are used to determine the type of the 
>>>>>> sync object.
>>>>>> +     */
>>>>>> +    __u32 flags;
>>>>>> +#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
>>>>>> +#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
>>>>>> +#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
>>>>>> +    /**
>>>>>> +     * @handle: the handle of the sync object
>>>>>> +     */
>>>>>> +    __u32 handle;
>>>>>> +    /**
>>>>>> +     * @timeline_value:
>>>>>> +     *
>>>>>> +     * The timeline point of the sync object in case the syncobj 
>>>>>> is of
>>>>>> +     * type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
>>>>>> +     */
>>>>>> +    __u64 timeline_value;
>>>>>> +};
>>>>>> +
>>>>>> +/**
>>>>>> + * struct drm_nouveau_vm_init - GPU VA space init structure
>>>>>> + *
>>>>>> + * Used to initialize the GPU's VA space for a user client, 
>>>>>> telling the kernel
>>>>>> + * which portion of the VA space is managed by the UMD and kernel 
>>>>>> respectively.
>>>>>> + */
>>>>>> +struct drm_nouveau_vm_init {
>>>>>> +    /**
>>>>>> +     * @unmanaged_addr: start address of the kernel managed VA 
>>>>>> space region
>>>>>> +     */
>>>>>> +    __u64 unmanaged_addr;
>>>>>> +    /**
>>>>>> +     * @unmanaged_size: size of the kernel managed VA space 
>>>>>> region in bytes
>>>>>> +     */
>>>>>> +    __u64 unmanaged_size;
>>>>>> +};
>>>>>> +
>>>>>> +/**
>>>>>> + * struct drm_nouveau_vm_bind_op - VM_BIND operation
>>>>>> + *
>>>>>> + * This structure represents a single VM_BIND operation. UMDs 
>>>>>> should pass
>>>>>> + * an array of this structure via struct drm_nouveau_vm_bind's 
>>>>>> &op_ptr field.
>>>>>> + */
>>>>>> +struct drm_nouveau_vm_bind_op {
>>>>>> +    /**
>>>>>> +     * @op: the operation type
>>>>>> +     */
>>>>>> +    __u32 op;
>>>>>> +/**
>>>>>> + * @DRM_NOUVEAU_VM_BIND_OP_ALLOC:
>>>>>> + *
>>>>>> + * The alloc operation is used to reserve a VA space region 
>>>>>> within the GPU's VA
>>>>>> + * space. Optionally, the &DRM_NOUVEAU_VM_BIND_SPARSE flag can be 
>>>>>> passed to
>>>>>> + * instruct the kernel to create sparse mappings for the given 
>>>>>> region.
>>>>>> + */
>>>>>> +#define DRM_NOUVEAU_VM_BIND_OP_ALLOC 0x0
>>>>>
>>>>> Do you really need this operation? We have no concept of this in Xe,
>>>>> e.g. we can create a VM and the entire address space is managed 
>>>>> exactly
>>>>> the same.
>>>>
>>>> The idea for alloc/free is to let UMDs allocate a portion of the VA 
>>>> space (which I call a region), basically the same thing Vulkan 
>>>> represents with a VKBuffer.
>>>
>>> If that's mangled into the same component/interface then I can say 
>>> from experience that this is a pretty bad idea. We have tried 
>>> something similar with radeon and it turned out horrible.
>>
>> What was the exact constellation in radeon and which problems did 
>> arise from it?
>>
>>>
>>> What you want is one component for tracking the VA allocations 
>>> (drm_mm based) and a different component/interface for tracking the 
>>> VA mappings (probably rb tree based).
>>
>> That's what the GPUVA manager is doing. There are gpuva_regions which 
>> correspond to VA allocations and gpuvas which represent the mappings. 
>> Both are tracked separately (currently both with a separate drm_mm, 
>> though). However, the GPUVA manager needs to take regions into account 
>> when dealing with mappings to make sure the GPUVA manager doesn't 
>> propose drivers to merge over region boundaries. Speaking from 
>> userspace PoV, the kernel wouldn't merge mappings from different 
>> VKBuffer objects even if they're virtually and physically contiguous.
> 
> That are two completely different things and shouldn't be handled in a 
> single component.

They are different things, but they're related in a way that for 
handling the mappings (in particular merging and sparse) the GPUVA 
manager needs to know the VA allocation (or region) boundaries.

I have the feeling there might be a misunderstanding. Userspace is in 
charge to actually allocate a portion of VA space and manage it. The 
GPUVA manager just needs to know about those VA space allocations and 
hence keeps track of them.

The GPUVA manager is not meant to be an allocator in the sense of 
finding and providing a hole for a given request.

Maybe the non-ideal choice of using drm_mm was implying something else.

> 
> We should probably talk about the design of the GPUVA manager once more 
> when this should be applicable to all GPU drivers.

That's what I try to figure out with this RFC, how to make it appicable 
for all GPU drivers, so I'm happy to discuss this. :-)

> 
>>
>> For sparse residency the kernel also needs to know the region 
>> boundaries to make sure that it keeps sparse mappings around.
> 
> What?

When userspace creates a new VKBuffer with the 
VK_BUFFER_CREATE_SPARSE_BINDING_BIT the kernel may need to create sparse 
mappings in order to ensure that using this buffer without any memory 
backed mappings doesn't fault the GPU.

Currently, the implementation does this the following way:

1. Userspace creates a new VKBuffer and hence allocates a portion of the 
VA space for it. It calls into the kernel indicating the new VA space 
region and the fact that the region is sparse.

2. The kernel picks up the region and stores it in the GPUVA manager, 
the driver creates the corresponding sparse mappings / page table entries.

3. Userspace might ask the driver to create a couple of memory backed 
mappings for this particular VA region. The GPUVA manager stores the 
mapping parameters, the driver creates the corresponding page table entries.

4. Userspace might ask to unmap all the memory backed mappings from this 
particular VA region. The GPUVA manager removes the mapping parameters, 
the driver cleans up the corresponding page table entries. However, the 
driver also needs to re-create the sparse mappings, since it's a sparse 
buffer, hence it needs to know the boundaries of the region it needs to 
create the sparse mappings in.

> 
> Regards,
> Christian.
> 
>>
>>>
>>> amdgpu has even gotten so far that the VA allocations are tracked in 
>>> libdrm in userspace
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> It serves two purposes:
>>>>
>>>> 1. It gives the kernel (in particular the GPUVA manager) the bounds 
>>>> in which it is allowed to merge mappings. E.g. when a user request 
>>>> asks for a new mapping and we detect we could merge this mapping 
>>>> with an existing one (used in another VKBuffer than the mapping 
>>>> request came for) the driver is not allowed to change the page table 
>>>> for the existing mapping we want to merge with (assuming that some 
>>>> drivers would need to do this in order to merge), because the 
>>>> existing mapping could already be in use and by re-mapping it we'd 
>>>> potentially cause a fault on the GPU.
>>>>
>>>> 2. It is used for sparse residency in a way that such an allocated 
>>>> VA space region can be flagged as sparse, such that the kernel 
>>>> always keeps sparse mappings around for the parts of the region that 
>>>> do not contain actual memory backed mappings.
>>>>
>>>> If for your driver merging is always OK, creating a single huge 
>>>> region would do the trick I guess. Otherwise, we could also add an 
>>>> option to the GPUVA manager (or a specific region, which could also 
>>>> be a single huge one) within which it never merges.
>>>>
>>>>>
>>>>> If this can be removed then the entire concept of regions in the GPUVA
>>>>> can be removed too (drop struct drm_gpuva_region). I say this because
>>>>> in Xe as I'm porting over to GPUVA the first thing I'm doing after
>>>>> drm_gpuva_manager_init is calling drm_gpuva_region_insert on the 
>>>>> entire
>>>>> address space. To me this seems kinda useless but maybe I'm missing 
>>>>> why
>>>>> you need this for Nouveau.
>>>>>
>>>>> Matt
>>>>>
>>>>>> +/**
>>>>>> + * @DRM_NOUVEAU_VM_BIND_OP_FREE: Free a reserved VA space region.
>>>>>> + */
>>>>>> +#define DRM_NOUVEAU_VM_BIND_OP_FREE 0x1
>>>>>> +/**
>>>>>> + * @DRM_NOUVEAU_VM_BIND_OP_MAP:
>>>>>> + *
>>>>>> + * Map a GEM object to the GPU's VA space. The mapping must be 
>>>>>> fully enclosed by
>>>>>> + * a previously allocated VA space region. If the region is 
>>>>>> sparse, existing
>>>>>> + * sparse mappings are overwritten.
>>>>>> + */
>>>>>> +#define DRM_NOUVEAU_VM_BIND_OP_MAP 0x2
>>>>>> +/**
>>>>>> + * @DRM_NOUVEAU_VM_BIND_OP_UNMAP:
>>>>>> + *
>>>>>> + * Unmap an existing mapping in the GPU's VA space. If the region 
>>>>>> the mapping
>>>>>> + * is located in is a sparse region, new sparse mappings are 
>>>>>> created where the
>>>>>> + * unmapped (memory backed) mapping was mapped previously.
>>>>>> + */
>>>>>> +#define DRM_NOUVEAU_VM_BIND_OP_UNMAP 0x3
>>>>>> +    /**
>>>>>> +     * @flags: the flags for a &drm_nouveau_vm_bind_op
>>>>>> +     */
>>>>>> +    __u32 flags;
>>>>>> +/**
>>>>>> + * @DRM_NOUVEAU_VM_BIND_SPARSE:
>>>>>> + *
>>>>>> + * Indicates that an allocated VA space region should be sparse.
>>>>>> + */
>>>>>> +#define DRM_NOUVEAU_VM_BIND_SPARSE (1 << 8)
>>>>>> +    /**
>>>>>> +     * @handle: the handle of the DRM GEM object to map
>>>>>> +     */
>>>>>> +    __u32 handle;
>>>>>> +    /**
>>>>>> +     * @addr:
>>>>>> +     *
>>>>>> +     * the address the VA space region or (memory backed) mapping 
>>>>>> should be mapped to
>>>>>> +     */
>>>>>> +    __u64 addr;
>>>>>> +    /**
>>>>>> +     * @bo_offset: the offset within the BO backing the mapping
>>>>>> +     */
>>>>>> +    __u64 bo_offset;
>>>>>> +    /**
>>>>>> +     * @range: the size of the requested mapping in bytes
>>>>>> +     */
>>>>>> +    __u64 range;
>>>>>> +};
>>>>>> +
>>>>>> +/**
>>>>>> + * struct drm_nouveau_vm_bind - structure for 
>>>>>> DRM_IOCTL_NOUVEAU_VM_BIND
>>>>>> + */
>>>>>> +struct drm_nouveau_vm_bind {
>>>>>> +    /**
>>>>>> +     * @op_count: the number of &drm_nouveau_vm_bind_op
>>>>>> +     */
>>>>>> +    __u32 op_count;
>>>>>> +    /**
>>>>>> +     * @flags: the flags for a &drm_nouveau_vm_bind ioctl
>>>>>> +     */
>>>>>> +    __u32 flags;
>>>>>> +/**
>>>>>> + * @DRM_NOUVEAU_VM_BIND_RUN_ASYNC:
>>>>>> + *
>>>>>> + * Indicates that the given VM_BIND operation should be executed 
>>>>>> asynchronously
>>>>>> + * by the kernel.
>>>>>> + *
>>>>>> + * If this flag is not supplied the kernel executes the 
>>>>>> associated operations
>>>>>> + * synchronously and doesn't accept any &drm_nouveau_sync objects.
>>>>>> + */
>>>>>> +#define DRM_NOUVEAU_VM_BIND_RUN_ASYNC 0x1
>>>>>> +    /**
>>>>>> +     * @wait_count: the number of wait &drm_nouveau_syncs
>>>>>> +     */
>>>>>> +    __u32 wait_count;
>>>>>> +    /**
>>>>>> +     * @sig_count: the number of &drm_nouveau_syncs to signal 
>>>>>> when finished
>>>>>> +     */
>>>>>> +    __u32 sig_count;
>>>>>> +    /**
>>>>>> +     * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
>>>>>> +     */
>>>>>> +    __u64 wait_ptr;
>>>>>> +    /**
>>>>>> +     * @sig_ptr: pointer to &drm_nouveau_syncs to signal when 
>>>>>> finished
>>>>>> +     */
>>>>>> +    __u64 sig_ptr;
>>>>>> +    /**
>>>>>> +     * @op_ptr: pointer to the &drm_nouveau_vm_bind_ops to execute
>>>>>> +     */
>>>>>> +    __u64 op_ptr;
>>>>>> +};
>>>>>> +
>>>>>> +/**
>>>>>> + * struct drm_nouveau_exec_push - EXEC push operation
>>>>>> + *
>>>>>> + * This structure represents a single EXEC push operation. UMDs 
>>>>>> should pass an
>>>>>> + * array of this structure via struct drm_nouveau_exec's 
>>>>>> &push_ptr field.
>>>>>> + */
>>>>>> +struct drm_nouveau_exec_push {
>>>>>> +    /**
>>>>>> +     * @va: the virtual address of the push buffer mapping
>>>>>> +     */
>>>>>> +    __u64 va;
>>>>>> +    /**
>>>>>> +     * @va_len: the length of the push buffer mapping
>>>>>> +     */
>>>>>> +    __u64 va_len;
>>>>>> +};
>>>>>> +
>>>>>> +/**
>>>>>> + * struct drm_nouveau_exec - structure for DRM_IOCTL_NOUVEAU_EXEC
>>>>>> + */
>>>>>> +struct drm_nouveau_exec {
>>>>>> +    /**
>>>>>> +     * @channel: the channel to execute the push buffer in
>>>>>> +     */
>>>>>> +    __u32 channel;
>>>>>> +    /**
>>>>>> +     * @push_count: the number of &drm_nouveau_exec_push ops
>>>>>> +     */
>>>>>> +    __u32 push_count;
>>>>>> +    /**
>>>>>> +     * @wait_count: the number of wait &drm_nouveau_syncs
>>>>>> +     */
>>>>>> +    __u32 wait_count;
>>>>>> +    /**
>>>>>> +     * @sig_count: the number of &drm_nouveau_syncs to signal 
>>>>>> when finished
>>>>>> +     */
>>>>>> +    __u32 sig_count;
>>>>>> +    /**
>>>>>> +     * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
>>>>>> +     */
>>>>>> +    __u64 wait_ptr;
>>>>>> +    /**
>>>>>> +     * @sig_ptr: pointer to &drm_nouveau_syncs to signal when 
>>>>>> finished
>>>>>> +     */
>>>>>> +    __u64 sig_ptr;
>>>>>> +    /**
>>>>>> +     * @push_ptr: pointer to &drm_nouveau_exec_push ops
>>>>>> +     */
>>>>>> +    __u64 push_ptr;
>>>>>> +};
>>>>>> +
>>>>>>   #define DRM_NOUVEAU_GETPARAM           0x00 /* deprecated */
>>>>>>   #define DRM_NOUVEAU_SETPARAM           0x01 /* deprecated */
>>>>>>   #define DRM_NOUVEAU_CHANNEL_ALLOC      0x02 /* deprecated */
>>>>>> @@ -136,6 +346,9 @@ struct drm_nouveau_gem_cpu_fini {
>>>>>>   #define DRM_NOUVEAU_NVIF               0x07
>>>>>>   #define DRM_NOUVEAU_SVM_INIT           0x08
>>>>>>   #define DRM_NOUVEAU_SVM_BIND           0x09
>>>>>> +#define DRM_NOUVEAU_VM_INIT            0x10
>>>>>> +#define DRM_NOUVEAU_VM_BIND            0x11
>>>>>> +#define DRM_NOUVEAU_EXEC               0x12
>>>>>>   #define DRM_NOUVEAU_GEM_NEW            0x40
>>>>>>   #define DRM_NOUVEAU_GEM_PUSHBUF        0x41
>>>>>>   #define DRM_NOUVEAU_GEM_CPU_PREP       0x42
>>>>>> @@ -197,6 +410,9 @@ struct drm_nouveau_svm_bind {
>>>>>>   #define DRM_IOCTL_NOUVEAU_GEM_CPU_FINI       DRM_IOW 
>>>>>> (DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_CPU_FINI, struct 
>>>>>> drm_nouveau_gem_cpu_fini)
>>>>>>   #define DRM_IOCTL_NOUVEAU_GEM_INFO DRM_IOWR(DRM_COMMAND_BASE + 
>>>>>> DRM_NOUVEAU_GEM_INFO, struct drm_nouveau_gem_info)
>>>>>>   +#define DRM_IOCTL_NOUVEAU_VM_INIT DRM_IOWR(DRM_COMMAND_BASE + 
>>>>>> DRM_NOUVEAU_VM_INIT, struct drm_nouveau_vm_init)
>>>>>> +#define DRM_IOCTL_NOUVEAU_VM_BIND DRM_IOWR(DRM_COMMAND_BASE + 
>>>>>> DRM_NOUVEAU_VM_BIND, struct drm_nouveau_vm_bind)
>>>>>> +#define DRM_IOCTL_NOUVEAU_EXEC DRM_IOWR(DRM_COMMAND_BASE + 
>>>>>> DRM_NOUVEAU_EXEC, struct drm_nouveau_exec)
>>>>>>   #if defined(__cplusplus)
>>>>>>   }
>>>>>>   #endif
>>>>>> -- 
>>>>>> 2.39.0
>>>>>>
>>>>>
>>>>
>>>
>>
>
Christian König Jan. 27, 2023, 3:17 p.m. UTC | #10
Am 27.01.23 um 15:44 schrieb Danilo Krummrich:
> [SNIP]
>>>>
>>>> What you want is one component for tracking the VA allocations 
>>>> (drm_mm based) and a different component/interface for tracking the 
>>>> VA mappings (probably rb tree based).
>>>
>>> That's what the GPUVA manager is doing. There are gpuva_regions 
>>> which correspond to VA allocations and gpuvas which represent the 
>>> mappings. Both are tracked separately (currently both with a 
>>> separate drm_mm, though). However, the GPUVA manager needs to take 
>>> regions into account when dealing with mappings to make sure the 
>>> GPUVA manager doesn't propose drivers to merge over region 
>>> boundaries. Speaking from userspace PoV, the kernel wouldn't merge 
>>> mappings from different VKBuffer objects even if they're virtually 
>>> and physically contiguous.
>>
>> That are two completely different things and shouldn't be handled in 
>> a single component.
>
> They are different things, but they're related in a way that for 
> handling the mappings (in particular merging and sparse) the GPUVA 
> manager needs to know the VA allocation (or region) boundaries.
>
> I have the feeling there might be a misunderstanding. Userspace is in 
> charge to actually allocate a portion of VA space and manage it. The 
> GPUVA manager just needs to know about those VA space allocations and 
> hence keeps track of them.
>
> The GPUVA manager is not meant to be an allocator in the sense of 
> finding and providing a hole for a given request.
>
> Maybe the non-ideal choice of using drm_mm was implying something else.

Uff, well long story short that doesn't even remotely match the 
requirements. This way the GPUVA manager won't be usable for a whole 
bunch of use cases.

What we have are mappings which say X needs to point to Y with this and 
hw dependent flags.

The whole idea of having ranges is not going to fly. Neither with AMD 
GPUs and I strongly think not with Intels XA either.

>> We should probably talk about the design of the GPUVA manager once 
>> more when this should be applicable to all GPU drivers.
>
> That's what I try to figure out with this RFC, how to make it 
> appicable for all GPU drivers, so I'm happy to discuss this. :-)

Yeah, that was really good idea :) That proposal here is really far away 
from the actual requirements.

>>> For sparse residency the kernel also needs to know the region 
>>> boundaries to make sure that it keeps sparse mappings around.
>>
>> What?
>
> When userspace creates a new VKBuffer with the 
> VK_BUFFER_CREATE_SPARSE_BINDING_BIT the kernel may need to create 
> sparse mappings in order to ensure that using this buffer without any 
> memory backed mappings doesn't fault the GPU.
>
> Currently, the implementation does this the following way:
>
> 1. Userspace creates a new VKBuffer and hence allocates a portion of 
> the VA space for it. It calls into the kernel indicating the new VA 
> space region and the fact that the region is sparse.
>
> 2. The kernel picks up the region and stores it in the GPUVA manager, 
> the driver creates the corresponding sparse mappings / page table 
> entries.
>
> 3. Userspace might ask the driver to create a couple of memory backed 
> mappings for this particular VA region. The GPUVA manager stores the 
> mapping parameters, the driver creates the corresponding page table 
> entries.
>
> 4. Userspace might ask to unmap all the memory backed mappings from 
> this particular VA region. The GPUVA manager removes the mapping 
> parameters, the driver cleans up the corresponding page table entries. 
> However, the driver also needs to re-create the sparse mappings, since 
> it's a sparse buffer, hence it needs to know the boundaries of the 
> region it needs to create the sparse mappings in.

Again, this is not how things are working. First of all the kernel 
absolutely should *NOT* know about those regions.

What we have inside the kernel is the information what happens if an 
address X is accessed. On AMD HW this can be:

1. Route to the PCIe bus because the mapped BO is stored in system memory.
2. Route to the internal MC because the mapped BO is stored in local memory.
3. Route to other GPUs in the same hive.
4. Route to some doorbell to kick of other work.
...
x. Ignore write, return 0 on reads (this is what is used for sparse 
mappings).
x+1. Trigger a recoverable page fault. This is used for things like SVA.
x+2. Trigger a non-recoverable page fault. This is used for things like 
unmapped regions where access is illegal.

All this is plus some hw specific caching flags.

When Vulkan allocates a sparse VKBuffer what should happen is the following:

1. The Vulkan driver somehow figures out a VA region A..B for the 
buffer. This can be in userspace (libdrm_amdgpu) or kernel (drm_mm), but 
essentially is currently driver specific.

2. The kernel gets a request to map the VA range A..B as sparse, meaning 
that it updates the page tables from A..B with the sparse setting.

3. User space asks kernel to map a couple of memory backings at location 
A+1, A+10, A+15 etc....

4. The VKBuffer is de-allocated, userspace asks kernel to update region 
A..B to not map anything (usually triggers a non-recoverable fault).

When you want to unify this between hw drivers I strongly suggest to 
completely start from scratch once more.

First of all don't think about those mappings as VMAs, that won't work 
because VMAs are usually something large. Think of this as individual 
PTEs controlled by the application. similar how COW mappings and struct 
pages are handled inside the kernel.

Then I would start with the VA allocation manager. You could probably 
base that on drm_mm. We handle it differently in amdgpu currently, but I 
think this is something we could change.

Then come up with something close to the amdgpu VM system. I'm pretty 
sure that should work for Nouveau and Intel XA as well. In other words 
you just have a bunch of very very small structures which represents 
mappings and a larger structure which combine all mappings of a specific 
type, e.g. all mappings of a BO or all sparse mappings etc...

Merging of regions is actually not mandatory. We don't do it in amdgpu 
and can live with the additional mappings pretty well. But I think this 
can differ between drivers.

Regards,
Christian.
David Airlie Jan. 27, 2023, 8:25 p.m. UTC | #11
On Sat, Jan 28, 2023 at 1:17 AM Christian König
<christian.koenig@amd.com> wrote:
>
> Am 27.01.23 um 15:44 schrieb Danilo Krummrich:
> > [SNIP]
> >>>>
> >>>> What you want is one component for tracking the VA allocations
> >>>> (drm_mm based) and a different component/interface for tracking the
> >>>> VA mappings (probably rb tree based).
> >>>
> >>> That's what the GPUVA manager is doing. There are gpuva_regions
> >>> which correspond to VA allocations and gpuvas which represent the
> >>> mappings. Both are tracked separately (currently both with a
> >>> separate drm_mm, though). However, the GPUVA manager needs to take
> >>> regions into account when dealing with mappings to make sure the
> >>> GPUVA manager doesn't propose drivers to merge over region
> >>> boundaries. Speaking from userspace PoV, the kernel wouldn't merge
> >>> mappings from different VKBuffer objects even if they're virtually
> >>> and physically contiguous.
> >>
> >> That are two completely different things and shouldn't be handled in
> >> a single component.
> >
> > They are different things, but they're related in a way that for
> > handling the mappings (in particular merging and sparse) the GPUVA
> > manager needs to know the VA allocation (or region) boundaries.
> >
> > I have the feeling there might be a misunderstanding. Userspace is in
> > charge to actually allocate a portion of VA space and manage it. The
> > GPUVA manager just needs to know about those VA space allocations and
> > hence keeps track of them.
> >
> > The GPUVA manager is not meant to be an allocator in the sense of
> > finding and providing a hole for a given request.
> >
> > Maybe the non-ideal choice of using drm_mm was implying something else.
>
> Uff, well long story short that doesn't even remotely match the
> requirements. This way the GPUVA manager won't be usable for a whole
> bunch of use cases.
>
> What we have are mappings which say X needs to point to Y with this and
> hw dependent flags.
>
> The whole idea of having ranges is not going to fly. Neither with AMD
> GPUs and I strongly think not with Intels XA either.
>
> >> We should probably talk about the design of the GPUVA manager once
> >> more when this should be applicable to all GPU drivers.
> >
> > That's what I try to figure out with this RFC, how to make it
> > appicable for all GPU drivers, so I'm happy to discuss this. :-)
>
> Yeah, that was really good idea :) That proposal here is really far away
> from the actual requirements.
>
> >>> For sparse residency the kernel also needs to know the region
> >>> boundaries to make sure that it keeps sparse mappings around.
> >>
> >> What?
> >
> > When userspace creates a new VKBuffer with the
> > VK_BUFFER_CREATE_SPARSE_BINDING_BIT the kernel may need to create
> > sparse mappings in order to ensure that using this buffer without any
> > memory backed mappings doesn't fault the GPU.
> >
> > Currently, the implementation does this the following way:
> >
> > 1. Userspace creates a new VKBuffer and hence allocates a portion of
> > the VA space for it. It calls into the kernel indicating the new VA
> > space region and the fact that the region is sparse.
> >
> > 2. The kernel picks up the region and stores it in the GPUVA manager,
> > the driver creates the corresponding sparse mappings / page table
> > entries.
> >
> > 3. Userspace might ask the driver to create a couple of memory backed
> > mappings for this particular VA region. The GPUVA manager stores the
> > mapping parameters, the driver creates the corresponding page table
> > entries.
> >
> > 4. Userspace might ask to unmap all the memory backed mappings from
> > this particular VA region. The GPUVA manager removes the mapping
> > parameters, the driver cleans up the corresponding page table entries.
> > However, the driver also needs to re-create the sparse mappings, since
> > it's a sparse buffer, hence it needs to know the boundaries of the
> > region it needs to create the sparse mappings in.
>
> Again, this is not how things are working. First of all the kernel
> absolutely should *NOT* know about those regions.
>
> What we have inside the kernel is the information what happens if an
> address X is accessed. On AMD HW this can be:
>
> 1. Route to the PCIe bus because the mapped BO is stored in system memory.
> 2. Route to the internal MC because the mapped BO is stored in local memory.
> 3. Route to other GPUs in the same hive.
> 4. Route to some doorbell to kick of other work.
> ...
> x. Ignore write, return 0 on reads (this is what is used for sparse
> mappings).
> x+1. Trigger a recoverable page fault. This is used for things like SVA.
> x+2. Trigger a non-recoverable page fault. This is used for things like
> unmapped regions where access is illegal.
>
> All this is plus some hw specific caching flags.
>
> When Vulkan allocates a sparse VKBuffer what should happen is the following:
>
> 1. The Vulkan driver somehow figures out a VA region A..B for the
> buffer. This can be in userspace (libdrm_amdgpu) or kernel (drm_mm), but
> essentially is currently driver specific.

There are NO plans to have drm_mm do VA region management, VA region
management will be in userspace in Mesa. Can we just not bring that up again?
This is for GPU VA tracking not management if that makes it easier we
could rename it.

>
> 2. The kernel gets a request to map the VA range A..B as sparse, meaning
> that it updates the page tables from A..B with the sparse setting.
>
> 3. User space asks kernel to map a couple of memory backings at location
> A+1, A+10, A+15 etc....

3.5?

Userspace asks the kernel to unmap A+1 so it can later map something
else in there?

What happens in that case, with a set of queued binds, do you just do
a new sparse mapping for A+1, does userspace decide that?

Dave.

>
> 4. The VKBuffer is de-allocated, userspace asks kernel to update region
> A..B to not map anything (usually triggers a non-recoverable fault).
>
> When you want to unify this between hw drivers I strongly suggest to
> completely start from scratch once more.
>
> First of all don't think about those mappings as VMAs, that won't work
> because VMAs are usually something large. Think of this as individual
> PTEs controlled by the application. similar how COW mappings and struct
> pages are handled inside the kernel.
>
> Then I would start with the VA allocation manager. You could probably
> base that on drm_mm. We handle it differently in amdgpu currently, but I
> think this is something we could change.
>
> Then come up with something close to the amdgpu VM system. I'm pretty
> sure that should work for Nouveau and Intel XA as well. In other words
> you just have a bunch of very very small structures which represents
> mappings and a larger structure which combine all mappings of a specific
> type, e.g. all mappings of a BO or all sparse mappings etc...
>
> Merging of regions is actually not mandatory. We don't do it in amdgpu
> and can live with the additional mappings pretty well. But I think this
> can differ between drivers.
>
> Regards,
> Christian.
>
Danilo Krummrich Jan. 27, 2023, 9:09 p.m. UTC | #12
On 1/27/23 16:17, Christian König wrote:
> Am 27.01.23 um 15:44 schrieb Danilo Krummrich:
>> [SNIP]
>>>>>
>>>>> What you want is one component for tracking the VA allocations 
>>>>> (drm_mm based) and a different component/interface for tracking the 
>>>>> VA mappings (probably rb tree based).
>>>>
>>>> That's what the GPUVA manager is doing. There are gpuva_regions 
>>>> which correspond to VA allocations and gpuvas which represent the 
>>>> mappings. Both are tracked separately (currently both with a 
>>>> separate drm_mm, though). However, the GPUVA manager needs to take 
>>>> regions into account when dealing with mappings to make sure the 
>>>> GPUVA manager doesn't propose drivers to merge over region 
>>>> boundaries. Speaking from userspace PoV, the kernel wouldn't merge 
>>>> mappings from different VKBuffer objects even if they're virtually 
>>>> and physically contiguous.
>>>
>>> That are two completely different things and shouldn't be handled in 
>>> a single component.
>>
>> They are different things, but they're related in a way that for 
>> handling the mappings (in particular merging and sparse) the GPUVA 
>> manager needs to know the VA allocation (or region) boundaries.
>>
>> I have the feeling there might be a misunderstanding. Userspace is in 
>> charge to actually allocate a portion of VA space and manage it. The 
>> GPUVA manager just needs to know about those VA space allocations and 
>> hence keeps track of them.
>>
>> The GPUVA manager is not meant to be an allocator in the sense of 
>> finding and providing a hole for a given request.
>>
>> Maybe the non-ideal choice of using drm_mm was implying something else.
> 
> Uff, well long story short that doesn't even remotely match the 
> requirements. This way the GPUVA manager won't be usable for a whole 
> bunch of use cases.
> 
> What we have are mappings which say X needs to point to Y with this and 
> hw dependent flags.
> 
> The whole idea of having ranges is not going to fly. Neither with AMD 
> GPUs and I strongly think not with Intels XA either.

A range in the sense of the GPUVA manager simply represents a VA space 
allocation (which in case of Nouveau is taken in userspace). Userspace 
allocates the portion of VA space and lets the kernel know about it. The 
current implementation needs that for the named reasons. So, I think 
there is no reason why this would work with one GPU, but not with 
another. It's just part of the design choice of the manager.

And I'm absolutely happy to discuss the details of the manager 
implementation though.

> 
>>> We should probably talk about the design of the GPUVA manager once 
>>> more when this should be applicable to all GPU drivers.
>>
>> That's what I try to figure out with this RFC, how to make it 
>> appicable for all GPU drivers, so I'm happy to discuss this. :-)
> 
> Yeah, that was really good idea :) That proposal here is really far away 
> from the actual requirements.
> 

And those are the ones I'm looking for. Do you mind sharing the 
requirements for amdgpu in particular?

>>>> For sparse residency the kernel also needs to know the region 
>>>> boundaries to make sure that it keeps sparse mappings around.
>>>
>>> What?
>>
>> When userspace creates a new VKBuffer with the 
>> VK_BUFFER_CREATE_SPARSE_BINDING_BIT the kernel may need to create 
>> sparse mappings in order to ensure that using this buffer without any 
>> memory backed mappings doesn't fault the GPU.
>>
>> Currently, the implementation does this the following way:
>>
>> 1. Userspace creates a new VKBuffer and hence allocates a portion of 
>> the VA space for it. It calls into the kernel indicating the new VA 
>> space region and the fact that the region is sparse.
>>
>> 2. The kernel picks up the region and stores it in the GPUVA manager, 
>> the driver creates the corresponding sparse mappings / page table 
>> entries.
>>
>> 3. Userspace might ask the driver to create a couple of memory backed 
>> mappings for this particular VA region. The GPUVA manager stores the 
>> mapping parameters, the driver creates the corresponding page table 
>> entries.
>>
>> 4. Userspace might ask to unmap all the memory backed mappings from 
>> this particular VA region. The GPUVA manager removes the mapping 
>> parameters, the driver cleans up the corresponding page table entries. 
>> However, the driver also needs to re-create the sparse mappings, since 
>> it's a sparse buffer, hence it needs to know the boundaries of the 
>> region it needs to create the sparse mappings in.
> 
> Again, this is not how things are working. First of all the kernel 
> absolutely should *NOT* know about those regions.
> 
> What we have inside the kernel is the information what happens if an 
> address X is accessed. On AMD HW this can be:
> 
> 1. Route to the PCIe bus because the mapped BO is stored in system memory.
> 2. Route to the internal MC because the mapped BO is stored in local 
> memory.
> 3. Route to other GPUs in the same hive.
> 4. Route to some doorbell to kick of other work.
> ...
> x. Ignore write, return 0 on reads (this is what is used for sparse 
> mappings).
> x+1. Trigger a recoverable page fault. This is used for things like SVA.
> x+2. Trigger a non-recoverable page fault. This is used for things like 
> unmapped regions where access is illegal.
> 
> All this is plus some hw specific caching flags.
> 
> When Vulkan allocates a sparse VKBuffer what should happen is the 
> following:
> 
> 1. The Vulkan driver somehow figures out a VA region A..B for the 
> buffer. This can be in userspace (libdrm_amdgpu) or kernel (drm_mm), but 
> essentially is currently driver specific.

Right, for Nouveau we have this in userspace as well.

> 
> 2. The kernel gets a request to map the VA range A..B as sparse, meaning 
> that it updates the page tables from A..B with the sparse setting.
> 
> 3. User space asks kernel to map a couple of memory backings at location 
> A+1, A+10, A+15 etc....
> 
> 4. The VKBuffer is de-allocated, userspace asks kernel to update region 
> A..B to not map anything (usually triggers a non-recoverable fault).

Until here this seems to be identical to what I'm doing.

It'd be interesting to know how amdgpu handles everything that 
potentially happens between your 3) and 4). More specifically, how are 
the page tables changed when memory backed mappings are mapped on a 
sparse range? What happens when the memory backed mappings are unmapped, 
but the VKBuffer isn't de-allocated, and hence sparse mappings need to 
be re-deployed?

Let's assume the sparse VKBuffer (and hence the VA space allocation) is 
pretty large. In Nouveau the corresponding PTEs would have a rather huge 
page size to cover this. Now, if small memory backed mappings are mapped 
to this huge sparse buffer, in Nouveau we'd allocate a new PT with a 
corresponding smaller page size overlaying the sparse mappings PTEs.

How would this look like in amdgpu?

> 
> When you want to unify this between hw drivers I strongly suggest to 
> completely start from scratch once more.
> 
> First of all don't think about those mappings as VMAs, that won't work 
> because VMAs are usually something large. Think of this as individual 
> PTEs controlled by the application. similar how COW mappings and struct 
> pages are handled inside the kernel.

Why do you consider tracking single PTEs superior to tracking VMAs? All 
the properties for a page you mentioned above should be equal for the 
entirety of pages of a whole (memory backed) mapping, aren't they?

> 
> Then I would start with the VA allocation manager. You could probably 
> base that on drm_mm. We handle it differently in amdgpu currently, but I 
> think this is something we could change.

It was not my intention to come up with an actual allocator for the VA 
space in the sense of actually finding a free and fitting hole in the VA 
space.

For Nouveau (and XE, I think) we have this in userspace and from what 
you've written previously I thought the same applies for amdgpu?

> 
> Then come up with something close to the amdgpu VM system. I'm pretty 
> sure that should work for Nouveau and Intel XA as well. In other words 
> you just have a bunch of very very small structures which represents 
> mappings and a larger structure which combine all mappings of a specific 
> type, e.g. all mappings of a BO or all sparse mappings etc...

Considering what you wrote above I assume that small structures / 
mappings in this paragraph refer to PTEs.

Immediately, I don't really see how this fine grained resolution of 
single PTEs would help implementing this in Nouveau. Actually, I think 
it would even complicate the handling of PTs, but I would need to think 
about this a bit more.

> 
> Merging of regions is actually not mandatory. We don't do it in amdgpu 
> and can live with the additional mappings pretty well. But I think this 
> can differ between drivers.
> 
> Regards,
> Christian.
>
Danilo Krummrich Jan. 29, 2023, 6:46 p.m. UTC | #13
On 1/27/23 22:09, Danilo Krummrich wrote:
> On 1/27/23 16:17, Christian König wrote:
>> Am 27.01.23 um 15:44 schrieb Danilo Krummrich:
>>> [SNIP]
>>>>>>
>>>>>> What you want is one component for tracking the VA allocations 
>>>>>> (drm_mm based) and a different component/interface for tracking 
>>>>>> the VA mappings (probably rb tree based).
>>>>>
>>>>> That's what the GPUVA manager is doing. There are gpuva_regions 
>>>>> which correspond to VA allocations and gpuvas which represent the 
>>>>> mappings. Both are tracked separately (currently both with a 
>>>>> separate drm_mm, though). However, the GPUVA manager needs to take 
>>>>> regions into account when dealing with mappings to make sure the 
>>>>> GPUVA manager doesn't propose drivers to merge over region 
>>>>> boundaries. Speaking from userspace PoV, the kernel wouldn't merge 
>>>>> mappings from different VKBuffer objects even if they're virtually 
>>>>> and physically contiguous.
>>>>
>>>> That are two completely different things and shouldn't be handled in 
>>>> a single component.
>>>
>>> They are different things, but they're related in a way that for 
>>> handling the mappings (in particular merging and sparse) the GPUVA 
>>> manager needs to know the VA allocation (or region) boundaries.
>>>
>>> I have the feeling there might be a misunderstanding. Userspace is in 
>>> charge to actually allocate a portion of VA space and manage it. The 
>>> GPUVA manager just needs to know about those VA space allocations and 
>>> hence keeps track of them.
>>>
>>> The GPUVA manager is not meant to be an allocator in the sense of 
>>> finding and providing a hole for a given request.
>>>
>>> Maybe the non-ideal choice of using drm_mm was implying something else.
>>
>> Uff, well long story short that doesn't even remotely match the 
>> requirements. This way the GPUVA manager won't be usable for a whole 
>> bunch of use cases.
>>
>> What we have are mappings which say X needs to point to Y with this 
>> and hw dependent flags.
>>
>> The whole idea of having ranges is not going to fly. Neither with AMD 
>> GPUs and I strongly think not with Intels XA either.
> 
> A range in the sense of the GPUVA manager simply represents a VA space 
> allocation (which in case of Nouveau is taken in userspace). Userspace 
> allocates the portion of VA space and lets the kernel know about it. The 
> current implementation needs that for the named reasons. So, I think 
> there is no reason why this would work with one GPU, but not with 
> another. It's just part of the design choice of the manager.
> 
> And I'm absolutely happy to discuss the details of the manager 
> implementation though.
> 
>>
>>>> We should probably talk about the design of the GPUVA manager once 
>>>> more when this should be applicable to all GPU drivers.
>>>
>>> That's what I try to figure out with this RFC, how to make it 
>>> appicable for all GPU drivers, so I'm happy to discuss this. :-)
>>
>> Yeah, that was really good idea :) That proposal here is really far 
>> away from the actual requirements.
>>
> 
> And those are the ones I'm looking for. Do you mind sharing the 
> requirements for amdgpu in particular?
> 
>>>>> For sparse residency the kernel also needs to know the region 
>>>>> boundaries to make sure that it keeps sparse mappings around.
>>>>
>>>> What?
>>>
>>> When userspace creates a new VKBuffer with the 
>>> VK_BUFFER_CREATE_SPARSE_BINDING_BIT the kernel may need to create 
>>> sparse mappings in order to ensure that using this buffer without any 
>>> memory backed mappings doesn't fault the GPU.
>>>
>>> Currently, the implementation does this the following way:
>>>
>>> 1. Userspace creates a new VKBuffer and hence allocates a portion of 
>>> the VA space for it. It calls into the kernel indicating the new VA 
>>> space region and the fact that the region is sparse.
>>>
>>> 2. The kernel picks up the region and stores it in the GPUVA manager, 
>>> the driver creates the corresponding sparse mappings / page table 
>>> entries.
>>>
>>> 3. Userspace might ask the driver to create a couple of memory backed 
>>> mappings for this particular VA region. The GPUVA manager stores the 
>>> mapping parameters, the driver creates the corresponding page table 
>>> entries.
>>>
>>> 4. Userspace might ask to unmap all the memory backed mappings from 
>>> this particular VA region. The GPUVA manager removes the mapping 
>>> parameters, the driver cleans up the corresponding page table 
>>> entries. However, the driver also needs to re-create the sparse 
>>> mappings, since it's a sparse buffer, hence it needs to know the 
>>> boundaries of the region it needs to create the sparse mappings in.
>>
>> Again, this is not how things are working. First of all the kernel 
>> absolutely should *NOT* know about those regions.
>>
>> What we have inside the kernel is the information what happens if an 
>> address X is accessed. On AMD HW this can be:
>>
>> 1. Route to the PCIe bus because the mapped BO is stored in system 
>> memory.
>> 2. Route to the internal MC because the mapped BO is stored in local 
>> memory.
>> 3. Route to other GPUs in the same hive.
>> 4. Route to some doorbell to kick of other work.
>> ...
>> x. Ignore write, return 0 on reads (this is what is used for sparse 
>> mappings).
>> x+1. Trigger a recoverable page fault. This is used for things like SVA.
>> x+2. Trigger a non-recoverable page fault. This is used for things 
>> like unmapped regions where access is illegal.
>>
>> All this is plus some hw specific caching flags.
>>
>> When Vulkan allocates a sparse VKBuffer what should happen is the 
>> following:
>>
>> 1. The Vulkan driver somehow figures out a VA region A..B for the 
>> buffer. This can be in userspace (libdrm_amdgpu) or kernel (drm_mm), 
>> but essentially is currently driver specific.
> 
> Right, for Nouveau we have this in userspace as well.
> 
>>
>> 2. The kernel gets a request to map the VA range A..B as sparse, 
>> meaning that it updates the page tables from A..B with the sparse 
>> setting.
>>
>> 3. User space asks kernel to map a couple of memory backings at 
>> location A+1, A+10, A+15 etc....
>>
>> 4. The VKBuffer is de-allocated, userspace asks kernel to update 
>> region A..B to not map anything (usually triggers a non-recoverable 
>> fault).
> 
> Until here this seems to be identical to what I'm doing.
> 
> It'd be interesting to know how amdgpu handles everything that 
> potentially happens between your 3) and 4). More specifically, how are 
> the page tables changed when memory backed mappings are mapped on a 
> sparse range? What happens when the memory backed mappings are unmapped, 
> but the VKBuffer isn't de-allocated, and hence sparse mappings need to 
> be re-deployed?
> 
> Let's assume the sparse VKBuffer (and hence the VA space allocation) is 
> pretty large. In Nouveau the corresponding PTEs would have a rather huge 
> page size to cover this. Now, if small memory backed mappings are mapped 
> to this huge sparse buffer, in Nouveau we'd allocate a new PT with a 
> corresponding smaller page size overlaying the sparse mappings PTEs.
> 
> How would this look like in amdgpu?
> 
>>
>> When you want to unify this between hw drivers I strongly suggest to 
>> completely start from scratch once more.
>>

I just took some time digging into amdgpu and, surprisingly, aside from 
the gpuva_regions it seems like amdgpu basically does exactly the same 
as I do in the GPU VA manager. As explained, those region boundaries are 
needed for merging only and, depending on the driver, might be useful 
for sparse mappings.

For drivers that don't intend to merge at all and (somehow) are capable 
of dealing with sparse regions without knowing the sparse region's 
boundaries, it'd be easy to make those gpuva_regions optional.

>> First of all don't think about those mappings as VMAs, that won't work 
>> because VMAs are usually something large. Think of this as individual 
>> PTEs controlled by the application. similar how COW mappings and 
>> struct pages are handled inside the kernel.
> 
> Why do you consider tracking single PTEs superior to tracking VMAs? All 
> the properties for a page you mentioned above should be equal for the 
> entirety of pages of a whole (memory backed) mapping, aren't they?
> 
>>
>> Then I would start with the VA allocation manager. You could probably 
>> base that on drm_mm. We handle it differently in amdgpu currently, but 
>> I think this is something we could change.
> 
> It was not my intention to come up with an actual allocator for the VA 
> space in the sense of actually finding a free and fitting hole in the VA 
> space.
> 
> For Nouveau (and XE, I think) we have this in userspace and from what 
> you've written previously I thought the same applies for amdgpu?
> 
>>
>> Then come up with something close to the amdgpu VM system. I'm pretty 
>> sure that should work for Nouveau and Intel XA as well. In other words 
>> you just have a bunch of very very small structures which represents 
>> mappings and a larger structure which combine all mappings of a 
>> specific type, e.g. all mappings of a BO or all sparse mappings etc...
> 
> Considering what you wrote above I assume that small structures / 
> mappings in this paragraph refer to PTEs.
> 
> Immediately, I don't really see how this fine grained resolution of 
> single PTEs would help implementing this in Nouveau. Actually, I think 
> it would even complicate the handling of PTs, but I would need to think 
> about this a bit more.
> 
>>
>> Merging of regions is actually not mandatory. We don't do it in amdgpu 
>> and can live with the additional mappings pretty well. But I think 
>> this can differ between drivers.
>>
>> Regards,
>> Christian.
>>
Christian König Jan. 30, 2023, 12:58 p.m. UTC | #14
Am 27.01.23 um 21:25 schrieb David Airlie:
> [SNIP]
>> What we have inside the kernel is the information what happens if an
>> address X is accessed. On AMD HW this can be:
>>
>> 1. Route to the PCIe bus because the mapped BO is stored in system memory.
>> 2. Route to the internal MC because the mapped BO is stored in local memory.
>> 3. Route to other GPUs in the same hive.
>> 4. Route to some doorbell to kick of other work.
>> ...
>> x. Ignore write, return 0 on reads (this is what is used for sparse
>> mappings).
>> x+1. Trigger a recoverable page fault. This is used for things like SVA.
>> x+2. Trigger a non-recoverable page fault. This is used for things like
>> unmapped regions where access is illegal.
>>
>> All this is plus some hw specific caching flags.
>>
>> When Vulkan allocates a sparse VKBuffer what should happen is the following:
>>
>> 1. The Vulkan driver somehow figures out a VA region A..B for the
>> buffer. This can be in userspace (libdrm_amdgpu) or kernel (drm_mm), but
>> essentially is currently driver specific.
> There are NO plans to have drm_mm do VA region management, VA region
> management will be in userspace in Mesa. Can we just not bring that up again?

If we are talking about Mesa drivers then yes that should work because 
they can then implement all the hw specific quirks you need for VA 
allocation. If the VA allocation should be hw independent then we have a 
major problem here.

At least on AMD hw we have four different address spaces and even if you 
know of hand from which one you want to allocate you need to share your 
address space between Vulkan, VA-API and potentially even things like 
ROCm/OpenCL.

If we don't properly do that then the AMD user space tools for debugging 
and profiling (RMV, UMR etc...) won't work any more.

> This is for GPU VA tracking not management if that makes it easier we
> could rename it.
>
>> 2. The kernel gets a request to map the VA range A..B as sparse, meaning
>> that it updates the page tables from A..B with the sparse setting.
>>
>> 3. User space asks kernel to map a couple of memory backings at location
>> A+1, A+10, A+15 etc....
> 3.5?
>
> Userspace asks the kernel to unmap A+1 so it can later map something
> else in there?
>
> What happens in that case, with a set of queued binds, do you just do
> a new sparse mapping for A+1, does userspace decide that?

Yes, exactly that. Essentially there are no unmap operation from the 
kernel pov.

You just tell the kernel what should happen when the hw tries to resolve 
address X.

This what can happen can potentially be resolve to some buffer memory, 
ignored for sparse binding or generate a fault. This is stuff which is 
most likely common to all drivers.

But then at least on newer AMD hardware we also have things like raise a 
debug trap on access, wait forever until a debugger tells you to 
continue.....

It would be great if we could have the common stuff for a VA update 
IOCTL common for all drivers, e.g. in/out fences, range description 
(start, offset, end....), GEM handle in a standardized structure while 
still be able to handle all that hw specific stuff as well.

Christian.

>
> Dave.
>
>> 4. The VKBuffer is de-allocated, userspace asks kernel to update region
>> A..B to not map anything (usually triggers a non-recoverable fault).
>>
>> When you want to unify this between hw drivers I strongly suggest to
>> completely start from scratch once more.
>>
>> First of all don't think about those mappings as VMAs, that won't work
>> because VMAs are usually something large. Think of this as individual
>> PTEs controlled by the application. similar how COW mappings and struct
>> pages are handled inside the kernel.
>>
>> Then I would start with the VA allocation manager. You could probably
>> base that on drm_mm. We handle it differently in amdgpu currently, but I
>> think this is something we could change.
>>
>> Then come up with something close to the amdgpu VM system. I'm pretty
>> sure that should work for Nouveau and Intel XA as well. In other words
>> you just have a bunch of very very small structures which represents
>> mappings and a larger structure which combine all mappings of a specific
>> type, e.g. all mappings of a BO or all sparse mappings etc...
>>
>> Merging of regions is actually not mandatory. We don't do it in amdgpu
>> and can live with the additional mappings pretty well. But I think this
>> can differ between drivers.
>>
>> Regards,
>> Christian.
>>
Christian König Jan. 30, 2023, 1:02 p.m. UTC | #15
Am 29.01.23 um 19:46 schrieb Danilo Krummrich:
> On 1/27/23 22:09, Danilo Krummrich wrote:
>> On 1/27/23 16:17, Christian König wrote:
>>> Am 27.01.23 um 15:44 schrieb Danilo Krummrich:
>>>> [SNIP]
>>>>>>>
>>>>>>> What you want is one component for tracking the VA allocations 
>>>>>>> (drm_mm based) and a different component/interface for tracking 
>>>>>>> the VA mappings (probably rb tree based).
>>>>>>
>>>>>> That's what the GPUVA manager is doing. There are gpuva_regions 
>>>>>> which correspond to VA allocations and gpuvas which represent the 
>>>>>> mappings. Both are tracked separately (currently both with a 
>>>>>> separate drm_mm, though). However, the GPUVA manager needs to 
>>>>>> take regions into account when dealing with mappings to make sure 
>>>>>> the GPUVA manager doesn't propose drivers to merge over region 
>>>>>> boundaries. Speaking from userspace PoV, the kernel wouldn't 
>>>>>> merge mappings from different VKBuffer objects even if they're 
>>>>>> virtually and physically contiguous.
>>>>>
>>>>> That are two completely different things and shouldn't be handled 
>>>>> in a single component.
>>>>
>>>> They are different things, but they're related in a way that for 
>>>> handling the mappings (in particular merging and sparse) the GPUVA 
>>>> manager needs to know the VA allocation (or region) boundaries.
>>>>
>>>> I have the feeling there might be a misunderstanding. Userspace is 
>>>> in charge to actually allocate a portion of VA space and manage it. 
>>>> The GPUVA manager just needs to know about those VA space 
>>>> allocations and hence keeps track of them.
>>>>
>>>> The GPUVA manager is not meant to be an allocator in the sense of 
>>>> finding and providing a hole for a given request.
>>>>
>>>> Maybe the non-ideal choice of using drm_mm was implying something 
>>>> else.
>>>
>>> Uff, well long story short that doesn't even remotely match the 
>>> requirements. This way the GPUVA manager won't be usable for a whole 
>>> bunch of use cases.
>>>
>>> What we have are mappings which say X needs to point to Y with this 
>>> and hw dependent flags.
>>>
>>> The whole idea of having ranges is not going to fly. Neither with 
>>> AMD GPUs and I strongly think not with Intels XA either.
>>
>> A range in the sense of the GPUVA manager simply represents a VA 
>> space allocation (which in case of Nouveau is taken in userspace). 
>> Userspace allocates the portion of VA space and lets the kernel know 
>> about it. The current implementation needs that for the named 
>> reasons. So, I think there is no reason why this would work with one 
>> GPU, but not with another. It's just part of the design choice of the 
>> manager.
>>
>> And I'm absolutely happy to discuss the details of the manager 
>> implementation though.
>>
>>>
>>>>> We should probably talk about the design of the GPUVA manager once 
>>>>> more when this should be applicable to all GPU drivers.
>>>>
>>>> That's what I try to figure out with this RFC, how to make it 
>>>> appicable for all GPU drivers, so I'm happy to discuss this. :-)
>>>
>>> Yeah, that was really good idea :) That proposal here is really far 
>>> away from the actual requirements.
>>>
>>
>> And those are the ones I'm looking for. Do you mind sharing the 
>> requirements for amdgpu in particular?
>>
>>>>>> For sparse residency the kernel also needs to know the region 
>>>>>> boundaries to make sure that it keeps sparse mappings around.
>>>>>
>>>>> What?
>>>>
>>>> When userspace creates a new VKBuffer with the 
>>>> VK_BUFFER_CREATE_SPARSE_BINDING_BIT the kernel may need to create 
>>>> sparse mappings in order to ensure that using this buffer without 
>>>> any memory backed mappings doesn't fault the GPU.
>>>>
>>>> Currently, the implementation does this the following way:
>>>>
>>>> 1. Userspace creates a new VKBuffer and hence allocates a portion 
>>>> of the VA space for it. It calls into the kernel indicating the new 
>>>> VA space region and the fact that the region is sparse.
>>>>
>>>> 2. The kernel picks up the region and stores it in the GPUVA 
>>>> manager, the driver creates the corresponding sparse mappings / 
>>>> page table entries.
>>>>
>>>> 3. Userspace might ask the driver to create a couple of memory 
>>>> backed mappings for this particular VA region. The GPUVA manager 
>>>> stores the mapping parameters, the driver creates the corresponding 
>>>> page table entries.
>>>>
>>>> 4. Userspace might ask to unmap all the memory backed mappings from 
>>>> this particular VA region. The GPUVA manager removes the mapping 
>>>> parameters, the driver cleans up the corresponding page table 
>>>> entries. However, the driver also needs to re-create the sparse 
>>>> mappings, since it's a sparse buffer, hence it needs to know the 
>>>> boundaries of the region it needs to create the sparse mappings in.
>>>
>>> Again, this is not how things are working. First of all the kernel 
>>> absolutely should *NOT* know about those regions.
>>>
>>> What we have inside the kernel is the information what happens if an 
>>> address X is accessed. On AMD HW this can be:
>>>
>>> 1. Route to the PCIe bus because the mapped BO is stored in system 
>>> memory.
>>> 2. Route to the internal MC because the mapped BO is stored in local 
>>> memory.
>>> 3. Route to other GPUs in the same hive.
>>> 4. Route to some doorbell to kick of other work.
>>> ...
>>> x. Ignore write, return 0 on reads (this is what is used for sparse 
>>> mappings).
>>> x+1. Trigger a recoverable page fault. This is used for things like 
>>> SVA.
>>> x+2. Trigger a non-recoverable page fault. This is used for things 
>>> like unmapped regions where access is illegal.
>>>
>>> All this is plus some hw specific caching flags.
>>>
>>> When Vulkan allocates a sparse VKBuffer what should happen is the 
>>> following:
>>>
>>> 1. The Vulkan driver somehow figures out a VA region A..B for the 
>>> buffer. This can be in userspace (libdrm_amdgpu) or kernel (drm_mm), 
>>> but essentially is currently driver specific.
>>
>> Right, for Nouveau we have this in userspace as well.
>>
>>>
>>> 2. The kernel gets a request to map the VA range A..B as sparse, 
>>> meaning that it updates the page tables from A..B with the sparse 
>>> setting.
>>>
>>> 3. User space asks kernel to map a couple of memory backings at 
>>> location A+1, A+10, A+15 etc....
>>>
>>> 4. The VKBuffer is de-allocated, userspace asks kernel to update 
>>> region A..B to not map anything (usually triggers a non-recoverable 
>>> fault).
>>
>> Until here this seems to be identical to what I'm doing.
>>
>> It'd be interesting to know how amdgpu handles everything that 
>> potentially happens between your 3) and 4). More specifically, how 
>> are the page tables changed when memory backed mappings are mapped on 
>> a sparse range? What happens when the memory backed mappings are 
>> unmapped, but the VKBuffer isn't de-allocated, and hence sparse 
>> mappings need to be re-deployed?
>>
>> Let's assume the sparse VKBuffer (and hence the VA space allocation) 
>> is pretty large. In Nouveau the corresponding PTEs would have a 
>> rather huge page size to cover this. Now, if small memory backed 
>> mappings are mapped to this huge sparse buffer, in Nouveau we'd 
>> allocate a new PT with a corresponding smaller page size overlaying 
>> the sparse mappings PTEs.
>>
>> How would this look like in amdgpu?
>>
>>>
>>> When you want to unify this between hw drivers I strongly suggest to 
>>> completely start from scratch once more.
>>>
>
> I just took some time digging into amdgpu and, surprisingly, aside 
> from the gpuva_regions it seems like amdgpu basically does exactly the 
> same as I do in the GPU VA manager. As explained, those region 
> boundaries are needed for merging only and, depending on the driver, 
> might be useful for sparse mappings.
>
> For drivers that don't intend to merge at all and (somehow) are 
> capable of dealing with sparse regions without knowing the sparse 
> region's boundaries, it'd be easy to make those gpuva_regions optional.

Yeah, but this then defeats the approach of having the same hw 
independent interface/implementation for all drivers.

Let me ask the other way around how does the hw implementation of a 
sparse mapping looks like for NVidia based hardware?

For newer AMD hw its a flag in the page tables, for older hw its a 
register where you can specify ranges A..B. We don't really support the 
later with AMDGPU any more, but from this interface I would guess you 
have the second variant, right?

Christian.

>
>>> First of all don't think about those mappings as VMAs, that won't 
>>> work because VMAs are usually something large. Think of this as 
>>> individual PTEs controlled by the application. similar how COW 
>>> mappings and struct pages are handled inside the kernel.
>>
>> Why do you consider tracking single PTEs superior to tracking VMAs? 
>> All the properties for a page you mentioned above should be equal for 
>> the entirety of pages of a whole (memory backed) mapping, aren't they?
>>
>>>
>>> Then I would start with the VA allocation manager. You could 
>>> probably base that on drm_mm. We handle it differently in amdgpu 
>>> currently, but I think this is something we could change.
>>
>> It was not my intention to come up with an actual allocator for the 
>> VA space in the sense of actually finding a free and fitting hole in 
>> the VA space.
>>
>> For Nouveau (and XE, I think) we have this in userspace and from what 
>> you've written previously I thought the same applies for amdgpu?
>>
>>>
>>> Then come up with something close to the amdgpu VM system. I'm 
>>> pretty sure that should work for Nouveau and Intel XA as well. In 
>>> other words you just have a bunch of very very small structures 
>>> which represents mappings and a larger structure which combine all 
>>> mappings of a specific type, e.g. all mappings of a BO or all sparse 
>>> mappings etc...
>>
>> Considering what you wrote above I assume that small structures / 
>> mappings in this paragraph refer to PTEs.
>>
>> Immediately, I don't really see how this fine grained resolution of 
>> single PTEs would help implementing this in Nouveau. Actually, I 
>> think it would even complicate the handling of PTs, but I would need 
>> to think about this a bit more.
>>
>>>
>>> Merging of regions is actually not mandatory. We don't do it in 
>>> amdgpu and can live with the additional mappings pretty well. But I 
>>> think this can differ between drivers.
>>>
>>> Regards,
>>> Christian.
>>>
>
Danilo Krummrich Jan. 30, 2023, 11:38 p.m. UTC | #16
On 1/30/23 14:02, Christian König wrote:
> Am 29.01.23 um 19:46 schrieb Danilo Krummrich:
>> On 1/27/23 22:09, Danilo Krummrich wrote:
>>> On 1/27/23 16:17, Christian König wrote:
>>>> Am 27.01.23 um 15:44 schrieb Danilo Krummrich:
>>>>> [SNIP]
>>>>>>>>
>>>>>>>> What you want is one component for tracking the VA allocations 
>>>>>>>> (drm_mm based) and a different component/interface for tracking 
>>>>>>>> the VA mappings (probably rb tree based).
>>>>>>>
>>>>>>> That's what the GPUVA manager is doing. There are gpuva_regions 
>>>>>>> which correspond to VA allocations and gpuvas which represent the 
>>>>>>> mappings. Both are tracked separately (currently both with a 
>>>>>>> separate drm_mm, though). However, the GPUVA manager needs to 
>>>>>>> take regions into account when dealing with mappings to make sure 
>>>>>>> the GPUVA manager doesn't propose drivers to merge over region 
>>>>>>> boundaries. Speaking from userspace PoV, the kernel wouldn't 
>>>>>>> merge mappings from different VKBuffer objects even if they're 
>>>>>>> virtually and physically contiguous.
>>>>>>
>>>>>> That are two completely different things and shouldn't be handled 
>>>>>> in a single component.
>>>>>
>>>>> They are different things, but they're related in a way that for 
>>>>> handling the mappings (in particular merging and sparse) the GPUVA 
>>>>> manager needs to know the VA allocation (or region) boundaries.
>>>>>
>>>>> I have the feeling there might be a misunderstanding. Userspace is 
>>>>> in charge to actually allocate a portion of VA space and manage it. 
>>>>> The GPUVA manager just needs to know about those VA space 
>>>>> allocations and hence keeps track of them.
>>>>>
>>>>> The GPUVA manager is not meant to be an allocator in the sense of 
>>>>> finding and providing a hole for a given request.
>>>>>
>>>>> Maybe the non-ideal choice of using drm_mm was implying something 
>>>>> else.
>>>>
>>>> Uff, well long story short that doesn't even remotely match the 
>>>> requirements. This way the GPUVA manager won't be usable for a whole 
>>>> bunch of use cases.
>>>>
>>>> What we have are mappings which say X needs to point to Y with this 
>>>> and hw dependent flags.
>>>>
>>>> The whole idea of having ranges is not going to fly. Neither with 
>>>> AMD GPUs and I strongly think not with Intels XA either.
>>>
>>> A range in the sense of the GPUVA manager simply represents a VA 
>>> space allocation (which in case of Nouveau is taken in userspace). 
>>> Userspace allocates the portion of VA space and lets the kernel know 
>>> about it. The current implementation needs that for the named 
>>> reasons. So, I think there is no reason why this would work with one 
>>> GPU, but not with another. It's just part of the design choice of the 
>>> manager.
>>>
>>> And I'm absolutely happy to discuss the details of the manager 
>>> implementation though.
>>>
>>>>
>>>>>> We should probably talk about the design of the GPUVA manager once 
>>>>>> more when this should be applicable to all GPU drivers.
>>>>>
>>>>> That's what I try to figure out with this RFC, how to make it 
>>>>> appicable for all GPU drivers, so I'm happy to discuss this. :-)
>>>>
>>>> Yeah, that was really good idea :) That proposal here is really far 
>>>> away from the actual requirements.
>>>>
>>>
>>> And those are the ones I'm looking for. Do you mind sharing the 
>>> requirements for amdgpu in particular?
>>>
>>>>>>> For sparse residency the kernel also needs to know the region 
>>>>>>> boundaries to make sure that it keeps sparse mappings around.
>>>>>>
>>>>>> What?
>>>>>
>>>>> When userspace creates a new VKBuffer with the 
>>>>> VK_BUFFER_CREATE_SPARSE_BINDING_BIT the kernel may need to create 
>>>>> sparse mappings in order to ensure that using this buffer without 
>>>>> any memory backed mappings doesn't fault the GPU.
>>>>>
>>>>> Currently, the implementation does this the following way:
>>>>>
>>>>> 1. Userspace creates a new VKBuffer and hence allocates a portion 
>>>>> of the VA space for it. It calls into the kernel indicating the new 
>>>>> VA space region and the fact that the region is sparse.
>>>>>
>>>>> 2. The kernel picks up the region and stores it in the GPUVA 
>>>>> manager, the driver creates the corresponding sparse mappings / 
>>>>> page table entries.
>>>>>
>>>>> 3. Userspace might ask the driver to create a couple of memory 
>>>>> backed mappings for this particular VA region. The GPUVA manager 
>>>>> stores the mapping parameters, the driver creates the corresponding 
>>>>> page table entries.
>>>>>
>>>>> 4. Userspace might ask to unmap all the memory backed mappings from 
>>>>> this particular VA region. The GPUVA manager removes the mapping 
>>>>> parameters, the driver cleans up the corresponding page table 
>>>>> entries. However, the driver also needs to re-create the sparse 
>>>>> mappings, since it's a sparse buffer, hence it needs to know the 
>>>>> boundaries of the region it needs to create the sparse mappings in.
>>>>
>>>> Again, this is not how things are working. First of all the kernel 
>>>> absolutely should *NOT* know about those regions.
>>>>
>>>> What we have inside the kernel is the information what happens if an 
>>>> address X is accessed. On AMD HW this can be:
>>>>
>>>> 1. Route to the PCIe bus because the mapped BO is stored in system 
>>>> memory.
>>>> 2. Route to the internal MC because the mapped BO is stored in local 
>>>> memory.
>>>> 3. Route to other GPUs in the same hive.
>>>> 4. Route to some doorbell to kick of other work.
>>>> ...
>>>> x. Ignore write, return 0 on reads (this is what is used for sparse 
>>>> mappings).
>>>> x+1. Trigger a recoverable page fault. This is used for things like 
>>>> SVA.
>>>> x+2. Trigger a non-recoverable page fault. This is used for things 
>>>> like unmapped regions where access is illegal.
>>>>
>>>> All this is plus some hw specific caching flags.
>>>>
>>>> When Vulkan allocates a sparse VKBuffer what should happen is the 
>>>> following:
>>>>
>>>> 1. The Vulkan driver somehow figures out a VA region A..B for the 
>>>> buffer. This can be in userspace (libdrm_amdgpu) or kernel (drm_mm), 
>>>> but essentially is currently driver specific.
>>>
>>> Right, for Nouveau we have this in userspace as well.
>>>
>>>>
>>>> 2. The kernel gets a request to map the VA range A..B as sparse, 
>>>> meaning that it updates the page tables from A..B with the sparse 
>>>> setting.
>>>>
>>>> 3. User space asks kernel to map a couple of memory backings at 
>>>> location A+1, A+10, A+15 etc....
>>>>
>>>> 4. The VKBuffer is de-allocated, userspace asks kernel to update 
>>>> region A..B to not map anything (usually triggers a non-recoverable 
>>>> fault).
>>>
>>> Until here this seems to be identical to what I'm doing.
>>>
>>> It'd be interesting to know how amdgpu handles everything that 
>>> potentially happens between your 3) and 4). More specifically, how 
>>> are the page tables changed when memory backed mappings are mapped on 
>>> a sparse range? What happens when the memory backed mappings are 
>>> unmapped, but the VKBuffer isn't de-allocated, and hence sparse 
>>> mappings need to be re-deployed?
>>>
>>> Let's assume the sparse VKBuffer (and hence the VA space allocation) 
>>> is pretty large. In Nouveau the corresponding PTEs would have a 
>>> rather huge page size to cover this. Now, if small memory backed 
>>> mappings are mapped to this huge sparse buffer, in Nouveau we'd 
>>> allocate a new PT with a corresponding smaller page size overlaying 
>>> the sparse mappings PTEs.
>>>
>>> How would this look like in amdgpu?
>>>
>>>>
>>>> When you want to unify this between hw drivers I strongly suggest to 
>>>> completely start from scratch once more.
>>>>
>>
>> I just took some time digging into amdgpu and, surprisingly, aside 
>> from the gpuva_regions it seems like amdgpu basically does exactly the 
>> same as I do in the GPU VA manager. As explained, those region 
>> boundaries are needed for merging only and, depending on the driver, 
>> might be useful for sparse mappings.
>>
>> For drivers that don't intend to merge at all and (somehow) are 
>> capable of dealing with sparse regions without knowing the sparse 
>> region's boundaries, it'd be easy to make those gpuva_regions optional.
> 
> Yeah, but this then defeats the approach of having the same hw 
> independent interface/implementation for all drivers.

That's probably a question of interpretation and I'd rather see it as an 
optional feature. Probably 80% to 90% of the code is for tracking 
mappings, generating split / merge steps on bind / unbind and connect 
the mappings to GEM objects. This would be the same for all the drivers 
and some might opt-in for using the feature of additionally tracking 
regions on top and other won't.

> 
> Let me ask the other way around how does the hw implementation of a 
> sparse mapping looks like for NVidia based hardware?
> 
> For newer AMD hw its a flag in the page tables, for older hw its a 
> register where you can specify ranges A..B. We don't really support the 
> later with AMDGPU any more, but from this interface I would guess you 
> have the second variant, right?

No, it's a flag in the PTEs as well.

However, for a rather huge sparse region the sparse PTEs might have a 
different (larger) page size than the PTEs of the (smaller) memory 
backed mappings. Hence, it might be that when trying to map a small 
memory backed mapping within a huge sparse region, we can *not* just 
change the sparse PTE to point to actual memory, but rather create a new 
PT with a smaller page size kind of overlaying the page table containing 
the sparse PTEs with a greater page size.

In such a situation, tracking the whole (sparse) region (representing 
the whole VA allocation) separately comes in handy.

And of course, as mentioned, tracking regions gives us the bounds for 
merging, which e.g. might be useful to pick a greater page size for 
merged mappings. It might also keep the amount of mappings to track down 
by a little bit, however, this would probably only be relevant if we 
actually have quite a few to merge.

> 
> Christian.
> 
>>
>>>> First of all don't think about those mappings as VMAs, that won't 
>>>> work because VMAs are usually something large. Think of this as 
>>>> individual PTEs controlled by the application. similar how COW 
>>>> mappings and struct pages are handled inside the kernel.
>>>
>>> Why do you consider tracking single PTEs superior to tracking VMAs? 
>>> All the properties for a page you mentioned above should be equal for 
>>> the entirety of pages of a whole (memory backed) mapping, aren't they?
>>>
>>>>
>>>> Then I would start with the VA allocation manager. You could 
>>>> probably base that on drm_mm. We handle it differently in amdgpu 
>>>> currently, but I think this is something we could change.
>>>
>>> It was not my intention to come up with an actual allocator for the 
>>> VA space in the sense of actually finding a free and fitting hole in 
>>> the VA space.
>>>
>>> For Nouveau (and XE, I think) we have this in userspace and from what 
>>> you've written previously I thought the same applies for amdgpu?
>>>
>>>>
>>>> Then come up with something close to the amdgpu VM system. I'm 
>>>> pretty sure that should work for Nouveau and Intel XA as well. In 
>>>> other words you just have a bunch of very very small structures 
>>>> which represents mappings and a larger structure which combine all 
>>>> mappings of a specific type, e.g. all mappings of a BO or all sparse 
>>>> mappings etc...
>>>
>>> Considering what you wrote above I assume that small structures / 
>>> mappings in this paragraph refer to PTEs.
>>>
>>> Immediately, I don't really see how this fine grained resolution of 
>>> single PTEs would help implementing this in Nouveau. Actually, I 
>>> think it would even complicate the handling of PTs, but I would need 
>>> to think about this a bit more.
>>>
>>>>
>>>> Merging of regions is actually not mandatory. We don't do it in 
>>>> amdgpu and can live with the additional mappings pretty well. But I 
>>>> think this can differ between drivers.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>
>
Dave Airlie Feb. 1, 2023, 8:10 a.m. UTC | #17
On Mon, 30 Jan 2023 at 23:02, Christian König <christian.koenig@amd.com> wrote:
>
> Am 29.01.23 um 19:46 schrieb Danilo Krummrich:
> > On 1/27/23 22:09, Danilo Krummrich wrote:
> >> On 1/27/23 16:17, Christian König wrote:
> >>> Am 27.01.23 um 15:44 schrieb Danilo Krummrich:
> >>>> [SNIP]
> >>>>>>>
> >>>>>>> What you want is one component for tracking the VA allocations
> >>>>>>> (drm_mm based) and a different component/interface for tracking
> >>>>>>> the VA mappings (probably rb tree based).
> >>>>>>
> >>>>>> That's what the GPUVA manager is doing. There are gpuva_regions
> >>>>>> which correspond to VA allocations and gpuvas which represent the
> >>>>>> mappings. Both are tracked separately (currently both with a
> >>>>>> separate drm_mm, though). However, the GPUVA manager needs to
> >>>>>> take regions into account when dealing with mappings to make sure
> >>>>>> the GPUVA manager doesn't propose drivers to merge over region
> >>>>>> boundaries. Speaking from userspace PoV, the kernel wouldn't
> >>>>>> merge mappings from different VKBuffer objects even if they're
> >>>>>> virtually and physically contiguous.
> >>>>>
> >>>>> That are two completely different things and shouldn't be handled
> >>>>> in a single component.
> >>>>
> >>>> They are different things, but they're related in a way that for
> >>>> handling the mappings (in particular merging and sparse) the GPUVA
> >>>> manager needs to know the VA allocation (or region) boundaries.
> >>>>
> >>>> I have the feeling there might be a misunderstanding. Userspace is
> >>>> in charge to actually allocate a portion of VA space and manage it.
> >>>> The GPUVA manager just needs to know about those VA space
> >>>> allocations and hence keeps track of them.
> >>>>
> >>>> The GPUVA manager is not meant to be an allocator in the sense of
> >>>> finding and providing a hole for a given request.
> >>>>
> >>>> Maybe the non-ideal choice of using drm_mm was implying something
> >>>> else.
> >>>
> >>> Uff, well long story short that doesn't even remotely match the
> >>> requirements. This way the GPUVA manager won't be usable for a whole
> >>> bunch of use cases.
> >>>
> >>> What we have are mappings which say X needs to point to Y with this
> >>> and hw dependent flags.
> >>>
> >>> The whole idea of having ranges is not going to fly. Neither with
> >>> AMD GPUs and I strongly think not with Intels XA either.
> >>
> >> A range in the sense of the GPUVA manager simply represents a VA
> >> space allocation (which in case of Nouveau is taken in userspace).
> >> Userspace allocates the portion of VA space and lets the kernel know
> >> about it. The current implementation needs that for the named
> >> reasons. So, I think there is no reason why this would work with one
> >> GPU, but not with another. It's just part of the design choice of the
> >> manager.
> >>
> >> And I'm absolutely happy to discuss the details of the manager
> >> implementation though.
> >>
> >>>
> >>>>> We should probably talk about the design of the GPUVA manager once
> >>>>> more when this should be applicable to all GPU drivers.
> >>>>
> >>>> That's what I try to figure out with this RFC, how to make it
> >>>> appicable for all GPU drivers, so I'm happy to discuss this. :-)
> >>>
> >>> Yeah, that was really good idea :) That proposal here is really far
> >>> away from the actual requirements.
> >>>
> >>
> >> And those are the ones I'm looking for. Do you mind sharing the
> >> requirements for amdgpu in particular?
> >>
> >>>>>> For sparse residency the kernel also needs to know the region
> >>>>>> boundaries to make sure that it keeps sparse mappings around.
> >>>>>
> >>>>> What?
> >>>>
> >>>> When userspace creates a new VKBuffer with the
> >>>> VK_BUFFER_CREATE_SPARSE_BINDING_BIT the kernel may need to create
> >>>> sparse mappings in order to ensure that using this buffer without
> >>>> any memory backed mappings doesn't fault the GPU.
> >>>>
> >>>> Currently, the implementation does this the following way:
> >>>>
> >>>> 1. Userspace creates a new VKBuffer and hence allocates a portion
> >>>> of the VA space for it. It calls into the kernel indicating the new
> >>>> VA space region and the fact that the region is sparse.
> >>>>
> >>>> 2. The kernel picks up the region and stores it in the GPUVA
> >>>> manager, the driver creates the corresponding sparse mappings /
> >>>> page table entries.
> >>>>
> >>>> 3. Userspace might ask the driver to create a couple of memory
> >>>> backed mappings for this particular VA region. The GPUVA manager
> >>>> stores the mapping parameters, the driver creates the corresponding
> >>>> page table entries.
> >>>>
> >>>> 4. Userspace might ask to unmap all the memory backed mappings from
> >>>> this particular VA region. The GPUVA manager removes the mapping
> >>>> parameters, the driver cleans up the corresponding page table
> >>>> entries. However, the driver also needs to re-create the sparse
> >>>> mappings, since it's a sparse buffer, hence it needs to know the
> >>>> boundaries of the region it needs to create the sparse mappings in.
> >>>
> >>> Again, this is not how things are working. First of all the kernel
> >>> absolutely should *NOT* know about those regions.
> >>>
> >>> What we have inside the kernel is the information what happens if an
> >>> address X is accessed. On AMD HW this can be:
> >>>
> >>> 1. Route to the PCIe bus because the mapped BO is stored in system
> >>> memory.
> >>> 2. Route to the internal MC because the mapped BO is stored in local
> >>> memory.
> >>> 3. Route to other GPUs in the same hive.
> >>> 4. Route to some doorbell to kick of other work.
> >>> ...
> >>> x. Ignore write, return 0 on reads (this is what is used for sparse
> >>> mappings).
> >>> x+1. Trigger a recoverable page fault. This is used for things like
> >>> SVA.
> >>> x+2. Trigger a non-recoverable page fault. This is used for things
> >>> like unmapped regions where access is illegal.
> >>>
> >>> All this is plus some hw specific caching flags.
> >>>
> >>> When Vulkan allocates a sparse VKBuffer what should happen is the
> >>> following:
> >>>
> >>> 1. The Vulkan driver somehow figures out a VA region A..B for the
> >>> buffer. This can be in userspace (libdrm_amdgpu) or kernel (drm_mm),
> >>> but essentially is currently driver specific.
> >>
> >> Right, for Nouveau we have this in userspace as well.
> >>
> >>>
> >>> 2. The kernel gets a request to map the VA range A..B as sparse,
> >>> meaning that it updates the page tables from A..B with the sparse
> >>> setting.
> >>>
> >>> 3. User space asks kernel to map a couple of memory backings at
> >>> location A+1, A+10, A+15 etc....
> >>>
> >>> 4. The VKBuffer is de-allocated, userspace asks kernel to update
> >>> region A..B to not map anything (usually triggers a non-recoverable
> >>> fault).
> >>
> >> Until here this seems to be identical to what I'm doing.
> >>
> >> It'd be interesting to know how amdgpu handles everything that
> >> potentially happens between your 3) and 4). More specifically, how
> >> are the page tables changed when memory backed mappings are mapped on
> >> a sparse range? What happens when the memory backed mappings are
> >> unmapped, but the VKBuffer isn't de-allocated, and hence sparse
> >> mappings need to be re-deployed?
> >>
> >> Let's assume the sparse VKBuffer (and hence the VA space allocation)
> >> is pretty large. In Nouveau the corresponding PTEs would have a
> >> rather huge page size to cover this. Now, if small memory backed
> >> mappings are mapped to this huge sparse buffer, in Nouveau we'd
> >> allocate a new PT with a corresponding smaller page size overlaying
> >> the sparse mappings PTEs.
> >>
> >> How would this look like in amdgpu?
> >>
> >>>
> >>> When you want to unify this between hw drivers I strongly suggest to
> >>> completely start from scratch once more.
> >>>
> >
> > I just took some time digging into amdgpu and, surprisingly, aside
> > from the gpuva_regions it seems like amdgpu basically does exactly the
> > same as I do in the GPU VA manager. As explained, those region
> > boundaries are needed for merging only and, depending on the driver,
> > might be useful for sparse mappings.
> >
> > For drivers that don't intend to merge at all and (somehow) are
> > capable of dealing with sparse regions without knowing the sparse
> > region's boundaries, it'd be easy to make those gpuva_regions optional.
>
> Yeah, but this then defeats the approach of having the same hw
> independent interface/implementation for all drivers.

I think you are running a few steps ahead here. The plan isn't to have
an independent interface, it's to provide a set of routines and
tracking that will be consistent across drivers, so that all drivers
once using them will operate in mostly the same fashion with respect
to GPU VA tracking and VA/BO lifetimes. Already in the tree we have
amdgpu and freedreno which I think end up operating slightly different
around lifetimes. I'd like to save future driver writers the effort of
dealing with those decisions and this should drive their user api
design so to enable vulkan sparse bindings.

Now if merging is a feature that makes sense to one driver maybe it
makes sense to all, however there may be reasons amdgpu gets away
without merging that other drivers might not benefit from, there might
also be a benefit to amdgpu from merging that you haven't looked at
yet, so I think we could leave merging as an optional extra driver
knob here. The userspace API should operate the same, it would just be
the gpu pagetables that would end up different sizes.

Dave.
Christian König Feb. 2, 2023, 11:53 a.m. UTC | #18
Am 01.02.23 um 09:10 schrieb Dave Airlie:
> [SNIP]
>>> For drivers that don't intend to merge at all and (somehow) are
>>> capable of dealing with sparse regions without knowing the sparse
>>> region's boundaries, it'd be easy to make those gpuva_regions optional.
>> Yeah, but this then defeats the approach of having the same hw
>> independent interface/implementation for all drivers.
> I think you are running a few steps ahead here. The plan isn't to have
> an independent interface, it's to provide a set of routines and
> tracking that will be consistent across drivers, so that all drivers
> once using them will operate in mostly the same fashion with respect
> to GPU VA tracking and VA/BO lifetimes. Already in the tree we have
> amdgpu and freedreno which I think end up operating slightly different
> around lifetimes. I'd like to save future driver writers the effort of
> dealing with those decisions and this should drive their user api
> design so to enable vulkan sparse bindings.

Ok in this case I'm pretty sure this is *NOT* a good idea.

See this means that we define the UAPI implicitly by saying to drivers 
to use a common framework for their VM implementation which then results 
in behavior A,B,C,D....

If a driver strides away from this common framework because it has 
different requirements based on how his hw work you certainly get 
different behavior again (and you have tons of hw specific requirements 
in here).

What we should do instead if we want to have some common handling among 
drivers (which I totally agree on makes sense) then we should define the 
UAPI explicitly.

For example we could have a DRM_IOCTL_GPU_VM which takes both driver 
independent as well as driver dependent information and then has the 
documented behavior:
a) VAs do (or don't) vanish automatically when the GEM handle is closed.
b) GEM BOs do (or don't) get an additional reference for each VM they 
are used in.
c) Can handle some common use cases driver independent (BO mappings, 
readonly, writeonly, sparse etc...).
d) Has a well defined behavior when the operation is executed async. 
E.g. in/out fences.
e) Can still handle hw specific stuff like (for example) trap on access 
etc....
...

Especially d is what Bas and I have pretty much already created a 
prototype for the amdgpu specific IOCTL for, but essentially this is 
completely driver independent and actually the more complex stuff. 
Compared to that common lifetime of BOs is just nice to have.

I strongly think we should concentrate on getting this right as well.

> Now if merging is a feature that makes sense to one driver maybe it
> makes sense to all, however there may be reasons amdgpu gets away
> without merging that other drivers might not benefit from, there might
> also be a benefit to amdgpu from merging that you haven't looked at
> yet, so I think we could leave merging as an optional extra driver
> knob here. The userspace API should operate the same, it would just be
> the gpu pagetables that would end up different sizes.

Yeah, agree completely. The point is that we should not have complexity 
inside the kernel which is not necessarily needed in the kernel.

So merging or not is something we have gone back and forth for amdgpu, 
one the one hand it reduces the memory footprint of the housekeeping 
overhead on the other hand it makes the handling more complex, error 
prone and use a few more CPU cycles.

For amdgpu merging is mostly beneficial when you can get rid of a whole 
page tables layer in the hierarchy, but for this you need to merge at 
least 2MiB or 1GiB together. And since that case doesn't happen that 
often we stopped doing it.

But for my understanding why you need the ranges for the merging? Isn't 
it sufficient to check that the mappings have the same type, flags, BO, 
whatever backing them?

Regards,
Christian.


>
> Dave.
Danilo Krummrich Feb. 2, 2023, 6:31 p.m. UTC | #19
On 2/2/23 12:53, Christian König wrote:
> Am 01.02.23 um 09:10 schrieb Dave Airlie:
>> [SNIP]
>>>> For drivers that don't intend to merge at all and (somehow) are
>>>> capable of dealing with sparse regions without knowing the sparse
>>>> region's boundaries, it'd be easy to make those gpuva_regions optional.
>>> Yeah, but this then defeats the approach of having the same hw
>>> independent interface/implementation for all drivers.
>> I think you are running a few steps ahead here. The plan isn't to have
>> an independent interface, it's to provide a set of routines and
>> tracking that will be consistent across drivers, so that all drivers
>> once using them will operate in mostly the same fashion with respect
>> to GPU VA tracking and VA/BO lifetimes. Already in the tree we have
>> amdgpu and freedreno which I think end up operating slightly different
>> around lifetimes. I'd like to save future driver writers the effort of
>> dealing with those decisions and this should drive their user api
>> design so to enable vulkan sparse bindings.
> 
> Ok in this case I'm pretty sure this is *NOT* a good idea.
> 
> See this means that we define the UAPI implicitly by saying to drivers 
> to use a common framework for their VM implementation which then results 
> in behavior A,B,C,D....
> 
> If a driver strides away from this common framework because it has 
> different requirements based on how his hw work you certainly get 
> different behavior again (and you have tons of hw specific requirements 
> in here).
> 
> What we should do instead if we want to have some common handling among 
> drivers (which I totally agree on makes sense) then we should define the 
> UAPI explicitly.

By asking that I don't want to say I'm against this idea, I'm just 
wondering how it becomes easier to deal with "tons of hw specific 
requirements" by generalizing things even more?

What makes us think that we do a better job in considering all hw 
specific requirements with a unified UAPI than with a more lightweight 
generic component for tracking VA mappings?

Also, wouldn't we need something like the GPUVA manager as part of a 
unified UAPI?

> 
> For example we could have a DRM_IOCTL_GPU_VM which takes both driver 
> independent as well as driver dependent information and then has the 
> documented behavior:
> a) VAs do (or don't) vanish automatically when the GEM handle is closed.
> b) GEM BOs do (or don't) get an additional reference for each VM they 
> are used in.
> c) Can handle some common use cases driver independent (BO mappings, 
> readonly, writeonly, sparse etc...).
> d) Has a well defined behavior when the operation is executed async. 
> E.g. in/out fences.
> e) Can still handle hw specific stuff like (for example) trap on access 
> etc....
> ...
> 
> Especially d is what Bas and I have pretty much already created a 
> prototype for the amdgpu specific IOCTL for, but essentially this is 
> completely driver independent and actually the more complex stuff. 
> Compared to that common lifetime of BOs is just nice to have.
> 
> I strongly think we should concentrate on getting this right as well.
> 
>> Now if merging is a feature that makes sense to one driver maybe it
>> makes sense to all, however there may be reasons amdgpu gets away
>> without merging that other drivers might not benefit from, there might
>> also be a benefit to amdgpu from merging that you haven't looked at
>> yet, so I think we could leave merging as an optional extra driver
>> knob here. The userspace API should operate the same, it would just be
>> the gpu pagetables that would end up different sizes.
> 
> Yeah, agree completely. The point is that we should not have complexity 
> inside the kernel which is not necessarily needed in the kernel.
> 
> So merging or not is something we have gone back and forth for amdgpu, 
> one the one hand it reduces the memory footprint of the housekeeping 
> overhead on the other hand it makes the handling more complex, error 
> prone and use a few more CPU cycles.
> 
> For amdgpu merging is mostly beneficial when you can get rid of a whole 
> page tables layer in the hierarchy, but for this you need to merge at 
> least 2MiB or 1GiB together. And since that case doesn't happen that 
> often we stopped doing it.
> 
> But for my understanding why you need the ranges for the merging? Isn't 
> it sufficient to check that the mappings have the same type, flags, BO, 
> whatever backing them?

Not entirely. Let's assume userspace creates two virtually contiguous 
buffers (VKBuffer) A and B. Userspace could bind a BO with BO offset 0 
to A (binding 1) and afterwards bind the same BO with BO offset 
length(A) to B (binding 2), maybe unlikely but AFAIK not illegal.

If we don't know about the bounds of A and B in the kernel, we detect 
that both bindings are virtually and physically contiguous and we merge 
them.

In the best case this was simply useless, because we'll need to split 
them anyway later on when A or B is destroyed, but in the worst case we 
could fault the GPU, e.g. if merging leads to a change of the page 
tables that are backing binding 1, but buffer A is already in use by 
userspace.

In Nouveau, I think we could also get rid of regions and do something 
driver specific for the handling of the dual page tables, which I want 
to use for sparse regions *and* just don't merge (at least for now). But 
exactly for the sake of not limiting drivers in their HW specifics I 
thought it'd be great if merging is supported in case it makes sense for 
a specific HW, especially given the fact that memory sizes are increasing.

> 
> Regards,
> Christian.
> 
> 
>>
>> Dave.
>
Christian König Feb. 6, 2023, 9:48 a.m. UTC | #20
Am 02.02.23 um 19:31 schrieb Danilo Krummrich:
> On 2/2/23 12:53, Christian König wrote:
>> Am 01.02.23 um 09:10 schrieb Dave Airlie:
>>> [SNIP]
>>>>> For drivers that don't intend to merge at all and (somehow) are
>>>>> capable of dealing with sparse regions without knowing the sparse
>>>>> region's boundaries, it'd be easy to make those gpuva_regions 
>>>>> optional.
>>>> Yeah, but this then defeats the approach of having the same hw
>>>> independent interface/implementation for all drivers.
>>> I think you are running a few steps ahead here. The plan isn't to have
>>> an independent interface, it's to provide a set of routines and
>>> tracking that will be consistent across drivers, so that all drivers
>>> once using them will operate in mostly the same fashion with respect
>>> to GPU VA tracking and VA/BO lifetimes. Already in the tree we have
>>> amdgpu and freedreno which I think end up operating slightly different
>>> around lifetimes. I'd like to save future driver writers the effort of
>>> dealing with those decisions and this should drive their user api
>>> design so to enable vulkan sparse bindings.
>>
>> Ok in this case I'm pretty sure this is *NOT* a good idea.
>>
>> See this means that we define the UAPI implicitly by saying to 
>> drivers to use a common framework for their VM implementation which 
>> then results in behavior A,B,C,D....
>>
>> If a driver strides away from this common framework because it has 
>> different requirements based on how his hw work you certainly get 
>> different behavior again (and you have tons of hw specific 
>> requirements in here).
>>
>> What we should do instead if we want to have some common handling 
>> among drivers (which I totally agree on makes sense) then we should 
>> define the UAPI explicitly.
>
> By asking that I don't want to say I'm against this idea, I'm just 
> wondering how it becomes easier to deal with "tons of hw specific 
> requirements" by generalizing things even more?

I'm already maintaining two different GPU VM solutions in the GPU 
drivers in the kernel, radeon and amdgpu. The hw they driver is 
identical, just the UAPI is different. And only because of the different 
UAPI they can't have the same VM backend implementation.

The hw stuff is completely abstract able. That's just stuff you need to 
consider when defining the structures you pass around.

But a messed up UAPI is sometimes impossible to fix because of backward 
compatibility.

We learned that the hard way with radeon and mostly fixed it by coming 
up with a completely new implementation for amdgpu.

> What makes us think that we do a better job in considering all hw 
> specific requirements with a unified UAPI than with a more lightweight 
> generic component for tracking VA mappings?

Because this defines the UAPI implicitly and that's seldom a good idea.

As I said before tracking is the easy part of the job. Defining this 
generic component helps a little bit writing new drivers, but it leaves 
way to much room for speculations on the UAPI.

> Also, wouldn't we need something like the GPUVA manager as part of a 
> unified UAPI?

Not necessarily. We can write components to help drivers implement the 
UAPI, but this isn't mandatory.

>
>>
>> For example we could have a DRM_IOCTL_GPU_VM which takes both driver 
>> independent as well as driver dependent information and then has the 
>> documented behavior:
>> a) VAs do (or don't) vanish automatically when the GEM handle is closed.
>> b) GEM BOs do (or don't) get an additional reference for each VM they 
>> are used in.
>> c) Can handle some common use cases driver independent (BO mappings, 
>> readonly, writeonly, sparse etc...).
>> d) Has a well defined behavior when the operation is executed async. 
>> E.g. in/out fences.
>> e) Can still handle hw specific stuff like (for example) trap on 
>> access etc....
>> ...
>>
>> Especially d is what Bas and I have pretty much already created a 
>> prototype for the amdgpu specific IOCTL for, but essentially this is 
>> completely driver independent and actually the more complex stuff. 
>> Compared to that common lifetime of BOs is just nice to have.
>>
>> I strongly think we should concentrate on getting this right as well.
>>
>>> Now if merging is a feature that makes sense to one driver maybe it
>>> makes sense to all, however there may be reasons amdgpu gets away
>>> without merging that other drivers might not benefit from, there might
>>> also be a benefit to amdgpu from merging that you haven't looked at
>>> yet, so I think we could leave merging as an optional extra driver
>>> knob here. The userspace API should operate the same, it would just be
>>> the gpu pagetables that would end up different sizes.
>>
>> Yeah, agree completely. The point is that we should not have 
>> complexity inside the kernel which is not necessarily needed in the 
>> kernel.
>>
>> So merging or not is something we have gone back and forth for 
>> amdgpu, one the one hand it reduces the memory footprint of the 
>> housekeeping overhead on the other hand it makes the handling more 
>> complex, error prone and use a few more CPU cycles.
>>
>> For amdgpu merging is mostly beneficial when you can get rid of a 
>> whole page tables layer in the hierarchy, but for this you need to 
>> merge at least 2MiB or 1GiB together. And since that case doesn't 
>> happen that often we stopped doing it.
>>
>> But for my understanding why you need the ranges for the merging? 
>> Isn't it sufficient to check that the mappings have the same type, 
>> flags, BO, whatever backing them?
>
> Not entirely. Let's assume userspace creates two virtually contiguous 
> buffers (VKBuffer) A and B. Userspace could bind a BO with BO offset 0 
> to A (binding 1) and afterwards bind the same BO with BO offset 
> length(A) to B (binding 2), maybe unlikely but AFAIK not illegal.
>
> If we don't know about the bounds of A and B in the kernel, we detect 
> that both bindings are virtually and physically contiguous and we 
> merge them.

Well as far as I can see this is actually legal and desirable.

>
> In the best case this was simply useless, because we'll need to split 
> them anyway later on when A or B is destroyed, but in the worst case 
> we could fault the GPU, e.g. if merging leads to a change of the page 
> tables that are backing binding 1, but buffer A is already in use by 
> userspace.

WOW wait a second, regions absolutely don't help you with that anyway.

You need to keep track which mappings are used or otherwise any 
modification could lead to problems.

In other words when the GPU already uses A you *must* have a fence on 
the page tables backing A to prevent their destruction.

>
> In Nouveau, I think we could also get rid of regions and do something 
> driver specific for the handling of the dual page tables, which I want 
> to use for sparse regions *and* just don't merge (at least for now). 
> But exactly for the sake of not limiting drivers in their HW specifics 
> I thought it'd be great if merging is supported in case it makes sense 
> for a specific HW, especially given the fact that memory sizes are 
> increasing.

What do you mean with that?

If you want your page tables to be modifiable while the GPU is using 
them (which is basically a standard requirement from sparse bindings in 
Vulkan) you need double housekeeping anyway.

Those regions strongly sound like you are pushing stuff which should be 
handled in userspace inside the kernel.

Regards,
Christian.

>
>
>>
>> Regards,
>> Christian.
>>
>>
>>>
>>> Dave.
>>
>
Danilo Krummrich Feb. 6, 2023, 1:27 p.m. UTC | #21
On 2/6/23 10:48, Christian König wrote:
> Am 02.02.23 um 19:31 schrieb Danilo Krummrich:
>> On 2/2/23 12:53, Christian König wrote:
>>> Am 01.02.23 um 09:10 schrieb Dave Airlie:
>>>> [SNIP]
>>>>>> For drivers that don't intend to merge at all and (somehow) are
>>>>>> capable of dealing with sparse regions without knowing the sparse
>>>>>> region's boundaries, it'd be easy to make those gpuva_regions 
>>>>>> optional.
>>>>> Yeah, but this then defeats the approach of having the same hw
>>>>> independent interface/implementation for all drivers.
>>>> I think you are running a few steps ahead here. The plan isn't to have
>>>> an independent interface, it's to provide a set of routines and
>>>> tracking that will be consistent across drivers, so that all drivers
>>>> once using them will operate in mostly the same fashion with respect
>>>> to GPU VA tracking and VA/BO lifetimes. Already in the tree we have
>>>> amdgpu and freedreno which I think end up operating slightly different
>>>> around lifetimes. I'd like to save future driver writers the effort of
>>>> dealing with those decisions and this should drive their user api
>>>> design so to enable vulkan sparse bindings.
>>>
>>> Ok in this case I'm pretty sure this is *NOT* a good idea.
>>>
>>> See this means that we define the UAPI implicitly by saying to 
>>> drivers to use a common framework for their VM implementation which 
>>> then results in behavior A,B,C,D....
>>>
>>> If a driver strides away from this common framework because it has 
>>> different requirements based on how his hw work you certainly get 
>>> different behavior again (and you have tons of hw specific 
>>> requirements in here).
>>>
>>> What we should do instead if we want to have some common handling 
>>> among drivers (which I totally agree on makes sense) then we should 
>>> define the UAPI explicitly.
>>
>> By asking that I don't want to say I'm against this idea, I'm just 
>> wondering how it becomes easier to deal with "tons of hw specific 
>> requirements" by generalizing things even more?
> 
> I'm already maintaining two different GPU VM solutions in the GPU 
> drivers in the kernel, radeon and amdgpu. The hw they driver is 
> identical, just the UAPI is different. And only because of the different 
> UAPI they can't have the same VM backend implementation.
> 
> The hw stuff is completely abstract able. That's just stuff you need to 
> consider when defining the structures you pass around.

Wouldn't we need to have strict limitations on that, such that HW 
specific structures / fields are not allowed to break the semantics of 
the UAPI? Because otherwise we wouldn't be able to attach generalized 
components to the unified UAPI which ultimately would be the whole 
purpose. So, if this consideration is correct, I'd still see a risk of 
drivers striding away from it because of their requirements. Again, I 
think a unified UAPI is a good idea, but it sounds more difficult to me 
than this last paragraph implies.

> 
> But a messed up UAPI is sometimes impossible to fix because of backward 
> compatibility.
> 
> We learned that the hard way with radeon and mostly fixed it by coming 
> up with a completely new implementation for amdgpu.
> 
>> What makes us think that we do a better job in considering all hw 
>> specific requirements with a unified UAPI than with a more lightweight 
>> generic component for tracking VA mappings?
> 
> Because this defines the UAPI implicitly and that's seldom a good idea.
> 
> As I said before tracking is the easy part of the job. Defining this 
> generic component helps a little bit writing new drivers, but it leaves 
> way to much room for speculations on the UAPI.
> 

Trying to move forward, I agree that a unified UAPI would improve the 
situation regarding the problems you mentioned and the examples you have 
given.

However, not having the GPUVA manager wouldn't give us a unified UAPI 
either. And as long as it delivers a generic component to solve a 
problem while not making the overall situation worse or preventing us 
from reaching this desirable goal of having a unified UAPI I tend to 
think it's fine to have such a component.

>> Also, wouldn't we need something like the GPUVA manager as part of a 
>> unified UAPI?
> 
> Not necessarily. We can write components to help drivers implement the 
> UAPI, but this isn't mandatory.

Well, yes, not necessarily. However, as mentioned above, wouldn't it be 
a major goal of a unified UAPI to be able to attach generic components 
to it?

> 
>>
>>>
>>> For example we could have a DRM_IOCTL_GPU_VM which takes both driver 
>>> independent as well as driver dependent information and then has the 
>>> documented behavior:
>>> a) VAs do (or don't) vanish automatically when the GEM handle is closed.
>>> b) GEM BOs do (or don't) get an additional reference for each VM they 
>>> are used in.
>>> c) Can handle some common use cases driver independent (BO mappings, 
>>> readonly, writeonly, sparse etc...).
>>> d) Has a well defined behavior when the operation is executed async. 
>>> E.g. in/out fences.
>>> e) Can still handle hw specific stuff like (for example) trap on 
>>> access etc....
>>> ...
>>>
>>> Especially d is what Bas and I have pretty much already created a 
>>> prototype for the amdgpu specific IOCTL for, but essentially this is 
>>> completely driver independent and actually the more complex stuff. 
>>> Compared to that common lifetime of BOs is just nice to have.
>>>
>>> I strongly think we should concentrate on getting this right as well.
>>>
>>>> Now if merging is a feature that makes sense to one driver maybe it
>>>> makes sense to all, however there may be reasons amdgpu gets away
>>>> without merging that other drivers might not benefit from, there might
>>>> also be a benefit to amdgpu from merging that you haven't looked at
>>>> yet, so I think we could leave merging as an optional extra driver
>>>> knob here. The userspace API should operate the same, it would just be
>>>> the gpu pagetables that would end up different sizes.
>>>
>>> Yeah, agree completely. The point is that we should not have 
>>> complexity inside the kernel which is not necessarily needed in the 
>>> kernel.
>>>
>>> So merging or not is something we have gone back and forth for 
>>> amdgpu, one the one hand it reduces the memory footprint of the 
>>> housekeeping overhead on the other hand it makes the handling more 
>>> complex, error prone and use a few more CPU cycles.
>>>
>>> For amdgpu merging is mostly beneficial when you can get rid of a 
>>> whole page tables layer in the hierarchy, but for this you need to 
>>> merge at least 2MiB or 1GiB together. And since that case doesn't 
>>> happen that often we stopped doing it.
>>>
>>> But for my understanding why you need the ranges for the merging? 
>>> Isn't it sufficient to check that the mappings have the same type, 
>>> flags, BO, whatever backing them?
>>
>> Not entirely. Let's assume userspace creates two virtually contiguous 
>> buffers (VKBuffer) A and B. Userspace could bind a BO with BO offset 0 
>> to A (binding 1) and afterwards bind the same BO with BO offset 
>> length(A) to B (binding 2), maybe unlikely but AFAIK not illegal.
>>
>> If we don't know about the bounds of A and B in the kernel, we detect 
>> that both bindings are virtually and physically contiguous and we 
>> merge them.
> 
> Well as far as I can see this is actually legal and desirable.

Legal, not sure, may depend on the semantics of the UAPI. (More on that 
below your next paragraph.)

Desirable, I don't think so. Since those mappings are associated with 
different VKBuffers they get split up later on anyway, hence why bother 
merging?

> 
>>
>> In the best case this was simply useless, because we'll need to split 
>> them anyway later on when A or B is destroyed, but in the worst case 
>> we could fault the GPU, e.g. if merging leads to a change of the page 
>> tables that are backing binding 1, but buffer A is already in use by 
>> userspace.
> 
> WOW wait a second, regions absolutely don't help you with that anyway.
> 
> You need to keep track which mappings are used or otherwise any 
> modification could lead to problems.
> 
> In other words when the GPU already uses A you *must* have a fence on 
> the page tables backing A to prevent their destruction.
> 

As mentioned above, I'm not entirely sure about that and it might just 
depend on the semantics of the UAPI.

My understanding is that userspace is fully responsible on the parts of 
the GPU VA space it owns. This means that userspace needs to take care 
to *not* ask the kernel to modify mappings that are in use currently. 
Hence, the kernel is in charge to not modify mappings it set up on 
behalf of userspace unless userspace explicitly asks the kernel to do so.

If those are valid preconditions, and based on them we want to support 
merging, the kernel must know about the VA space allocations (or 
VKBuffers in userspace terminology) to make sure it never merges across 
their boundaries, which might not make much sense anyway.

>>
>> In Nouveau, I think we could also get rid of regions and do something 
>> driver specific for the handling of the dual page tables, which I want 
>> to use for sparse regions *and* just don't merge (at least for now). 
>> But exactly for the sake of not limiting drivers in their HW specifics 
>> I thought it'd be great if merging is supported in case it makes sense 
>> for a specific HW, especially given the fact that memory sizes are 
>> increasing.
> 
> What do you mean with that?
> 
> If you want your page tables to be modifiable while the GPU is using 
> them (which is basically a standard requirement from sparse bindings in 
> Vulkan) you need double housekeeping anyway.
> 
> Those regions strongly sound like you are pushing stuff which should be 
> handled in userspace inside the kernel.

1. userspace allocates a new VKBuffer with the sparse bit set (0x0 - 
0x800000)

2. kernel creates a new region structure with the range 0x800000 and 
creates a new PT (A) with 4 PTEs with the sparse flag set (page shift is 21)

3. userspace requests a memory backed mapping at 0x200000 with size 0x2000

4. kernel creates a new mapping structure with base address 0x200000 and 
range 0x2000 and creates a new PT (B) with 2 PTEs (page shift is 12) 
"overlaying" PT A

5. userspace crashes unexpectedly for some reason

6. kernel needs to clean things up, iterates the list of mappings and 
unmaps them (PT B is freed); kernel iterates all regions and removes 
them (PT A is freed)

> 
> Regards,
> Christian.
> 
>>
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>
>>>>
>>>> Dave.
>>>
>>
>
Christian König Feb. 6, 2023, 4:14 p.m. UTC | #22
Concentrating this discussion on a very big misunderstanding first.

Am 06.02.23 um 14:27 schrieb Danilo Krummrich:
> [SNIP]
> My understanding is that userspace is fully responsible on the parts 
> of the GPU VA space it owns. This means that userspace needs to take 
> care to *not* ask the kernel to modify mappings that are in use currently.

This is a completely wrong assumption! Take a look at what games like 
Forza Horizzon are doing.

Basically that game allocates a very big sparse area and fills it with 
pages from BOs while shaders are accessing it. And yes, as far as I know 
this is completely valid behavior.

So you need to be able to handle this case anyway and the approach with 
the regions won't help you at all preventing that.

Regards,
Christian.
Danilo Krummrich Feb. 6, 2023, 6:20 p.m. UTC | #23
On 2/6/23 17:14, Christian König wrote:
> Concentrating this discussion on a very big misunderstanding first.
> 
> Am 06.02.23 um 14:27 schrieb Danilo Krummrich:
>> [SNIP]
>> My understanding is that userspace is fully responsible on the parts 
>> of the GPU VA space it owns. This means that userspace needs to take 
>> care to *not* ask the kernel to modify mappings that are in use 
>> currently.
> 
> This is a completely wrong assumption! Take a look at what games like 
> Forza Horizzon are doing.
> 
> Basically that game allocates a very big sparse area and fills it with 
> pages from BOs while shaders are accessing it. And yes, as far as I know 
> this is completely valid behavior.

I also think this is valid behavior. That's not the problem I'm trying 
to describe. In this case userspace modifies the VA space 
*intentionally* while shaders are accessing it, because it knows that 
the shaders can deal with reading 0s.

Just to have it all in place, the example I gave was:
  - two virtually contiguous buffers A and B
  - binding 1 mapped to A with BO offset 0
  - binding 2 mapped to B with BO offset length(A)

What I did not mention both A and B aren't sparse buffers in this 
example, although it probably doesn't matter too much.

Since the conditions to do so are given, we merge binding 1 and binding 
2 right at the time when binding 2 is requested. To do so a driver might 
unmap binding 1 for a very short period of time (e.g. to (re-)map the 
freshly merged binding with a different page size if possible).

 From userspace perspective buffer A is ready to use before applying 
binding 2 to buffer B, hence it would be illegal to touch binding 1 
again when userspace asks the kernel to map binding 2 to buffer B.

Besides that I think there is no point in merging between buffers anyway 
because we'd end up splitting such a merged mapping anyway later on when 
one of the two buffers is destroyed.

Also, I think the same applies to sparse buffers as well, a mapping 
within A isn't expected to be re-mapped just because something is mapped 
to B.

However, in this context I start wondering if re-mapping in the context 
of merge and split is allowed at all, even within the same sparse buffer 
(and even with a separate page table for sparse mappings as described in 
my last mail; shaders would never fault).

> 
> So you need to be able to handle this case anyway and the approach with 
> the regions won't help you at all preventing that.
> 
> Regards,
> Christian.
>
Christian König Feb. 7, 2023, 9:35 a.m. UTC | #24
Am 06.02.23 um 19:20 schrieb Danilo Krummrich:
> On 2/6/23 17:14, Christian König wrote:
>> Concentrating this discussion on a very big misunderstanding first.
>>
>> Am 06.02.23 um 14:27 schrieb Danilo Krummrich:
>>> [SNIP]
>>> My understanding is that userspace is fully responsible on the parts 
>>> of the GPU VA space it owns. This means that userspace needs to take 
>>> care to *not* ask the kernel to modify mappings that are in use 
>>> currently.
>>
>> This is a completely wrong assumption! Take a look at what games like 
>> Forza Horizzon are doing.
>>
>> Basically that game allocates a very big sparse area and fills it 
>> with pages from BOs while shaders are accessing it. And yes, as far 
>> as I know this is completely valid behavior.
>
> I also think this is valid behavior. That's not the problem I'm trying 
> to describe. In this case userspace modifies the VA space 
> *intentionally* while shaders are accessing it, because it knows that 
> the shaders can deal with reading 0s.

No, it's perfectly valid for userspace to modify the VA space even if 
shaders are not supposed to deal with reading 0s.

>
>
> Just to have it all in place, the example I gave was:
>  - two virtually contiguous buffers A and B
>  - binding 1 mapped to A with BO offset 0
>  - binding 2 mapped to B with BO offset length(A)
>
> What I did not mention both A and B aren't sparse buffers in this 
> example, although it probably doesn't matter too much.
>
> Since the conditions to do so are given, we merge binding 1 and 
> binding 2 right at the time when binding 2 is requested. To do so a 
> driver might unmap binding 1 for a very short period of time (e.g. to 
> (re-)map the freshly merged binding with a different page size if 
> possible).

Nope, that's not correct handling.

>
> From userspace perspective buffer A is ready to use before applying 
> binding 2 to buffer B, hence it would be illegal to touch binding 1 
> again when userspace asks the kernel to map binding 2 to buffer B.
>
> Besides that I think there is no point in merging between buffers 
> anyway because we'd end up splitting such a merged mapping anyway 
> later on when one of the two buffers is destroyed.
>
> Also, I think the same applies to sparse buffers as well, a mapping 
> within A isn't expected to be re-mapped just because something is 
> mapped to B.
>
> However, in this context I start wondering if re-mapping in the 
> context of merge and split is allowed at all, even within the same 
> sparse buffer (and even with a separate page table for sparse mappings 
> as described in my last mail; shaders would never fault).

See, your assumption is that userspace/applications don't modify the VA 
space intentionally while the GPU is accessing it is just bluntly 
speaking incorrect.

When you have a VA address which is mapped to buffer A and accessed by 
some GPU shaders it is perfectly valid for the application to say "map 
it again to the same buffer A".

It is also perfectly valid for an application to re-map this region to a 
different buffer B, it's just not defined when the access then transits 
from A to B. (AFAIK this is currently worked on in a new specification).

So when your page table updates result in the shader to intermediately 
get 0s in return, because you change the underlying mapping you simply 
have some implementation bug in Nouveau.

I don't know how Nvidia hw handles this, and yes it's quite complicated 
on AMD hw as well because our TLBs are not really made for this use 
case, but I'm 100% sure that this is possible since it is still part of 
some of the specifications (mostly Vulkan I think).

To sum it up as far as I can see by giving the regions to the kernel is 
not something you would want for Nouveau either.

Regards,
Christian.


>
>>
>> So you need to be able to handle this case anyway and the approach 
>> with the regions won't help you at all preventing that.
>>
>> Regards,
>> Christian.
>>
>
Danilo Krummrich Feb. 7, 2023, 10:50 a.m. UTC | #25
On 2/7/23 10:35, Christian König wrote:
> Am 06.02.23 um 19:20 schrieb Danilo Krummrich:
>> On 2/6/23 17:14, Christian König wrote:
>>> Concentrating this discussion on a very big misunderstanding first.
>>>
>>> Am 06.02.23 um 14:27 schrieb Danilo Krummrich:
>>>> [SNIP]
>>>> My understanding is that userspace is fully responsible on the parts 
>>>> of the GPU VA space it owns. This means that userspace needs to take 
>>>> care to *not* ask the kernel to modify mappings that are in use 
>>>> currently.
>>>
>>> This is a completely wrong assumption! Take a look at what games like 
>>> Forza Horizzon are doing.
>>>
>>> Basically that game allocates a very big sparse area and fills it 
>>> with pages from BOs while shaders are accessing it. And yes, as far 
>>> as I know this is completely valid behavior.
>>
>> I also think this is valid behavior. That's not the problem I'm trying 
>> to describe. In this case userspace modifies the VA space 
>> *intentionally* while shaders are accessing it, because it knows that 
>> the shaders can deal with reading 0s.
> 
> No, it's perfectly valid for userspace to modify the VA space even if 
> shaders are not supposed to deal with reading 0s.
> 
>>
>>
>> Just to have it all in place, the example I gave was:
>>  - two virtually contiguous buffers A and B
>>  - binding 1 mapped to A with BO offset 0
>>  - binding 2 mapped to B with BO offset length(A)
>>
>> What I did not mention both A and B aren't sparse buffers in this 
>> example, although it probably doesn't matter too much.
>>
>> Since the conditions to do so are given, we merge binding 1 and 
>> binding 2 right at the time when binding 2 is requested. To do so a 
>> driver might unmap binding 1 for a very short period of time (e.g. to 
>> (re-)map the freshly merged binding with a different page size if 
>> possible).
> 
> Nope, that's not correct handling.

I agree, and that's exactly what I'm trying to say. However, I start 
noticing that this is not correct if it happens within the same buffer 
as well.

> 
>>
>> From userspace perspective buffer A is ready to use before applying 
>> binding 2 to buffer B, hence it would be illegal to touch binding 1 
>> again when userspace asks the kernel to map binding 2 to buffer B.
>>
>> Besides that I think there is no point in merging between buffers 
>> anyway because we'd end up splitting such a merged mapping anyway 
>> later on when one of the two buffers is destroyed.
>>
>> Also, I think the same applies to sparse buffers as well, a mapping 
>> within A isn't expected to be re-mapped just because something is 
>> mapped to B.
>>
>> However, in this context I start wondering if re-mapping in the 
>> context of merge and split is allowed at all, even within the same 
>> sparse buffer (and even with a separate page table for sparse mappings 
>> as described in my last mail; shaders would never fault).
> 
> See, your assumption is that userspace/applications don't modify the VA 
> space intentionally while the GPU is accessing it is just bluntly 
> speaking incorrect.
> 

I don't assume that. The opposite is the case. My assumption is that 
it's always OK for userspace to intentionally modify the VA space.

However, I also assumed that if userspace asks for e.g. a new mapping 
within a certain buffer it is OK for the kernel to apply further changes 
(e.g. re-organize PTs to split or merge) to the VA space of which 
userspace isn't aware of. At least as long as they happen within the 
bounds of this particular buffer, but not for other buffers.

I think the reasoning I had in mind was that I thought if userspace asks 
for any modification of a given portion of the VA space (that is a 
VKBuffer) userspace must assume that until this modification (e.g. 
re-organization of PTs) is complete reading 0s intermediately may 
happen. This seems to be clearly wrong.

> When you have a VA address which is mapped to buffer A and accessed by 
> some GPU shaders it is perfectly valid for the application to say "map 
> it again to the same buffer A".
> 
> It is also perfectly valid for an application to re-map this region to a 
> different buffer B, it's just not defined when the access then transits 
> from A to B. (AFAIK this is currently worked on in a new specification).
> 
> So when your page table updates result in the shader to intermediately 
> get 0s in return, because you change the underlying mapping you simply 
> have some implementation bug in Nouveau.

Luckily that's not the case (anymore).

> 
> I don't know how Nvidia hw handles this, and yes it's quite complicated 
> on AMD hw as well because our TLBs are not really made for this use 
> case, but I'm 100% sure that this is possible since it is still part of 
> some of the specifications (mostly Vulkan I think).
> 
> To sum it up as far as I can see by giving the regions to the kernel is 
> not something you would want for Nouveau either.

If, as it turns out, it's also not allowed to do what I described above 
within the same VKBuffer, I agree the bounds aren't needed for merging.

However, I still don't see why we would want to merge over buffer 
boundaries, because ultimately we'll end up splitting such a merged 
mapping later on anyway once one of the buffers is destroyed.

Also, as explained in one of the previous mails in nouveau we can have 
separate PTs for sparse mappings with large page sizes and separate PTs 
for memory backed mappings with smaller page sizes overlaying them. 
Hence, I need to track a single sparse mapping per buffer spanning the 
whole buffer (which I do with a region) and the actual memory backed 
mappings within the same range.

Now, this might or might not be unique for Nvidia hardware. If nouveau 
would be the only potential user, plus we don't care about potentially 
merging mappings over buffer boundaries and hence producing foreseeable 
splits of those merged mappings, we could get rid of regions entirely.

> 
> Regards,
> Christian.
> 
> 
>>
>>>
>>> So you need to be able to handle this case anyway and the approach 
>>> with the regions won't help you at all preventing that.
>>>
>>> Regards,
>>> Christian.
>>>
>>
>
Christian König Feb. 10, 2023, 11:50 a.m. UTC | #26
Am 07.02.23 um 11:50 schrieb Danilo Krummrich:
> On 2/7/23 10:35, Christian König wrote:
[SNIP]
>>>
>>> Just to have it all in place, the example I gave was:
>>>  - two virtually contiguous buffers A and B
>>>  - binding 1 mapped to A with BO offset 0
>>>  - binding 2 mapped to B with BO offset length(A)
>>>
>>> What I did not mention both A and B aren't sparse buffers in this 
>>> example, although it probably doesn't matter too much.
>>>
>>> Since the conditions to do so are given, we merge binding 1 and 
>>> binding 2 right at the time when binding 2 is requested. To do so a 
>>> driver might unmap binding 1 for a very short period of time (e.g. 
>>> to (re-)map the freshly merged binding with a different page size if 
>>> possible).
>>
>> Nope, that's not correct handling.
>
> I agree, and that's exactly what I'm trying to say. However, I start 
> noticing that this is not correct if it happens within the same buffer 
> as well.

Yes, exactly that's my point.

>
>>
>>>
>>> From userspace perspective buffer A is ready to use before applying 
>>> binding 2 to buffer B, hence it would be illegal to touch binding 1 
>>> again when userspace asks the kernel to map binding 2 to buffer B.
>>>
>>> Besides that I think there is no point in merging between buffers 
>>> anyway because we'd end up splitting such a merged mapping anyway 
>>> later on when one of the two buffers is destroyed.
>>>
>>> Also, I think the same applies to sparse buffers as well, a mapping 
>>> within A isn't expected to be re-mapped just because something is 
>>> mapped to B.
>>>
>>> However, in this context I start wondering if re-mapping in the 
>>> context of merge and split is allowed at all, even within the same 
>>> sparse buffer (and even with a separate page table for sparse 
>>> mappings as described in my last mail; shaders would never fault).
>>
>> See, your assumption is that userspace/applications don't modify the 
>> VA space intentionally while the GPU is accessing it is just bluntly 
>> speaking incorrect.
>>
>
> I don't assume that. The opposite is the case. My assumption is that 
> it's always OK for userspace to intentionally modify the VA space.
>
> However, I also assumed that if userspace asks for e.g. a new mapping 
> within a certain buffer it is OK for the kernel to apply further 
> changes (e.g. re-organize PTs to split or merge) to the VA space of 
> which userspace isn't aware of. At least as long as they happen within 
> the bounds of this particular buffer, but not for other buffers.

Well when this somehow affects shaders which accesses other parts of the 
buffer at the same time then that won't work.

> I think the reasoning I had in mind was that I thought if userspace 
> asks for any modification of a given portion of the VA space (that is 
> a VKBuffer) userspace must assume that until this modification (e.g. 
> re-organization of PTs) is complete reading 0s intermediately may 
> happen. This seems to be clearly wrong.
>
>> When you have a VA address which is mapped to buffer A and accessed 
>> by some GPU shaders it is perfectly valid for the application to say 
>> "map it again to the same buffer A".
>>
>> It is also perfectly valid for an application to re-map this region 
>> to a different buffer B, it's just not defined when the access then 
>> transits from A to B. (AFAIK this is currently worked on in a new 
>> specification).
>>
>> So when your page table updates result in the shader to 
>> intermediately get 0s in return, because you change the underlying 
>> mapping you simply have some implementation bug in Nouveau.
>
> Luckily that's not the case (anymore).
>
>>
>> I don't know how Nvidia hw handles this, and yes it's quite 
>> complicated on AMD hw as well because our TLBs are not really made 
>> for this use case, but I'm 100% sure that this is possible since it 
>> is still part of some of the specifications (mostly Vulkan I think).
>>
>> To sum it up as far as I can see by giving the regions to the kernel 
>> is not something you would want for Nouveau either.
>
> If, as it turns out, it's also not allowed to do what I described 
> above within the same VKBuffer, I agree the bounds aren't needed for 
> merging.
>
> However, I still don't see why we would want to merge over buffer 
> boundaries, because ultimately we'll end up splitting such a merged 
> mapping later on anyway once one of the buffers is destroyed.

Well the key point is all approaches have some pros and cons.

If we merge and decide to only do that inside certain boundaries then 
those boundaries needs to be provided and checked against. This burns 
quite some CPU cycles

If we just merge what we can we might have extra page table updates 
which cost time and could result in undesired side effects.

If we don't merge at all we have additional housekeeping for the 
mappings and maybe hw restrictions.

> Also, as explained in one of the previous mails in nouveau we can have 
> separate PTs for sparse mappings with large page sizes and separate 
> PTs for memory backed mappings with smaller page sizes overlaying 
> them. Hence, I need to track a single sparse mapping per buffer 
> spanning the whole buffer (which I do with a region) and the actual 
> memory backed mappings within the same range.
>
> Now, this might or might not be unique for Nvidia hardware. If nouveau 
> would be the only potential user, plus we don't care about potentially 
> merging mappings over buffer boundaries and hence producing 
> foreseeable splits of those merged mappings, we could get rid of 
> regions entirely.

This sounds similar to what AMD hw used to have up until gfx8 (I think), 
basically sparse resources where defined through a separate mechanism to 
the address resolution of the page tables. I won't rule out that other 
hardware has similar approaches.

On the other hand when you have separate page tables for address 
translation and sparse handling then why not instantiate two separate VM 
manager instances for them?

Regards,
Christian.

>
>>
>> Regards,
>> Christian.
>>
>>
>>>
>>>>
>>>> So you need to be able to handle this case anyway and the approach 
>>>> with the regions won't help you at all preventing that.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>
>>
>
Danilo Krummrich Feb. 10, 2023, 12:47 p.m. UTC | #27
On 2/10/23 12:50, Christian König wrote:
> 
> 
> Am 07.02.23 um 11:50 schrieb Danilo Krummrich:
>> On 2/7/23 10:35, Christian König wrote:

<snip>

>> However, I still don't see why we would want to merge over buffer 
>> boundaries, because ultimately we'll end up splitting such a merged 
>> mapping later on anyway once one of the buffers is destroyed.
> 
> Well the key point is all approaches have some pros and cons.
> 
> If we merge and decide to only do that inside certain boundaries then 
> those boundaries needs to be provided and checked against. This burns 
> quite some CPU cycles
> 
> If we just merge what we can we might have extra page table updates 
> which cost time and could result in undesired side effects.
> 
> If we don't merge at all we have additional housekeeping for the 
> mappings and maybe hw restrictions.

Absolutely agree, hence I think it would be beneficial to leave the 
decision to the driver which approach to pick.

For instance, if a driver needs to keep track of these bounds anyway 
because it needs to track separate page tables for sparse regions, there 
is no additional overhead, but the nice effect of being able not avoid 
unnecessary merges and subsequent splits.

> 
>> Also, as explained in one of the previous mails in nouveau we can have 
>> separate PTs for sparse mappings with large page sizes and separate 
>> PTs for memory backed mappings with smaller page sizes overlaying 
>> them. Hence, I need to track a single sparse mapping per buffer 
>> spanning the whole buffer (which I do with a region) and the actual 
>> memory backed mappings within the same range.
>>
>> Now, this might or might not be unique for Nvidia hardware. If nouveau 
>> would be the only potential user, plus we don't care about potentially 
>> merging mappings over buffer boundaries and hence producing 
>> foreseeable splits of those merged mappings, we could get rid of 
>> regions entirely.
> 
> This sounds similar to what AMD hw used to have up until gfx8 (I think), 
> basically sparse resources where defined through a separate mechanism to 
> the address resolution of the page tables. I won't rule out that other 
> hardware has similar approaches.
> 
> On the other hand when you have separate page tables for address 
> translation and sparse handling then why not instantiate two separate VM 
> manager instances for them?

As mentioned above, for some drivers there could be a synergy between 
keeping track of those separate page tables and using these boundaries 
for merge decisions.

Also, having a separate manager instance would lead to have less 
lightweight nodes for sparse regions, since we'd also carry the fields 
needed for memory backed mappings. Furthermore there wouldn't be a 
"generic relationship" between the nodes of the two separate manager 
instances, like a mapping node has a pointer to the region it resides 
in. This may be useful to e.g. do some sanity checks, unmap all mappings 
of a given region, etc.

Of course drivers could code this relationship within the driver 
specific structures around the mapping nodes, but I think it would be 
nice to generalize that and have this built-in.

> 
> Regards,
> Christian.
> 
>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>
>>>>
>>>>>
>>>>> So you need to be able to handle this case anyway and the approach 
>>>>> with the regions won't help you at all preventing that.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>
>>>
>>
>
diff mbox series

Patch

diff --git a/Documentation/gpu/driver-uapi.rst b/Documentation/gpu/driver-uapi.rst
index 4411e6919a3d..9c7ca6e33a68 100644
--- a/Documentation/gpu/driver-uapi.rst
+++ b/Documentation/gpu/driver-uapi.rst
@@ -6,3 +6,11 @@  drm/i915 uAPI
 =============
 
 .. kernel-doc:: include/uapi/drm/i915_drm.h
+
+drm/nouveau uAPI
+================
+
+VM_BIND / EXEC uAPI
+-------------------
+
+.. kernel-doc:: include/uapi/drm/nouveau_drm.h
diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h
index 853a327433d3..f6e7d40201d4 100644
--- a/include/uapi/drm/nouveau_drm.h
+++ b/include/uapi/drm/nouveau_drm.h
@@ -126,6 +126,216 @@  struct drm_nouveau_gem_cpu_fini {
 	__u32 handle;
 };
 
+/**
+ * struct drm_nouveau_sync - sync object
+ *
+ * This structure serves as synchronization mechanism for (potentially)
+ * asynchronous operations such as EXEC or VM_BIND.
+ */
+struct drm_nouveau_sync {
+	/**
+	 * @flags: the flags for a sync object
+	 *
+	 * The first 8 bits are used to determine the type of the sync object.
+	 */
+	__u32 flags;
+#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
+#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
+#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
+	/**
+	 * @handle: the handle of the sync object
+	 */
+	__u32 handle;
+	/**
+	 * @timeline_value:
+	 *
+	 * The timeline point of the sync object in case the syncobj is of
+	 * type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
+	 */
+	__u64 timeline_value;
+};
+
+/**
+ * struct drm_nouveau_vm_init - GPU VA space init structure
+ *
+ * Used to initialize the GPU's VA space for a user client, telling the kernel
+ * which portion of the VA space is managed by the UMD and kernel respectively.
+ */
+struct drm_nouveau_vm_init {
+	/**
+	 * @unmanaged_addr: start address of the kernel managed VA space region
+	 */
+	__u64 unmanaged_addr;
+	/**
+	 * @unmanaged_size: size of the kernel managed VA space region in bytes
+	 */
+	__u64 unmanaged_size;
+};
+
+/**
+ * struct drm_nouveau_vm_bind_op - VM_BIND operation
+ *
+ * This structure represents a single VM_BIND operation. UMDs should pass
+ * an array of this structure via struct drm_nouveau_vm_bind's &op_ptr field.
+ */
+struct drm_nouveau_vm_bind_op {
+	/**
+	 * @op: the operation type
+	 */
+	__u32 op;
+/**
+ * @DRM_NOUVEAU_VM_BIND_OP_ALLOC:
+ *
+ * The alloc operation is used to reserve a VA space region within the GPU's VA
+ * space. Optionally, the &DRM_NOUVEAU_VM_BIND_SPARSE flag can be passed to
+ * instruct the kernel to create sparse mappings for the given region.
+ */
+#define DRM_NOUVEAU_VM_BIND_OP_ALLOC 0x0
+/**
+ * @DRM_NOUVEAU_VM_BIND_OP_FREE: Free a reserved VA space region.
+ */
+#define DRM_NOUVEAU_VM_BIND_OP_FREE 0x1
+/**
+ * @DRM_NOUVEAU_VM_BIND_OP_MAP:
+ *
+ * Map a GEM object to the GPU's VA space. The mapping must be fully enclosed by
+ * a previously allocated VA space region. If the region is sparse, existing
+ * sparse mappings are overwritten.
+ */
+#define DRM_NOUVEAU_VM_BIND_OP_MAP 0x2
+/**
+ * @DRM_NOUVEAU_VM_BIND_OP_UNMAP:
+ *
+ * Unmap an existing mapping in the GPU's VA space. If the region the mapping
+ * is located in is a sparse region, new sparse mappings are created where the
+ * unmapped (memory backed) mapping was mapped previously.
+ */
+#define DRM_NOUVEAU_VM_BIND_OP_UNMAP 0x3
+	/**
+	 * @flags: the flags for a &drm_nouveau_vm_bind_op
+	 */
+	__u32 flags;
+/**
+ * @DRM_NOUVEAU_VM_BIND_SPARSE:
+ *
+ * Indicates that an allocated VA space region should be sparse.
+ */
+#define DRM_NOUVEAU_VM_BIND_SPARSE (1 << 8)
+	/**
+	 * @handle: the handle of the DRM GEM object to map
+	 */
+	__u32 handle;
+	/**
+	 * @addr:
+	 *
+	 * the address the VA space region or (memory backed) mapping should be mapped to
+	 */
+	__u64 addr;
+	/**
+	 * @bo_offset: the offset within the BO backing the mapping
+	 */
+	__u64 bo_offset;
+	/**
+	 * @range: the size of the requested mapping in bytes
+	 */
+	__u64 range;
+};
+
+/**
+ * struct drm_nouveau_vm_bind - structure for DRM_IOCTL_NOUVEAU_VM_BIND
+ */
+struct drm_nouveau_vm_bind {
+	/**
+	 * @op_count: the number of &drm_nouveau_vm_bind_op
+	 */
+	__u32 op_count;
+	/**
+	 * @flags: the flags for a &drm_nouveau_vm_bind ioctl
+	 */
+	__u32 flags;
+/**
+ * @DRM_NOUVEAU_VM_BIND_RUN_ASYNC:
+ *
+ * Indicates that the given VM_BIND operation should be executed asynchronously
+ * by the kernel.
+ *
+ * If this flag is not supplied the kernel executes the associated operations
+ * synchronously and doesn't accept any &drm_nouveau_sync objects.
+ */
+#define DRM_NOUVEAU_VM_BIND_RUN_ASYNC 0x1
+	/**
+	 * @wait_count: the number of wait &drm_nouveau_syncs
+	 */
+	__u32 wait_count;
+	/**
+	 * @sig_count: the number of &drm_nouveau_syncs to signal when finished
+	 */
+	__u32 sig_count;
+	/**
+	 * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
+	 */
+	__u64 wait_ptr;
+	/**
+	 * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
+	 */
+	__u64 sig_ptr;
+	/**
+	 * @op_ptr: pointer to the &drm_nouveau_vm_bind_ops to execute
+	 */
+	__u64 op_ptr;
+};
+
+/**
+ * struct drm_nouveau_exec_push - EXEC push operation
+ *
+ * This structure represents a single EXEC push operation. UMDs should pass an
+ * array of this structure via struct drm_nouveau_exec's &push_ptr field.
+ */
+struct drm_nouveau_exec_push {
+	/**
+	 * @va: the virtual address of the push buffer mapping
+	 */
+	__u64 va;
+	/**
+	 * @va_len: the length of the push buffer mapping
+	 */
+	__u64 va_len;
+};
+
+/**
+ * struct drm_nouveau_exec - structure for DRM_IOCTL_NOUVEAU_EXEC
+ */
+struct drm_nouveau_exec {
+	/**
+	 * @channel: the channel to execute the push buffer in
+	 */
+	__u32 channel;
+	/**
+	 * @push_count: the number of &drm_nouveau_exec_push ops
+	 */
+	__u32 push_count;
+	/**
+	 * @wait_count: the number of wait &drm_nouveau_syncs
+	 */
+	__u32 wait_count;
+	/**
+	 * @sig_count: the number of &drm_nouveau_syncs to signal when finished
+	 */
+	__u32 sig_count;
+	/**
+	 * @wait_ptr: pointer to &drm_nouveau_syncs to wait for
+	 */
+	__u64 wait_ptr;
+	/**
+	 * @sig_ptr: pointer to &drm_nouveau_syncs to signal when finished
+	 */
+	__u64 sig_ptr;
+	/**
+	 * @push_ptr: pointer to &drm_nouveau_exec_push ops
+	 */
+	__u64 push_ptr;
+};
+
 #define DRM_NOUVEAU_GETPARAM           0x00 /* deprecated */
 #define DRM_NOUVEAU_SETPARAM           0x01 /* deprecated */
 #define DRM_NOUVEAU_CHANNEL_ALLOC      0x02 /* deprecated */
@@ -136,6 +346,9 @@  struct drm_nouveau_gem_cpu_fini {
 #define DRM_NOUVEAU_NVIF               0x07
 #define DRM_NOUVEAU_SVM_INIT           0x08
 #define DRM_NOUVEAU_SVM_BIND           0x09
+#define DRM_NOUVEAU_VM_INIT            0x10
+#define DRM_NOUVEAU_VM_BIND            0x11
+#define DRM_NOUVEAU_EXEC               0x12
 #define DRM_NOUVEAU_GEM_NEW            0x40
 #define DRM_NOUVEAU_GEM_PUSHBUF        0x41
 #define DRM_NOUVEAU_GEM_CPU_PREP       0x42
@@ -197,6 +410,9 @@  struct drm_nouveau_svm_bind {
 #define DRM_IOCTL_NOUVEAU_GEM_CPU_FINI       DRM_IOW (DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_CPU_FINI, struct drm_nouveau_gem_cpu_fini)
 #define DRM_IOCTL_NOUVEAU_GEM_INFO           DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_GEM_INFO, struct drm_nouveau_gem_info)
 
+#define DRM_IOCTL_NOUVEAU_VM_INIT            DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_VM_INIT, struct drm_nouveau_vm_init)
+#define DRM_IOCTL_NOUVEAU_VM_BIND            DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_VM_BIND, struct drm_nouveau_vm_bind)
+#define DRM_IOCTL_NOUVEAU_EXEC               DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_EXEC, struct drm_nouveau_exec)
 #if defined(__cplusplus)
 }
 #endif