diff mbox series

[6/6] drm/xe/xe_vm: Implement xe_vm_get_property_ioctl

Message ID 20250226225557.133076-7-jonathan.cavitt@intel.com (mailing list archive)
State New
Headers show
Series drm/xe/xe_vm: Implement xe_vm_get_property_ioctl | expand

Commit Message

Jonathan Cavitt Feb. 26, 2025, 10:55 p.m. UTC
Add support for userspace to get various properties from a specified VM.
The currently supported properties are:

- The number of engine resets the VM has observed
- The number of exec queue bans the VM has observed, up to the last 50
  relevant ones, and how many of those were caused by faults.

The latter request also includes information on the exec queue bans,
such as the ID of the banned exec queue, whether the ban was caused by a
pagefault or not, and the address and address type of the associated
fault (if one exists).

Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Suggested-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_device.c |   2 +
 drivers/gpu/drm/xe/xe_vm.c     | 106 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_vm.h     |   2 +
 include/uapi/drm/xe_drm.h      |  73 +++++++++++++++++++++++
 4 files changed, 183 insertions(+)

Comments

kernel test robot Feb. 27, 2025, 2:36 a.m. UTC | #1
Hi Jonathan,

kernel test robot noticed the following build errors:

[auto build test ERROR on drm-xe/drm-xe-next]
[also build test ERROR on next-20250226]
[cannot apply to linus/master v6.14-rc4]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Jonathan-Cavitt/drm-xe-xe_gt_pagefault-Migrate-lookup_vma-to-xe_vm-h/20250227-070008
base:   https://gitlab.freedesktop.org/drm/xe/kernel.git drm-xe-next
patch link:    https://lore.kernel.org/r/20250226225557.133076-7-jonathan.cavitt%40intel.com
patch subject: [PATCH 6/6] drm/xe/xe_vm: Implement xe_vm_get_property_ioctl
config: s390-randconfig-001-20250227 (https://download.01.org/0day-ci/archive/20250227/202502271029.67aYhWm6-lkp@intel.com/config)
compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250227/202502271029.67aYhWm6-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202502271029.67aYhWm6-lkp@intel.com/

All errors (new ones prefixed by >>):

>> drivers/gpu/drm/xe/xe_vm.c:3288:2: error: use of undeclared identifier 'vma'; did you mean 'vm'?
    3288 |         vma = lookup_vma(vm, pf->page_addr);
         |         ^~~
         |         vm
   drivers/gpu/drm/xe/xe_vm.c:3283:56: note: 'vm' declared here
    3283 | xe_pagefault_access_type_to_address_type(struct xe_vm *vm, struct xe_pagefault *pf)
         |                                                        ^
>> drivers/gpu/drm/xe/xe_vm.c:3288:8: error: call to undeclared function 'lookup_vma'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
    3288 |         vma = lookup_vma(vm, pf->page_addr);
         |               ^
>> drivers/gpu/drm/xe/xe_vm.c:3288:6: error: incompatible integer to pointer conversion assigning to 'struct xe_vm *' from 'int' [-Wint-conversion]
    3288 |         vma = lookup_vma(vm, pf->page_addr);
         |             ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   drivers/gpu/drm/xe/xe_vm.c:3289:7: error: use of undeclared identifier 'vma'; did you mean 'vm'?
    3289 |         if (!vma)
         |              ^~~
         |              vm
   drivers/gpu/drm/xe/xe_vm.c:3283:56: note: 'vm' declared here
    3283 | xe_pagefault_access_type_to_address_type(struct xe_vm *vm, struct xe_pagefault *pf)
         |                                                        ^
   drivers/gpu/drm/xe/xe_vm.c:3291:23: error: use of undeclared identifier 'vma'; did you mean 'vm'?
    3291 |         if (xe_vma_read_only(vma) && pf->access_type != XE_PAGEFAULT_ACCESS_TYPE_READ)
         |                              ^~~
         |                              vm
   drivers/gpu/drm/xe/xe_vm.c:3283:56: note: 'vm' declared here
    3283 | xe_pagefault_access_type_to_address_type(struct xe_vm *vm, struct xe_pagefault *pf)
         |                                                        ^
   5 errors generated.


vim +3288 drivers/gpu/drm/xe/xe_vm.c

  3281	
  3282	static enum drm_xe_fault_address_type
  3283	xe_pagefault_access_type_to_address_type(struct xe_vm *vm, struct xe_pagefault *pf)
  3284	{
  3285		if (!pf)
  3286			return 0;
  3287	
> 3288		vma = lookup_vma(vm, pf->page_addr);
  3289		if (!vma)
  3290			return DRM_XE_FAULT_ADDRESS_TYPE_NONE_EXT;
  3291		if (xe_vma_read_only(vma) && pf->access_type != XE_PAGEFAULT_ACCESS_TYPE_READ)
  3292			return DRM_XE_FAULT_ADDRESS_TYPE_WRITE_INVALID_EXT;
  3293		return 0;
  3294	}
  3295
kernel test robot Feb. 27, 2025, 5:14 a.m. UTC | #2
Hi Jonathan,

kernel test robot noticed the following build errors:

[auto build test ERROR on drm-xe/drm-xe-next]
[also build test ERROR on next-20250226]
[cannot apply to linus/master v6.14-rc4]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Jonathan-Cavitt/drm-xe-xe_gt_pagefault-Migrate-lookup_vma-to-xe_vm-h/20250227-070008
base:   https://gitlab.freedesktop.org/drm/xe/kernel.git drm-xe-next
patch link:    https://lore.kernel.org/r/20250226225557.133076-7-jonathan.cavitt%40intel.com
patch subject: [PATCH 6/6] drm/xe/xe_vm: Implement xe_vm_get_property_ioctl
config: i386-buildonly-randconfig-002-20250227 (https://download.01.org/0day-ci/archive/20250227/202502271226.qHKoHhA5-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250227/202502271226.qHKoHhA5-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202502271226.qHKoHhA5-lkp@intel.com/

All errors (new ones prefixed by >>):

   drivers/gpu/drm/xe/xe_vm.c: In function 'xe_pagefault_access_type_to_address_type':
>> drivers/gpu/drm/xe/xe_vm.c:3288:9: error: 'vma' undeclared (first use in this function); did you mean 'vm'?
    3288 |         vma = lookup_vma(vm, pf->page_addr);
         |         ^~~
         |         vm
   drivers/gpu/drm/xe/xe_vm.c:3288:9: note: each undeclared identifier is reported only once for each function it appears in
>> drivers/gpu/drm/xe/xe_vm.c:3288:15: error: implicit declaration of function 'lookup_vma' [-Werror=implicit-function-declaration]
    3288 |         vma = lookup_vma(vm, pf->page_addr);
         |               ^~~~~~~~~~
   cc1: some warnings being treated as errors


vim +3288 drivers/gpu/drm/xe/xe_vm.c

  3281	
  3282	static enum drm_xe_fault_address_type
  3283	xe_pagefault_access_type_to_address_type(struct xe_vm *vm, struct xe_pagefault *pf)
  3284	{
  3285		if (!pf)
  3286			return 0;
  3287	
> 3288		vma = lookup_vma(vm, pf->page_addr);
  3289		if (!vma)
  3290			return DRM_XE_FAULT_ADDRESS_TYPE_NONE_EXT;
  3291		if (xe_vma_read_only(vma) && pf->access_type != XE_PAGEFAULT_ACCESS_TYPE_READ)
  3292			return DRM_XE_FAULT_ADDRESS_TYPE_WRITE_INVALID_EXT;
  3293		return 0;
  3294	}
  3295
Matthew Brost Feb. 27, 2025, 8:25 a.m. UTC | #3
On Wed, Feb 26, 2025 at 10:55:56PM +0000, Jonathan Cavitt wrote:
> Add support for userspace to get various properties from a specified VM.
> The currently supported properties are:
> 
> - The number of engine resets the VM has observed
> - The number of exec queue bans the VM has observed, up to the last 50
>   relevant ones, and how many of those were caused by faults.
> 
> The latter request also includes information on the exec queue bans,
> such as the ID of the banned exec queue, whether the ban was caused by a
> pagefault or not, and the address and address type of the associated
> fault (if one exists).
> 

> Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> Suggested-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_device.c |   2 +
>  drivers/gpu/drm/xe/xe_vm.c     | 106 +++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_vm.h     |   2 +
>  include/uapi/drm/xe_drm.h      |  73 +++++++++++++++++++++++
>  4 files changed, 183 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index 9454b51f7ad8..3a509a69062c 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -193,6 +193,8 @@ static const struct drm_ioctl_desc xe_ioctls[] = {
>  	DRM_IOCTL_DEF_DRV(XE_WAIT_USER_FENCE, xe_wait_user_fence_ioctl,
>  			  DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(XE_OBSERVATION, xe_observation_ioctl, DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(XE_VM_GET_PROPERTY, xe_vm_get_property_ioctl,
> +			  DRM_RENDER_ALLOW),
>  };
>  
>  static long xe_drm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 3e88652670e6..047908eb9ff7 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -3258,6 +3258,112 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  	return err;
>  }
>  
> +static u32 xe_vm_get_property_size(struct xe_vm *vm, u32 property)
> +{
> +	u32 size = -EINVAL;
> +
> +	switch (property) {
> +	case DRM_XE_VM_GET_PROPERTY_FAULTS:
> +		spin_lock(&vm->bans.lock);
> +		size = vm->bans.len * sizeof(struct drm_xe_ban);
> +		spin_unlock(&vm->bans.lock);
> +		size += sizeof(struct drm_xe_faults);
> +		break;
> +	case DRM_XE_VM_GET_PROPERTY_NUM_RESETS:
> +		size = sizeof(u64);
> +		break;
> +	default:
> +		break;
> +	}
> +
> +	return size;
> +}
> +
> +static enum drm_xe_fault_address_type
> +xe_pagefault_access_type_to_address_type(struct xe_vm *vm, struct xe_pagefault *pf)
> +{
> +	if (!pf)
> +		return 0;
> +
> +	vma = lookup_vma(vm, pf->page_addr);

The VMA state is not stable; for example, it can change between the time
of the fault and the time this IOCTL is called. Therefore, we need an
intermediate structure fully populated at fault time, which is then
converted to user output upon IOCTL.

> +	if (!vma)
> +		return DRM_XE_FAULT_ADDRESS_TYPE_NONE_EXT;
> +	if (xe_vma_read_only(vma) && pf->access_type != XE_PAGEFAULT_ACCESS_TYPE_READ)
> +		return DRM_XE_FAULT_ADDRESS_TYPE_WRITE_INVALID_EXT;
> +	return 0;
> +}
> +
> +int xe_vm_get_property_ioctl(struct drm_device *drm, void *data,
> +			     struct drm_file *file)
> +{
> +	struct xe_device *xe = to_xe_device(drm);
> +	struct xe_file *xef = to_xe_file(file);
> +	struct drm_xe_vm_get_property *args = data;
> +	struct xe_vm *vm;
> +	u32 size;
> +
> +	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> +		return -EINVAL;
> +
> +	vm = xe_vm_lookup(xef, args->vm_id);
> +	if (XE_IOCTL_DBG(xe, !vm))
> +		return -ENOENT;
> +
> +	size = xe_vm_get_property_size(vm, args->property);

For both this size calculation and the population if statements below,
I'd lean toward using vfunc tables for the implementations based on the
property index. Maybe overkill, though—not a strongly held opinion.

> +	if (size < 0) {
> +		return size;
> +	} else if (!args->size) {
> +		args->size = size;
> +		return 0;
> +	} else if (args->size != size) {
> +		return -EINVAL;
> +	}
> +
> +	if (args->property == DRM_XE_VM_GET_PROPERTY_FAULTS) {
> +		struct drm_xe_faults __user *usr_ptr = u64_to_user_ptr(args->data);
> +		struct drm_xe_faults fault_list;
> +		struct drm_xe_ban *ban;
> +		struct xe_exec_queue_ban_entry *entry;
> +		int i = 0;
> +
> +		if (copy_from_user(&fault_list, usr_ptr, size))
> +			return -EFAULT;
> +
> +		fault_list.num_faults = 0;
> +
> +		spin_lock(&vm->bans.lock);
> +		list_for_each_entry(entry, &vm->bans.list, list) {
> +			struct xe_pagefault *pf = entry->pf;
> +
> +			ban = &fault_list.list[i++];
> +			ban->exec_queue_id = entry->exec_queue_id;
> +			ban->faulted = !!pf ? 1 : 0;
> +			ban->address = pf ? pf->page_addr : 0;
> +			ban->address_type = xe_pagefault_access_type_to_address_type(vm, pf);
> +			ban->address_type = pf ? pf->fault_type : 0;
> +			fault_list.num_faults += ban->faulted;
> +		}
> +		spin_unlock(&vm->bans.lock);
> +
> +		fault_list.num_bans = i;
> +
> +		if (copy_to_user(usr_ptr, &fault_list, size))
> +			return -EFAULT;
> +
> +	} else if (args->property == DRM_XE_VM_GET_PROPERTY_NUM_RESETS) {
> +		u64 __user *usr_ptr = u64_to_user_ptr(args->data);
> +		u64 num_resets = atomic_read(&vm->reset_count);
> +
> +		if (copy_to_user(usr_ptr, &num_resets, size))
> +			return -EFAULT;
> +
> +	} else {
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
>  /**
>   * xe_vm_bind_kernel_bo - bind a kernel BO to a VM
>   * @vm: VM to bind the BO to
> diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
> index 78dbc5d57cd3..84653539d8db 100644
> --- a/drivers/gpu/drm/xe/xe_vm.h
> +++ b/drivers/gpu/drm/xe/xe_vm.h
> @@ -184,6 +184,8 @@ int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
>  			struct drm_file *file);
>  int xe_vm_bind_ioctl(struct drm_device *dev, void *data,
>  		     struct drm_file *file);
> +int xe_vm_get_property_ioctl(struct drm_device *dev, void *data,
> +			     struct drm_file *file);
>  
>  void xe_vm_close_and_put(struct xe_vm *vm);
>  
> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> index 76a462fae05f..00328d8a15dd 100644
> --- a/include/uapi/drm/xe_drm.h
> +++ b/include/uapi/drm/xe_drm.h
> @@ -81,6 +81,7 @@ extern "C" {
>   *  - &DRM_IOCTL_XE_EXEC
>   *  - &DRM_IOCTL_XE_WAIT_USER_FENCE
>   *  - &DRM_IOCTL_XE_OBSERVATION
> + *  - &DRM_IOCTL_XE_VM_GET_BANS

s/DRM_IOCTL_XE_VM_GET_BANS/DRM_IOCTL_XE_VM_GET_PROPERTY

>   */
>  
>  /*
> @@ -102,6 +103,7 @@ extern "C" {
>  #define DRM_XE_EXEC			0x09
>  #define DRM_XE_WAIT_USER_FENCE		0x0a
>  #define DRM_XE_OBSERVATION		0x0b
> +#define DRM_XE_VM_GET_PROPERTY		0x0c
>  
>  /* Must be kept compact -- no holes */
>  
> @@ -117,6 +119,7 @@ extern "C" {
>  #define DRM_IOCTL_XE_EXEC			DRM_IOW(DRM_COMMAND_BASE + DRM_XE_EXEC, struct drm_xe_exec)
>  #define DRM_IOCTL_XE_WAIT_USER_FENCE		DRM_IOWR(DRM_COMMAND_BASE + DRM_XE_WAIT_USER_FENCE, struct drm_xe_wait_user_fence)
>  #define DRM_IOCTL_XE_OBSERVATION		DRM_IOW(DRM_COMMAND_BASE + DRM_XE_OBSERVATION, struct drm_xe_observation_param)
> +#define DRM_IOCTL_XE_VM_GET_PROPERTY		DRM_IOW(DRM_COMMAND_BASE + DRM_XE_VM_GET_PROPERTY, struct drm_xe_vm_get_property)

s/DRM_IOW/DRM_IOWR as we write back the size.

For reference: DRM_IOW does not copy back the modified IOCTL structure
(e.g., struct drm_xe_vm_get_property), while DRM_IOWR does.

>  
>  /**
>   * DOC: Xe IOCTL Extensions
> @@ -1166,6 +1169,76 @@ struct drm_xe_vm_bind {
>  	__u64 reserved[2];
>  };
>  
> +/** Types of fault address */
> +enum drm_xe_fault_address_type {
> +	DRM_XE_FAULT_ADDRESS_TYPE_NONE_EXT,
> +	DRM_XE_FAULT_ADDRESS_TYPE_READ_INVALID_EXT,
> +	DRM_XE_FAULT_ADDRESS_TYPE_WRITE_INVALID_EXT,
> +};

No enum uAPI in header files—use defines instead. Enums can change at
the compiler's discretion, while defines cannot.

> +
> +struct drm_xe_ban {
> +	/** @exec_queue_id: ID of banned exec queue */
> +	__u32 exec_queue_id;

I don't think we can reliably associate a page fault with an
exec_queue_id at the moment, given my above statement about having to
capture all state at the time of the page fault. Maybe we could with
some tricks between the page fault and the IOMMU CAT error G2H?
Regardless, let's ask the UMD we are targeting [1] if this information
would be helpful. It would seemingly have to be vendor-specific
information, not part of the generic Vk information.

Additionally, it might be good to ask what other vendor-specific
information, if any, we'd need here based on what the current page fault
interface supports.

[1] https://registry.khronos.org/vulkan/specs/latest/man/html/VK_EXT_device_fault.html

> +	/** @faulted: Whether or not the ban has an associated pagefault.  0 is no, 1 is yes */
> +	__u32 faulted;
> +	/** @address: Address of the fault, if relevant */
> +	__u64 address;
> +	/** @address_type: enum drm_xe_fault_address_type, if relevant */
> +	__u32 address_type;

We likely need a fault_size field to support VkDeviceSize
addressPrecision; as defined here [2]. I believe we can extract this
information from pagefault.fault_level.

[2] https://registry.khronos.org/vulkan/specs/latest/man/html/VkDeviceFaultAddressInfoEXT.html

> +	/** @pad: MBZ */
> +	__u32 pad;
> +	/** @reserved: MBZ */
> +	__u64 reserved[3];
> +};
> +
> +struct drm_xe_faults {
> +	/** @num_faults: Number of faults observed on the VM */
> +	__u32 num_faults;
> +	/** @num_bans: Number of bans observed on the VM */
> +	__u32 num_bans;

I don't think num_bans and num_faults really provide any benefit for
supporting [1]. The requirement for [1] is device faults—nothing more.
With that in mind, I'd lean toward an array of a single structure
(returned in drm_xe_vm_get_property.data, number of faults can be
inferred from the returned size) reporting all faults, with each entry
containing all the fault information. If another use case arises for
reporting all banned queues, we can add a property for that.

> +	/** @reserved: MBZ */
> +	__u64 reserved[2];
> +	/** @list: Dynamic sized array of drm_xe_ban bans */
> +	struct drm_xe_ban list[];

list[0] would be the prefered way.

> +};
> +
> +/**
> + * struct drm_xe_vm_get_property - Input of &DRM_IOCTL_XE_VM_GET_PROPERTY
> + *
> + * The user provides a VM ID and a property to query to this ioctl,
> + * and the ioctl returns the size of the return value.  Calling the
> + * ioctl again with memory reserved for the data will save the
> + * requested property data to the data pointer.
> + *
> + * The valid properties are:
> + *  - %DRM_XE_VM_GET_PROPERTY_FAULTS : Property is a drm_xe_faults struct of dynamic size
> + *  - %DRM_XE_VM_GET_PROPERTY_NUM_RESETS: Property is a scalar

We need to consider where the number of resets requirement is coming
from. As far as I know, the Vk extension [1] we are targeting does not
need this information. I'm unsure about the compute UMD requirements at
the moment, so let's focus on supporting the Vk extension first.

Any uAPI must also have a UMD component, so focusing on one issue at a
time makes sense.

> + */
> +struct drm_xe_vm_get_property {
> +	/** @extensions: Pointer to the first extension struct, if any */
> +	__u64 extensions;
> +
> +	/** @vm_id: The ID of the VM to query the properties of */
> +	__u32 vm_id;
> +
> +#define DRM_XE_VM_GET_PROPERTY_FAULTS		0
> +#define DRM_XE_VM_GET_PROPERTY_NUM_RESETS	1
> +	/** @property: The property to get */
> +	__u32 property;
> +
> +	/** @size: Size of returned property @data */
> +	__u32 size;
> +
> +	/** @pad: MBZ */
> +	__u32 pad;
> +
> +	/** @reserved: MBZ */
> +	__u64 reserved[2];

I'd put the reserved bits at the end.

> +
> +	/** @data: Pointer storing return data */
> +	__u64 data;

union {
	__u64 data;
	__u64 ptr;
};

We would simply return 'data' for properties that fit in a u64 and
perform the size=0, return size process for properties that require a
user allocation.

This may not be relevant at the moment if we drop
DRM_XE_VM_GET_PROPERTY_NUM_RESET.

Matt

> +};
> +
>  /**
>   * struct drm_xe_exec_queue_create - Input of &DRM_IOCTL_XE_EXEC_QUEUE_CREATE
>   *
> -- 
> 2.43.0
>
Jonathan Cavitt Feb. 27, 2025, 4:51 p.m. UTC | #4
Some responses below.  If I skip over anything, just assume that I'm taking the request
into consideration and that it will be fixed for version 2 of this patch series.

-----Original Message-----
From: Brost, Matthew <matthew.brost@intel.com> 
Sent: Thursday, February 27, 2025 12:25 AM
To: Cavitt, Jonathan <jonathan.cavitt@intel.com>
Cc: intel-xe@lists.freedesktop.org; Gupta, saurabhg <saurabhg.gupta@intel.com>; Zuo, Alex <alex.zuo@intel.com>; joonas.lahtinen@linux.intel.com; Zhang, Jianxun <jianxun.zhang@intel.com>; dri-devel@lists.freedesktop.org
Subject: Re: [PATCH 6/6] drm/xe/xe_vm: Implement xe_vm_get_property_ioctl
> 
> On Wed, Feb 26, 2025 at 10:55:56PM +0000, Jonathan Cavitt wrote:
> > Add support for userspace to get various properties from a specified VM.
> > The currently supported properties are:
> > 
> > - The number of engine resets the VM has observed
> > - The number of exec queue bans the VM has observed, up to the last 50
> >   relevant ones, and how many of those were caused by faults.
> > 
> > The latter request also includes information on the exec queue bans,
> > such as the ID of the banned exec queue, whether the ban was caused by a
> > pagefault or not, and the address and address type of the associated
> > fault (if one exists).
> > 
> 
> > Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> > Suggested-by: Matthew Brost <matthew.brost@intel.com>
> > ---
[...]
> 
> > +
> > +struct drm_xe_ban {
> > +	/** @exec_queue_id: ID of banned exec queue */
> > +	__u32 exec_queue_id;
> 
> I don't think we can reliably associate a page fault with an
> exec_queue_id at the moment, given my above statement about having to
> capture all state at the time of the page fault. Maybe we could with
> some tricks between the page fault and the IOMMU CAT error G2H?
> Regardless, let's ask the UMD we are targeting [1] if this information
> would be helpful. It would seemingly have to be vendor-specific
> information, not part of the generic Vk information.
> 
> Additionally, it might be good to ask what other vendor-specific
> information, if any, we'd need here based on what the current page fault
> interface supports.
> 
> [1] https://registry.khronos.org/vulkan/specs/latest/man/html/VK_EXT_device_fault.html

The original request was something along the lines of having a mirror of the
DRM_IOCTL_I915_GET_RESET_STATS on XeKMD.  Those reset stats contain
information on the "context" ID, which maps to the exec queue ID on XeKMD.

Even if we can't reasonably blame a pagefault on a particular exec queue, in
order to match the request correctly, this information needs to be returned.

The I915 reset stats also contain information on the number of observed engine
resets, so that needs to be returned as well.

@joonas.lahtinen@linux.intel.com can provide more details.  Or maybe
@Mistat, Tomasz .

> 
> > +	/** @faulted: Whether or not the ban has an associated pagefault.  0 is no, 1 is yes */
> > +	__u32 faulted;
> > +	/** @address: Address of the fault, if relevant */
> > +	__u64 address;
> > +	/** @address_type: enum drm_xe_fault_address_type, if relevant */
> > +	__u32 address_type;
> 
> We likely need a fault_size field to support VkDeviceSize
> addressPrecision; as defined here [2]. I believe we can extract this
> information from pagefault.fault_level.
> 
> [2] https://registry.khronos.org/vulkan/specs/latest/man/html/VkDeviceFaultAddressInfoEXT.html

I can add this field as a prototype, though it will always return SZ_4K until we
can have a longer discussion on how to map between the fault_level and the
fault_size.

> 
> > +	/** @pad: MBZ */
> > +	__u32 pad;
> > +	/** @reserved: MBZ */
> > +	__u64 reserved[3];
> > +};
> > +
> > +struct drm_xe_faults {
> > +	/** @num_faults: Number of faults observed on the VM */
> > +	__u32 num_faults;
> > +	/** @num_bans: Number of bans observed on the VM */
> > +	__u32 num_bans;
> 
> I don't think num_bans and num_faults really provide any benefit for
> supporting [1]. The requirement for [1] is device faults-nothing more.
> With that in mind, I'd lean toward an array of a single structure
> (returned in drm_xe_vm_get_property.data, number of faults can be
> inferred from the returned size) reporting all faults, with each entry
> containing all the fault information. If another use case arises for
> reporting all banned queues, we can add a property for that.

I'm fairly certain the full ban list was directly requested, but I can break
it into a third query at least.

Also, the abstraction is done here because that's how copy_from_user
has historically been used.  I'd rather not experiment with trying to
copy_from_user a structure array and bungling it, but I guess I can give
it a try at least...

> 
> > +	/** @reserved: MBZ */
> > +	__u64 reserved[2];
> > +	/** @list: Dynamic sized array of drm_xe_ban bans */
> > +	struct drm_xe_ban list[];
> 
> list[0] would be the prefered way.

That is not how dynamic arrays are handled for
struct drm_xe_query_engines,
struct drm_xe_query_mem_regions,
struct drm_xe_query_config,
struct drm_xe_query_gt_list,
struct drm_xe_query_topology_mask,
struct drm_xe_oa_unit,
or
struct drm_xe_query_oa_units

> 
> > +};
> > +
> > +/**
> > + * struct drm_xe_vm_get_property - Input of &DRM_IOCTL_XE_VM_GET_PROPERTY
> > + *
> > + * The user provides a VM ID and a property to query to this ioctl,
> > + * and the ioctl returns the size of the return value.  Calling the
> > + * ioctl again with memory reserved for the data will save the
> > + * requested property data to the data pointer.
> > + *
> > + * The valid properties are:
> > + *  - %DRM_XE_VM_GET_PROPERTY_FAULTS : Property is a drm_xe_faults struct of dynamic size
> > + *  - %DRM_XE_VM_GET_PROPERTY_NUM_RESETS: Property is a scalar
> 
> We need to consider where the number of resets requirement is coming
> from. As far as I know, the Vk extension [1] we are targeting does not
> need this information. I'm unsure about the compute UMD requirements at
> the moment, so let's focus on supporting the Vk extension first.
> 
> Any uAPI must also have a UMD component, so focusing on one issue at a
> time makes sense.

See first reply.
-Jonathan Cavitt

> 
> > + */
> > +struct drm_xe_vm_get_property {
> > +	/** @extensions: Pointer to the first extension struct, if any */
> > +	__u64 extensions;
> > +
> > +	/** @vm_id: The ID of the VM to query the properties of */
> > +	__u32 vm_id;
> > +
> > +#define DRM_XE_VM_GET_PROPERTY_FAULTS		0
> > +#define DRM_XE_VM_GET_PROPERTY_NUM_RESETS	1
> > +	/** @property: The property to get */
> > +	__u32 property;
> > +
> > +	/** @size: Size of returned property @data */
> > +	__u32 size;
> > +
> > +	/** @pad: MBZ */
> > +	__u32 pad;
> > +
> > +	/** @reserved: MBZ */
> > +	__u64 reserved[2];
> 
> I'd put the reserved bits at the end.
> 
> > +
> > +	/** @data: Pointer storing return data */
> > +	__u64 data;
> 
> union {
> 	__u64 data;
> 	__u64 ptr;
> };
> 
> We would simply return 'data' for properties that fit in a u64 and
> perform the size=0, return size process for properties that require a
> user allocation.
> 
> This may not be relevant at the moment if we drop
> DRM_XE_VM_GET_PROPERTY_NUM_RESET.
> 
> Matt
> 
> > +};
> > +
> >  /**
> >   * struct drm_xe_exec_queue_create - Input of &DRM_IOCTL_XE_EXEC_QUEUE_CREATE
> >   *
> > -- 
> > 2.43.0
> > 
>
Matthew Brost Feb. 28, 2025, 3:34 a.m. UTC | #5
On Thu, Feb 27, 2025 at 09:51:15AM -0700, Cavitt, Jonathan wrote:
> Some responses below.  If I skip over anything, just assume that I'm taking the request
> into consideration and that it will be fixed for version 2 of this patch series.
> 
> -----Original Message-----
> From: Brost, Matthew <matthew.brost@intel.com> 
> Sent: Thursday, February 27, 2025 12:25 AM
> To: Cavitt, Jonathan <jonathan.cavitt@intel.com>
> Cc: intel-xe@lists.freedesktop.org; Gupta, saurabhg <saurabhg.gupta@intel.com>; Zuo, Alex <alex.zuo@intel.com>; joonas.lahtinen@linux.intel.com; Zhang, Jianxun <jianxun.zhang@intel.com>; dri-devel@lists.freedesktop.org
> Subject: Re: [PATCH 6/6] drm/xe/xe_vm: Implement xe_vm_get_property_ioctl
> > 
> > On Wed, Feb 26, 2025 at 10:55:56PM +0000, Jonathan Cavitt wrote:
> > > Add support for userspace to get various properties from a specified VM.
> > > The currently supported properties are:
> > > 
> > > - The number of engine resets the VM has observed
> > > - The number of exec queue bans the VM has observed, up to the last 50
> > >   relevant ones, and how many of those were caused by faults.
> > > 
> > > The latter request also includes information on the exec queue bans,
> > > such as the ID of the banned exec queue, whether the ban was caused by a
> > > pagefault or not, and the address and address type of the associated
> > > fault (if one exists).
> > > 
> > 
> > > Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> > > Suggested-by: Matthew Brost <matthew.brost@intel.com>
> > > ---
> [...]
> > 
> > > +
> > > +struct drm_xe_ban {
> > > +	/** @exec_queue_id: ID of banned exec queue */
> > > +	__u32 exec_queue_id;
> > 
> > I don't think we can reliably associate a page fault with an
> > exec_queue_id at the moment, given my above statement about having to
> > capture all state at the time of the page fault. Maybe we could with
> > some tricks between the page fault and the IOMMU CAT error G2H?
> > Regardless, let's ask the UMD we are targeting [1] if this information
> > would be helpful. It would seemingly have to be vendor-specific
> > information, not part of the generic Vk information.
> > 
> > Additionally, it might be good to ask what other vendor-specific
> > information, if any, we'd need here based on what the current page fault
> > interface supports.
> > 
> > [1] https://registry.khronos.org/vulkan/specs/latest/man/html/VK_EXT_device_fault.html
> 
> The original request was something along the lines of having a mirror of the
> DRM_IOCTL_I915_GET_RESET_STATS on XeKMD.  Those reset stats contain
> information on the "context" ID, which maps to the exec queue ID on XeKMD.
> 
> Even if we can't reasonably blame a pagefault on a particular exec queue, in
> order to match the request correctly, this information needs to be returned.
> 
> The I915 reset stats also contain information on the number of observed engine
> resets, so that needs to be returned as well.
> 
> @joonas.lahtinen@linux.intel.com can provide more details.  Or maybe
> @Mistat, Tomasz .
> 

You haven't really answered my question here or below where you say see
above. We a need UMD use case posted with any uAPI changes before
merging uAPI changes. I know the above Vk extension is going to be
implemented on top of this series but it is very unclear where the
number of resets requirement / UMD use case is coming from which makes
it impossible to review. 

Again I suggest focusing on the Vk use case first or go talk to our UMD
partners and figure out exactly why something similar to
DRM_IOCTL_I915_GET_RESET_STATS is required in Xe. I have made similar
comments on VLK-69424.

Matt

> > 
> > > +	/** @faulted: Whether or not the ban has an associated pagefault.  0 is no, 1 is yes */
> > > +	__u32 faulted;
> > > +	/** @address: Address of the fault, if relevant */
> > > +	__u64 address;
> > > +	/** @address_type: enum drm_xe_fault_address_type, if relevant */
> > > +	__u32 address_type;
> > 
> > We likely need a fault_size field to support VkDeviceSize
> > addressPrecision; as defined here [2]. I believe we can extract this
> > information from pagefault.fault_level.
> > 
> > [2] https://registry.khronos.org/vulkan/specs/latest/man/html/VkDeviceFaultAddressInfoEXT.html
> 
> I can add this field as a prototype, though it will always return SZ_4K until we
> can have a longer discussion on how to map between the fault_level and the
> fault_size.
> 
> > 
> > > +	/** @pad: MBZ */
> > > +	__u32 pad;
> > > +	/** @reserved: MBZ */
> > > +	__u64 reserved[3];
> > > +};
> > > +
> > > +struct drm_xe_faults {
> > > +	/** @num_faults: Number of faults observed on the VM */
> > > +	__u32 num_faults;
> > > +	/** @num_bans: Number of bans observed on the VM */
> > > +	__u32 num_bans;
> > 
> > I don't think num_bans and num_faults really provide any benefit for
> > supporting [1]. The requirement for [1] is device faults-nothing more.
> > With that in mind, I'd lean toward an array of a single structure
> > (returned in drm_xe_vm_get_property.data, number of faults can be
> > inferred from the returned size) reporting all faults, with each entry
> > containing all the fault information. If another use case arises for
> > reporting all banned queues, we can add a property for that.
> 
> I'm fairly certain the full ban list was directly requested, but I can break
> it into a third query at least.
> 
> Also, the abstraction is done here because that's how copy_from_user
> has historically been used.  I'd rather not experiment with trying to
> copy_from_user a structure array and bungling it, but I guess I can give
> it a try at least...
> 
> > 
> > > +	/** @reserved: MBZ */
> > > +	__u64 reserved[2];
> > > +	/** @list: Dynamic sized array of drm_xe_ban bans */
> > > +	struct drm_xe_ban list[];
> > 
> > list[0] would be the prefered way.
> 
> That is not how dynamic arrays are handled for
> struct drm_xe_query_engines,
> struct drm_xe_query_mem_regions,
> struct drm_xe_query_config,
> struct drm_xe_query_gt_list,
> struct drm_xe_query_topology_mask,
> struct drm_xe_oa_unit,
> or
> struct drm_xe_query_oa_units
> 
> > 
> > > +};
> > > +
> > > +/**
> > > + * struct drm_xe_vm_get_property - Input of &DRM_IOCTL_XE_VM_GET_PROPERTY
> > > + *
> > > + * The user provides a VM ID and a property to query to this ioctl,
> > > + * and the ioctl returns the size of the return value.  Calling the
> > > + * ioctl again with memory reserved for the data will save the
> > > + * requested property data to the data pointer.
> > > + *
> > > + * The valid properties are:
> > > + *  - %DRM_XE_VM_GET_PROPERTY_FAULTS : Property is a drm_xe_faults struct of dynamic size
> > > + *  - %DRM_XE_VM_GET_PROPERTY_NUM_RESETS: Property is a scalar
> > 
> > We need to consider where the number of resets requirement is coming
> > from. As far as I know, the Vk extension [1] we are targeting does not
> > need this information. I'm unsure about the compute UMD requirements at
> > the moment, so let's focus on supporting the Vk extension first.
> > 
> > Any uAPI must also have a UMD component, so focusing on one issue at a
> > time makes sense.
> 
> See first reply.
> -Jonathan Cavitt
> 
> > 
> > > + */
> > > +struct drm_xe_vm_get_property {
> > > +	/** @extensions: Pointer to the first extension struct, if any */
> > > +	__u64 extensions;
> > > +
> > > +	/** @vm_id: The ID of the VM to query the properties of */
> > > +	__u32 vm_id;
> > > +
> > > +#define DRM_XE_VM_GET_PROPERTY_FAULTS		0
> > > +#define DRM_XE_VM_GET_PROPERTY_NUM_RESETS	1
> > > +	/** @property: The property to get */
> > > +	__u32 property;
> > > +
> > > +	/** @size: Size of returned property @data */
> > > +	__u32 size;
> > > +
> > > +	/** @pad: MBZ */
> > > +	__u32 pad;
> > > +
> > > +	/** @reserved: MBZ */
> > > +	__u64 reserved[2];
> > 
> > I'd put the reserved bits at the end.
> > 
> > > +
> > > +	/** @data: Pointer storing return data */
> > > +	__u64 data;
> > 
> > union {
> > 	__u64 data;
> > 	__u64 ptr;
> > };
> > 
> > We would simply return 'data' for properties that fit in a u64 and
> > perform the size=0, return size process for properties that require a
> > user allocation.
> > 
> > This may not be relevant at the moment if we drop
> > DRM_XE_VM_GET_PROPERTY_NUM_RESET.
> > 
> > Matt
> > 
> > > +};
> > > +
> > >  /**
> > >   * struct drm_xe_exec_queue_create - Input of &DRM_IOCTL_XE_EXEC_QUEUE_CREATE
> > >   *
> > > -- 
> > > 2.43.0
> > > 
> >
diff mbox series

Patch

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 9454b51f7ad8..3a509a69062c 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -193,6 +193,8 @@  static const struct drm_ioctl_desc xe_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(XE_WAIT_USER_FENCE, xe_wait_user_fence_ioctl,
 			  DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(XE_OBSERVATION, xe_observation_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(XE_VM_GET_PROPERTY, xe_vm_get_property_ioctl,
+			  DRM_RENDER_ALLOW),
 };
 
 static long xe_drm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 3e88652670e6..047908eb9ff7 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -3258,6 +3258,112 @@  int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	return err;
 }
 
+static u32 xe_vm_get_property_size(struct xe_vm *vm, u32 property)
+{
+	u32 size = -EINVAL;
+
+	switch (property) {
+	case DRM_XE_VM_GET_PROPERTY_FAULTS:
+		spin_lock(&vm->bans.lock);
+		size = vm->bans.len * sizeof(struct drm_xe_ban);
+		spin_unlock(&vm->bans.lock);
+		size += sizeof(struct drm_xe_faults);
+		break;
+	case DRM_XE_VM_GET_PROPERTY_NUM_RESETS:
+		size = sizeof(u64);
+		break;
+	default:
+		break;
+	}
+
+	return size;
+}
+
+static enum drm_xe_fault_address_type
+xe_pagefault_access_type_to_address_type(struct xe_vm *vm, struct xe_pagefault *pf)
+{
+	if (!pf)
+		return 0;
+
+	vma = lookup_vma(vm, pf->page_addr);
+	if (!vma)
+		return DRM_XE_FAULT_ADDRESS_TYPE_NONE_EXT;
+	if (xe_vma_read_only(vma) && pf->access_type != XE_PAGEFAULT_ACCESS_TYPE_READ)
+		return DRM_XE_FAULT_ADDRESS_TYPE_WRITE_INVALID_EXT;
+	return 0;
+}
+
+int xe_vm_get_property_ioctl(struct drm_device *drm, void *data,
+			     struct drm_file *file)
+{
+	struct xe_device *xe = to_xe_device(drm);
+	struct xe_file *xef = to_xe_file(file);
+	struct drm_xe_vm_get_property *args = data;
+	struct xe_vm *vm;
+	u32 size;
+
+	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
+		return -EINVAL;
+
+	vm = xe_vm_lookup(xef, args->vm_id);
+	if (XE_IOCTL_DBG(xe, !vm))
+		return -ENOENT;
+
+	size = xe_vm_get_property_size(vm, args->property);
+	if (size < 0) {
+		return size;
+	} else if (!args->size) {
+		args->size = size;
+		return 0;
+	} else if (args->size != size) {
+		return -EINVAL;
+	}
+
+	if (args->property == DRM_XE_VM_GET_PROPERTY_FAULTS) {
+		struct drm_xe_faults __user *usr_ptr = u64_to_user_ptr(args->data);
+		struct drm_xe_faults fault_list;
+		struct drm_xe_ban *ban;
+		struct xe_exec_queue_ban_entry *entry;
+		int i = 0;
+
+		if (copy_from_user(&fault_list, usr_ptr, size))
+			return -EFAULT;
+
+		fault_list.num_faults = 0;
+
+		spin_lock(&vm->bans.lock);
+		list_for_each_entry(entry, &vm->bans.list, list) {
+			struct xe_pagefault *pf = entry->pf;
+
+			ban = &fault_list.list[i++];
+			ban->exec_queue_id = entry->exec_queue_id;
+			ban->faulted = !!pf ? 1 : 0;
+			ban->address = pf ? pf->page_addr : 0;
+			ban->address_type = xe_pagefault_access_type_to_address_type(vm, pf);
+			ban->address_type = pf ? pf->fault_type : 0;
+			fault_list.num_faults += ban->faulted;
+		}
+		spin_unlock(&vm->bans.lock);
+
+		fault_list.num_bans = i;
+
+		if (copy_to_user(usr_ptr, &fault_list, size))
+			return -EFAULT;
+
+	} else if (args->property == DRM_XE_VM_GET_PROPERTY_NUM_RESETS) {
+		u64 __user *usr_ptr = u64_to_user_ptr(args->data);
+		u64 num_resets = atomic_read(&vm->reset_count);
+
+		if (copy_to_user(usr_ptr, &num_resets, size))
+			return -EFAULT;
+
+	} else {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /**
  * xe_vm_bind_kernel_bo - bind a kernel BO to a VM
  * @vm: VM to bind the BO to
diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
index 78dbc5d57cd3..84653539d8db 100644
--- a/drivers/gpu/drm/xe/xe_vm.h
+++ b/drivers/gpu/drm/xe/xe_vm.h
@@ -184,6 +184,8 @@  int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file);
 int xe_vm_bind_ioctl(struct drm_device *dev, void *data,
 		     struct drm_file *file);
+int xe_vm_get_property_ioctl(struct drm_device *dev, void *data,
+			     struct drm_file *file);
 
 void xe_vm_close_and_put(struct xe_vm *vm);
 
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index 76a462fae05f..00328d8a15dd 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -81,6 +81,7 @@  extern "C" {
  *  - &DRM_IOCTL_XE_EXEC
  *  - &DRM_IOCTL_XE_WAIT_USER_FENCE
  *  - &DRM_IOCTL_XE_OBSERVATION
+ *  - &DRM_IOCTL_XE_VM_GET_BANS
  */
 
 /*
@@ -102,6 +103,7 @@  extern "C" {
 #define DRM_XE_EXEC			0x09
 #define DRM_XE_WAIT_USER_FENCE		0x0a
 #define DRM_XE_OBSERVATION		0x0b
+#define DRM_XE_VM_GET_PROPERTY		0x0c
 
 /* Must be kept compact -- no holes */
 
@@ -117,6 +119,7 @@  extern "C" {
 #define DRM_IOCTL_XE_EXEC			DRM_IOW(DRM_COMMAND_BASE + DRM_XE_EXEC, struct drm_xe_exec)
 #define DRM_IOCTL_XE_WAIT_USER_FENCE		DRM_IOWR(DRM_COMMAND_BASE + DRM_XE_WAIT_USER_FENCE, struct drm_xe_wait_user_fence)
 #define DRM_IOCTL_XE_OBSERVATION		DRM_IOW(DRM_COMMAND_BASE + DRM_XE_OBSERVATION, struct drm_xe_observation_param)
+#define DRM_IOCTL_XE_VM_GET_PROPERTY		DRM_IOW(DRM_COMMAND_BASE + DRM_XE_VM_GET_PROPERTY, struct drm_xe_vm_get_property)
 
 /**
  * DOC: Xe IOCTL Extensions
@@ -1166,6 +1169,76 @@  struct drm_xe_vm_bind {
 	__u64 reserved[2];
 };
 
+/** Types of fault address */
+enum drm_xe_fault_address_type {
+	DRM_XE_FAULT_ADDRESS_TYPE_NONE_EXT,
+	DRM_XE_FAULT_ADDRESS_TYPE_READ_INVALID_EXT,
+	DRM_XE_FAULT_ADDRESS_TYPE_WRITE_INVALID_EXT,
+};
+
+struct drm_xe_ban {
+	/** @exec_queue_id: ID of banned exec queue */
+	__u32 exec_queue_id;
+	/** @faulted: Whether or not the ban has an associated pagefault.  0 is no, 1 is yes */
+	__u32 faulted;
+	/** @address: Address of the fault, if relevant */
+	__u64 address;
+	/** @address_type: enum drm_xe_fault_address_type, if relevant */
+	__u32 address_type;
+	/** @pad: MBZ */
+	__u32 pad;
+	/** @reserved: MBZ */
+	__u64 reserved[3];
+};
+
+struct drm_xe_faults {
+	/** @num_faults: Number of faults observed on the VM */
+	__u32 num_faults;
+	/** @num_bans: Number of bans observed on the VM */
+	__u32 num_bans;
+	/** @reserved: MBZ */
+	__u64 reserved[2];
+	/** @list: Dynamic sized array of drm_xe_ban bans */
+	struct drm_xe_ban list[];
+};
+
+/**
+ * struct drm_xe_vm_get_property - Input of &DRM_IOCTL_XE_VM_GET_PROPERTY
+ *
+ * The user provides a VM ID and a property to query to this ioctl,
+ * and the ioctl returns the size of the return value.  Calling the
+ * ioctl again with memory reserved for the data will save the
+ * requested property data to the data pointer.
+ *
+ * The valid properties are:
+ *  - %DRM_XE_VM_GET_PROPERTY_FAULTS : Property is a drm_xe_faults struct of dynamic size
+ *  - %DRM_XE_VM_GET_PROPERTY_NUM_RESETS: Property is a scalar
+ */
+struct drm_xe_vm_get_property {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @vm_id: The ID of the VM to query the properties of */
+	__u32 vm_id;
+
+#define DRM_XE_VM_GET_PROPERTY_FAULTS		0
+#define DRM_XE_VM_GET_PROPERTY_NUM_RESETS	1
+	/** @property: The property to get */
+	__u32 property;
+
+	/** @size: Size of returned property @data */
+	__u32 size;
+
+	/** @pad: MBZ */
+	__u32 pad;
+
+	/** @reserved: MBZ */
+	__u64 reserved[2];
+
+	/** @data: Pointer storing return data */
+	__u64 data;
+};
+
 /**
  * struct drm_xe_exec_queue_create - Input of &DRM_IOCTL_XE_EXEC_QUEUE_CREATE
  *