diff mbox series

[v2,6/6] drm/xe/xe_query: Add support for per-drm-client reset stat querying

Message ID 20250218202426.103151-7-jonathan.cavitt@intel.com (mailing list archive)
State New
Headers show
Series drm/xe/xe_drm_client: Add per drm client reset stats | expand

Commit Message

Jonathan Cavitt Feb. 18, 2025, 8:24 p.m. UTC
Add support for userspace to query per drm client reset stats via the
query ioctl.  This includes the number of engine resets the drm client
has observed, as well as a list of up to the last 50 relevant exec
queue bans and their associated causal pagefaults (if they exists).

Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
---
 drivers/gpu/drm/xe/xe_query.c | 66 +++++++++++++++++++++++++++++++++++
 include/uapi/drm/xe_drm.h     | 50 ++++++++++++++++++++++++++
 2 files changed, 116 insertions(+)

Comments

kernel test robot Feb. 19, 2025, 4:52 a.m. UTC | #1
Hi Jonathan,

kernel test robot noticed the following build errors:

[auto build test ERROR on drm-xe/drm-xe-next]
[also build test ERROR on next-20250218]
[cannot apply to linus/master v6.14-rc3]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Jonathan-Cavitt/drm-xe-xe_exec_queue-Add-ID-param-to-exec-queue-struct/20250219-042528
base:   https://gitlab.freedesktop.org/drm/xe/kernel.git drm-xe-next
patch link:    https://lore.kernel.org/r/20250218202426.103151-7-jonathan.cavitt%40intel.com
patch subject: [PATCH v2 6/6] drm/xe/xe_query: Add support for per-drm-client reset stat querying
config: xtensa-randconfig-001-20250219 (https://download.01.org/0day-ci/archive/20250219/202502191202.It7kBP8q-lkp@intel.com/config)
compiler: xtensa-linux-gcc (GCC) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250219/202502191202.It7kBP8q-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202502191202.It7kBP8q-lkp@intel.com/

All errors (new ones prefixed by >>):

   drivers/gpu/drm/xe/xe_query.c: In function 'calc_reset_stats_size':
>> drivers/gpu/drm/xe/xe_query.c:749:26: error: 'struct xe_drm_client' has no member named 'blame_lock'
     749 |         spin_lock(&client->blame_lock);
         |                          ^~
>> drivers/gpu/drm/xe/xe_query.c:750:62: error: 'struct xe_drm_client' has no member named 'blame_len'
     750 |         size += sizeof(struct drm_xe_exec_queue_ban) * client->blame_len;
         |                                                              ^~
   drivers/gpu/drm/xe/xe_query.c:751:26: error: 'struct xe_drm_client' has no member named 'blame_lock'
     751 |         spin_lock(&client->blame_lock);
         |                          ^~
   drivers/gpu/drm/xe/xe_query.c: In function 'query_reset_stats':
>> drivers/gpu/drm/xe/xe_query.c:778:47: error: 'struct xe_drm_client' has no member named 'reset_count'
     778 |         resp.reset_count = atomic_read(&client->reset_count);
         |                                               ^~
   drivers/gpu/drm/xe/xe_query.c:780:26: error: 'struct xe_drm_client' has no member named 'blame_lock'
     780 |         spin_lock(&client->blame_lock);
         |                          ^~
   drivers/gpu/drm/xe/xe_query.c:781:32: error: 'struct xe_drm_client' has no member named 'blame_len'
     781 |         resp.ban_count = client->blame_len;
         |                                ^~
   In file included from include/linux/list.h:5,
                    from include/linux/smp.h:12,
                    from include/linux/sched/clock.h:5,
                    from drivers/gpu/drm/xe/xe_query.c:9:
>> drivers/gpu/drm/xe/xe_query.c:782:39: error: 'struct xe_drm_client' has no member named 'blame_list'
     782 |         list_for_each_entry(b, &client->blame_list, list) {
         |                                       ^~
   include/linux/container_of.h:19:33: note: in definition of macro 'container_of'
      19 |         void *__mptr = (void *)(ptr);                                   \
         |                                 ^~~
   include/linux/list.h:612:9: note: in expansion of macro 'list_entry'
     612 |         list_entry((ptr)->next, type, member)
         |         ^~~~~~~~~~
   include/linux/list.h:770:20: note: in expansion of macro 'list_first_entry'
     770 |         for (pos = list_first_entry(head, typeof(*pos), member);        \
         |                    ^~~~~~~~~~~~~~~~
   drivers/gpu/drm/xe/xe_query.c:782:9: note: in expansion of macro 'list_for_each_entry'
     782 |         list_for_each_entry(b, &client->blame_list, list) {
         |         ^~~~~~~~~~~~~~~~~~~
   In file included from include/linux/container_of.h:5:
>> drivers/gpu/drm/xe/xe_query.c:782:39: error: 'struct xe_drm_client' has no member named 'blame_list'
     782 |         list_for_each_entry(b, &client->blame_list, list) {
         |                                       ^~
   include/linux/build_bug.h:78:56: note: in definition of macro '__static_assert'
      78 | #define __static_assert(expr, msg, ...) _Static_assert(expr, msg)
         |                                                        ^~~~
   include/linux/container_of.h:20:9: note: in expansion of macro 'static_assert'
      20 |         static_assert(__same_type(*(ptr), ((type *)0)->member) ||       \
         |         ^~~~~~~~~~~~~
   include/linux/container_of.h:20:23: note: in expansion of macro '__same_type'
      20 |         static_assert(__same_type(*(ptr), ((type *)0)->member) ||       \
         |                       ^~~~~~~~~~~
   include/linux/list.h:601:9: note: in expansion of macro 'container_of'
     601 |         container_of(ptr, type, member)
         |         ^~~~~~~~~~~~
   include/linux/list.h:612:9: note: in expansion of macro 'list_entry'
     612 |         list_entry((ptr)->next, type, member)
         |         ^~~~~~~~~~
   include/linux/list.h:770:20: note: in expansion of macro 'list_first_entry'
     770 |         for (pos = list_first_entry(head, typeof(*pos), member);        \
         |                    ^~~~~~~~~~~~~~~~
   drivers/gpu/drm/xe/xe_query.c:782:9: note: in expansion of macro 'list_for_each_entry'
     782 |         list_for_each_entry(b, &client->blame_list, list) {
         |         ^~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/xe/xe_query.c:782:39: error: 'struct xe_drm_client' has no member named 'blame_list'
     782 |         list_for_each_entry(b, &client->blame_list, list) {
         |                                       ^~
   include/linux/build_bug.h:78:56: note: in definition of macro '__static_assert'
      78 | #define __static_assert(expr, msg, ...) _Static_assert(expr, msg)
         |                                                        ^~~~
   include/linux/container_of.h:20:9: note: in expansion of macro 'static_assert'
      20 |         static_assert(__same_type(*(ptr), ((type *)0)->member) ||       \
         |         ^~~~~~~~~~~~~
   include/linux/container_of.h:21:23: note: in expansion of macro '__same_type'
      21 |                       __same_type(*(ptr), void),                        \
         |                       ^~~~~~~~~~~
   include/linux/list.h:601:9: note: in expansion of macro 'container_of'
     601 |         container_of(ptr, type, member)
         |         ^~~~~~~~~~~~
   include/linux/list.h:612:9: note: in expansion of macro 'list_entry'
     612 |         list_entry((ptr)->next, type, member)
         |         ^~~~~~~~~~
   include/linux/list.h:770:20: note: in expansion of macro 'list_first_entry'
     770 |         for (pos = list_first_entry(head, typeof(*pos), member);        \
         |                    ^~~~~~~~~~~~~~~~
   drivers/gpu/drm/xe/xe_query.c:782:9: note: in expansion of macro 'list_for_each_entry'
     782 |         list_for_each_entry(b, &client->blame_list, list) {
         |         ^~~~~~~~~~~~~~~~~~~
   include/linux/compiler_types.h:483:27: error: expression in static assertion is not an integer
     483 | #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
         |                           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/build_bug.h:78:56: note: in definition of macro '__static_assert'
      78 | #define __static_assert(expr, msg, ...) _Static_assert(expr, msg)
         |                                                        ^~~~
   include/linux/container_of.h:20:9: note: in expansion of macro 'static_assert'
      20 |         static_assert(__same_type(*(ptr), ((type *)0)->member) ||       \
         |         ^~~~~~~~~~~~~
   include/linux/container_of.h:20:23: note: in expansion of macro '__same_type'
      20 |         static_assert(__same_type(*(ptr), ((type *)0)->member) ||       \
         |                       ^~~~~~~~~~~
   include/linux/list.h:601:9: note: in expansion of macro 'container_of'
     601 |         container_of(ptr, type, member)
         |         ^~~~~~~~~~~~
   include/linux/list.h:612:9: note: in expansion of macro 'list_entry'
     612 |         list_entry((ptr)->next, type, member)
         |         ^~~~~~~~~~
   include/linux/list.h:770:20: note: in expansion of macro 'list_first_entry'
     770 |         for (pos = list_first_entry(head, typeof(*pos), member);        \
         |                    ^~~~~~~~~~~~~~~~
   drivers/gpu/drm/xe/xe_query.c:782:9: note: in expansion of macro 'list_for_each_entry'
     782 |         list_for_each_entry(b, &client->blame_list, list) {
         |         ^~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/xe/xe_query.c:782:39: error: 'struct xe_drm_client' has no member named 'blame_list'
     782 |         list_for_each_entry(b, &client->blame_list, list) {
         |                                       ^~
   include/linux/list.h:761:37: note: in definition of macro 'list_entry_is_head'
     761 |         list_is_head(&pos->member, (head))
         |                                     ^~~~
   drivers/gpu/drm/xe/xe_query.c:782:9: note: in expansion of macro 'list_for_each_entry'
     782 |         list_for_each_entry(b, &client->blame_list, list) {
         |         ^~~~~~~~~~~~~~~~~~~
   drivers/gpu/drm/xe/xe_query.c:800:28: error: 'struct xe_drm_client' has no member named 'blame_lock'
     800 |         spin_unlock(&client->blame_lock);
         |                            ^~


vim +749 drivers/gpu/drm/xe/xe_query.c

   744	
   745	static size_t calc_reset_stats_size(struct xe_drm_client *client)
   746	{
   747		size_t size = sizeof(struct drm_xe_query_reset_stats);
   748	
 > 749		spin_lock(&client->blame_lock);
 > 750		size += sizeof(struct drm_xe_exec_queue_ban) * client->blame_len;
   751		spin_lock(&client->blame_lock);
   752	
   753		return size;
   754	}
   755	
   756	static int query_reset_stats(struct xe_device *xe,
   757				     struct drm_xe_device_query *query,
   758				     struct drm_file *file)
   759	{
   760		void __user *query_ptr = u64_to_user_ptr(query->data);
   761		struct drm_xe_query_reset_stats resp;
   762		struct xe_file *xef = to_xe_file(file);
   763		struct xe_drm_client *client = xef->client;
   764		struct blame *b;
   765		size_t size = calc_reset_stats_size(client);
   766		int i = 0;
   767	
   768		if (query->size == 0) {
   769			query->size = size;
   770			return 0;
   771		} else if (XE_IOCTL_DBG(xe, query->size != size)) {
   772			return -EINVAL;
   773		}
   774	
   775		if (copy_from_user(&resp, query_ptr, size))
   776			return -EFAULT;
   777	
 > 778		resp.reset_count = atomic_read(&client->reset_count);
   779	
   780		spin_lock(&client->blame_lock);
 > 781		resp.ban_count = client->blame_len;
 > 782		list_for_each_entry(b, &client->blame_list, list) {
   783			struct drm_xe_exec_queue_ban *ban = &resp.ban_list[i++];
   784			struct pagefault *pf = b->pf;
   785	
   786			ban->exec_queue_id = b->exec_queue_id;
   787			ban->pf_found = pf ? 1 : 0;
   788			if (!pf)
   789				continue;
   790	
   791			ban->access_type = pf->access_type;
   792			ban->fault_type = pf->fault_type;
   793			ban->vfid = pf->vfid;
   794			ban->asid = pf->asid;
   795			ban->pdata = pf->pdata;
   796			ban->engine_class = xe_to_user_engine_class[pf->engine_class];
   797			ban->engine_instance = pf->engine_instance;
   798			ban->fault_addr = pf->page_addr;
   799		}
   800		spin_unlock(&client->blame_lock);
   801	
   802		if (copy_to_user(query_ptr, &resp, size))
   803			return -EFAULT;
   804	
   805		return 0;
   806	}
   807
diff mbox series

Patch

diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c
index 3aad4737bfec..934ff0f4f992 100644
--- a/drivers/gpu/drm/xe/xe_query.c
+++ b/drivers/gpu/drm/xe/xe_query.c
@@ -16,10 +16,12 @@ 
 #include "regs/xe_gt_regs.h"
 #include "xe_bo.h"
 #include "xe_device.h"
+#include "xe_drm_client.h"
 #include "xe_exec_queue.h"
 #include "xe_force_wake.h"
 #include "xe_ggtt.h"
 #include "xe_gt.h"
+#include "xe_gt_pagefault.h"
 #include "xe_guc_hwconfig.h"
 #include "xe_macros.h"
 #include "xe_mmio.h"
@@ -740,6 +742,69 @@  static int query_pxp_status(struct xe_device *xe,
 	return 0;
 }
 
+static size_t calc_reset_stats_size(struct xe_drm_client *client)
+{
+	size_t size = sizeof(struct drm_xe_query_reset_stats);
+
+	spin_lock(&client->blame_lock);
+	size += sizeof(struct drm_xe_exec_queue_ban) * client->blame_len;
+	spin_lock(&client->blame_lock);
+
+	return size;
+}
+
+static int query_reset_stats(struct xe_device *xe,
+			     struct drm_xe_device_query *query,
+			     struct drm_file *file)
+{
+	void __user *query_ptr = u64_to_user_ptr(query->data);
+	struct drm_xe_query_reset_stats resp;
+	struct xe_file *xef = to_xe_file(file);
+	struct xe_drm_client *client = xef->client;
+	struct blame *b;
+	size_t size = calc_reset_stats_size(client);
+	int i = 0;
+
+	if (query->size == 0) {
+		query->size = size;
+		return 0;
+	} else if (XE_IOCTL_DBG(xe, query->size != size)) {
+		return -EINVAL;
+	}
+
+	if (copy_from_user(&resp, query_ptr, size))
+		return -EFAULT;
+
+	resp.reset_count = atomic_read(&client->reset_count);
+
+	spin_lock(&client->blame_lock);
+	resp.ban_count = client->blame_len;
+	list_for_each_entry(b, &client->blame_list, list) {
+		struct drm_xe_exec_queue_ban *ban = &resp.ban_list[i++];
+		struct pagefault *pf = b->pf;
+
+		ban->exec_queue_id = b->exec_queue_id;
+		ban->pf_found = pf ? 1 : 0;
+		if (!pf)
+			continue;
+
+		ban->access_type = pf->access_type;
+		ban->fault_type = pf->fault_type;
+		ban->vfid = pf->vfid;
+		ban->asid = pf->asid;
+		ban->pdata = pf->pdata;
+		ban->engine_class = xe_to_user_engine_class[pf->engine_class];
+		ban->engine_instance = pf->engine_instance;
+		ban->fault_addr = pf->page_addr;
+	}
+	spin_unlock(&client->blame_lock);
+
+	if (copy_to_user(query_ptr, &resp, size))
+		return -EFAULT;
+
+	return 0;
+}
+
 static int (* const xe_query_funcs[])(struct xe_device *xe,
 				      struct drm_xe_device_query *query,
 				      struct drm_file *file) = {
@@ -753,6 +818,7 @@  static int (* const xe_query_funcs[])(struct xe_device *xe,
 	query_uc_fw_version,
 	query_oa_units,
 	query_pxp_status,
+	query_reset_stats,
 };
 
 int xe_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index 892f54d3aa09..ffeb2a79e084 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -682,6 +682,7 @@  struct drm_xe_query_pxp_status {
  *  - %DRM_XE_DEVICE_QUERY_GT_TOPOLOGY
  *  - %DRM_XE_DEVICE_QUERY_ENGINE_CYCLES
  *  - %DRM_XE_DEVICE_QUERY_PXP_STATUS
+ *  - %DRM_XE_DEVICE_QUERY_RESET_STATS
  *
  * If size is set to 0, the driver fills it with the required size for
  * the requested type of data to query. If size is equal to the required
@@ -735,6 +736,7 @@  struct drm_xe_device_query {
 #define DRM_XE_DEVICE_QUERY_UC_FW_VERSION	7
 #define DRM_XE_DEVICE_QUERY_OA_UNITS		8
 #define DRM_XE_DEVICE_QUERY_PXP_STATUS		9
+#define DRM_XE_DEVICE_QUERY_RESET_STATS		10
 	/** @query: The type of data to query */
 	__u32 query;
 
@@ -1845,6 +1847,54 @@  enum drm_xe_pxp_session_type {
 	DRM_XE_PXP_TYPE_HWDRM = 1,
 };
 
+/**
+ * struct drm_xe_exec_queue_ban - Per drm client exec queue ban info returned
+ * from @DRM_XE_DEVICE_QUERY_RESET_STATS query.  Includes the exec queue ID and
+ * all associated pagefault information, if relevant.
+ */
+struct drm_xe_exec_queue_ban {
+	/** @exec_queue_id: ID of banned exec queue */
+	__u32 exec_queue_id;
+	/**
+	 * @pf_found: whether or not the ban is associated with a pagefault.
+	 * If not, all pagefault data will default to 0 and will not be relevant.
+	 */
+	__u8 pf_found;
+	/** @access_type: access type of associated pagefault */
+	__u8 access_type;
+	/** @fault_type: fault type of associated pagefault */
+	__u8 fault_type;
+	/** @vfid: VFID of associated pagefault */
+	__u8 vfid;
+	/** @asid: ASID of associated pagefault */
+	__u32 asid;
+	/** @pdata: PDATA of associated pagefault */
+	__u16 pdata;
+	/** @engine_class: engine class of associated pagefault */
+	__u8 engine_class;
+	/** @engine_instance: engine instance of associated pagefault */
+	__u8 engine_instance;
+	/** @fault_addr: faulted address of associated pagefault */
+	__u64 fault_addr;
+};
+
+/**
+ * struct drm_xe_query_reset_stats - Per drm client reset stats query.
+ */
+struct drm_xe_query_reset_stats {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+	/** @reset_count: Number of times the drm client has observed an engine reset */
+	__u64 reset_count;
+	/** @ban_count: number of exec queue bans saved by the drm client */
+	__u64 ban_count;
+	/**
+	 * @ban_list: flexible array of struct drm_xe_exec_queue_ban, reporting all
+	 * observed exec queue bans on the drm client.
+	 */
+	struct drm_xe_exec_queue_ban ban_list[];
+};
+
 /* ID of the protected content session managed by Xe when PXP is active */
 #define DRM_XE_PXP_HWDRM_DEFAULT_SESSION 0xf