@@ -700,6 +700,7 @@ struct drm_xe_device_query {
#define DRM_XE_DEVICE_QUERY_ENGINE_CYCLES 6
#define DRM_XE_DEVICE_QUERY_UC_FW_VERSION 7
#define DRM_XE_DEVICE_QUERY_OA_UNITS 8
+#define DRM_XE_DEVICE_QUERY_RESET_STATS 10
/** @query: The type of data to query */
__u32 query;
@@ -1729,6 +1730,55 @@ struct drm_xe_oa_stream_info {
__u64 reserved[3];
};
+#define MAX_BAN_COUNT 50
+/**
+ * struct drm_xe_exec_queue_ban - Per drm client exec queue ban info returned
+ * from @DRM_XE_DEVICE_QUERY_RESET_STATS query. Includes the exec queue ID and
+ * all associated pagefault information, if relevant.
+ */
+struct drm_xe_exec_queue_ban {
+ /** @exec_queue_id: ID of banned exec queue */
+ __u32 exec_queue_id;
+ /**
+ * @pf_found: whether or not the ban is associated with a pagefault.
+ * If not, all pagefault data will default to 0 and will not be relevant.
+ */
+ __u8 pf_found;
+ /** @access_type: access type of associated pagefault */
+ __u8 access_type;
+ /** @fault_type: fault type of associated pagefault */
+ __u8 fault_type;
+ /** @vfid: VFID of associated pagefault */
+ __u8 vfid;
+ /** @asid: ASID of associated pagefault */
+ __u32 asid;
+ /** @pdata: PDATA of associated pagefault */
+ __u16 pdata;
+ /** @engine_class: engine class of associated pagefault */
+ __u8 engine_class;
+ /** @engine_instance: engine instance of associated pagefault */
+ __u8 engine_instance;
+ /** @fault_addr: faulted address of associated pagefault */
+ __u64 fault_addr;
+};
+
+/**
+ * struct drm_xe_query_reset_stats - Per drm client reset stats query.
+ */
+struct drm_xe_query_reset_stats {
+ /** @extensions: Pointer to the first extension struct, if any */
+ __u64 extensions;
+ /** @reset_count: Number of times the drm client has observed an engine reset */
+ __u64 reset_count;
+ /** @ban_count: number of exec queue bans saved by the drm client */
+ __u64 ban_count;
+ /**
+ * @ban_list: flexible array of struct drm_xe_exec_queue_ban, reporting all
+ * observed exec queue bans on the drm client.
+ */
+ struct drm_xe_exec_queue_ban ban_list[];
+};
+
#if defined(__cplusplus)
}
#endif
@@ -1077,6 +1077,342 @@ static void test_query_oa_units(int fd)
}
}
+/**
+ * The reset stats query will report -EOPNOTSUPP if the kernel is
+ * configured without CONFIG_PROC_FS. Check this before running
+ * any tests on this query.
+ */
+static bool
+query_reset_stats_supported(int fd)
+{
+ struct drm_xe_device_query query = {
+ .extensions = 0,
+ .query = DRM_XE_DEVICE_QUERY_RESET_STATS,
+ .size = 0,
+ .data = 0,
+ };
+ int ret = igt_ioctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, &query);
+
+ if (ret)
+ igt_assert(ret == -EOPNOTSUPP);
+ return !ret;
+}
+
+/**
+ * SUBTEST: query-reset-stats
+ * Description: Display fields for reset stats query
+ *
+ * SUBTEST: multigpu-query-reset-stats
+ * Description: Display fields for reset stats query for all GPU devices
+ * Sub-category: MultiGPU
+ */
+static void test_query_reset_stats(int fd)
+{
+ struct drm_xe_query_reset_stats *qrs;
+ struct drm_xe_device_query query = {
+ .extensions = 0,
+ .query = DRM_XE_DEVICE_QUERY_RESET_STATS,
+ .size = 0,
+ .data = 0,
+ };
+ struct drm_xe_exec_queue_ban *ban;
+
+ igt_skip_on(!query_reset_stats_supported(fd));
+ igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0);
+
+ qrs = malloc(query.size);
+ igt_assert(qrs);
+
+ query.data = to_user_pointer(qrs);
+ igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0);
+
+ igt_info("reset count: %lld\n", qrs->reset_count);
+ igt_info("ban count: %lld\n", qrs->ban_count);
+
+ for (int i = 0; i < qrs->ban_count; i++) {
+ ban = &qrs->ban_list[i];
+
+ igt_info("-------------------------------\n");
+ igt_info("exec queue ban %d\n", i);
+ igt_info("-------------------------------\n");
+ igt_info("exec_queue_id: %d\n", ban->exec_queue_id);
+ if (!ban->pf_found) {
+ igt_info("no associated pagefault\n");
+ continue;
+ }
+ igt_info("pagefault associated:\n");
+ igt_info("\tASID: %d\n"
+ "\tVFID: %d\n"
+ "\tPDATA: 0x%04x\n"
+ "\tFaulted Address: 0x%08x%08x\n"
+ "\tFaultType: %d\n"
+ "\tAccessType: %d\n"
+ "\tEngineClass: %d %s\n"
+ "\tEngineInstance: %d\n",
+ ban->asid, ban->vfid, ban->pdata,
+ upper_32_bits(ban->fault_addr),
+ lower_32_bits(ban->fault_addr),
+ ban->fault_type, ban->access_type,
+ ban->engine_class,
+ xe_engine_class_string(ban->engine_class),
+ ban->engine_instance);
+ }
+
+ free(qrs);
+}
+
+/**
+ * SUBTEST: query-reset-stats-reset
+ * Description: Assert reset stats query tracks reset count
+ *
+ * SUBTEST: multigpu-query-reset-stats-reset
+ * Description: Assert reset stats query tracks reset count for all GPU devices
+ * Sub-category: MultiGPU
+ */
+static void
+test_query_reset_stats_reset(int fd)
+{
+ struct drm_xe_engine_class_instance *hwe;
+ struct drm_xe_query_reset_stats *qrs;
+ struct drm_xe_device_query query = {
+ .extensions = 0,
+ .query = DRM_XE_DEVICE_QUERY_RESET_STATS,
+ .size = 0,
+ .data = 0,
+ };
+ u64 resets1, resets2;
+
+ igt_skip_on(!query_reset_stats_supported(fd));
+ igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0);
+
+ qrs = malloc(query.size);
+ igt_assert(qrs);
+
+ query.data = to_user_pointer(qrs);
+ igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0);
+ resets1 = qrs->reset_count;
+ free(qrs);
+
+ query.size = 0;
+ query.data = 0;
+
+ xe_for_each_engine(fd, hwe)
+ xe_force_gt_reset_sync(fd, hwe->gt_id);
+
+ igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0);
+
+ qrs = malloc(query.size);
+ igt_assert(qrs);
+
+ query.data = to_user_pointer(qrs);
+ igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0);
+ resets2 = qrs->reset_count;
+ free(qrs);
+
+ igt_assert_lt(resets1, resets2);
+}
+
+static void gen_pf(int fd, struct drm_xe_engine_class_instance *eci)
+{
+ uint32_t vm;
+ uint64_t addr = 0x1a0000;
+ uint64_t sync_addr = 0x101a0000;
+#define USER_FENCE_VALUE 0xdeadbeefdeadbeefull
+ struct drm_xe_sync sync[1] = {
+ { .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags = DRM_XE_SYNC_FLAG_SIGNAL,
+ .timeline_value = USER_FENCE_VALUE },
+ };
+ struct drm_xe_exec exec = {
+ .num_batch_buffer = 1,
+ .num_syncs = 1,
+ .syncs = to_user_pointer(sync),
+ };
+ uint32_t exec_queues[1];
+ uint32_t bind_exec_queues[1];
+ size_t bo_size, sync_size;
+ struct {
+ uint32_t batch[16];
+ uint64_t pad;
+ uint64_t vm_sync;
+ uint32_t data;
+ } *data;
+ uint64_t *exec_sync;
+ int i, b;
+ int map_fd = -1;
+ int n_exec_queues = 1;
+ int n_execs = 64;
+
+ vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
+ DRM_XE_VM_CREATE_FLAG_FAULT_MODE, 0);
+ bo_size = sizeof(*data) * n_execs;
+ bo_size = xe_bb_size(fd, bo_size);
+ sync_size = sizeof(*exec_sync) * n_execs;
+ sync_size = xe_bb_size(fd, sync_size);
+
+#define MAP_ADDRESS 0x00007fadeadbe000
+ data = mmap((void *)MAP_ADDRESS, bo_size, PROT_READ |
+ PROT_WRITE, MAP_SHARED | MAP_FIXED |
+ MAP_ANONYMOUS, -1, 0);
+ igt_assert(data != MAP_FAILED);
+ memset(data, 0, bo_size);
+
+#define EXEC_SYNC_ADDRESS 0x00007fbdeadbe000
+ exec_sync = mmap((void *)EXEC_SYNC_ADDRESS, sync_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
+ igt_assert(exec_sync != MAP_FAILED);
+ memset(exec_sync, 0, sync_size);
+
+ for (i = 0; i < n_exec_queues; i++) {
+ exec_queues[i] = xe_exec_queue_create(fd, vm, eci, 0);
+ bind_exec_queues[i] = 0;
+ }
+
+ sync[0].addr = to_user_pointer(&data[0].vm_sync);
+ xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
+ to_user_pointer(data), addr,
+ bo_size, sync, 1);
+
+ xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
+ bind_exec_queues[0], NSEC_PER_SEC);
+ data[0].vm_sync = 0;
+
+ xe_vm_bind_userptr_async(fd, vm, bind_exec_queues[0],
+ to_user_pointer(exec_sync), sync_addr,
+ sync_size, sync, 1);
+ xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
+ bind_exec_queues[0], NSEC_PER_SEC);
+ data[0].vm_sync = 0;
+
+ for (i = 0; i < n_execs; i++) {
+ uint64_t batch_offset = (char *)&data[i].batch - (char *)data;
+ uint64_t batch_addr = addr + batch_offset;
+ uint64_t sdi_offset = (char *)&data[i].data - (char *)data;
+ uint64_t sdi_addr = addr + sdi_offset;
+ int e = i % n_exec_queues;
+
+ b = 0;
+
+ data[i].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
+ data[i].batch[b++] = sdi_addr;
+ data[i].batch[b++] = sdi_addr >> 32;
+ data[i].batch[b++] = 0xc0ffee;
+ data[i].batch[b++] = MI_BATCH_BUFFER_END;
+ igt_assert(b <= ARRAY_SIZE(data[i].batch));
+
+ sync[0].addr = sync_addr + (char *)&exec_sync[i] - (char *)exec_sync;
+
+ exec.exec_queue_id = exec_queues[e];
+ exec.address = batch_addr;
+ xe_exec(fd, &exec);
+
+ if (i + 1 != n_execs) {
+ /*
+ * Wait for exec completion and check data as
+ * userptr will likely change to different
+ * physical memory on next mmap call triggering
+ * an invalidate.
+ */
+ xe_wait_ufence(fd, &exec_sync[i],
+ USER_FENCE_VALUE, exec_queues[e],
+ NSEC_PER_SEC);
+ igt_assert_eq(data[i].data, 0xc0ffee);
+ data = mmap((void *)MAP_ADDRESS, bo_size,
+ PROT_READ | PROT_WRITE, MAP_SHARED |
+ MAP_FIXED | MAP_ANONYMOUS, -1, 0);
+ igt_assert(data != MAP_FAILED);
+ }
+ }
+
+ for (i = n_execs - 1; i < n_execs; i++) {
+ int64_t timeout = NSEC_PER_SEC;
+
+ igt_assert_eq(__xe_wait_ufence(fd, &exec_sync[i], USER_FENCE_VALUE,
+ exec_queues[i % n_exec_queues], &timeout), 0);
+ }
+
+ sync[0].addr = to_user_pointer(&data[0].vm_sync);
+ data[0].vm_sync = 0;
+ xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, sync_addr, sync_size,
+ sync, 1);
+ xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
+ bind_exec_queues[0], NSEC_PER_SEC);
+ data[0].vm_sync = 0;
+ xe_vm_unbind_async(fd, vm, bind_exec_queues[0], 0, addr, bo_size,
+ sync, 1);
+ xe_wait_ufence(fd, &data[0].vm_sync, USER_FENCE_VALUE,
+ bind_exec_queues[0], NSEC_PER_SEC);
+
+ for (i = 0; i < n_exec_queues; i++) {
+ xe_exec_queue_destroy(fd, exec_queues[i]);
+ if (bind_exec_queues[i])
+ xe_exec_queue_destroy(fd, bind_exec_queues[i]);
+ }
+
+ munmap(exec_sync, sync_size);
+ xe_vm_destroy(fd, vm);
+ if (map_fd != -1)
+ close(map_fd);
+}
+
+/**
+ * SUBTEST: query-reset-stats-bans
+ * Description: Assert reset stats query tracks exec queue bans
+ *
+ * SUBTEST: multigpu-query-reset-stats-bans
+ * Description: Assert reset stats query tracks exec queue bans for all GPU devices
+ * Sub-category: MultiGPU
+ */
+static void
+test_query_reset_stats_bans(int fd)
+{
+ struct drm_xe_engine_class_instance *hwe;
+ struct drm_xe_query_reset_stats *qrs;
+ struct drm_xe_device_query query = {
+ .extensions = 0,
+ .query = DRM_XE_DEVICE_QUERY_RESET_STATS,
+ .size = 0,
+ .data = 0,
+ };
+ u64 bans1, bans2;
+
+ igt_skip_on(!query_reset_stats_supported(fd));
+ igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0);
+
+ qrs = malloc(query.size);
+ igt_assert(qrs);
+
+ query.data = to_user_pointer(qrs);
+ igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0);
+ bans1 = qrs->ban_count;
+ free(qrs);
+
+ query.size = 0;
+ query.data = 0;
+
+ xe_for_each_engine(fd, hwe)
+ gen_pf(fd, hwe);
+
+ igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0);
+
+ qrs = malloc(query.size);
+ igt_assert(qrs);
+
+ query.data = to_user_pointer(qrs);
+ igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0);
+ bans2 = qrs->ban_count;
+ free(qrs);
+
+ /**
+ * There is a limit to the number of bans that can be saved to the
+ * ban list, so if that limit was already reached before now, assert
+ * the list did not get any smaller.
+ */
+ if (bans1 == MAX_BAN_COUNT)
+ igt_assert_eq(bans1, bans2);
+ else
+ igt_assert_lt(bans1, bans2);
+}
+
igt_main
{
const struct {
@@ -1094,6 +1430,9 @@ igt_main
{ "query-uc-fw-version-guc", test_query_uc_fw_version_guc },
{ "query-uc-fw-version-huc", test_query_uc_fw_version_huc },
{ "query-oa-units", test_query_oa_units },
+ { "query-reset-stats", test_query_reset_stats },
+ { "query-reset-stats-reset", test_query_reset_stats_reset },
+ { "query-reset-stats-bans", test_query_reset_stats_bans },
{ "query-invalid-cs-cycles", test_engine_cycles_invalid },
{ "query-invalid-query", test_query_invalid_query },
{ "query-invalid-size", test_query_invalid_size },
Add tests that exercise the xe reset stats query. The current tests simply output the result of the tests and assert that the reset and ban counters properly increment. Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com> --- include/drm-uapi/xe_drm.h | 50 ++++++ tests/intel/xe_query.c | 339 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 389 insertions(+)