diff mbox series

[v3,1/2] drm: Add GPU reset sysfs event

Message ID 20221125175203.52481-2-andrealmeid@igalia.com (mailing list archive)
State New, archived
Headers show
Series drm: Add GPU reset sysfs | expand

Commit Message

André Almeida Nov. 25, 2022, 5:52 p.m. UTC
From: Shashank Sharma <shashank.sharma@amd.com>

Add a sysfs event to notify userspace about GPU resets providing:
- PID that triggered the GPU reset, if any. Resets can happen from
  kernel threads as well, in that case no PID is provided
- Information about the reset (e.g. was VRAM lost?)

Co-developed-by: André Almeida <andrealmeid@igalia.com>
Signed-off-by: André Almeida <andrealmeid@igalia.com>
Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
---

V3:
   - Reduce information to just PID and flags
   - Use pid pointer instead of just pid number
   - BUG() if no reset info is provided

V2:
   - Addressed review comments from Christian and Amar
   - move the reset information structure to DRM layer
   - drop _ctx from struct name
   - make pid 32 bit(than 64)
   - set flag when VRAM invalid (than valid)
   - add process name as well (Amar)
---
 drivers/gpu/drm/drm_sysfs.c | 26 ++++++++++++++++++++++++++
 include/drm/drm_sysfs.h     | 13 +++++++++++++
 2 files changed, 39 insertions(+)

Comments

Pekka Paalanen Nov. 28, 2022, 9:27 a.m. UTC | #1
On Fri, 25 Nov 2022 14:52:02 -0300
André Almeida <andrealmeid@igalia.com> wrote:

> From: Shashank Sharma <shashank.sharma@amd.com>
> 
> Add a sysfs event to notify userspace about GPU resets providing:
> - PID that triggered the GPU reset, if any. Resets can happen from
>   kernel threads as well, in that case no PID is provided
> - Information about the reset (e.g. was VRAM lost?)
> 
> Co-developed-by: André Almeida <andrealmeid@igalia.com>
> Signed-off-by: André Almeida <andrealmeid@igalia.com>
> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
> ---
> 
> V3:
>    - Reduce information to just PID and flags
>    - Use pid pointer instead of just pid number
>    - BUG() if no reset info is provided
> 
> V2:
>    - Addressed review comments from Christian and Amar
>    - move the reset information structure to DRM layer
>    - drop _ctx from struct name
>    - make pid 32 bit(than 64)
>    - set flag when VRAM invalid (than valid)
>    - add process name as well (Amar)
> ---
>  drivers/gpu/drm/drm_sysfs.c | 26 ++++++++++++++++++++++++++
>  include/drm/drm_sysfs.h     | 13 +++++++++++++
>  2 files changed, 39 insertions(+)
> 
> diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
> index 430e00b16eec..85777abf4194 100644
> --- a/drivers/gpu/drm/drm_sysfs.c
> +++ b/drivers/gpu/drm/drm_sysfs.c
> @@ -409,6 +409,32 @@ void drm_sysfs_hotplug_event(struct drm_device *dev)
>  }
>  EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>  
> +/**
> + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
> + * @dev: DRM device
> + * @reset_info: The contextual information about the reset (like PID, flags)
> + *
> + * Send a uevent for the DRM device specified by @dev. This informs
> + * user that a GPU reset has occurred, so that an interested client
> + * can take any recovery or profiling measure.
> + */
> +void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event_info *reset_info)
> +{
> +	unsigned char pid_str[13];

Hi,

"PID=4111222333\0"

I count 15 bytes instead of 13?

> +	unsigned char flags_str[18];
> +	unsigned char reset_str[] = "RESET=1";
> +	char *envp[] = { reset_str, pid_str, flags_str, NULL };

It seems you always send PID attribute, even if it's nonsense (I guess
zero). Should sending nonsense be avoided?

> +
> +	DRM_DEBUG("generating reset event\n");
> +
> +	BUG_ON(!reset_info);
> +
> +	snprintf(pid_str, sizeof(pid_str), "PID=%u", pid_vnr(reset_info->pid));

Passing PID by number is racy, but I suppose it has two rationales:
- there is no way to pass a pidfd?
- it's safe enough because the kernel avoids aggressive PID re-use?

Maybe this would be good to note in the commit message to justify the
design.

What about pid namespaces, are they handled by pid_vnr() auto-magically
somehow? Does it mean that the daemon handling these events *must not*
be running under a (non-root?) pid namespace to work at all?

E.g. if you have a container that is given a dedicated GPU device, I
guess it might be reasonable to want to run the daemon inside that
container as well. I have no idea how pid namespaces work, but I recall
hearing they are a thing.

> +	snprintf(flags_str, sizeof(flags_str), "FLAGS=0x%llx", reset_info->flags);
> +	kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
> +}
> +EXPORT_SYMBOL(drm_sysfs_reset_event);
> +
>  /**
>   * drm_sysfs_connector_hotplug_event - generate a DRM uevent for any connector
>   * change
> diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
> index 6273cac44e47..dbb0ac6230b8 100644
> --- a/include/drm/drm_sysfs.h
> +++ b/include/drm/drm_sysfs.h
> @@ -2,15 +2,28 @@
>  #ifndef _DRM_SYSFS_H_
>  #define _DRM_SYSFS_H_
>  
> +#define DRM_RESET_EVENT_VRAM_LOST (1 << 0)

Since flags are UAPI, they should be documented somewhere in UAPI docs.

Shouldn't this whole event be documented somewhere in UAPI docs to say
what it means and what attributes it may have and what they mean?


Thanks,
pq

> +
>  struct drm_device;
>  struct device;
>  struct drm_connector;
>  struct drm_property;
>  
> +/**
> + * struct drm_reset_event_info - Information about a GPU reset event
> + * @pid: Process that triggered the reset, if any
> + * @flags: Extra information around the reset event (e.g. is VRAM lost?)
> + */
> +struct drm_reset_event_info {
> +	struct pid *pid;
> +	uint64_t flags;
> +};
> +
>  int drm_class_device_register(struct device *dev);
>  void drm_class_device_unregister(struct device *dev);
>  
>  void drm_sysfs_hotplug_event(struct drm_device *dev);
> +void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event_info *reset_info);
>  void drm_sysfs_connector_hotplug_event(struct drm_connector *connector);
>  void drm_sysfs_connector_status_event(struct drm_connector *connector,
>  				      struct drm_property *property);
Alex Deucher Nov. 29, 2022, 2:07 p.m. UTC | #2
On Fri, Nov 25, 2022 at 12:52 PM André Almeida <andrealmeid@igalia.com> wrote:
>
> From: Shashank Sharma <shashank.sharma@amd.com>
>
> Add a sysfs event to notify userspace about GPU resets providing:
> - PID that triggered the GPU reset, if any. Resets can happen from
>   kernel threads as well, in that case no PID is provided
> - Information about the reset (e.g. was VRAM lost?)
>
> Co-developed-by: André Almeida <andrealmeid@igalia.com>
> Signed-off-by: André Almeida <andrealmeid@igalia.com>
> Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
> ---
>
> V3:
>    - Reduce information to just PID and flags
>    - Use pid pointer instead of just pid number
>    - BUG() if no reset info is provided
>
> V2:
>    - Addressed review comments from Christian and Amar
>    - move the reset information structure to DRM layer
>    - drop _ctx from struct name
>    - make pid 32 bit(than 64)
>    - set flag when VRAM invalid (than valid)
>    - add process name as well (Amar)
> ---
>  drivers/gpu/drm/drm_sysfs.c | 26 ++++++++++++++++++++++++++
>  include/drm/drm_sysfs.h     | 13 +++++++++++++
>  2 files changed, 39 insertions(+)
>
> diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
> index 430e00b16eec..85777abf4194 100644
> --- a/drivers/gpu/drm/drm_sysfs.c
> +++ b/drivers/gpu/drm/drm_sysfs.c
> @@ -409,6 +409,32 @@ void drm_sysfs_hotplug_event(struct drm_device *dev)
>  }
>  EXPORT_SYMBOL(drm_sysfs_hotplug_event);
>
> +/**
> + * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
> + * @dev: DRM device
> + * @reset_info: The contextual information about the reset (like PID, flags)
> + *
> + * Send a uevent for the DRM device specified by @dev. This informs
> + * user that a GPU reset has occurred, so that an interested client
> + * can take any recovery or profiling measure.
> + */
> +void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event_info *reset_info)
> +{
> +       unsigned char pid_str[13];
> +       unsigned char flags_str[18];
> +       unsigned char reset_str[] = "RESET=1";
> +       char *envp[] = { reset_str, pid_str, flags_str, NULL };
> +
> +       DRM_DEBUG("generating reset event\n");
> +
> +       BUG_ON(!reset_info);
> +
> +       snprintf(pid_str, sizeof(pid_str), "PID=%u", pid_vnr(reset_info->pid));
> +       snprintf(flags_str, sizeof(flags_str), "FLAGS=0x%llx", reset_info->flags);
> +       kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
> +}
> +EXPORT_SYMBOL(drm_sysfs_reset_event);
> +
>  /**
>   * drm_sysfs_connector_hotplug_event - generate a DRM uevent for any connector
>   * change
> diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
> index 6273cac44e47..dbb0ac6230b8 100644
> --- a/include/drm/drm_sysfs.h
> +++ b/include/drm/drm_sysfs.h
> @@ -2,15 +2,28 @@
>  #ifndef _DRM_SYSFS_H_
>  #define _DRM_SYSFS_H_
>
> +#define DRM_RESET_EVENT_VRAM_LOST (1 << 0)

I was thinking about this a bit more last night, and I think we should add:
DRM_RESET_EVENT_APP_ROBUSTNESS
When an application that supports robustness extensions starts, the
UMD can set a flag when it creates the context with the KMD.  That way
if the app causes a GPU hang, the reset daemon would see this flag if
the guilty app supports robustness and adjust it's behavior as
appropriate.  E.g., rather than killing the app, it might let it run
or set some grace period, etc.

Alex


> +
>  struct drm_device;
>  struct device;
>  struct drm_connector;
>  struct drm_property;
>
> +/**
> + * struct drm_reset_event_info - Information about a GPU reset event
> + * @pid: Process that triggered the reset, if any
> + * @flags: Extra information around the reset event (e.g. is VRAM lost?)
> + */
> +struct drm_reset_event_info {
> +       struct pid *pid;
> +       uint64_t flags;
> +};
> +
>  int drm_class_device_register(struct device *dev);
>  void drm_class_device_unregister(struct device *dev);
>
>  void drm_sysfs_hotplug_event(struct drm_device *dev);
> +void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event_info *reset_info);
>  void drm_sysfs_connector_hotplug_event(struct drm_connector *connector);
>  void drm_sysfs_connector_status_event(struct drm_connector *connector,
>                                       struct drm_property *property);
> --
> 2.38.1
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
index 430e00b16eec..85777abf4194 100644
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -409,6 +409,32 @@  void drm_sysfs_hotplug_event(struct drm_device *dev)
 }
 EXPORT_SYMBOL(drm_sysfs_hotplug_event);
 
+/**
+ * drm_sysfs_reset_event - generate a DRM uevent to indicate GPU reset
+ * @dev: DRM device
+ * @reset_info: The contextual information about the reset (like PID, flags)
+ *
+ * Send a uevent for the DRM device specified by @dev. This informs
+ * user that a GPU reset has occurred, so that an interested client
+ * can take any recovery or profiling measure.
+ */
+void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event_info *reset_info)
+{
+	unsigned char pid_str[13];
+	unsigned char flags_str[18];
+	unsigned char reset_str[] = "RESET=1";
+	char *envp[] = { reset_str, pid_str, flags_str, NULL };
+
+	DRM_DEBUG("generating reset event\n");
+
+	BUG_ON(!reset_info);
+
+	snprintf(pid_str, sizeof(pid_str), "PID=%u", pid_vnr(reset_info->pid));
+	snprintf(flags_str, sizeof(flags_str), "FLAGS=0x%llx", reset_info->flags);
+	kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
+}
+EXPORT_SYMBOL(drm_sysfs_reset_event);
+
 /**
  * drm_sysfs_connector_hotplug_event - generate a DRM uevent for any connector
  * change
diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
index 6273cac44e47..dbb0ac6230b8 100644
--- a/include/drm/drm_sysfs.h
+++ b/include/drm/drm_sysfs.h
@@ -2,15 +2,28 @@ 
 #ifndef _DRM_SYSFS_H_
 #define _DRM_SYSFS_H_
 
+#define DRM_RESET_EVENT_VRAM_LOST (1 << 0)
+
 struct drm_device;
 struct device;
 struct drm_connector;
 struct drm_property;
 
+/**
+ * struct drm_reset_event_info - Information about a GPU reset event
+ * @pid: Process that triggered the reset, if any
+ * @flags: Extra information around the reset event (e.g. is VRAM lost?)
+ */
+struct drm_reset_event_info {
+	struct pid *pid;
+	uint64_t flags;
+};
+
 int drm_class_device_register(struct device *dev);
 void drm_class_device_unregister(struct device *dev);
 
 void drm_sysfs_hotplug_event(struct drm_device *dev);
+void drm_sysfs_reset_event(struct drm_device *dev, struct drm_reset_event_info *reset_info);
 void drm_sysfs_connector_hotplug_event(struct drm_connector *connector);
 void drm_sysfs_connector_status_event(struct drm_connector *connector,
 				      struct drm_property *property);