diff mbox series

[v3,4/6] drm/i915/ttm: Break refcounting loops at device region unref time

Message ID 20211114111218.623138-5-thomas.hellstrom@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/ttm: Async migration | expand

Commit Message

Thomas Hellstrom Nov. 14, 2021, 11:12 a.m. UTC
There is an interesting refcounting loop:
struct intel_memory_region has a struct ttm_resource_manager,
ttm_resource_manager->move may hold a reference to i915_request,
i915_request may hold a reference to intel_context,
intel_context may hold a reference to drm_i915_gem_object,
drm_i915_gem_object may hold a reference to intel_memory_region.

Break this loop when we drop the device reference count on the
region by putting the region move fence.

Also hold dropping the device reference count until all objects of
the region has been deleted, to avoid issues if proceeding with the
device takedown while the region is still present.

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c     |  1 +
 drivers/gpu/drm/i915/gt/intel_region_lmem.c |  1 +
 drivers/gpu/drm/i915/intel_memory_region.c  |  5 +++-
 drivers/gpu/drm/i915/intel_memory_region.h  |  1 +
 drivers/gpu/drm/i915/intel_region_ttm.c     | 28 +++++++++++++++++++++
 drivers/gpu/drm/i915/intel_region_ttm.h     |  2 ++
 6 files changed, 37 insertions(+), 1 deletion(-)

Comments

Matthew Auld Nov. 15, 2021, 10:49 a.m. UTC | #1
On 14/11/2021 11:12, Thomas Hellström wrote:
> There is an interesting refcounting loop:
> struct intel_memory_region has a struct ttm_resource_manager,
> ttm_resource_manager->move may hold a reference to i915_request,
> i915_request may hold a reference to intel_context,
> intel_context may hold a reference to drm_i915_gem_object,
> drm_i915_gem_object may hold a reference to intel_memory_region.

Would it help if we drop the per object region refcoutning? IIRC that 
was originally added to more cleanly appease some selftest teardown or 
something.

> 
> Break this loop when we drop the device reference count on the
> region by putting the region move fence.
> 
> Also hold dropping the device reference count until all objects of
> the region has been deleted, to avoid issues if proceeding with the
> device takedown while the region is still present.
> 
> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_ttm.c     |  1 +
>   drivers/gpu/drm/i915/gt/intel_region_lmem.c |  1 +
>   drivers/gpu/drm/i915/intel_memory_region.c  |  5 +++-
>   drivers/gpu/drm/i915/intel_memory_region.h  |  1 +
>   drivers/gpu/drm/i915/intel_region_ttm.c     | 28 +++++++++++++++++++++
>   drivers/gpu/drm/i915/intel_region_ttm.h     |  2 ++
>   6 files changed, 37 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> index 537a81445b90..a1df49378a0f 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> @@ -1044,6 +1044,7 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
>   
>   static const struct intel_memory_region_ops ttm_system_region_ops = {
>   	.init_object = __i915_gem_ttm_object_init,
> +	.disable = intel_region_ttm_disable,
>   };
>   
>   struct intel_memory_region *
> diff --git a/drivers/gpu/drm/i915/gt/intel_region_lmem.c b/drivers/gpu/drm/i915/gt/intel_region_lmem.c
> index aec838ecb2ef..956916fd21f8 100644
> --- a/drivers/gpu/drm/i915/gt/intel_region_lmem.c
> +++ b/drivers/gpu/drm/i915/gt/intel_region_lmem.c
> @@ -108,6 +108,7 @@ region_lmem_init(struct intel_memory_region *mem)
>   static const struct intel_memory_region_ops intel_region_lmem_ops = {
>   	.init = region_lmem_init,
>   	.release = region_lmem_release,
> +	.disable = intel_region_ttm_disable,
>   	.init_object = __i915_gem_ttm_object_init,
>   };
>   
> diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c
> index e7f7e6627750..1f67d2b68c24 100644
> --- a/drivers/gpu/drm/i915/intel_memory_region.c
> +++ b/drivers/gpu/drm/i915/intel_memory_region.c
> @@ -233,8 +233,11 @@ void intel_memory_regions_driver_release(struct drm_i915_private *i915)
>   		struct intel_memory_region *region =
>   			fetch_and_zero(&i915->mm.regions[i]);
>   
> -		if (region)
> +		if (region) {
> +			if (region->ops->disable)
> +				region->ops->disable(region);
>   			intel_memory_region_put(region);
> +		}
>   	}
>   }
>   
> diff --git a/drivers/gpu/drm/i915/intel_memory_region.h b/drivers/gpu/drm/i915/intel_memory_region.h
> index 3feae3353d33..9bb77eacd206 100644
> --- a/drivers/gpu/drm/i915/intel_memory_region.h
> +++ b/drivers/gpu/drm/i915/intel_memory_region.h
> @@ -52,6 +52,7 @@ struct intel_memory_region_ops {
>   
>   	int (*init)(struct intel_memory_region *mem);
>   	void (*release)(struct intel_memory_region *mem);
> +	void (*disable)(struct intel_memory_region *mem);
>   
>   	int (*init_object)(struct intel_memory_region *mem,
>   			   struct drm_i915_gem_object *obj,
> diff --git a/drivers/gpu/drm/i915/intel_region_ttm.c b/drivers/gpu/drm/i915/intel_region_ttm.c
> index 2e901a27e259..4219d83a2b19 100644
> --- a/drivers/gpu/drm/i915/intel_region_ttm.c
> +++ b/drivers/gpu/drm/i915/intel_region_ttm.c
> @@ -114,6 +114,34 @@ void intel_region_ttm_fini(struct intel_memory_region *mem)
>   	mem->region_private = NULL;
>   }
>   
> +/**
> + * intel_region_ttm_disable - A TTM region disable callback helper
> + * @mem: The memory region.
> + *
> + * A helper that ensures that nothing any longer references a region at
> + * device takedown. Breaks refcounting loops and waits for objects in the
> + * region to be deleted.
> + */
> +void intel_region_ttm_disable(struct intel_memory_region *mem)
> +{
> +	struct ttm_resource_manager *man = mem->region_private;
> +
> +	/*
> +	 * Put the region's move fences. This releases requests that
> +	 * may hold on to contexts and vms that may hold on to buffer
> +	 * objects that may have a refcount on the region. :/
> +	 */
> +	if (man)
> +		ttm_resource_manager_cleanup(man);
> +
> +	/* Flush objects that may just have been freed */
> +	i915_gem_flush_free_objects(mem->i915);
> +
> +	/* Wait until the only region reference left is our own. */
> +	while (kref_read(&mem->kref) > 1)
> +		msleep(20);

If we leak an object, I guess we get an infinite loop here at driver 
release?

> +}
> +
>   /**
>    * intel_region_ttm_resource_to_rsgt -
>    * Convert an opaque TTM resource manager resource to a refcounted sg_table.
> diff --git a/drivers/gpu/drm/i915/intel_region_ttm.h b/drivers/gpu/drm/i915/intel_region_ttm.h
> index 7bbe2b46b504..197a8c179370 100644
> --- a/drivers/gpu/drm/i915/intel_region_ttm.h
> +++ b/drivers/gpu/drm/i915/intel_region_ttm.h
> @@ -22,6 +22,8 @@ int intel_region_ttm_init(struct intel_memory_region *mem);
>   
>   void intel_region_ttm_fini(struct intel_memory_region *mem);
>   
> +void intel_region_ttm_disable(struct intel_memory_region *mem);
> +
>   struct i915_refct_sgt *
>   intel_region_ttm_resource_to_rsgt(struct intel_memory_region *mem,
>   				  struct ttm_resource *res);
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index 537a81445b90..a1df49378a0f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -1044,6 +1044,7 @@  int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
 
 static const struct intel_memory_region_ops ttm_system_region_ops = {
 	.init_object = __i915_gem_ttm_object_init,
+	.disable = intel_region_ttm_disable,
 };
 
 struct intel_memory_region *
diff --git a/drivers/gpu/drm/i915/gt/intel_region_lmem.c b/drivers/gpu/drm/i915/gt/intel_region_lmem.c
index aec838ecb2ef..956916fd21f8 100644
--- a/drivers/gpu/drm/i915/gt/intel_region_lmem.c
+++ b/drivers/gpu/drm/i915/gt/intel_region_lmem.c
@@ -108,6 +108,7 @@  region_lmem_init(struct intel_memory_region *mem)
 static const struct intel_memory_region_ops intel_region_lmem_ops = {
 	.init = region_lmem_init,
 	.release = region_lmem_release,
+	.disable = intel_region_ttm_disable,
 	.init_object = __i915_gem_ttm_object_init,
 };
 
diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c
index e7f7e6627750..1f67d2b68c24 100644
--- a/drivers/gpu/drm/i915/intel_memory_region.c
+++ b/drivers/gpu/drm/i915/intel_memory_region.c
@@ -233,8 +233,11 @@  void intel_memory_regions_driver_release(struct drm_i915_private *i915)
 		struct intel_memory_region *region =
 			fetch_and_zero(&i915->mm.regions[i]);
 
-		if (region)
+		if (region) {
+			if (region->ops->disable)
+				region->ops->disable(region);
 			intel_memory_region_put(region);
+		}
 	}
 }
 
diff --git a/drivers/gpu/drm/i915/intel_memory_region.h b/drivers/gpu/drm/i915/intel_memory_region.h
index 3feae3353d33..9bb77eacd206 100644
--- a/drivers/gpu/drm/i915/intel_memory_region.h
+++ b/drivers/gpu/drm/i915/intel_memory_region.h
@@ -52,6 +52,7 @@  struct intel_memory_region_ops {
 
 	int (*init)(struct intel_memory_region *mem);
 	void (*release)(struct intel_memory_region *mem);
+	void (*disable)(struct intel_memory_region *mem);
 
 	int (*init_object)(struct intel_memory_region *mem,
 			   struct drm_i915_gem_object *obj,
diff --git a/drivers/gpu/drm/i915/intel_region_ttm.c b/drivers/gpu/drm/i915/intel_region_ttm.c
index 2e901a27e259..4219d83a2b19 100644
--- a/drivers/gpu/drm/i915/intel_region_ttm.c
+++ b/drivers/gpu/drm/i915/intel_region_ttm.c
@@ -114,6 +114,34 @@  void intel_region_ttm_fini(struct intel_memory_region *mem)
 	mem->region_private = NULL;
 }
 
+/**
+ * intel_region_ttm_disable - A TTM region disable callback helper
+ * @mem: The memory region.
+ *
+ * A helper that ensures that nothing any longer references a region at
+ * device takedown. Breaks refcounting loops and waits for objects in the
+ * region to be deleted.
+ */
+void intel_region_ttm_disable(struct intel_memory_region *mem)
+{
+	struct ttm_resource_manager *man = mem->region_private;
+
+	/*
+	 * Put the region's move fences. This releases requests that
+	 * may hold on to contexts and vms that may hold on to buffer
+	 * objects that may have a refcount on the region. :/
+	 */
+	if (man)
+		ttm_resource_manager_cleanup(man);
+
+	/* Flush objects that may just have been freed */
+	i915_gem_flush_free_objects(mem->i915);
+
+	/* Wait until the only region reference left is our own. */
+	while (kref_read(&mem->kref) > 1)
+		msleep(20);
+}
+
 /**
  * intel_region_ttm_resource_to_rsgt -
  * Convert an opaque TTM resource manager resource to a refcounted sg_table.
diff --git a/drivers/gpu/drm/i915/intel_region_ttm.h b/drivers/gpu/drm/i915/intel_region_ttm.h
index 7bbe2b46b504..197a8c179370 100644
--- a/drivers/gpu/drm/i915/intel_region_ttm.h
+++ b/drivers/gpu/drm/i915/intel_region_ttm.h
@@ -22,6 +22,8 @@  int intel_region_ttm_init(struct intel_memory_region *mem);
 
 void intel_region_ttm_fini(struct intel_memory_region *mem);
 
+void intel_region_ttm_disable(struct intel_memory_region *mem);
+
 struct i915_refct_sgt *
 intel_region_ttm_resource_to_rsgt(struct intel_memory_region *mem,
 				  struct ttm_resource *res);