diff mbox series

[v3,2/4] drm/i915/ttm: Adjust gem flags and caching settings after a move

Message ID 20210614095230.126284-3-thomas.hellstrom@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915: Move system memory to TTM for discrete | expand

Commit Message

Thomas Hellstrom June 14, 2021, 9:52 a.m. UTC
After a TTM move or object init we need to update the i915 gem flags and
caching settings to reflect the new placement. Currently caching settings
are not changed during the lifetime of an object, although that might
change moving forward if we run into performance issues or issues with
WC system page allocations.
Also introduce gpu_binds_iomem() and cpu_maps_iomem() to clean up the
various ways we previously used to detect this.
Finally, initialize the TTM object reserved to be able to update
flags and caching before anyone else gets hold of the object.

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
v2:
- Style fixes (Reported by Matthew Auld)
v3:
- More style fixes. Clarify why we're updating caching settings after move.
  (Suggested by Matthew Auld)
---
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 111 +++++++++++++++++++-----
 1 file changed, 89 insertions(+), 22 deletions(-)

Comments

Matthew Auld June 14, 2021, 10:20 a.m. UTC | #1
On Mon, 14 Jun 2021 at 10:53, Thomas Hellström
<thomas.hellstrom@linux.intel.com> wrote:
>
> After a TTM move or object init we need to update the i915 gem flags and
> caching settings to reflect the new placement. Currently caching settings
> are not changed during the lifetime of an object, although that might
> change moving forward if we run into performance issues or issues with
> WC system page allocations.
> Also introduce gpu_binds_iomem() and cpu_maps_iomem() to clean up the
> various ways we previously used to detect this.
> Finally, initialize the TTM object reserved to be able to update
> flags and caching before anyone else gets hold of the object.
>
> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> ---
> v2:
> - Style fixes (Reported by Matthew Auld)
> v3:
> - More style fixes. Clarify why we're updating caching settings after move.
>   (Suggested by Matthew Auld)
> ---
>  drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 111 +++++++++++++++++++-----
>  1 file changed, 89 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> index 33ab47f1e05b..5176682a7d19 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> @@ -70,6 +70,17 @@ static struct ttm_placement i915_sys_placement = {
>         .busy_placement = &lmem0_sys_placement_flags[1],
>  };
>
> +static bool gpu_binds_iomem(struct ttm_resource *mem)
> +{
> +       return mem->mem_type != TTM_PL_SYSTEM;
> +}
> +
> +static bool cpu_maps_iomem(struct ttm_resource *mem)
> +{
> +       /* Once / if we support GGTT, this is also false for cached ttm_tts */
> +       return mem->mem_type != TTM_PL_SYSTEM;
> +}
> +
>  static void i915_ttm_adjust_lru(struct drm_i915_gem_object *obj);
>
>  static struct ttm_tt *i915_ttm_tt_create(struct ttm_buffer_object *bo,
> @@ -175,6 +186,40 @@ static void i915_ttm_free_cached_io_st(struct drm_i915_gem_object *obj)
>         obj->ttm.cached_io_st = NULL;
>  }
>
> +static void
> +i915_ttm_adjust_domains_after_cpu_move(struct drm_i915_gem_object *obj)
> +{
> +       struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
> +
> +       if (cpu_maps_iomem(bo->resource) || bo->ttm->caching != ttm_cached) {
> +               obj->write_domain = I915_GEM_DOMAIN_WC;
> +               obj->read_domains = I915_GEM_DOMAIN_WC;
> +       } else {
> +               obj->write_domain = I915_GEM_DOMAIN_CPU;
> +               obj->read_domains = I915_GEM_DOMAIN_CPU;
> +       }
> +}
> +
> +static void i915_ttm_adjust_gem_after_move(struct drm_i915_gem_object *obj)
> +{
> +       struct drm_i915_private *i915 = to_i915(obj->base.dev);
> +       struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
> +       unsigned int cache_level;
> +
> +       obj->mem_flags &= ~(I915_BO_FLAG_STRUCT_PAGE | I915_BO_FLAG_IOMEM);
> +
> +       obj->mem_flags |= cpu_maps_iomem(bo->resource) ? I915_BO_FLAG_IOMEM :
> +               I915_BO_FLAG_STRUCT_PAGE;
> +
> +       if ((HAS_LLC(i915) || HAS_SNOOP(i915)) && !gpu_binds_iomem(bo->resource) &&
> +           bo->ttm->caching == ttm_cached)
> +               cache_level = I915_CACHE_LLC;
> +       else
> +               cache_level = I915_CACHE_NONE;
> +
> +       i915_gem_object_set_cache_coherency(obj, cache_level);
> +}
> +
>  static void i915_ttm_purge(struct drm_i915_gem_object *obj)
>  {
>         struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
> @@ -190,8 +235,10 @@ static void i915_ttm_purge(struct drm_i915_gem_object *obj)
>
>         /* TTM's purge interface. Note that we might be reentering. */
>         ret = ttm_bo_validate(bo, &place, &ctx);
> -
>         if (!ret) {
> +               obj->write_domain = 0;
> +               obj->read_domains = 0;
> +               i915_ttm_adjust_gem_after_move(obj);
>                 i915_ttm_free_cached_io_st(obj);
>                 obj->mm.madv = __I915_MADV_PURGED;
>         }
> @@ -273,12 +320,15 @@ i915_ttm_resource_get_st(struct drm_i915_gem_object *obj,
>                          struct ttm_resource *res)
>  {
>         struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
> -       struct ttm_resource_manager *man =
> -               ttm_manager_type(bo->bdev, res->mem_type);
>
> -       if (man->use_tt)
> +       if (!gpu_binds_iomem(res))
>                 return i915_ttm_tt_get_st(bo->ttm);
>
> +       /*
> +        * If CPU mapping differs, we need to add the ttm_tt pages to
> +        * the resulting st. Might make sense for GGTT.
> +        */
> +       GEM_WARN_ON(!cpu_maps_iomem(res));
>         return intel_region_ttm_node_to_st(obj->mm.region, res);
>  }
>
> @@ -290,8 +340,6 @@ static int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
>         struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
>         struct ttm_resource_manager *dst_man =
>                 ttm_manager_type(bo->bdev, dst_mem->mem_type);
> -       struct ttm_resource_manager *src_man =
> -               ttm_manager_type(bo->bdev, bo->resource->mem_type);
>         struct intel_memory_region *dst_reg, *src_reg;
>         union {
>                 struct ttm_kmap_iter_tt tt;
> @@ -332,34 +380,36 @@ static int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
>         if (IS_ERR(dst_st))
>                 return PTR_ERR(dst_st);
>
> -       /* If we start mapping GGTT, we can no longer use man::use_tt here. */
> -       dst_iter = dst_man->use_tt ?
> +       dst_iter = !cpu_maps_iomem(dst_mem) ?
>                 ttm_kmap_iter_tt_init(&_dst_iter.tt, bo->ttm) :
>                 ttm_kmap_iter_iomap_init(&_dst_iter.io, &dst_reg->iomap,
>                                          dst_st, dst_reg->region.start);
>
> -       src_iter = src_man->use_tt ?
> +       src_iter = !cpu_maps_iomem(bo->resource) ?
>                 ttm_kmap_iter_tt_init(&_src_iter.tt, bo->ttm) :
>                 ttm_kmap_iter_iomap_init(&_src_iter.io, &src_reg->iomap,
>                                          obj->ttm.cached_io_st,
>                                          src_reg->region.start);
>
>         ttm_move_memcpy(bo, dst_mem->num_pages, dst_iter, src_iter);
> +       /* Below dst_mem becomes bo->resource. */
>         ttm_bo_move_sync_cleanup(bo, dst_mem);
> +       i915_ttm_adjust_domains_after_cpu_move(obj);
>         i915_ttm_free_cached_io_st(obj);
>
> -       if (!dst_man->use_tt) {
> +       if (gpu_binds_iomem(dst_mem) || cpu_maps_iomem(dst_mem)) {
>                 obj->ttm.cached_io_st = dst_st;
>                 obj->ttm.get_io_page.sg_pos = dst_st->sgl;
>                 obj->ttm.get_io_page.sg_idx = 0;
>         }
>
> +       i915_ttm_adjust_gem_after_move(obj);
>         return 0;
>  }
>
>  static int i915_ttm_io_mem_reserve(struct ttm_device *bdev, struct ttm_resource *mem)
>  {
> -       if (mem->mem_type < I915_PL_LMEM0)
> +       if (!cpu_maps_iomem(mem))
>                 return 0;
>
>         mem->bus.caching = ttm_write_combined;
> @@ -421,6 +471,16 @@ static int i915_ttm_get_pages(struct drm_i915_gem_object *obj)
>         if (ret)
>                 return ret == -ENOSPC ? -ENXIO : ret;
>
> +       i915_ttm_adjust_lru(obj);
> +       if (bo->ttm && !ttm_tt_is_populated(bo->ttm)) {
> +               ret = ttm_tt_populate(bo->bdev, bo->ttm, &ctx);
> +               if (ret)
> +                       return ret;
> +
> +               i915_ttm_adjust_domains_after_cpu_move(obj);
> +               i915_ttm_adjust_gem_after_move(obj);
> +       }
> +
>         /* Object either has a page vector or is an iomem object */
>         st = bo->ttm ? i915_ttm_tt_get_st(bo->ttm) : obj->ttm.cached_io_st;
>         if (IS_ERR(st))
> @@ -428,8 +488,6 @@ static int i915_ttm_get_pages(struct drm_i915_gem_object *obj)
>
>         __i915_gem_object_set_pages(obj, st, i915_sg_dma_sizes(st->sgl));
>
> -       i915_ttm_adjust_lru(obj);
> -
>         return ret;
>  }
>
> @@ -563,6 +621,7 @@ static u64 i915_ttm_mmap_offset(struct drm_i915_gem_object *obj)
>
>  const struct drm_i915_gem_object_ops i915_gem_ttm_obj_ops = {
>         .name = "i915_gem_object_ttm",
> +       .flags = I915_GEM_OBJECT_IS_SHRINKABLE,

We set is_shrinkable for both lmem and smem? Does the current shrinker
work with !shmem? I assume the put_pages() will just discard the page
contents, which is not what we want? Maybe keep this disabled for now?
Or am I missing something?

>
>         .get_pages = i915_ttm_get_pages,
>         .put_pages = i915_ttm_put_pages,
> @@ -599,6 +658,10 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
>  {
>         static struct lock_class_key lock_class;
>         struct drm_i915_private *i915 = mem->i915;
> +       struct ttm_operation_ctx ctx = {
> +               .interruptible = true,
> +               .no_wait_gpu = false,
> +       };
>         enum ttm_bo_type bo_type;
>         size_t alignment = 0;
>         int ret;
> @@ -618,15 +681,14 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
>         i915_gem_object_init(obj, &i915_gem_ttm_obj_ops, &lock_class, flags);
>         i915_gem_object_init_memory_region(obj, mem);
>         i915_gem_object_make_unshrinkable(obj);
> -       obj->read_domains = I915_GEM_DOMAIN_WC | I915_GEM_DOMAIN_GTT;
> -       obj->mem_flags |= I915_BO_FLAG_IOMEM;
> -       i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
>         INIT_RADIX_TREE(&obj->ttm.get_io_page.radix, GFP_KERNEL | __GFP_NOWARN);
>         mutex_init(&obj->ttm.get_io_page.lock);
>
>         bo_type = (obj->flags & I915_BO_ALLOC_USER) ? ttm_bo_type_device :
>                 ttm_bo_type_kernel;
>
> +       obj->base.vma_node.driver_private = i915_gem_to_ttm(obj);
> +
>         /*
>          * If this function fails, it will call the destructor, but
>          * our caller still owns the object. So no freeing in the
> @@ -634,14 +696,19 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
>          * Similarly, in delayed_destroy, we can't call ttm_bo_put()
>          * until successful initialization.
>          */
> -       obj->base.vma_node.driver_private = i915_gem_to_ttm(obj);
> -       ret = ttm_bo_init(&i915->bdev, i915_gem_to_ttm(obj), size,
> -                         bo_type, &i915_sys_placement, alignment,
> -                         true, NULL, NULL, i915_ttm_bo_destroy);
> +       ret = ttm_bo_init_reserved(&i915->bdev, i915_gem_to_ttm(obj), size,
> +                                  bo_type, &i915_sys_placement, alignment,
> +                                  &ctx, NULL, NULL, i915_ttm_bo_destroy);
> +
> +       if (ret)
> +               goto out;
>
> -       if (!ret)
> -               obj->ttm.created = true;
> +       obj->ttm.created = true;
> +       i915_ttm_adjust_domains_after_cpu_move(obj);
> +       i915_ttm_adjust_gem_after_move(obj);
> +       i915_gem_object_unlock(obj);
>
> +out:
>         /* i915 wants -ENXIO when out of memory region space. */
>         return (ret == -ENOSPC) ? -ENXIO : ret;
>  }
> --
> 2.31.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Thomas Hellstrom June 14, 2021, 10:32 a.m. UTC | #2
On 6/14/21 12:20 PM, Matthew Auld wrote:
> On Mon, 14 Jun 2021 at 10:53, Thomas Hellström
> <thomas.hellstrom@linux.intel.com> wrote:
>> After a TTM move or object init we need to update the i915 gem flags and
>> caching settings to reflect the new placement. Currently caching settings
>> are not changed during the lifetime of an object, although that might
>> change moving forward if we run into performance issues or issues with
>> WC system page allocations.
>> Also introduce gpu_binds_iomem() and cpu_maps_iomem() to clean up the
>> various ways we previously used to detect this.
>> Finally, initialize the TTM object reserved to be able to update
>> flags and caching before anyone else gets hold of the object.
>>
>> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>> ---
>> v2:
>> - Style fixes (Reported by Matthew Auld)
>> v3:
>> - More style fixes. Clarify why we're updating caching settings after move.
>>    (Suggested by Matthew Auld)
>> ---
>>   drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 111 +++++++++++++++++++-----
>>   1 file changed, 89 insertions(+), 22 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> index 33ab47f1e05b..5176682a7d19 100644
>> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> @@ -70,6 +70,17 @@ static struct ttm_placement i915_sys_placement = {
>>          .busy_placement = &lmem0_sys_placement_flags[1],
>>   };
>>
>> +static bool gpu_binds_iomem(struct ttm_resource *mem)
>> +{
>> +       return mem->mem_type != TTM_PL_SYSTEM;
>> +}
>> +
>> +static bool cpu_maps_iomem(struct ttm_resource *mem)
>> +{
>> +       /* Once / if we support GGTT, this is also false for cached ttm_tts */
>> +       return mem->mem_type != TTM_PL_SYSTEM;
>> +}
>> +
>>   static void i915_ttm_adjust_lru(struct drm_i915_gem_object *obj);
>>
>>   static struct ttm_tt *i915_ttm_tt_create(struct ttm_buffer_object *bo,
>> @@ -175,6 +186,40 @@ static void i915_ttm_free_cached_io_st(struct drm_i915_gem_object *obj)
>>          obj->ttm.cached_io_st = NULL;
>>   }
>>
>> +static void
>> +i915_ttm_adjust_domains_after_cpu_move(struct drm_i915_gem_object *obj)
>> +{
>> +       struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
>> +
>> +       if (cpu_maps_iomem(bo->resource) || bo->ttm->caching != ttm_cached) {
>> +               obj->write_domain = I915_GEM_DOMAIN_WC;
>> +               obj->read_domains = I915_GEM_DOMAIN_WC;
>> +       } else {
>> +               obj->write_domain = I915_GEM_DOMAIN_CPU;
>> +               obj->read_domains = I915_GEM_DOMAIN_CPU;
>> +       }
>> +}
>> +
>> +static void i915_ttm_adjust_gem_after_move(struct drm_i915_gem_object *obj)
>> +{
>> +       struct drm_i915_private *i915 = to_i915(obj->base.dev);
>> +       struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
>> +       unsigned int cache_level;
>> +
>> +       obj->mem_flags &= ~(I915_BO_FLAG_STRUCT_PAGE | I915_BO_FLAG_IOMEM);
>> +
>> +       obj->mem_flags |= cpu_maps_iomem(bo->resource) ? I915_BO_FLAG_IOMEM :
>> +               I915_BO_FLAG_STRUCT_PAGE;
>> +
>> +       if ((HAS_LLC(i915) || HAS_SNOOP(i915)) && !gpu_binds_iomem(bo->resource) &&
>> +           bo->ttm->caching == ttm_cached)
>> +               cache_level = I915_CACHE_LLC;
>> +       else
>> +               cache_level = I915_CACHE_NONE;
>> +
>> +       i915_gem_object_set_cache_coherency(obj, cache_level);
>> +}
>> +
>>   static void i915_ttm_purge(struct drm_i915_gem_object *obj)
>>   {
>>          struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
>> @@ -190,8 +235,10 @@ static void i915_ttm_purge(struct drm_i915_gem_object *obj)
>>
>>          /* TTM's purge interface. Note that we might be reentering. */
>>          ret = ttm_bo_validate(bo, &place, &ctx);
>> -
>>          if (!ret) {
>> +               obj->write_domain = 0;
>> +               obj->read_domains = 0;
>> +               i915_ttm_adjust_gem_after_move(obj);
>>                  i915_ttm_free_cached_io_st(obj);
>>                  obj->mm.madv = __I915_MADV_PURGED;
>>          }
>> @@ -273,12 +320,15 @@ i915_ttm_resource_get_st(struct drm_i915_gem_object *obj,
>>                           struct ttm_resource *res)
>>   {
>>          struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
>> -       struct ttm_resource_manager *man =
>> -               ttm_manager_type(bo->bdev, res->mem_type);
>>
>> -       if (man->use_tt)
>> +       if (!gpu_binds_iomem(res))
>>                  return i915_ttm_tt_get_st(bo->ttm);
>>
>> +       /*
>> +        * If CPU mapping differs, we need to add the ttm_tt pages to
>> +        * the resulting st. Might make sense for GGTT.
>> +        */
>> +       GEM_WARN_ON(!cpu_maps_iomem(res));
>>          return intel_region_ttm_node_to_st(obj->mm.region, res);
>>   }
>>
>> @@ -290,8 +340,6 @@ static int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
>>          struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
>>          struct ttm_resource_manager *dst_man =
>>                  ttm_manager_type(bo->bdev, dst_mem->mem_type);
>> -       struct ttm_resource_manager *src_man =
>> -               ttm_manager_type(bo->bdev, bo->resource->mem_type);
>>          struct intel_memory_region *dst_reg, *src_reg;
>>          union {
>>                  struct ttm_kmap_iter_tt tt;
>> @@ -332,34 +380,36 @@ static int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
>>          if (IS_ERR(dst_st))
>>                  return PTR_ERR(dst_st);
>>
>> -       /* If we start mapping GGTT, we can no longer use man::use_tt here. */
>> -       dst_iter = dst_man->use_tt ?
>> +       dst_iter = !cpu_maps_iomem(dst_mem) ?
>>                  ttm_kmap_iter_tt_init(&_dst_iter.tt, bo->ttm) :
>>                  ttm_kmap_iter_iomap_init(&_dst_iter.io, &dst_reg->iomap,
>>                                           dst_st, dst_reg->region.start);
>>
>> -       src_iter = src_man->use_tt ?
>> +       src_iter = !cpu_maps_iomem(bo->resource) ?
>>                  ttm_kmap_iter_tt_init(&_src_iter.tt, bo->ttm) :
>>                  ttm_kmap_iter_iomap_init(&_src_iter.io, &src_reg->iomap,
>>                                           obj->ttm.cached_io_st,
>>                                           src_reg->region.start);
>>
>>          ttm_move_memcpy(bo, dst_mem->num_pages, dst_iter, src_iter);
>> +       /* Below dst_mem becomes bo->resource. */
>>          ttm_bo_move_sync_cleanup(bo, dst_mem);
>> +       i915_ttm_adjust_domains_after_cpu_move(obj);
>>          i915_ttm_free_cached_io_st(obj);
>>
>> -       if (!dst_man->use_tt) {
>> +       if (gpu_binds_iomem(dst_mem) || cpu_maps_iomem(dst_mem)) {
>>                  obj->ttm.cached_io_st = dst_st;
>>                  obj->ttm.get_io_page.sg_pos = dst_st->sgl;
>>                  obj->ttm.get_io_page.sg_idx = 0;
>>          }
>>
>> +       i915_ttm_adjust_gem_after_move(obj);
>>          return 0;
>>   }
>>
>>   static int i915_ttm_io_mem_reserve(struct ttm_device *bdev, struct ttm_resource *mem)
>>   {
>> -       if (mem->mem_type < I915_PL_LMEM0)
>> +       if (!cpu_maps_iomem(mem))
>>                  return 0;
>>
>>          mem->bus.caching = ttm_write_combined;
>> @@ -421,6 +471,16 @@ static int i915_ttm_get_pages(struct drm_i915_gem_object *obj)
>>          if (ret)
>>                  return ret == -ENOSPC ? -ENXIO : ret;
>>
>> +       i915_ttm_adjust_lru(obj);
>> +       if (bo->ttm && !ttm_tt_is_populated(bo->ttm)) {
>> +               ret = ttm_tt_populate(bo->bdev, bo->ttm, &ctx);
>> +               if (ret)
>> +                       return ret;
>> +
>> +               i915_ttm_adjust_domains_after_cpu_move(obj);
>> +               i915_ttm_adjust_gem_after_move(obj);
>> +       }
>> +
>>          /* Object either has a page vector or is an iomem object */
>>          st = bo->ttm ? i915_ttm_tt_get_st(bo->ttm) : obj->ttm.cached_io_st;
>>          if (IS_ERR(st))
>> @@ -428,8 +488,6 @@ static int i915_ttm_get_pages(struct drm_i915_gem_object *obj)
>>
>>          __i915_gem_object_set_pages(obj, st, i915_sg_dma_sizes(st->sgl));
>>
>> -       i915_ttm_adjust_lru(obj);
>> -
>>          return ret;
>>   }
>>
>> @@ -563,6 +621,7 @@ static u64 i915_ttm_mmap_offset(struct drm_i915_gem_object *obj)
>>
>>   const struct drm_i915_gem_object_ops i915_gem_ttm_obj_ops = {
>>          .name = "i915_gem_object_ttm",
>> +       .flags = I915_GEM_OBJECT_IS_SHRINKABLE,
> We set is_shrinkable for both lmem and smem? Does the current shrinker
> work with !shmem? I assume the put_pages() will just discard the page
> contents, which is not what we want? Maybe keep this disabled for now?
> Or am I missing something?

This is to indicate to the gem shrinker that we want to be able to 
determine dynamically whether the object is shrinkable (by the gem 
shrinker) or not. (See below). It's intended for the situation where we 
might want to use shmem to back cached-only ttm objects, and use our own 
shrinker.

But if you think it's better, we could flip that ops flag once we 
actually have anything shrinkable in place. Let me know what you think. 
The TTM shmem pool idea becomes a bit doubtful by the fact that 
allocating some 8GB of system objects, populating and clearing them 
appears to be some 6X faster with a straightforward alloc approach (like 
TTM or the "internal" backend) than with shmem...

>>          .get_pages = i915_ttm_get_pages,
>>          .put_pages = i915_ttm_put_pages,
>> @@ -599,6 +658,10 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
>>   {
>>          static struct lock_class_key lock_class;
>>          struct drm_i915_private *i915 = mem->i915;
>> +       struct ttm_operation_ctx ctx = {
>> +               .interruptible = true,
>> +               .no_wait_gpu = false,
>> +       };
>>          enum ttm_bo_type bo_type;
>>          size_t alignment = 0;
>>          int ret;
>> @@ -618,15 +681,14 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
>>          i915_gem_object_init(obj, &i915_gem_ttm_obj_ops, &lock_class, flags);
>>          i915_gem_object_init_memory_region(obj, mem);
>>          i915_gem_object_make_unshrinkable(obj);

Here we set the object to unshrinkable.

/Thomas
Matthew Auld June 14, 2021, 10:49 a.m. UTC | #3
On Mon, 14 Jun 2021 at 11:32, Thomas Hellström
<thomas.hellstrom@linux.intel.com> wrote:
>
>
> On 6/14/21 12:20 PM, Matthew Auld wrote:
> > On Mon, 14 Jun 2021 at 10:53, Thomas Hellström
> > <thomas.hellstrom@linux.intel.com> wrote:
> >> After a TTM move or object init we need to update the i915 gem flags and
> >> caching settings to reflect the new placement. Currently caching settings
> >> are not changed during the lifetime of an object, although that might
> >> change moving forward if we run into performance issues or issues with
> >> WC system page allocations.
> >> Also introduce gpu_binds_iomem() and cpu_maps_iomem() to clean up the
> >> various ways we previously used to detect this.
> >> Finally, initialize the TTM object reserved to be able to update
> >> flags and caching before anyone else gets hold of the object.
> >>
> >> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> >> ---
> >> v2:
> >> - Style fixes (Reported by Matthew Auld)
> >> v3:
> >> - More style fixes. Clarify why we're updating caching settings after move.
> >>    (Suggested by Matthew Auld)
> >> ---
> >>   drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 111 +++++++++++++++++++-----
> >>   1 file changed, 89 insertions(+), 22 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> >> index 33ab47f1e05b..5176682a7d19 100644
> >> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> >> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> >> @@ -70,6 +70,17 @@ static struct ttm_placement i915_sys_placement = {
> >>          .busy_placement = &lmem0_sys_placement_flags[1],
> >>   };
> >>
> >> +static bool gpu_binds_iomem(struct ttm_resource *mem)
> >> +{
> >> +       return mem->mem_type != TTM_PL_SYSTEM;
> >> +}
> >> +
> >> +static bool cpu_maps_iomem(struct ttm_resource *mem)
> >> +{
> >> +       /* Once / if we support GGTT, this is also false for cached ttm_tts */
> >> +       return mem->mem_type != TTM_PL_SYSTEM;
> >> +}
> >> +
> >>   static void i915_ttm_adjust_lru(struct drm_i915_gem_object *obj);
> >>
> >>   static struct ttm_tt *i915_ttm_tt_create(struct ttm_buffer_object *bo,
> >> @@ -175,6 +186,40 @@ static void i915_ttm_free_cached_io_st(struct drm_i915_gem_object *obj)
> >>          obj->ttm.cached_io_st = NULL;
> >>   }
> >>
> >> +static void
> >> +i915_ttm_adjust_domains_after_cpu_move(struct drm_i915_gem_object *obj)
> >> +{
> >> +       struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
> >> +
> >> +       if (cpu_maps_iomem(bo->resource) || bo->ttm->caching != ttm_cached) {
> >> +               obj->write_domain = I915_GEM_DOMAIN_WC;
> >> +               obj->read_domains = I915_GEM_DOMAIN_WC;
> >> +       } else {
> >> +               obj->write_domain = I915_GEM_DOMAIN_CPU;
> >> +               obj->read_domains = I915_GEM_DOMAIN_CPU;
> >> +       }
> >> +}
> >> +
> >> +static void i915_ttm_adjust_gem_after_move(struct drm_i915_gem_object *obj)
> >> +{
> >> +       struct drm_i915_private *i915 = to_i915(obj->base.dev);
> >> +       struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
> >> +       unsigned int cache_level;
> >> +
> >> +       obj->mem_flags &= ~(I915_BO_FLAG_STRUCT_PAGE | I915_BO_FLAG_IOMEM);
> >> +
> >> +       obj->mem_flags |= cpu_maps_iomem(bo->resource) ? I915_BO_FLAG_IOMEM :
> >> +               I915_BO_FLAG_STRUCT_PAGE;
> >> +
> >> +       if ((HAS_LLC(i915) || HAS_SNOOP(i915)) && !gpu_binds_iomem(bo->resource) &&
> >> +           bo->ttm->caching == ttm_cached)
> >> +               cache_level = I915_CACHE_LLC;
> >> +       else
> >> +               cache_level = I915_CACHE_NONE;
> >> +
> >> +       i915_gem_object_set_cache_coherency(obj, cache_level);
> >> +}
> >> +
> >>   static void i915_ttm_purge(struct drm_i915_gem_object *obj)
> >>   {
> >>          struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
> >> @@ -190,8 +235,10 @@ static void i915_ttm_purge(struct drm_i915_gem_object *obj)
> >>
> >>          /* TTM's purge interface. Note that we might be reentering. */
> >>          ret = ttm_bo_validate(bo, &place, &ctx);
> >> -
> >>          if (!ret) {
> >> +               obj->write_domain = 0;
> >> +               obj->read_domains = 0;
> >> +               i915_ttm_adjust_gem_after_move(obj);
> >>                  i915_ttm_free_cached_io_st(obj);
> >>                  obj->mm.madv = __I915_MADV_PURGED;
> >>          }
> >> @@ -273,12 +320,15 @@ i915_ttm_resource_get_st(struct drm_i915_gem_object *obj,
> >>                           struct ttm_resource *res)
> >>   {
> >>          struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
> >> -       struct ttm_resource_manager *man =
> >> -               ttm_manager_type(bo->bdev, res->mem_type);
> >>
> >> -       if (man->use_tt)
> >> +       if (!gpu_binds_iomem(res))
> >>                  return i915_ttm_tt_get_st(bo->ttm);
> >>
> >> +       /*
> >> +        * If CPU mapping differs, we need to add the ttm_tt pages to
> >> +        * the resulting st. Might make sense for GGTT.
> >> +        */
> >> +       GEM_WARN_ON(!cpu_maps_iomem(res));
> >>          return intel_region_ttm_node_to_st(obj->mm.region, res);
> >>   }
> >>
> >> @@ -290,8 +340,6 @@ static int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
> >>          struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
> >>          struct ttm_resource_manager *dst_man =
> >>                  ttm_manager_type(bo->bdev, dst_mem->mem_type);
> >> -       struct ttm_resource_manager *src_man =
> >> -               ttm_manager_type(bo->bdev, bo->resource->mem_type);
> >>          struct intel_memory_region *dst_reg, *src_reg;
> >>          union {
> >>                  struct ttm_kmap_iter_tt tt;
> >> @@ -332,34 +380,36 @@ static int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
> >>          if (IS_ERR(dst_st))
> >>                  return PTR_ERR(dst_st);
> >>
> >> -       /* If we start mapping GGTT, we can no longer use man::use_tt here. */
> >> -       dst_iter = dst_man->use_tt ?
> >> +       dst_iter = !cpu_maps_iomem(dst_mem) ?
> >>                  ttm_kmap_iter_tt_init(&_dst_iter.tt, bo->ttm) :
> >>                  ttm_kmap_iter_iomap_init(&_dst_iter.io, &dst_reg->iomap,
> >>                                           dst_st, dst_reg->region.start);
> >>
> >> -       src_iter = src_man->use_tt ?
> >> +       src_iter = !cpu_maps_iomem(bo->resource) ?
> >>                  ttm_kmap_iter_tt_init(&_src_iter.tt, bo->ttm) :
> >>                  ttm_kmap_iter_iomap_init(&_src_iter.io, &src_reg->iomap,
> >>                                           obj->ttm.cached_io_st,
> >>                                           src_reg->region.start);
> >>
> >>          ttm_move_memcpy(bo, dst_mem->num_pages, dst_iter, src_iter);
> >> +       /* Below dst_mem becomes bo->resource. */
> >>          ttm_bo_move_sync_cleanup(bo, dst_mem);
> >> +       i915_ttm_adjust_domains_after_cpu_move(obj);
> >>          i915_ttm_free_cached_io_st(obj);
> >>
> >> -       if (!dst_man->use_tt) {
> >> +       if (gpu_binds_iomem(dst_mem) || cpu_maps_iomem(dst_mem)) {
> >>                  obj->ttm.cached_io_st = dst_st;
> >>                  obj->ttm.get_io_page.sg_pos = dst_st->sgl;
> >>                  obj->ttm.get_io_page.sg_idx = 0;
> >>          }
> >>
> >> +       i915_ttm_adjust_gem_after_move(obj);
> >>          return 0;
> >>   }
> >>
> >>   static int i915_ttm_io_mem_reserve(struct ttm_device *bdev, struct ttm_resource *mem)
> >>   {
> >> -       if (mem->mem_type < I915_PL_LMEM0)
> >> +       if (!cpu_maps_iomem(mem))
> >>                  return 0;
> >>
> >>          mem->bus.caching = ttm_write_combined;
> >> @@ -421,6 +471,16 @@ static int i915_ttm_get_pages(struct drm_i915_gem_object *obj)
> >>          if (ret)
> >>                  return ret == -ENOSPC ? -ENXIO : ret;
> >>
> >> +       i915_ttm_adjust_lru(obj);
> >> +       if (bo->ttm && !ttm_tt_is_populated(bo->ttm)) {
> >> +               ret = ttm_tt_populate(bo->bdev, bo->ttm, &ctx);
> >> +               if (ret)
> >> +                       return ret;
> >> +
> >> +               i915_ttm_adjust_domains_after_cpu_move(obj);
> >> +               i915_ttm_adjust_gem_after_move(obj);
> >> +       }
> >> +
> >>          /* Object either has a page vector or is an iomem object */
> >>          st = bo->ttm ? i915_ttm_tt_get_st(bo->ttm) : obj->ttm.cached_io_st;
> >>          if (IS_ERR(st))
> >> @@ -428,8 +488,6 @@ static int i915_ttm_get_pages(struct drm_i915_gem_object *obj)
> >>
> >>          __i915_gem_object_set_pages(obj, st, i915_sg_dma_sizes(st->sgl));
> >>
> >> -       i915_ttm_adjust_lru(obj);
> >> -
> >>          return ret;
> >>   }
> >>
> >> @@ -563,6 +621,7 @@ static u64 i915_ttm_mmap_offset(struct drm_i915_gem_object *obj)
> >>
> >>   const struct drm_i915_gem_object_ops i915_gem_ttm_obj_ops = {
> >>          .name = "i915_gem_object_ttm",
> >> +       .flags = I915_GEM_OBJECT_IS_SHRINKABLE,
> > We set is_shrinkable for both lmem and smem? Does the current shrinker
> > work with !shmem? I assume the put_pages() will just discard the page
> > contents, which is not what we want? Maybe keep this disabled for now?
> > Or am I missing something?
>
> This is to indicate to the gem shrinker that we want to be able to
> determine dynamically whether the object is shrinkable (by the gem
> shrinker) or not. (See below). It's intended for the situation where we
> might want to use shmem to back cached-only ttm objects, and use our own
> shrinker.
>
> But if you think it's better, we could flip that ops flag once we
> actually have anything shrinkable in place. Let me know what you think.
> The TTM shmem pool idea becomes a bit doubtful by the fact that
> allocating some 8GB of system objects, populating and clearing them
> appears to be some 6X faster with a straightforward alloc approach (like
> TTM or the "internal" backend) than with shmem...

I would vote for keeping it turned off for now.

>
> >>          .get_pages = i915_ttm_get_pages,
> >>          .put_pages = i915_ttm_put_pages,
> >> @@ -599,6 +658,10 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
> >>   {
> >>          static struct lock_class_key lock_class;
> >>          struct drm_i915_private *i915 = mem->i915;
> >> +       struct ttm_operation_ctx ctx = {
> >> +               .interruptible = true,
> >> +               .no_wait_gpu = false,
> >> +       };
> >>          enum ttm_bo_type bo_type;
> >>          size_t alignment = 0;
> >>          int ret;
> >> @@ -618,15 +681,14 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
> >>          i915_gem_object_init(obj, &i915_gem_ttm_obj_ops, &lock_class, flags);
> >>          i915_gem_object_init_memory_region(obj, mem);
> >>          i915_gem_object_make_unshrinkable(obj);
>
> Here we set the object to unshrinkable.

Oh, I totally missed that. But I don't think we are meant to call that
here, especially with IS_SHRINKABLE, since object_set_pages will later
make it shrinkable anyway, which looks like fun and games.

>
> /Thomas
>
>
Thomas Hellstrom June 14, 2021, 11:14 a.m. UTC | #4
On 6/14/21 12:49 PM, Matthew Auld wrote:
> On Mon, 14 Jun 2021 at 11:32, Thomas Hellström
> <thomas.hellstrom@linux.intel.com> wrote:
>>
>> On 6/14/21 12:20 PM, Matthew Auld wrote:
>>> On Mon, 14 Jun 2021 at 10:53, Thomas Hellström
>>> <thomas.hellstrom@linux.intel.com> wrote:
>>>> After a TTM move or object init we need to update the i915 gem flags and
>>>> caching settings to reflect the new placement. Currently caching settings
>>>> are not changed during the lifetime of an object, although that might
>>>> change moving forward if we run into performance issues or issues with
>>>> WC system page allocations.
>>>> Also introduce gpu_binds_iomem() and cpu_maps_iomem() to clean up the
>>>> various ways we previously used to detect this.
>>>> Finally, initialize the TTM object reserved to be able to update
>>>> flags and caching before anyone else gets hold of the object.
>>>>
>>>> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>>>> ---
>>>> v2:
>>>> - Style fixes (Reported by Matthew Auld)
>>>> v3:
>>>> - More style fixes. Clarify why we're updating caching settings after move.
>>>>     (Suggested by Matthew Auld)
>>>> ---
>>>>    drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 111 +++++++++++++++++++-----
>>>>    1 file changed, 89 insertions(+), 22 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>>> index 33ab47f1e05b..5176682a7d19 100644
>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>>> @@ -70,6 +70,17 @@ static struct ttm_placement i915_sys_placement = {
>>>>           .busy_placement = &lmem0_sys_placement_flags[1],
>>>>    };
>>>>
>>>> +static bool gpu_binds_iomem(struct ttm_resource *mem)
>>>> +{
>>>> +       return mem->mem_type != TTM_PL_SYSTEM;
>>>> +}
>>>> +
>>>> +static bool cpu_maps_iomem(struct ttm_resource *mem)
>>>> +{
>>>> +       /* Once / if we support GGTT, this is also false for cached ttm_tts */
>>>> +       return mem->mem_type != TTM_PL_SYSTEM;
>>>> +}
>>>> +
>>>>    static void i915_ttm_adjust_lru(struct drm_i915_gem_object *obj);
>>>>
>>>>    static struct ttm_tt *i915_ttm_tt_create(struct ttm_buffer_object *bo,
>>>> @@ -175,6 +186,40 @@ static void i915_ttm_free_cached_io_st(struct drm_i915_gem_object *obj)
>>>>           obj->ttm.cached_io_st = NULL;
>>>>    }
>>>>
>>>> +static void
>>>> +i915_ttm_adjust_domains_after_cpu_move(struct drm_i915_gem_object *obj)
>>>> +{
>>>> +       struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
>>>> +
>>>> +       if (cpu_maps_iomem(bo->resource) || bo->ttm->caching != ttm_cached) {
>>>> +               obj->write_domain = I915_GEM_DOMAIN_WC;
>>>> +               obj->read_domains = I915_GEM_DOMAIN_WC;
>>>> +       } else {
>>>> +               obj->write_domain = I915_GEM_DOMAIN_CPU;
>>>> +               obj->read_domains = I915_GEM_DOMAIN_CPU;
>>>> +       }
>>>> +}
>>>> +
>>>> +static void i915_ttm_adjust_gem_after_move(struct drm_i915_gem_object *obj)
>>>> +{
>>>> +       struct drm_i915_private *i915 = to_i915(obj->base.dev);
>>>> +       struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
>>>> +       unsigned int cache_level;
>>>> +
>>>> +       obj->mem_flags &= ~(I915_BO_FLAG_STRUCT_PAGE | I915_BO_FLAG_IOMEM);
>>>> +
>>>> +       obj->mem_flags |= cpu_maps_iomem(bo->resource) ? I915_BO_FLAG_IOMEM :
>>>> +               I915_BO_FLAG_STRUCT_PAGE;
>>>> +
>>>> +       if ((HAS_LLC(i915) || HAS_SNOOP(i915)) && !gpu_binds_iomem(bo->resource) &&
>>>> +           bo->ttm->caching == ttm_cached)
>>>> +               cache_level = I915_CACHE_LLC;
>>>> +       else
>>>> +               cache_level = I915_CACHE_NONE;
>>>> +
>>>> +       i915_gem_object_set_cache_coherency(obj, cache_level);
>>>> +}
>>>> +
>>>>    static void i915_ttm_purge(struct drm_i915_gem_object *obj)
>>>>    {
>>>>           struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
>>>> @@ -190,8 +235,10 @@ static void i915_ttm_purge(struct drm_i915_gem_object *obj)
>>>>
>>>>           /* TTM's purge interface. Note that we might be reentering. */
>>>>           ret = ttm_bo_validate(bo, &place, &ctx);
>>>> -
>>>>           if (!ret) {
>>>> +               obj->write_domain = 0;
>>>> +               obj->read_domains = 0;
>>>> +               i915_ttm_adjust_gem_after_move(obj);
>>>>                   i915_ttm_free_cached_io_st(obj);
>>>>                   obj->mm.madv = __I915_MADV_PURGED;
>>>>           }
>>>> @@ -273,12 +320,15 @@ i915_ttm_resource_get_st(struct drm_i915_gem_object *obj,
>>>>                            struct ttm_resource *res)
>>>>    {
>>>>           struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
>>>> -       struct ttm_resource_manager *man =
>>>> -               ttm_manager_type(bo->bdev, res->mem_type);
>>>>
>>>> -       if (man->use_tt)
>>>> +       if (!gpu_binds_iomem(res))
>>>>                   return i915_ttm_tt_get_st(bo->ttm);
>>>>
>>>> +       /*
>>>> +        * If CPU mapping differs, we need to add the ttm_tt pages to
>>>> +        * the resulting st. Might make sense for GGTT.
>>>> +        */
>>>> +       GEM_WARN_ON(!cpu_maps_iomem(res));
>>>>           return intel_region_ttm_node_to_st(obj->mm.region, res);
>>>>    }
>>>>
>>>> @@ -290,8 +340,6 @@ static int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
>>>>           struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
>>>>           struct ttm_resource_manager *dst_man =
>>>>                   ttm_manager_type(bo->bdev, dst_mem->mem_type);
>>>> -       struct ttm_resource_manager *src_man =
>>>> -               ttm_manager_type(bo->bdev, bo->resource->mem_type);
>>>>           struct intel_memory_region *dst_reg, *src_reg;
>>>>           union {
>>>>                   struct ttm_kmap_iter_tt tt;
>>>> @@ -332,34 +380,36 @@ static int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
>>>>           if (IS_ERR(dst_st))
>>>>                   return PTR_ERR(dst_st);
>>>>
>>>> -       /* If we start mapping GGTT, we can no longer use man::use_tt here. */
>>>> -       dst_iter = dst_man->use_tt ?
>>>> +       dst_iter = !cpu_maps_iomem(dst_mem) ?
>>>>                   ttm_kmap_iter_tt_init(&_dst_iter.tt, bo->ttm) :
>>>>                   ttm_kmap_iter_iomap_init(&_dst_iter.io, &dst_reg->iomap,
>>>>                                            dst_st, dst_reg->region.start);
>>>>
>>>> -       src_iter = src_man->use_tt ?
>>>> +       src_iter = !cpu_maps_iomem(bo->resource) ?
>>>>                   ttm_kmap_iter_tt_init(&_src_iter.tt, bo->ttm) :
>>>>                   ttm_kmap_iter_iomap_init(&_src_iter.io, &src_reg->iomap,
>>>>                                            obj->ttm.cached_io_st,
>>>>                                            src_reg->region.start);
>>>>
>>>>           ttm_move_memcpy(bo, dst_mem->num_pages, dst_iter, src_iter);
>>>> +       /* Below dst_mem becomes bo->resource. */
>>>>           ttm_bo_move_sync_cleanup(bo, dst_mem);
>>>> +       i915_ttm_adjust_domains_after_cpu_move(obj);
>>>>           i915_ttm_free_cached_io_st(obj);
>>>>
>>>> -       if (!dst_man->use_tt) {
>>>> +       if (gpu_binds_iomem(dst_mem) || cpu_maps_iomem(dst_mem)) {
>>>>                   obj->ttm.cached_io_st = dst_st;
>>>>                   obj->ttm.get_io_page.sg_pos = dst_st->sgl;
>>>>                   obj->ttm.get_io_page.sg_idx = 0;
>>>>           }
>>>>
>>>> +       i915_ttm_adjust_gem_after_move(obj);
>>>>           return 0;
>>>>    }
>>>>
>>>>    static int i915_ttm_io_mem_reserve(struct ttm_device *bdev, struct ttm_resource *mem)
>>>>    {
>>>> -       if (mem->mem_type < I915_PL_LMEM0)
>>>> +       if (!cpu_maps_iomem(mem))
>>>>                   return 0;
>>>>
>>>>           mem->bus.caching = ttm_write_combined;
>>>> @@ -421,6 +471,16 @@ static int i915_ttm_get_pages(struct drm_i915_gem_object *obj)
>>>>           if (ret)
>>>>                   return ret == -ENOSPC ? -ENXIO : ret;
>>>>
>>>> +       i915_ttm_adjust_lru(obj);
>>>> +       if (bo->ttm && !ttm_tt_is_populated(bo->ttm)) {
>>>> +               ret = ttm_tt_populate(bo->bdev, bo->ttm, &ctx);
>>>> +               if (ret)
>>>> +                       return ret;
>>>> +
>>>> +               i915_ttm_adjust_domains_after_cpu_move(obj);
>>>> +               i915_ttm_adjust_gem_after_move(obj);
>>>> +       }
>>>> +
>>>>           /* Object either has a page vector or is an iomem object */
>>>>           st = bo->ttm ? i915_ttm_tt_get_st(bo->ttm) : obj->ttm.cached_io_st;
>>>>           if (IS_ERR(st))
>>>> @@ -428,8 +488,6 @@ static int i915_ttm_get_pages(struct drm_i915_gem_object *obj)
>>>>
>>>>           __i915_gem_object_set_pages(obj, st, i915_sg_dma_sizes(st->sgl));
>>>>
>>>> -       i915_ttm_adjust_lru(obj);
>>>> -
>>>>           return ret;
>>>>    }
>>>>
>>>> @@ -563,6 +621,7 @@ static u64 i915_ttm_mmap_offset(struct drm_i915_gem_object *obj)
>>>>
>>>>    const struct drm_i915_gem_object_ops i915_gem_ttm_obj_ops = {
>>>>           .name = "i915_gem_object_ttm",
>>>> +       .flags = I915_GEM_OBJECT_IS_SHRINKABLE,
>>> We set is_shrinkable for both lmem and smem? Does the current shrinker
>>> work with !shmem? I assume the put_pages() will just discard the page
>>> contents, which is not what we want? Maybe keep this disabled for now?
>>> Or am I missing something?
>> This is to indicate to the gem shrinker that we want to be able to
>> determine dynamically whether the object is shrinkable (by the gem
>> shrinker) or not. (See below). It's intended for the situation where we
>> might want to use shmem to back cached-only ttm objects, and use our own
>> shrinker.
>>
>> But if you think it's better, we could flip that ops flag once we
>> actually have anything shrinkable in place. Let me know what you think.
>> The TTM shmem pool idea becomes a bit doubtful by the fact that
>> allocating some 8GB of system objects, populating and clearing them
>> appears to be some 6X faster with a straightforward alloc approach (like
>> TTM or the "internal" backend) than with shmem...
> I would vote for keeping it turned off for now.
>
>>>>           .get_pages = i915_ttm_get_pages,
>>>>           .put_pages = i915_ttm_put_pages,
>>>> @@ -599,6 +658,10 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
>>>>    {
>>>>           static struct lock_class_key lock_class;
>>>>           struct drm_i915_private *i915 = mem->i915;
>>>> +       struct ttm_operation_ctx ctx = {
>>>> +               .interruptible = true,
>>>> +               .no_wait_gpu = false,
>>>> +       };
>>>>           enum ttm_bo_type bo_type;
>>>>           size_t alignment = 0;
>>>>           int ret;
>>>> @@ -618,15 +681,14 @@ int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
>>>>           i915_gem_object_init(obj, &i915_gem_ttm_obj_ops, &lock_class, flags);
>>>>           i915_gem_object_init_memory_region(obj, mem);
>>>>           i915_gem_object_make_unshrinkable(obj);
>> Here we set the object to unshrinkable.
> Oh, I totally missed that. But I don't think we are meant to call that
> here, especially with IS_SHRINKABLE, since object_set_pages will later
> make it shrinkable anyway, which looks like fun and games.

Indeed. OK, I'll respin to keep these objects !IS_SHRINKABLE for now.

Need to take a deeper look into that interface...

/Thomas



>> /Thomas
>>
>>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index 33ab47f1e05b..5176682a7d19 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -70,6 +70,17 @@  static struct ttm_placement i915_sys_placement = {
 	.busy_placement = &lmem0_sys_placement_flags[1],
 };
 
+static bool gpu_binds_iomem(struct ttm_resource *mem)
+{
+	return mem->mem_type != TTM_PL_SYSTEM;
+}
+
+static bool cpu_maps_iomem(struct ttm_resource *mem)
+{
+	/* Once / if we support GGTT, this is also false for cached ttm_tts */
+	return mem->mem_type != TTM_PL_SYSTEM;
+}
+
 static void i915_ttm_adjust_lru(struct drm_i915_gem_object *obj);
 
 static struct ttm_tt *i915_ttm_tt_create(struct ttm_buffer_object *bo,
@@ -175,6 +186,40 @@  static void i915_ttm_free_cached_io_st(struct drm_i915_gem_object *obj)
 	obj->ttm.cached_io_st = NULL;
 }
 
+static void
+i915_ttm_adjust_domains_after_cpu_move(struct drm_i915_gem_object *obj)
+{
+	struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
+
+	if (cpu_maps_iomem(bo->resource) || bo->ttm->caching != ttm_cached) {
+		obj->write_domain = I915_GEM_DOMAIN_WC;
+		obj->read_domains = I915_GEM_DOMAIN_WC;
+	} else {
+		obj->write_domain = I915_GEM_DOMAIN_CPU;
+		obj->read_domains = I915_GEM_DOMAIN_CPU;
+	}
+}
+
+static void i915_ttm_adjust_gem_after_move(struct drm_i915_gem_object *obj)
+{
+	struct drm_i915_private *i915 = to_i915(obj->base.dev);
+	struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
+	unsigned int cache_level;
+
+	obj->mem_flags &= ~(I915_BO_FLAG_STRUCT_PAGE | I915_BO_FLAG_IOMEM);
+
+	obj->mem_flags |= cpu_maps_iomem(bo->resource) ? I915_BO_FLAG_IOMEM :
+		I915_BO_FLAG_STRUCT_PAGE;
+
+	if ((HAS_LLC(i915) || HAS_SNOOP(i915)) && !gpu_binds_iomem(bo->resource) &&
+	    bo->ttm->caching == ttm_cached)
+		cache_level = I915_CACHE_LLC;
+	else
+		cache_level = I915_CACHE_NONE;
+
+	i915_gem_object_set_cache_coherency(obj, cache_level);
+}
+
 static void i915_ttm_purge(struct drm_i915_gem_object *obj)
 {
 	struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
@@ -190,8 +235,10 @@  static void i915_ttm_purge(struct drm_i915_gem_object *obj)
 
 	/* TTM's purge interface. Note that we might be reentering. */
 	ret = ttm_bo_validate(bo, &place, &ctx);
-
 	if (!ret) {
+		obj->write_domain = 0;
+		obj->read_domains = 0;
+		i915_ttm_adjust_gem_after_move(obj);
 		i915_ttm_free_cached_io_st(obj);
 		obj->mm.madv = __I915_MADV_PURGED;
 	}
@@ -273,12 +320,15 @@  i915_ttm_resource_get_st(struct drm_i915_gem_object *obj,
 			 struct ttm_resource *res)
 {
 	struct ttm_buffer_object *bo = i915_gem_to_ttm(obj);
-	struct ttm_resource_manager *man =
-		ttm_manager_type(bo->bdev, res->mem_type);
 
-	if (man->use_tt)
+	if (!gpu_binds_iomem(res))
 		return i915_ttm_tt_get_st(bo->ttm);
 
+	/*
+	 * If CPU mapping differs, we need to add the ttm_tt pages to
+	 * the resulting st. Might make sense for GGTT.
+	 */
+	GEM_WARN_ON(!cpu_maps_iomem(res));
 	return intel_region_ttm_node_to_st(obj->mm.region, res);
 }
 
@@ -290,8 +340,6 @@  static int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
 	struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
 	struct ttm_resource_manager *dst_man =
 		ttm_manager_type(bo->bdev, dst_mem->mem_type);
-	struct ttm_resource_manager *src_man =
-		ttm_manager_type(bo->bdev, bo->resource->mem_type);
 	struct intel_memory_region *dst_reg, *src_reg;
 	union {
 		struct ttm_kmap_iter_tt tt;
@@ -332,34 +380,36 @@  static int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
 	if (IS_ERR(dst_st))
 		return PTR_ERR(dst_st);
 
-	/* If we start mapping GGTT, we can no longer use man::use_tt here. */
-	dst_iter = dst_man->use_tt ?
+	dst_iter = !cpu_maps_iomem(dst_mem) ?
 		ttm_kmap_iter_tt_init(&_dst_iter.tt, bo->ttm) :
 		ttm_kmap_iter_iomap_init(&_dst_iter.io, &dst_reg->iomap,
 					 dst_st, dst_reg->region.start);
 
-	src_iter = src_man->use_tt ?
+	src_iter = !cpu_maps_iomem(bo->resource) ?
 		ttm_kmap_iter_tt_init(&_src_iter.tt, bo->ttm) :
 		ttm_kmap_iter_iomap_init(&_src_iter.io, &src_reg->iomap,
 					 obj->ttm.cached_io_st,
 					 src_reg->region.start);
 
 	ttm_move_memcpy(bo, dst_mem->num_pages, dst_iter, src_iter);
+	/* Below dst_mem becomes bo->resource. */
 	ttm_bo_move_sync_cleanup(bo, dst_mem);
+	i915_ttm_adjust_domains_after_cpu_move(obj);
 	i915_ttm_free_cached_io_st(obj);
 
-	if (!dst_man->use_tt) {
+	if (gpu_binds_iomem(dst_mem) || cpu_maps_iomem(dst_mem)) {
 		obj->ttm.cached_io_st = dst_st;
 		obj->ttm.get_io_page.sg_pos = dst_st->sgl;
 		obj->ttm.get_io_page.sg_idx = 0;
 	}
 
+	i915_ttm_adjust_gem_after_move(obj);
 	return 0;
 }
 
 static int i915_ttm_io_mem_reserve(struct ttm_device *bdev, struct ttm_resource *mem)
 {
-	if (mem->mem_type < I915_PL_LMEM0)
+	if (!cpu_maps_iomem(mem))
 		return 0;
 
 	mem->bus.caching = ttm_write_combined;
@@ -421,6 +471,16 @@  static int i915_ttm_get_pages(struct drm_i915_gem_object *obj)
 	if (ret)
 		return ret == -ENOSPC ? -ENXIO : ret;
 
+	i915_ttm_adjust_lru(obj);
+	if (bo->ttm && !ttm_tt_is_populated(bo->ttm)) {
+		ret = ttm_tt_populate(bo->bdev, bo->ttm, &ctx);
+		if (ret)
+			return ret;
+
+		i915_ttm_adjust_domains_after_cpu_move(obj);
+		i915_ttm_adjust_gem_after_move(obj);
+	}
+
 	/* Object either has a page vector or is an iomem object */
 	st = bo->ttm ? i915_ttm_tt_get_st(bo->ttm) : obj->ttm.cached_io_st;
 	if (IS_ERR(st))
@@ -428,8 +488,6 @@  static int i915_ttm_get_pages(struct drm_i915_gem_object *obj)
 
 	__i915_gem_object_set_pages(obj, st, i915_sg_dma_sizes(st->sgl));
 
-	i915_ttm_adjust_lru(obj);
-
 	return ret;
 }
 
@@ -563,6 +621,7 @@  static u64 i915_ttm_mmap_offset(struct drm_i915_gem_object *obj)
 
 const struct drm_i915_gem_object_ops i915_gem_ttm_obj_ops = {
 	.name = "i915_gem_object_ttm",
+	.flags = I915_GEM_OBJECT_IS_SHRINKABLE,
 
 	.get_pages = i915_ttm_get_pages,
 	.put_pages = i915_ttm_put_pages,
@@ -599,6 +658,10 @@  int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
 {
 	static struct lock_class_key lock_class;
 	struct drm_i915_private *i915 = mem->i915;
+	struct ttm_operation_ctx ctx = {
+		.interruptible = true,
+		.no_wait_gpu = false,
+	};
 	enum ttm_bo_type bo_type;
 	size_t alignment = 0;
 	int ret;
@@ -618,15 +681,14 @@  int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
 	i915_gem_object_init(obj, &i915_gem_ttm_obj_ops, &lock_class, flags);
 	i915_gem_object_init_memory_region(obj, mem);
 	i915_gem_object_make_unshrinkable(obj);
-	obj->read_domains = I915_GEM_DOMAIN_WC | I915_GEM_DOMAIN_GTT;
-	obj->mem_flags |= I915_BO_FLAG_IOMEM;
-	i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
 	INIT_RADIX_TREE(&obj->ttm.get_io_page.radix, GFP_KERNEL | __GFP_NOWARN);
 	mutex_init(&obj->ttm.get_io_page.lock);
 
 	bo_type = (obj->flags & I915_BO_ALLOC_USER) ? ttm_bo_type_device :
 		ttm_bo_type_kernel;
 
+	obj->base.vma_node.driver_private = i915_gem_to_ttm(obj);
+
 	/*
 	 * If this function fails, it will call the destructor, but
 	 * our caller still owns the object. So no freeing in the
@@ -634,14 +696,19 @@  int __i915_gem_ttm_object_init(struct intel_memory_region *mem,
 	 * Similarly, in delayed_destroy, we can't call ttm_bo_put()
 	 * until successful initialization.
 	 */
-	obj->base.vma_node.driver_private = i915_gem_to_ttm(obj);
-	ret = ttm_bo_init(&i915->bdev, i915_gem_to_ttm(obj), size,
-			  bo_type, &i915_sys_placement, alignment,
-			  true, NULL, NULL, i915_ttm_bo_destroy);
+	ret = ttm_bo_init_reserved(&i915->bdev, i915_gem_to_ttm(obj), size,
+				   bo_type, &i915_sys_placement, alignment,
+				   &ctx, NULL, NULL, i915_ttm_bo_destroy);
+
+	if (ret)
+		goto out;
 
-	if (!ret)
-		obj->ttm.created = true;
+	obj->ttm.created = true;
+	i915_ttm_adjust_domains_after_cpu_move(obj);
+	i915_ttm_adjust_gem_after_move(obj);
+	i915_gem_object_unlock(obj);
 
+out:
 	/* i915 wants -ENXIO when out of memory region space. */
 	return (ret == -ENOSPC) ? -ENXIO : ret;
 }