diff mbox series

drm/i915: Improve debug print in vm_fault_ttm

Message ID 20220922120908.10352-1-nirmoy.das@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915: Improve debug print in vm_fault_ttm | expand

Commit Message

Nirmoy Das Sept. 22, 2022, 12:09 p.m. UTC
Print the error code returned by __i915_ttm_migrate()
for better debuggability.

References: https://gitlab.freedesktop.org/drm/intel/-/issues/6889
Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Comments

kernel test robot Sept. 22, 2022, 2:24 p.m. UTC | #1
Hi Nirmoy,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-tip/drm-tip]

url:    https://github.com/intel-lab-lkp/linux/commits/Nirmoy-Das/drm-i915-Improve-debug-print-in-vm_fault_ttm/20220922-201041
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
config: x86_64-randconfig-a011 (https://download.01.org/0day-ci/archive/20220922/202209222231.mIrsqiu7-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-5) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/e1a426a9e14837ada7e883d20af7c9abdf59823c
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Nirmoy-Das/drm-i915-Improve-debug-print-in-vm_fault_ttm/20220922-201041
        git checkout e1a426a9e14837ada7e883d20af7c9abdf59823c
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/gpu/drm/i915/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from include/drm/drm_mm.h:51,
                    from include/drm/ttm/ttm_bo_driver.h:33,
                    from drivers/gpu/drm/i915/gem/i915_gem_ttm.c:8:
   drivers/gpu/drm/i915/gem/i915_gem_ttm.c: In function 'vm_fault_ttm':
>> drivers/gpu/drm/i915/gem/i915_gem_ttm.c:1037:38: warning: format '%p' expects argument of type 'void *', but argument 4 has type 'int' [-Wformat=]
    1037 |                         drm_dbg(dev, "Unable to make resource CPU accessible(err = %pe)\n", err);
         |                                      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  ~~~
         |                                                                                             |
         |                                                                                             int
   include/drm/drm_print.h:461:63: note: in definition of macro 'drm_dbg'
     461 |         drm_dev_dbg((drm) ? (drm)->dev : NULL, DRM_UT_DRIVER, fmt, ##__VA_ARGS__)
         |                                                               ^~~
   drivers/gpu/drm/i915/gem/i915_gem_ttm.c:1037:85: note: format string is defined here
    1037 |                         drm_dbg(dev, "Unable to make resource CPU accessible(err = %pe)\n", err);
         |                                                                                    ~^
         |                                                                                     |
         |                                                                                     void *
         |                                                                                    %d


vim +1037 drivers/gpu/drm/i915/gem/i915_gem_ttm.c

   986	
   987	static vm_fault_t vm_fault_ttm(struct vm_fault *vmf)
   988	{
   989		struct vm_area_struct *area = vmf->vma;
   990		struct ttm_buffer_object *bo = area->vm_private_data;
   991		struct drm_device *dev = bo->base.dev;
   992		struct drm_i915_gem_object *obj;
   993		intel_wakeref_t wakeref = 0;
   994		vm_fault_t ret;
   995		int idx;
   996	
   997		obj = i915_ttm_to_gem(bo);
   998		if (!obj)
   999			return VM_FAULT_SIGBUS;
  1000	
  1001		/* Sanity check that we allow writing into this object */
  1002		if (unlikely(i915_gem_object_is_readonly(obj) &&
  1003			     area->vm_flags & VM_WRITE))
  1004			return VM_FAULT_SIGBUS;
  1005	
  1006		ret = ttm_bo_vm_reserve(bo, vmf);
  1007		if (ret)
  1008			return ret;
  1009	
  1010		if (obj->mm.madv != I915_MADV_WILLNEED) {
  1011			dma_resv_unlock(bo->base.resv);
  1012			return VM_FAULT_SIGBUS;
  1013		}
  1014	
  1015		if (i915_ttm_cpu_maps_iomem(bo->resource))
  1016			wakeref = intel_runtime_pm_get(&to_i915(obj->base.dev)->runtime_pm);
  1017	
  1018		if (!i915_ttm_resource_mappable(bo->resource)) {
  1019			int err = -ENODEV;
  1020			int i;
  1021	
  1022			for (i = 0; i < obj->mm.n_placements; i++) {
  1023				struct intel_memory_region *mr = obj->mm.placements[i];
  1024				unsigned int flags;
  1025	
  1026				if (!mr->io_size && mr->type != INTEL_MEMORY_SYSTEM)
  1027					continue;
  1028	
  1029				flags = obj->flags;
  1030				flags &= ~I915_BO_ALLOC_GPU_ONLY;
  1031				err = __i915_ttm_migrate(obj, mr, flags);
  1032				if (!err)
  1033					break;
  1034			}
  1035	
  1036			if (err) {
> 1037				drm_dbg(dev, "Unable to make resource CPU accessible(err = %pe)\n", err);
  1038				dma_resv_unlock(bo->base.resv);
  1039				ret = VM_FAULT_SIGBUS;
  1040				goto out_rpm;
  1041			}
  1042		}
  1043	
  1044		if (drm_dev_enter(dev, &idx)) {
  1045			ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
  1046						       TTM_BO_VM_NUM_PREFAULT);
  1047			drm_dev_exit(idx);
  1048		} else {
  1049			ret = ttm_bo_vm_dummy_page(vmf, vmf->vma->vm_page_prot);
  1050		}
  1051	
  1052		if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
  1053			goto out_rpm;
  1054	
  1055		/* ttm_bo_vm_reserve() already has dma_resv_lock */
  1056		if (ret == VM_FAULT_NOPAGE && wakeref && !obj->userfault_count) {
  1057			obj->userfault_count = 1;
  1058			mutex_lock(&to_gt(to_i915(obj->base.dev))->lmem_userfault_lock);
  1059			list_add(&obj->userfault_link, &to_gt(to_i915(obj->base.dev))->lmem_userfault_list);
  1060			mutex_unlock(&to_gt(to_i915(obj->base.dev))->lmem_userfault_lock);
  1061		}
  1062	
  1063		if (wakeref & CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND)
  1064			intel_wakeref_auto(&to_gt(to_i915(obj->base.dev))->userfault_wakeref,
  1065					   msecs_to_jiffies_timeout(CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND));
  1066	
  1067		i915_ttm_adjust_lru(obj);
  1068	
  1069		dma_resv_unlock(bo->base.resv);
  1070	
  1071	out_rpm:
  1072		if (wakeref)
  1073			intel_runtime_pm_put(&to_i915(obj->base.dev)->runtime_pm, wakeref);
  1074	
  1075		return ret;
  1076	}
  1077
kernel test robot Sept. 22, 2022, 4:25 p.m. UTC | #2
Hi Nirmoy,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on drm-tip/drm-tip]

url:    https://github.com/intel-lab-lkp/linux/commits/Nirmoy-Das/drm-i915-Improve-debug-print-in-vm_fault_ttm/20220922-201041
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20220923/202209230001.og3h9EMY-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-5) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/e1a426a9e14837ada7e883d20af7c9abdf59823c
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Nirmoy-Das/drm-i915-Improve-debug-print-in-vm_fault_ttm/20220922-201041
        git checkout e1a426a9e14837ada7e883d20af7c9abdf59823c
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from include/drm/drm_mm.h:51,
                    from include/drm/ttm/ttm_bo_driver.h:33,
                    from drivers/gpu/drm/i915/gem/i915_gem_ttm.c:8:
   drivers/gpu/drm/i915/gem/i915_gem_ttm.c: In function 'vm_fault_ttm':
>> drivers/gpu/drm/i915/gem/i915_gem_ttm.c:1037:38: error: format '%p' expects argument of type 'void *', but argument 4 has type 'int' [-Werror=format=]
    1037 |                         drm_dbg(dev, "Unable to make resource CPU accessible(err = %pe)\n", err);
         |                                      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  ~~~
         |                                                                                             |
         |                                                                                             int
   include/drm/drm_print.h:461:63: note: in definition of macro 'drm_dbg'
     461 |         drm_dev_dbg((drm) ? (drm)->dev : NULL, DRM_UT_DRIVER, fmt, ##__VA_ARGS__)
         |                                                               ^~~
   drivers/gpu/drm/i915/gem/i915_gem_ttm.c:1037:85: note: format string is defined here
    1037 |                         drm_dbg(dev, "Unable to make resource CPU accessible(err = %pe)\n", err);
         |                                                                                    ~^
         |                                                                                     |
         |                                                                                     void *
         |                                                                                    %d
   cc1: all warnings being treated as errors


vim +1037 drivers/gpu/drm/i915/gem/i915_gem_ttm.c

   986	
   987	static vm_fault_t vm_fault_ttm(struct vm_fault *vmf)
   988	{
   989		struct vm_area_struct *area = vmf->vma;
   990		struct ttm_buffer_object *bo = area->vm_private_data;
   991		struct drm_device *dev = bo->base.dev;
   992		struct drm_i915_gem_object *obj;
   993		intel_wakeref_t wakeref = 0;
   994		vm_fault_t ret;
   995		int idx;
   996	
   997		obj = i915_ttm_to_gem(bo);
   998		if (!obj)
   999			return VM_FAULT_SIGBUS;
  1000	
  1001		/* Sanity check that we allow writing into this object */
  1002		if (unlikely(i915_gem_object_is_readonly(obj) &&
  1003			     area->vm_flags & VM_WRITE))
  1004			return VM_FAULT_SIGBUS;
  1005	
  1006		ret = ttm_bo_vm_reserve(bo, vmf);
  1007		if (ret)
  1008			return ret;
  1009	
  1010		if (obj->mm.madv != I915_MADV_WILLNEED) {
  1011			dma_resv_unlock(bo->base.resv);
  1012			return VM_FAULT_SIGBUS;
  1013		}
  1014	
  1015		if (i915_ttm_cpu_maps_iomem(bo->resource))
  1016			wakeref = intel_runtime_pm_get(&to_i915(obj->base.dev)->runtime_pm);
  1017	
  1018		if (!i915_ttm_resource_mappable(bo->resource)) {
  1019			int err = -ENODEV;
  1020			int i;
  1021	
  1022			for (i = 0; i < obj->mm.n_placements; i++) {
  1023				struct intel_memory_region *mr = obj->mm.placements[i];
  1024				unsigned int flags;
  1025	
  1026				if (!mr->io_size && mr->type != INTEL_MEMORY_SYSTEM)
  1027					continue;
  1028	
  1029				flags = obj->flags;
  1030				flags &= ~I915_BO_ALLOC_GPU_ONLY;
  1031				err = __i915_ttm_migrate(obj, mr, flags);
  1032				if (!err)
  1033					break;
  1034			}
  1035	
  1036			if (err) {
> 1037				drm_dbg(dev, "Unable to make resource CPU accessible(err = %pe)\n", err);
  1038				dma_resv_unlock(bo->base.resv);
  1039				ret = VM_FAULT_SIGBUS;
  1040				goto out_rpm;
  1041			}
  1042		}
  1043	
  1044		if (drm_dev_enter(dev, &idx)) {
  1045			ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
  1046						       TTM_BO_VM_NUM_PREFAULT);
  1047			drm_dev_exit(idx);
  1048		} else {
  1049			ret = ttm_bo_vm_dummy_page(vmf, vmf->vma->vm_page_prot);
  1050		}
  1051	
  1052		if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
  1053			goto out_rpm;
  1054	
  1055		/* ttm_bo_vm_reserve() already has dma_resv_lock */
  1056		if (ret == VM_FAULT_NOPAGE && wakeref && !obj->userfault_count) {
  1057			obj->userfault_count = 1;
  1058			mutex_lock(&to_gt(to_i915(obj->base.dev))->lmem_userfault_lock);
  1059			list_add(&obj->userfault_link, &to_gt(to_i915(obj->base.dev))->lmem_userfault_list);
  1060			mutex_unlock(&to_gt(to_i915(obj->base.dev))->lmem_userfault_lock);
  1061		}
  1062	
  1063		if (wakeref & CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND)
  1064			intel_wakeref_auto(&to_gt(to_i915(obj->base.dev))->userfault_wakeref,
  1065					   msecs_to_jiffies_timeout(CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND));
  1066	
  1067		i915_ttm_adjust_lru(obj);
  1068	
  1069		dma_resv_unlock(bo->base.resv);
  1070	
  1071	out_rpm:
  1072		if (wakeref)
  1073			intel_runtime_pm_put(&to_i915(obj->base.dev)->runtime_pm, wakeref);
  1074	
  1075		return ret;
  1076	}
  1077
Matthew Auld Sept. 22, 2022, 4:38 p.m. UTC | #3
On 22/09/2022 13:09, Nirmoy Das wrote:
> Print the error code returned by __i915_ttm_migrate()
> for better debuggability.
> 
> References: https://gitlab.freedesktop.org/drm/intel/-/issues/6889
> Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> index e3fc38dd5db0..9619c0fe1025 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> @@ -1034,7 +1034,7 @@ static vm_fault_t vm_fault_ttm(struct vm_fault *vmf)
>   		}
>   
>   		if (err) {
> -			drm_dbg(dev, "Unable to make resource CPU accessible\n");
> +			drm_dbg(dev, "Unable to make resource CPU accessible(err = %pe)\n", err);

Yeah, looks useful. I think for that bug the object is just too large 
for the mappable part of lmem, so this just gives -2big or similar on 
small-bar systems. I presume that the test needs to be updated to 
account for the cpu_size or so.

With the kernel test robot warning fixed:
Acked-by: Matthew Auld <matthew.auld@intel.com>

I looked at the GEM_BUG_ON(rq->reserved_space > ring->space), and I 
think the issue is maybe with emit_pte() using the ring->space to 
manually figure out the number of dwords it can emit (instead of the 
usual ring_begin()), which I guess works, but if we are unlucky and get 
interrupted (like with a very well timed sigbus here), while waiting for 
more ring space and end up bailing early, we might have trampled over 
the reserved_space when submitting the request. I guess normally the 
next ring_begin() would take care of the reserved_space, like when 
constructing the actual copy packet.

>   			dma_resv_unlock(bo->base.resv);
>   			ret = VM_FAULT_SIGBUS;
>   			goto out_rpm;
Nirmoy Das Sept. 23, 2022, 7:27 a.m. UTC | #4
On 9/22/2022 6:38 PM, Matthew Auld wrote:
> On 22/09/2022 13:09, Nirmoy Das wrote:
>> Print the error code returned by __i915_ttm_migrate()
>> for better debuggability.
>>
>> References: https://gitlab.freedesktop.org/drm/intel/-/issues/6889
>> Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
>> ---
>>   drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
>> b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> index e3fc38dd5db0..9619c0fe1025 100644
>> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> @@ -1034,7 +1034,7 @@ static vm_fault_t vm_fault_ttm(struct vm_fault 
>> *vmf)
>>           }
>>             if (err) {
>> -            drm_dbg(dev, "Unable to make resource CPU accessible\n");
>> +            drm_dbg(dev, "Unable to make resource CPU accessible(err 
>> = %pe)\n", err);
>
> Yeah, looks useful. I think for that bug the object is just too large 
> for the mappable part of lmem, so this just gives -2big or similar on 
> small-bar systems. I presume that the test needs to be updated to 
> account for the cpu_size or so.


Yeah, can't think of any other case. The test need to be updated, going 
to send out igt fixes for this.

>
> With the kernel test robot warning fixed:
> Acked-by: Matthew Auld <matthew.auld@intel.com>


Thanks, I will resend a updated one.

>
> I looked at the GEM_BUG_ON(rq->reserved_space > ring->space), and I 
> think the issue is maybe with emit_pte() using the ring->space to 
> manually figure out the number of dwords it can emit (instead of the 
> usual ring_begin()), which I guess works, but if we are unlucky and 
> get interrupted (like with a very well timed sigbus here), while 
> waiting for more ring space and end up bailing early, we might have 
> trampled over the reserved_space when submitting the request. I guess 
> normally the next ring_begin() would take care of the reserved_space, 
> like when constructing the actual copy packet.


I am not so familiar with the code but sounds logical.


Nirmoy

>
>> dma_resv_unlock(bo->base.resv);
>>               ret = VM_FAULT_SIGBUS;
>>               goto out_rpm;
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index e3fc38dd5db0..9619c0fe1025 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -1034,7 +1034,7 @@  static vm_fault_t vm_fault_ttm(struct vm_fault *vmf)
 		}
 
 		if (err) {
-			drm_dbg(dev, "Unable to make resource CPU accessible\n");
+			drm_dbg(dev, "Unable to make resource CPU accessible(err = %pe)\n", err);
 			dma_resv_unlock(bo->base.resv);
 			ret = VM_FAULT_SIGBUS;
 			goto out_rpm;