Message ID | 20230118071609.17572-24-niranjana.vishwanathapura@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/i915/vm_bind: Add VM_BIND functionality | expand |
On 18/01/2023 07:16, Niranjana Vishwanathapura wrote: > Support dump capture of persistent mappings upon user request. > > Capture of a mapping is requested with the VM_BIND ioctl and > processed during the GPU error handling. They are synchronously > unbound during eviction so that no additional vma resource > reference taking is required in the submission path. Thus, a > list of persistent vmas requiring capture is maintained instead > of a list of vma resources. > > v2: enable with CONFIG_DRM_I915_CAPTURE_ERROR, remove gfp > overwrite, add kernel-doc and expand commit message > v3: Ensure vma->resource is valid during capture > > Signed-off-by: Brian Welty <brian.welty@intel.com> > Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> > --- > .../drm/i915/gem/i915_gem_vm_bind_object.c | 13 +++++ > drivers/gpu/drm/i915/gt/intel_gtt.c | 5 ++ > drivers/gpu/drm/i915/gt/intel_gtt.h | 7 +++ > drivers/gpu/drm/i915/i915_gem.c | 14 ++++- > drivers/gpu/drm/i915/i915_gpu_error.c | 52 ++++++++++++++++++- > drivers/gpu/drm/i915/i915_vma.c | 4 ++ > drivers/gpu/drm/i915/i915_vma_types.h | 4 ++ > include/uapi/drm/i915_drm.h | 9 +++- > 8 files changed, 104 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c b/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c > index 78e7c0642c5f..562a67a988f2 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c > +++ b/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c > @@ -88,6 +88,12 @@ static void i915_gem_vm_bind_remove(struct i915_vma *vma, bool release_obj) > { > lockdep_assert_held(&vma->vm->vm_bind_lock); > > +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) > + mutex_lock(&vma->vm->vm_capture_lock); > + if (!list_empty(&vma->vm_capture_link)) > + list_del_init(&vma->vm_capture_link); > + mutex_unlock(&vma->vm->vm_capture_lock); > +#endif > spin_lock(&vma->vm->vm_rebind_lock); > if (!list_empty(&vma->vm_rebind_link)) > list_del_init(&vma->vm_rebind_link); > @@ -357,6 +363,13 @@ static int i915_gem_vm_bind_obj(struct i915_address_space *vm, > continue; > } > > +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) > + if (va->flags & I915_GEM_VM_BIND_CAPTURE) { > + mutex_lock(&vm->vm_capture_lock); > + list_add_tail(&vma->vm_capture_link, &vm->vm_capture_list); > + mutex_unlock(&vm->vm_capture_lock); > + } > +#endif > list_add_tail(&vma->vm_bind_link, &vm->vm_bound_list); > i915_vm_bind_it_insert(vma, &vm->va); > if (!obj->priv_root) > diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c > index 2e4c9fabf3b8..103ca55222be 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gtt.c > +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c > @@ -297,6 +297,11 @@ void i915_address_space_init(struct i915_address_space *vm, int subclass) > spin_lock_init(&vm->vm_rebind_lock); > spin_lock_init(&vm->userptr_invalidated_lock); > INIT_LIST_HEAD(&vm->userptr_invalidated_list); > + > +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) > + INIT_LIST_HEAD(&vm->vm_capture_list); > + mutex_init(&vm->vm_capture_lock); > +#endif > } > > void *__px_vaddr(struct drm_i915_gem_object *p) > diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h > index 620b4e020a9f..7f69e1d4fb5e 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gtt.h > +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h > @@ -281,6 +281,13 @@ struct i915_address_space { > /** @root_obj: root object for dma-resv sharing by private objects */ > struct drm_i915_gem_object *root_obj; > > +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) > + /* @vm_capture_list: list of vm captures */ > + struct list_head vm_capture_list; > + /* @vm_capture_lock: protects vm_capture_list */ > + struct mutex vm_capture_lock; > +#endif > + > /* Global GTT */ > bool is_ggtt:1; > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 969581e7106f..d97822f203fc 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -143,6 +143,8 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj, > while (!ret && (vma = list_first_entry_or_null(&obj->vma.list, > struct i915_vma, > obj_link))) { > + bool sync_unbind = true; > + > list_move_tail(&vma->obj_link, &still_in_list); > if (!i915_vma_is_bound(vma, I915_VMA_BIND_MASK)) > continue; > @@ -171,8 +173,18 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj, > * and destroy the vma from under us. > */ > > + /* > + * Synchronously unbind persistent mappings with capture > + * request so that vma->resource is valid in the error capture > + * path without needing extra reference taking in execbuf path. > + */ > + if (!mutex_lock_interruptible(&vma->vm->vm_capture_lock)) { > + sync_unbind = !list_empty(&vma->vm_capture_link); > + mutex_unlock(&vma->vm->vm_capture_lock); > + } This stuff only exists on CONFIG_DRM_I915_CAPTURE_ERROR it seems? > + > ret = -EBUSY; > - if (flags & I915_GEM_OBJECT_UNBIND_ASYNC) { > + if (!sync_unbind && (flags & I915_GEM_OBJECT_UNBIND_ASYNC)) { > assert_object_held(vma->obj); > ret = i915_vma_unbind_async(vma, vm_trylock); > } > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index 9d5d5a397b64..5ccd1eaea2a5 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -1460,6 +1460,49 @@ capture_vma(struct intel_engine_capture_vma *next, > return next; > } > > +static struct intel_engine_capture_vma * > +capture_user_vm(struct intel_engine_capture_vma *capture, > + struct i915_address_space *vm, gfp_t gfp) > +{ > + struct list_head vm_capture_list; > + struct i915_vma *vma, *vn; > + int err; > + > + INIT_LIST_HEAD(&vm_capture_list); > + > + err = mutex_lock_interruptible(&vm->vm_capture_lock); > + if (err) > + return capture; Same here, and a few other places. > + > + /* vma->resource should be checked with vm->mutex held */ > + err = mutex_lock_interruptible(&vm->mutex); > + if (err) > + goto skip_user_vm_capture; > + > + list_for_each_entry_safe(vma, vn, &vm->vm_capture_list, > + vm_capture_link) { > + if (drm_WARN_ONCE(&vm->i915->drm, !vma->resource, > + "vma->resource expected!\n")) > + continue; > + > + i915_vma_resource_get(vma->resource); > + list_move_tail(&vma->vm_capture_link, &vm_capture_list); Now that stuff can be added to the capture_list outside of the exec, can't someone do an exec, followed by a bunch of vm_binds requesting capture for each one? With the idea of tricking the capture code into dumping the pages of non-cleared memory? (The GPU clear job has been created, but not actually completed yet). Say we have an IGT which creates a spinner or something on the vm, then creates a bunch of vm_binds, each asking for capture. What ensures that all the binds we are capturing here are valid when the spinner or whatever triggers a GPU hang i.e everything in capture_list has at least been cleared? With eb2 everything was tied to the rq, and if the rq has been submitted then all required async clears/moves must have already completed. > + } > + mutex_unlock(&vm->mutex); > + > + list_for_each_entry(vma, &vm_capture_list, vm_capture_link) { > + capture = capture_vma_snapshot(capture, vma->resource, > + gfp, "user"); > + i915_vma_resource_put(vma->resource); > + } > + list_splice_tail(&vm_capture_list, &vm->vm_capture_list); > + > +skip_user_vm_capture: > + mutex_unlock(&vm->vm_capture_lock); > + > + return capture; > +} > + > static struct intel_engine_capture_vma * > capture_user(struct intel_engine_capture_vma *capture, > const struct i915_request *rq, > @@ -1467,6 +1510,8 @@ capture_user(struct intel_engine_capture_vma *capture, > { > struct i915_capture_list *c; > > + capture = capture_user_vm(capture, rq->context->vm, gfp); > + > for (c = rq->capture_list; c; c = c->next) > capture = capture_vma_snapshot(capture, c->vma_res, gfp, > "user"); > @@ -1548,8 +1593,13 @@ intel_engine_coredump_add_request(struct intel_engine_coredump *ee, > * as the simplest method to avoid being overwritten > * by userspace. > */ > - vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch"); > + > + /* > + * Ensure capture_user_vm which takes vm->mutex gets called first > + * as snapshoting the first vma starts dma fence critical section. > + */ > vma = capture_user(vma, rq, gfp); > + vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch"); > vma = capture_vma(vma, rq->ring->vma, "ring", gfp); > vma = capture_vma(vma, rq->context->state, "HW context", gfp); > > diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c > index 2f0994f0ed42..b47715fa773f 100644 > --- a/drivers/gpu/drm/i915/i915_vma.c > +++ b/drivers/gpu/drm/i915/i915_vma.c > @@ -248,6 +248,10 @@ vma_create(struct drm_i915_gem_object *obj, > INIT_LIST_HEAD(&vma->non_priv_vm_bind_link); > INIT_LIST_HEAD(&vma->vm_rebind_link); > INIT_LIST_HEAD(&vma->userptr_invalidated_link); > + > +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) > + INIT_LIST_HEAD(&vma->vm_capture_link); > +#endif > return vma; > > err_unlock: > diff --git a/drivers/gpu/drm/i915/i915_vma_types.h b/drivers/gpu/drm/i915/i915_vma_types.h > index 89f9854a6f69..c4fd61d51ce6 100644 > --- a/drivers/gpu/drm/i915/i915_vma_types.h > +++ b/drivers/gpu/drm/i915/i915_vma_types.h > @@ -310,6 +310,10 @@ struct i915_vma { > struct list_head vm_rebind_link; /* Link in vm_rebind_list */ > /** @userptr_invalidated_link: link to the vm->userptr_invalidated_list */ > struct list_head userptr_invalidated_link; > +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) > + /* @vm_capture_link: link to the captureable VMA list */ > + struct list_head vm_capture_link; > +#endif > > /** Timeline fence for vm_bind completion notification */ > struct { > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h > index b9167f950327..5fde6020e339 100644 > --- a/include/uapi/drm/i915_drm.h > +++ b/include/uapi/drm/i915_drm.h > @@ -3925,12 +3925,17 @@ struct drm_i915_gem_vm_bind { > __u64 length; > > /** > - * @flags: Currently reserved, MBZ. > + * @flags: Supported flags are: > + * > + * I915_GEM_VM_BIND_CAPTURE: > + * Capture this mapping in the dump upon GPU error. > + * CONFIG_DRM_I915_CAPTURE_ERROR should be enabled for valid capture. > * > * Note that @fence carries its own flags. > */ > __u64 flags; > -#define __I915_GEM_VM_BIND_UNKNOWN_FLAGS (~0ull) > +#define I915_GEM_VM_BIND_CAPTURE (1ull << 0) > +#define __I915_GEM_VM_BIND_UNKNOWN_FLAGS (-(I915_GEM_VM_BIND_CAPTURE << 1)) > > /** @rsvd: Reserved, MBZ */ > __u64 rsvd[2];
Hi Niranjana, Thank you for the patch! Yet something to improve: [auto build test ERROR on drm-intel/for-linux-next] [also build test ERROR on drm-intel/for-linux-next-fixes drm-tip/drm-tip drm/drm-next drm-exynos/exynos-drm-next drm-misc/drm-misc-next linus/master v6.2-rc4 next-20230118] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Niranjana-Vishwanathapura/drm-i915-vm_bind-Expose-vm-lookup-function/20230118-151845 base: git://anongit.freedesktop.org/drm-intel for-linux-next patch link: https://lore.kernel.org/r/20230118071609.17572-24-niranjana.vishwanathapura%40intel.com patch subject: [Intel-gfx] [PATCH v10 23/23] drm/i915/vm_bind: Support capture of persistent mappings config: i386-randconfig-a016-20230116 compiler: gcc-11 (Debian 11.3.0-8) 11.3.0 reproduce (this is a W=1 build): # https://github.com/intel-lab-lkp/linux/commit/251fbfd52586e3ff4677b44a136d08f9580d79e2 git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Niranjana-Vishwanathapura/drm-i915-vm_bind-Expose-vm-lookup-function/20230118-151845 git checkout 251fbfd52586e3ff4677b44a136d08f9580d79e2 # save the config file mkdir build_dir && cp config build_dir/.config make W=1 O=build_dir ARCH=i386 olddefconfig make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot <lkp@intel.com> All errors (new ones prefixed by >>): In file included from include/linux/rhashtable-types.h:14, from include/linux/ipc.h:7, from include/uapi/linux/sem.h:5, from include/linux/sem.h:5, from include/linux/sched.h:15, from include/linux/dma-fence.h:21, from include/linux/dma-fence-array.h:15, from drivers/gpu/drm/i915/i915_gem.c:28: drivers/gpu/drm/i915/i915_gem.c: In function 'i915_gem_object_unbind': >> drivers/gpu/drm/i915/i915_gem.c:181:55: error: 'struct i915_address_space' has no member named 'vm_capture_lock' 181 | if (!mutex_lock_interruptible(&vma->vm->vm_capture_lock)) { | ^~ include/linux/mutex.h:188:72: note: in definition of macro 'mutex_lock_interruptible' 188 | #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0) | ^~~~ >> drivers/gpu/drm/i915/i915_gem.c:182:55: error: 'struct i915_vma' has no member named 'vm_capture_link' 182 | sync_unbind = !list_empty(&vma->vm_capture_link); | ^~ drivers/gpu/drm/i915/i915_gem.c:183:46: error: 'struct i915_address_space' has no member named 'vm_capture_lock' 183 | mutex_unlock(&vma->vm->vm_capture_lock); | ^~ vim +181 drivers/gpu/drm/i915/i915_gem.c 116 117 int i915_gem_object_unbind(struct drm_i915_gem_object *obj, 118 unsigned long flags) 119 { 120 struct intel_runtime_pm *rpm = &to_i915(obj->base.dev)->runtime_pm; 121 bool vm_trylock = !!(flags & I915_GEM_OBJECT_UNBIND_VM_TRYLOCK); 122 LIST_HEAD(still_in_list); 123 intel_wakeref_t wakeref; 124 struct i915_vma *vma; 125 int ret; 126 127 assert_object_held(obj); 128 129 if (list_empty(&obj->vma.list)) 130 return 0; 131 132 /* 133 * As some machines use ACPI to handle runtime-resume callbacks, and 134 * ACPI is quite kmalloc happy, we cannot resume beneath the vm->mutex 135 * as they are required by the shrinker. Ergo, we wake the device up 136 * first just in case. 137 */ 138 wakeref = intel_runtime_pm_get(rpm); 139 140 try_again: 141 ret = 0; 142 spin_lock(&obj->vma.lock); 143 while (!ret && (vma = list_first_entry_or_null(&obj->vma.list, 144 struct i915_vma, 145 obj_link))) { 146 bool sync_unbind = true; 147 148 list_move_tail(&vma->obj_link, &still_in_list); 149 if (!i915_vma_is_bound(vma, I915_VMA_BIND_MASK)) 150 continue; 151 152 if (flags & I915_GEM_OBJECT_UNBIND_TEST) { 153 ret = -EBUSY; 154 break; 155 } 156 157 /* 158 * Requiring the vm destructor to take the object lock 159 * before destroying a vma would help us eliminate the 160 * i915_vm_tryget() here, AND thus also the barrier stuff 161 * at the end. That's an easy fix, but sleeping locks in 162 * a kthread should generally be avoided. 163 */ 164 ret = -EAGAIN; 165 if (!i915_vm_tryget(vma->vm)) 166 break; 167 168 spin_unlock(&obj->vma.lock); 169 170 /* 171 * Since i915_vma_parked() takes the object lock 172 * before vma destruction, it won't race us here, 173 * and destroy the vma from under us. 174 */ 175 176 /* 177 * Synchronously unbind persistent mappings with capture 178 * request so that vma->resource is valid in the error capture 179 * path without needing extra reference taking in execbuf path. 180 */ > 181 if (!mutex_lock_interruptible(&vma->vm->vm_capture_lock)) { > 182 sync_unbind = !list_empty(&vma->vm_capture_link); 183 mutex_unlock(&vma->vm->vm_capture_lock); 184 } 185 186 ret = -EBUSY; 187 if (!sync_unbind && (flags & I915_GEM_OBJECT_UNBIND_ASYNC)) { 188 assert_object_held(vma->obj); 189 ret = i915_vma_unbind_async(vma, vm_trylock); 190 } 191 192 if (ret == -EBUSY && (flags & I915_GEM_OBJECT_UNBIND_ACTIVE || 193 !i915_vma_is_active(vma))) { 194 if (vm_trylock) { 195 if (mutex_trylock(&vma->vm->mutex)) { 196 ret = __i915_vma_unbind(vma); 197 mutex_unlock(&vma->vm->mutex); 198 } 199 } else { 200 ret = i915_vma_unbind(vma); 201 } 202 } 203 204 i915_vm_put(vma->vm); 205 spin_lock(&obj->vma.lock); 206 } 207 list_splice_init(&still_in_list, &obj->vma.list); 208 spin_unlock(&obj->vma.lock); 209 210 if (ret == -EAGAIN && flags & I915_GEM_OBJECT_UNBIND_BARRIER) { 211 rcu_barrier(); /* flush the i915_vm_release() */ 212 goto try_again; 213 } 214 215 intel_runtime_pm_put(rpm, wakeref); 216 217 return ret; 218 } 219
On Wed, Jan 18, 2023 at 12:45:08PM +0000, Matthew Auld wrote: >On 18/01/2023 07:16, Niranjana Vishwanathapura wrote: >>Support dump capture of persistent mappings upon user request. >> >>Capture of a mapping is requested with the VM_BIND ioctl and >>processed during the GPU error handling. They are synchronously >>unbound during eviction so that no additional vma resource >>reference taking is required in the submission path. Thus, a >>list of persistent vmas requiring capture is maintained instead >>of a list of vma resources. >> >>v2: enable with CONFIG_DRM_I915_CAPTURE_ERROR, remove gfp >> overwrite, add kernel-doc and expand commit message >>v3: Ensure vma->resource is valid during capture >> >>Signed-off-by: Brian Welty <brian.welty@intel.com> >>Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> >>--- >> .../drm/i915/gem/i915_gem_vm_bind_object.c | 13 +++++ >> drivers/gpu/drm/i915/gt/intel_gtt.c | 5 ++ >> drivers/gpu/drm/i915/gt/intel_gtt.h | 7 +++ >> drivers/gpu/drm/i915/i915_gem.c | 14 ++++- >> drivers/gpu/drm/i915/i915_gpu_error.c | 52 ++++++++++++++++++- >> drivers/gpu/drm/i915/i915_vma.c | 4 ++ >> drivers/gpu/drm/i915/i915_vma_types.h | 4 ++ >> include/uapi/drm/i915_drm.h | 9 +++- >> 8 files changed, 104 insertions(+), 4 deletions(-) >> >>diff --git a/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c b/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c >>index 78e7c0642c5f..562a67a988f2 100644 >>--- a/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c >>+++ b/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c >>@@ -88,6 +88,12 @@ static void i915_gem_vm_bind_remove(struct i915_vma *vma, bool release_obj) >> { >> lockdep_assert_held(&vma->vm->vm_bind_lock); >>+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) >>+ mutex_lock(&vma->vm->vm_capture_lock); >>+ if (!list_empty(&vma->vm_capture_link)) >>+ list_del_init(&vma->vm_capture_link); >>+ mutex_unlock(&vma->vm->vm_capture_lock); >>+#endif >> spin_lock(&vma->vm->vm_rebind_lock); >> if (!list_empty(&vma->vm_rebind_link)) >> list_del_init(&vma->vm_rebind_link); >>@@ -357,6 +363,13 @@ static int i915_gem_vm_bind_obj(struct i915_address_space *vm, >> continue; >> } >>+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) >>+ if (va->flags & I915_GEM_VM_BIND_CAPTURE) { >>+ mutex_lock(&vm->vm_capture_lock); >>+ list_add_tail(&vma->vm_capture_link, &vm->vm_capture_list); >>+ mutex_unlock(&vm->vm_capture_lock); >>+ } >>+#endif >> list_add_tail(&vma->vm_bind_link, &vm->vm_bound_list); >> i915_vm_bind_it_insert(vma, &vm->va); >> if (!obj->priv_root) >>diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c >>index 2e4c9fabf3b8..103ca55222be 100644 >>--- a/drivers/gpu/drm/i915/gt/intel_gtt.c >>+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c >>@@ -297,6 +297,11 @@ void i915_address_space_init(struct i915_address_space *vm, int subclass) >> spin_lock_init(&vm->vm_rebind_lock); >> spin_lock_init(&vm->userptr_invalidated_lock); >> INIT_LIST_HEAD(&vm->userptr_invalidated_list); >>+ >>+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) >>+ INIT_LIST_HEAD(&vm->vm_capture_list); >>+ mutex_init(&vm->vm_capture_lock); >>+#endif >> } >> void *__px_vaddr(struct drm_i915_gem_object *p) >>diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h >>index 620b4e020a9f..7f69e1d4fb5e 100644 >>--- a/drivers/gpu/drm/i915/gt/intel_gtt.h >>+++ b/drivers/gpu/drm/i915/gt/intel_gtt.h >>@@ -281,6 +281,13 @@ struct i915_address_space { >> /** @root_obj: root object for dma-resv sharing by private objects */ >> struct drm_i915_gem_object *root_obj; >>+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) >>+ /* @vm_capture_list: list of vm captures */ >>+ struct list_head vm_capture_list; >>+ /* @vm_capture_lock: protects vm_capture_list */ >>+ struct mutex vm_capture_lock; >>+#endif >>+ >> /* Global GTT */ >> bool is_ggtt:1; >>diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c >>index 969581e7106f..d97822f203fc 100644 >>--- a/drivers/gpu/drm/i915/i915_gem.c >>+++ b/drivers/gpu/drm/i915/i915_gem.c >>@@ -143,6 +143,8 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj, >> while (!ret && (vma = list_first_entry_or_null(&obj->vma.list, >> struct i915_vma, >> obj_link))) { >>+ bool sync_unbind = true; >>+ >> list_move_tail(&vma->obj_link, &still_in_list); >> if (!i915_vma_is_bound(vma, I915_VMA_BIND_MASK)) >> continue; >>@@ -171,8 +173,18 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj, >> * and destroy the vma from under us. >> */ >>+ /* >>+ * Synchronously unbind persistent mappings with capture >>+ * request so that vma->resource is valid in the error capture >>+ * path without needing extra reference taking in execbuf path. >>+ */ >>+ if (!mutex_lock_interruptible(&vma->vm->vm_capture_lock)) { >>+ sync_unbind = !list_empty(&vma->vm_capture_link); >>+ mutex_unlock(&vma->vm->vm_capture_lock); >>+ } > >This stuff only exists on CONFIG_DRM_I915_CAPTURE_ERROR it seems? > Yah, will move it under CONFIG_DRM_I915_CAPTURE_ERROR. >>+ >> ret = -EBUSY; >>- if (flags & I915_GEM_OBJECT_UNBIND_ASYNC) { >>+ if (!sync_unbind && (flags & I915_GEM_OBJECT_UNBIND_ASYNC)) { >> assert_object_held(vma->obj); >> ret = i915_vma_unbind_async(vma, vm_trylock); >> } >>diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c >>index 9d5d5a397b64..5ccd1eaea2a5 100644 >>--- a/drivers/gpu/drm/i915/i915_gpu_error.c >>+++ b/drivers/gpu/drm/i915/i915_gpu_error.c >>@@ -1460,6 +1460,49 @@ capture_vma(struct intel_engine_capture_vma *next, >> return next; >> } >>+static struct intel_engine_capture_vma * >>+capture_user_vm(struct intel_engine_capture_vma *capture, >>+ struct i915_address_space *vm, gfp_t gfp) >>+{ >>+ struct list_head vm_capture_list; >>+ struct i915_vma *vma, *vn; >>+ int err; >>+ >>+ INIT_LIST_HEAD(&vm_capture_list); >>+ >>+ err = mutex_lock_interruptible(&vm->vm_capture_lock); >>+ if (err) >>+ return capture; > >Same here, and a few other places. > We don't need CONFIG_DRM_I915_CAPTURE_ERROR check here as this whole file is only compiled if this config is set. >>+ >>+ /* vma->resource should be checked with vm->mutex held */ >>+ err = mutex_lock_interruptible(&vm->mutex); >>+ if (err) >>+ goto skip_user_vm_capture; >>+ >>+ list_for_each_entry_safe(vma, vn, &vm->vm_capture_list, >>+ vm_capture_link) { >>+ if (drm_WARN_ONCE(&vm->i915->drm, !vma->resource, >>+ "vma->resource expected!\n")) >>+ continue; >>+ >>+ i915_vma_resource_get(vma->resource); >>+ list_move_tail(&vma->vm_capture_link, &vm_capture_list); > >Now that stuff can be added to the capture_list outside of the exec, >can't someone do an exec, followed by a bunch of vm_binds requesting >capture for each one? With the idea of tricking the capture code into >dumping the pages of non-cleared memory? (The GPU clear job has been >created, but not actually completed yet). > >Say we have an IGT which creates a spinner or something on the vm, >then creates a bunch of vm_binds, each asking for capture. What >ensures that all the binds we are capturing here are valid when the >spinner or whatever triggers a GPU hang i.e everything in capture_list >has at least been cleared? With eb2 everything was tied to the rq, and >if the rq has been submitted then all required async clears/moves must >have already completed. Ok, I think we can skip the capture here if i915_vma_verify_bind_complete() returns false. Thanks, Niranjana > >>+ } >>+ mutex_unlock(&vm->mutex); >>+ >>+ list_for_each_entry(vma, &vm_capture_list, vm_capture_link) { >>+ capture = capture_vma_snapshot(capture, vma->resource, >>+ gfp, "user"); >>+ i915_vma_resource_put(vma->resource); >>+ } >>+ list_splice_tail(&vm_capture_list, &vm->vm_capture_list); >>+ >>+skip_user_vm_capture: >>+ mutex_unlock(&vm->vm_capture_lock); >>+ >>+ return capture; >>+} >>+ >> static struct intel_engine_capture_vma * >> capture_user(struct intel_engine_capture_vma *capture, >> const struct i915_request *rq, >>@@ -1467,6 +1510,8 @@ capture_user(struct intel_engine_capture_vma *capture, >> { >> struct i915_capture_list *c; >>+ capture = capture_user_vm(capture, rq->context->vm, gfp); >>+ >> for (c = rq->capture_list; c; c = c->next) >> capture = capture_vma_snapshot(capture, c->vma_res, gfp, >> "user"); >>@@ -1548,8 +1593,13 @@ intel_engine_coredump_add_request(struct intel_engine_coredump *ee, >> * as the simplest method to avoid being overwritten >> * by userspace. >> */ >>- vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch"); >>+ >>+ /* >>+ * Ensure capture_user_vm which takes vm->mutex gets called first >>+ * as snapshoting the first vma starts dma fence critical section. >>+ */ >> vma = capture_user(vma, rq, gfp); >>+ vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch"); >> vma = capture_vma(vma, rq->ring->vma, "ring", gfp); >> vma = capture_vma(vma, rq->context->state, "HW context", gfp); >>diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c >>index 2f0994f0ed42..b47715fa773f 100644 >>--- a/drivers/gpu/drm/i915/i915_vma.c >>+++ b/drivers/gpu/drm/i915/i915_vma.c >>@@ -248,6 +248,10 @@ vma_create(struct drm_i915_gem_object *obj, >> INIT_LIST_HEAD(&vma->non_priv_vm_bind_link); >> INIT_LIST_HEAD(&vma->vm_rebind_link); >> INIT_LIST_HEAD(&vma->userptr_invalidated_link); >>+ >>+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) >>+ INIT_LIST_HEAD(&vma->vm_capture_link); >>+#endif >> return vma; >> err_unlock: >>diff --git a/drivers/gpu/drm/i915/i915_vma_types.h b/drivers/gpu/drm/i915/i915_vma_types.h >>index 89f9854a6f69..c4fd61d51ce6 100644 >>--- a/drivers/gpu/drm/i915/i915_vma_types.h >>+++ b/drivers/gpu/drm/i915/i915_vma_types.h >>@@ -310,6 +310,10 @@ struct i915_vma { >> struct list_head vm_rebind_link; /* Link in vm_rebind_list */ >> /** @userptr_invalidated_link: link to the vm->userptr_invalidated_list */ >> struct list_head userptr_invalidated_link; >>+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) >>+ /* @vm_capture_link: link to the captureable VMA list */ >>+ struct list_head vm_capture_link; >>+#endif >> /** Timeline fence for vm_bind completion notification */ >> struct { >>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h >>index b9167f950327..5fde6020e339 100644 >>--- a/include/uapi/drm/i915_drm.h >>+++ b/include/uapi/drm/i915_drm.h >>@@ -3925,12 +3925,17 @@ struct drm_i915_gem_vm_bind { >> __u64 length; >> /** >>- * @flags: Currently reserved, MBZ. >>+ * @flags: Supported flags are: >>+ * >>+ * I915_GEM_VM_BIND_CAPTURE: >>+ * Capture this mapping in the dump upon GPU error. >>+ * CONFIG_DRM_I915_CAPTURE_ERROR should be enabled for valid capture. >> * >> * Note that @fence carries its own flags. >> */ >> __u64 flags; >>-#define __I915_GEM_VM_BIND_UNKNOWN_FLAGS (~0ull) >>+#define I915_GEM_VM_BIND_CAPTURE (1ull << 0) >>+#define __I915_GEM_VM_BIND_UNKNOWN_FLAGS (-(I915_GEM_VM_BIND_CAPTURE << 1)) >> /** @rsvd: Reserved, MBZ */ >> __u64 rsvd[2];
Hi Niranjana, Thank you for the patch! Yet something to improve: [auto build test ERROR on drm-intel/for-linux-next] [also build test ERROR on drm-intel/for-linux-next-fixes drm-tip/drm-tip drm/drm-next drm-exynos/exynos-drm-next drm-misc/drm-misc-next linus/master v6.2-rc4 next-20230118] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Niranjana-Vishwanathapura/drm-i915-vm_bind-Expose-vm-lookup-function/20230118-151845 base: git://anongit.freedesktop.org/drm-intel for-linux-next patch link: https://lore.kernel.org/r/20230118071609.17572-24-niranjana.vishwanathapura%40intel.com patch subject: [Intel-gfx] [PATCH v10 23/23] drm/i915/vm_bind: Support capture of persistent mappings config: i386-randconfig-a013 (https://download.01.org/0day-ci/archive/20230119/202301190440.EuujWDwh-lkp@intel.com/config) compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1) reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/intel-lab-lkp/linux/commit/251fbfd52586e3ff4677b44a136d08f9580d79e2 git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Niranjana-Vishwanathapura/drm-i915-vm_bind-Expose-vm-lookup-function/20230118-151845 git checkout 251fbfd52586e3ff4677b44a136d08f9580d79e2 # save the config file mkdir build_dir && cp config build_dir/.config COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 olddefconfig COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot <lkp@intel.com> All errors (new ones prefixed by >>): >> drivers/gpu/drm/i915/i915_gem.c:181:43: error: no member named 'vm_capture_lock' in 'struct i915_address_space' if (!mutex_lock_interruptible(&vma->vm->vm_capture_lock)) { ~~~~~~~ ^ include/linux/mutex.h:188:72: note: expanded from macro 'mutex_lock_interruptible' #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0) ^~~~ >> drivers/gpu/drm/i915/i915_gem.c:182:36: error: no member named 'vm_capture_link' in 'struct i915_vma' sync_unbind = !list_empty(&vma->vm_capture_link); ~~~ ^ drivers/gpu/drm/i915/i915_gem.c:183:27: error: no member named 'vm_capture_lock' in 'struct i915_address_space' mutex_unlock(&vma->vm->vm_capture_lock); ~~~~~~~ ^ 3 errors generated. vim +181 drivers/gpu/drm/i915/i915_gem.c 116 117 int i915_gem_object_unbind(struct drm_i915_gem_object *obj, 118 unsigned long flags) 119 { 120 struct intel_runtime_pm *rpm = &to_i915(obj->base.dev)->runtime_pm; 121 bool vm_trylock = !!(flags & I915_GEM_OBJECT_UNBIND_VM_TRYLOCK); 122 LIST_HEAD(still_in_list); 123 intel_wakeref_t wakeref; 124 struct i915_vma *vma; 125 int ret; 126 127 assert_object_held(obj); 128 129 if (list_empty(&obj->vma.list)) 130 return 0; 131 132 /* 133 * As some machines use ACPI to handle runtime-resume callbacks, and 134 * ACPI is quite kmalloc happy, we cannot resume beneath the vm->mutex 135 * as they are required by the shrinker. Ergo, we wake the device up 136 * first just in case. 137 */ 138 wakeref = intel_runtime_pm_get(rpm); 139 140 try_again: 141 ret = 0; 142 spin_lock(&obj->vma.lock); 143 while (!ret && (vma = list_first_entry_or_null(&obj->vma.list, 144 struct i915_vma, 145 obj_link))) { 146 bool sync_unbind = true; 147 148 list_move_tail(&vma->obj_link, &still_in_list); 149 if (!i915_vma_is_bound(vma, I915_VMA_BIND_MASK)) 150 continue; 151 152 if (flags & I915_GEM_OBJECT_UNBIND_TEST) { 153 ret = -EBUSY; 154 break; 155 } 156 157 /* 158 * Requiring the vm destructor to take the object lock 159 * before destroying a vma would help us eliminate the 160 * i915_vm_tryget() here, AND thus also the barrier stuff 161 * at the end. That's an easy fix, but sleeping locks in 162 * a kthread should generally be avoided. 163 */ 164 ret = -EAGAIN; 165 if (!i915_vm_tryget(vma->vm)) 166 break; 167 168 spin_unlock(&obj->vma.lock); 169 170 /* 171 * Since i915_vma_parked() takes the object lock 172 * before vma destruction, it won't race us here, 173 * and destroy the vma from under us. 174 */ 175 176 /* 177 * Synchronously unbind persistent mappings with capture 178 * request so that vma->resource is valid in the error capture 179 * path without needing extra reference taking in execbuf path. 180 */ > 181 if (!mutex_lock_interruptible(&vma->vm->vm_capture_lock)) { > 182 sync_unbind = !list_empty(&vma->vm_capture_link); 183 mutex_unlock(&vma->vm->vm_capture_lock); 184 } 185 186 ret = -EBUSY; 187 if (!sync_unbind && (flags & I915_GEM_OBJECT_UNBIND_ASYNC)) { 188 assert_object_held(vma->obj); 189 ret = i915_vma_unbind_async(vma, vm_trylock); 190 } 191 192 if (ret == -EBUSY && (flags & I915_GEM_OBJECT_UNBIND_ACTIVE || 193 !i915_vma_is_active(vma))) { 194 if (vm_trylock) { 195 if (mutex_trylock(&vma->vm->mutex)) { 196 ret = __i915_vma_unbind(vma); 197 mutex_unlock(&vma->vm->mutex); 198 } 199 } else { 200 ret = i915_vma_unbind(vma); 201 } 202 } 203 204 i915_vm_put(vma->vm); 205 spin_lock(&obj->vma.lock); 206 } 207 list_splice_init(&still_in_list, &obj->vma.list); 208 spin_unlock(&obj->vma.lock); 209 210 if (ret == -EAGAIN && flags & I915_GEM_OBJECT_UNBIND_BARRIER) { 211 rcu_barrier(); /* flush the i915_vm_release() */ 212 goto try_again; 213 } 214 215 intel_runtime_pm_put(rpm, wakeref); 216 217 return ret; 218 } 219
Hi Niranjana, On Tue, Jan 17, 2023 at 11:16:09PM -0800, Niranjana Vishwanathapura wrote: > Support dump capture of persistent mappings upon user request. > > Capture of a mapping is requested with the VM_BIND ioctl and > processed during the GPU error handling. They are synchronously > unbound during eviction so that no additional vma resource > reference taking is required in the submission path. Thus, a > list of persistent vmas requiring capture is maintained instead > of a list of vma resources. > > v2: enable with CONFIG_DRM_I915_CAPTURE_ERROR, remove gfp > overwrite, add kernel-doc and expand commit message > v3: Ensure vma->resource is valid during capture > > Signed-off-by: Brian Welty <brian.welty@intel.com> > Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> > --- > .../drm/i915/gem/i915_gem_vm_bind_object.c | 13 +++++ > drivers/gpu/drm/i915/gt/intel_gtt.c | 5 ++ > drivers/gpu/drm/i915/gt/intel_gtt.h | 7 +++ > drivers/gpu/drm/i915/i915_gem.c | 14 ++++- > drivers/gpu/drm/i915/i915_gpu_error.c | 52 ++++++++++++++++++- > drivers/gpu/drm/i915/i915_vma.c | 4 ++ > drivers/gpu/drm/i915/i915_vma_types.h | 4 ++ > include/uapi/drm/i915_drm.h | 9 +++- > 8 files changed, 104 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c b/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c > index 78e7c0642c5f..562a67a988f2 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c > +++ b/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c > @@ -88,6 +88,12 @@ static void i915_gem_vm_bind_remove(struct i915_vma *vma, bool release_obj) > { > lockdep_assert_held(&vma->vm->vm_bind_lock); > > +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) > + mutex_lock(&vma->vm->vm_capture_lock); > + if (!list_empty(&vma->vm_capture_link)) > + list_del_init(&vma->vm_capture_link); > + mutex_unlock(&vma->vm->vm_capture_lock); > +#endif > spin_lock(&vma->vm->vm_rebind_lock); > if (!list_empty(&vma->vm_rebind_link)) > list_del_init(&vma->vm_rebind_link); > @@ -357,6 +363,13 @@ static int i915_gem_vm_bind_obj(struct i915_address_space *vm, > continue; > } > > +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) > + if (va->flags & I915_GEM_VM_BIND_CAPTURE) { > + mutex_lock(&vm->vm_capture_lock); > + list_add_tail(&vma->vm_capture_link, &vm->vm_capture_list); > + mutex_unlock(&vm->vm_capture_lock); > + } > +#endif > list_add_tail(&vma->vm_bind_link, &vm->vm_bound_list); > i915_vm_bind_it_insert(vma, &vm->va); > if (!obj->priv_root) > diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c > index 2e4c9fabf3b8..103ca55222be 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gtt.c > +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c > @@ -297,6 +297,11 @@ void i915_address_space_init(struct i915_address_space *vm, int subclass) > spin_lock_init(&vm->vm_rebind_lock); > spin_lock_init(&vm->userptr_invalidated_lock); > INIT_LIST_HEAD(&vm->userptr_invalidated_list); > + > +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) > + INIT_LIST_HEAD(&vm->vm_capture_list); > + mutex_init(&vm->vm_capture_lock); > +#endif can we have all these, init/add/remove inside a single CONFIG_RM_I915_CAPTURE_ERROR? > } > > void *__px_vaddr(struct drm_i915_gem_object *p) > diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h > index 620b4e020a9f..7f69e1d4fb5e 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gtt.h > +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h > @@ -281,6 +281,13 @@ struct i915_address_space { > /** @root_obj: root object for dma-resv sharing by private objects */ > struct drm_i915_gem_object *root_obj; > > +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) > + /* @vm_capture_list: list of vm captures */ > + struct list_head vm_capture_list; > + /* @vm_capture_lock: protects vm_capture_list */ > + struct mutex vm_capture_lock; > +#endif > + > /* Global GTT */ > bool is_ggtt:1; > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 969581e7106f..d97822f203fc 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -143,6 +143,8 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj, > while (!ret && (vma = list_first_entry_or_null(&obj->vma.list, > struct i915_vma, > obj_link))) { > + bool sync_unbind = true; > + > list_move_tail(&vma->obj_link, &still_in_list); > if (!i915_vma_is_bound(vma, I915_VMA_BIND_MASK)) > continue; > @@ -171,8 +173,18 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj, > * and destroy the vma from under us. > */ > > + /* > + * Synchronously unbind persistent mappings with capture > + * request so that vma->resource is valid in the error capture > + * path without needing extra reference taking in execbuf path. > + */ > + if (!mutex_lock_interruptible(&vma->vm->vm_capture_lock)) { > + sync_unbind = !list_empty(&vma->vm_capture_link); > + mutex_unlock(&vma->vm->vm_capture_lock); > + } this, as well? > + > ret = -EBUSY; > - if (flags & I915_GEM_OBJECT_UNBIND_ASYNC) { > + if (!sync_unbind && (flags & I915_GEM_OBJECT_UNBIND_ASYNC)) { > assert_object_held(vma->obj); > ret = i915_vma_unbind_async(vma, vm_trylock); > } > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index 9d5d5a397b64..5ccd1eaea2a5 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -1460,6 +1460,49 @@ capture_vma(struct intel_engine_capture_vma *next, > return next; > } > > +static struct intel_engine_capture_vma * > +capture_user_vm(struct intel_engine_capture_vma *capture, > + struct i915_address_space *vm, gfp_t gfp) > +{ > + struct list_head vm_capture_list; > + struct i915_vma *vma, *vn; > + int err; > + > + INIT_LIST_HEAD(&vm_capture_list); > + > + err = mutex_lock_interruptible(&vm->vm_capture_lock); > + if (err) > + return capture; > + > + /* vma->resource should be checked with vm->mutex held */ > + err = mutex_lock_interruptible(&vm->mutex); > + if (err) > + goto skip_user_vm_capture; > + > + list_for_each_entry_safe(vma, vn, &vm->vm_capture_list, > + vm_capture_link) { > + if (drm_WARN_ONCE(&vm->i915->drm, !vma->resource, > + "vma->resource expected!\n")) > + continue; > + > + i915_vma_resource_get(vma->resource); > + list_move_tail(&vma->vm_capture_link, &vm_capture_list); > + } > + mutex_unlock(&vm->mutex); > + > + list_for_each_entry(vma, &vm_capture_list, vm_capture_link) { > + capture = capture_vma_snapshot(capture, vma->resource, > + gfp, "user"); > + i915_vma_resource_put(vma->resource); > + } > + list_splice_tail(&vm_capture_list, &vm->vm_capture_list); > + > +skip_user_vm_capture: > + mutex_unlock(&vm->vm_capture_lock); > + > + return capture; > +} By any chance we want to have them all here in i915_gpu_error.c? Does it look ugly? Andi > + > static struct intel_engine_capture_vma * > capture_user(struct intel_engine_capture_vma *capture, > const struct i915_request *rq,
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c b/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c index 78e7c0642c5f..562a67a988f2 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_vm_bind_object.c @@ -88,6 +88,12 @@ static void i915_gem_vm_bind_remove(struct i915_vma *vma, bool release_obj) { lockdep_assert_held(&vma->vm->vm_bind_lock); +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) + mutex_lock(&vma->vm->vm_capture_lock); + if (!list_empty(&vma->vm_capture_link)) + list_del_init(&vma->vm_capture_link); + mutex_unlock(&vma->vm->vm_capture_lock); +#endif spin_lock(&vma->vm->vm_rebind_lock); if (!list_empty(&vma->vm_rebind_link)) list_del_init(&vma->vm_rebind_link); @@ -357,6 +363,13 @@ static int i915_gem_vm_bind_obj(struct i915_address_space *vm, continue; } +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) + if (va->flags & I915_GEM_VM_BIND_CAPTURE) { + mutex_lock(&vm->vm_capture_lock); + list_add_tail(&vma->vm_capture_link, &vm->vm_capture_list); + mutex_unlock(&vm->vm_capture_lock); + } +#endif list_add_tail(&vma->vm_bind_link, &vm->vm_bound_list); i915_vm_bind_it_insert(vma, &vm->va); if (!obj->priv_root) diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c index 2e4c9fabf3b8..103ca55222be 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.c +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -297,6 +297,11 @@ void i915_address_space_init(struct i915_address_space *vm, int subclass) spin_lock_init(&vm->vm_rebind_lock); spin_lock_init(&vm->userptr_invalidated_lock); INIT_LIST_HEAD(&vm->userptr_invalidated_list); + +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) + INIT_LIST_HEAD(&vm->vm_capture_list); + mutex_init(&vm->vm_capture_lock); +#endif } void *__px_vaddr(struct drm_i915_gem_object *p) diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h index 620b4e020a9f..7f69e1d4fb5e 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.h +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h @@ -281,6 +281,13 @@ struct i915_address_space { /** @root_obj: root object for dma-resv sharing by private objects */ struct drm_i915_gem_object *root_obj; +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) + /* @vm_capture_list: list of vm captures */ + struct list_head vm_capture_list; + /* @vm_capture_lock: protects vm_capture_list */ + struct mutex vm_capture_lock; +#endif + /* Global GTT */ bool is_ggtt:1; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 969581e7106f..d97822f203fc 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -143,6 +143,8 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj, while (!ret && (vma = list_first_entry_or_null(&obj->vma.list, struct i915_vma, obj_link))) { + bool sync_unbind = true; + list_move_tail(&vma->obj_link, &still_in_list); if (!i915_vma_is_bound(vma, I915_VMA_BIND_MASK)) continue; @@ -171,8 +173,18 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj, * and destroy the vma from under us. */ + /* + * Synchronously unbind persistent mappings with capture + * request so that vma->resource is valid in the error capture + * path without needing extra reference taking in execbuf path. + */ + if (!mutex_lock_interruptible(&vma->vm->vm_capture_lock)) { + sync_unbind = !list_empty(&vma->vm_capture_link); + mutex_unlock(&vma->vm->vm_capture_lock); + } + ret = -EBUSY; - if (flags & I915_GEM_OBJECT_UNBIND_ASYNC) { + if (!sync_unbind && (flags & I915_GEM_OBJECT_UNBIND_ASYNC)) { assert_object_held(vma->obj); ret = i915_vma_unbind_async(vma, vm_trylock); } diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 9d5d5a397b64..5ccd1eaea2a5 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1460,6 +1460,49 @@ capture_vma(struct intel_engine_capture_vma *next, return next; } +static struct intel_engine_capture_vma * +capture_user_vm(struct intel_engine_capture_vma *capture, + struct i915_address_space *vm, gfp_t gfp) +{ + struct list_head vm_capture_list; + struct i915_vma *vma, *vn; + int err; + + INIT_LIST_HEAD(&vm_capture_list); + + err = mutex_lock_interruptible(&vm->vm_capture_lock); + if (err) + return capture; + + /* vma->resource should be checked with vm->mutex held */ + err = mutex_lock_interruptible(&vm->mutex); + if (err) + goto skip_user_vm_capture; + + list_for_each_entry_safe(vma, vn, &vm->vm_capture_list, + vm_capture_link) { + if (drm_WARN_ONCE(&vm->i915->drm, !vma->resource, + "vma->resource expected!\n")) + continue; + + i915_vma_resource_get(vma->resource); + list_move_tail(&vma->vm_capture_link, &vm_capture_list); + } + mutex_unlock(&vm->mutex); + + list_for_each_entry(vma, &vm_capture_list, vm_capture_link) { + capture = capture_vma_snapshot(capture, vma->resource, + gfp, "user"); + i915_vma_resource_put(vma->resource); + } + list_splice_tail(&vm_capture_list, &vm->vm_capture_list); + +skip_user_vm_capture: + mutex_unlock(&vm->vm_capture_lock); + + return capture; +} + static struct intel_engine_capture_vma * capture_user(struct intel_engine_capture_vma *capture, const struct i915_request *rq, @@ -1467,6 +1510,8 @@ capture_user(struct intel_engine_capture_vma *capture, { struct i915_capture_list *c; + capture = capture_user_vm(capture, rq->context->vm, gfp); + for (c = rq->capture_list; c; c = c->next) capture = capture_vma_snapshot(capture, c->vma_res, gfp, "user"); @@ -1548,8 +1593,13 @@ intel_engine_coredump_add_request(struct intel_engine_coredump *ee, * as the simplest method to avoid being overwritten * by userspace. */ - vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch"); + + /* + * Ensure capture_user_vm which takes vm->mutex gets called first + * as snapshoting the first vma starts dma fence critical section. + */ vma = capture_user(vma, rq, gfp); + vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch"); vma = capture_vma(vma, rq->ring->vma, "ring", gfp); vma = capture_vma(vma, rq->context->state, "HW context", gfp); diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 2f0994f0ed42..b47715fa773f 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -248,6 +248,10 @@ vma_create(struct drm_i915_gem_object *obj, INIT_LIST_HEAD(&vma->non_priv_vm_bind_link); INIT_LIST_HEAD(&vma->vm_rebind_link); INIT_LIST_HEAD(&vma->userptr_invalidated_link); + +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) + INIT_LIST_HEAD(&vma->vm_capture_link); +#endif return vma; err_unlock: diff --git a/drivers/gpu/drm/i915/i915_vma_types.h b/drivers/gpu/drm/i915/i915_vma_types.h index 89f9854a6f69..c4fd61d51ce6 100644 --- a/drivers/gpu/drm/i915/i915_vma_types.h +++ b/drivers/gpu/drm/i915/i915_vma_types.h @@ -310,6 +310,10 @@ struct i915_vma { struct list_head vm_rebind_link; /* Link in vm_rebind_list */ /** @userptr_invalidated_link: link to the vm->userptr_invalidated_list */ struct list_head userptr_invalidated_link; +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) + /* @vm_capture_link: link to the captureable VMA list */ + struct list_head vm_capture_link; +#endif /** Timeline fence for vm_bind completion notification */ struct { diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index b9167f950327..5fde6020e339 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -3925,12 +3925,17 @@ struct drm_i915_gem_vm_bind { __u64 length; /** - * @flags: Currently reserved, MBZ. + * @flags: Supported flags are: + * + * I915_GEM_VM_BIND_CAPTURE: + * Capture this mapping in the dump upon GPU error. + * CONFIG_DRM_I915_CAPTURE_ERROR should be enabled for valid capture. * * Note that @fence carries its own flags. */ __u64 flags; -#define __I915_GEM_VM_BIND_UNKNOWN_FLAGS (~0ull) +#define I915_GEM_VM_BIND_CAPTURE (1ull << 0) +#define __I915_GEM_VM_BIND_UNKNOWN_FLAGS (-(I915_GEM_VM_BIND_CAPTURE << 1)) /** @rsvd: Reserved, MBZ */ __u64 rsvd[2];