Message ID | 20221018071630.3831-16-niranjana.vishwanathapura@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/i915/vm_bind: Add VM_BIND functionality | expand |
On 18/10/2022 08:16, Niranjana Vishwanathapura wrote: > Handle persistent (VM_BIND) mappings during the request submission > in the execbuf3 path. > > v2: Ensure requests wait for bindings to complete. > v3: Remove short term pinning with PIN_VALIDATE flag. > Individualize fences before adding to dma_resv obj. > > Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com> > --- > .../gpu/drm/i915/gem/i915_gem_execbuffer3.c | 208 +++++++++++++++++- > 1 file changed, 207 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c > index a9b4cc44bf66..8120e4c6b7da 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c > +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c > @@ -3,6 +3,7 @@ > * Copyright © 2022 Intel Corporation > */ > > +#include <linux/dma-fence-array.h> > #include <linux/dma-resv.h> > #include <linux/uaccess.h> > > @@ -19,6 +20,7 @@ > #include "i915_gem_vm_bind.h" > #include "i915_trace.h" > > +#define __EXEC3_HAS_PIN BIT_ULL(33) > #define __EXEC3_ENGINE_PINNED BIT_ULL(32) > #define __EXEC3_INTERNAL_FLAGS (~0ull << 32) > > @@ -42,7 +44,9 @@ > * execlist. Hence, no support for implicit sync. > * > * The new execbuf3 ioctl only works in VM_BIND mode and the VM_BIND mode only > - * works with execbuf3 ioctl for submission. > + * works with execbuf3 ioctl for submission. All BOs mapped on that VM (through > + * VM_BIND call) at the time of execbuf3 call are deemed required for that > + * submission. > * > * The execbuf3 ioctl directly specifies the batch addresses instead of as > * object handles as in execbuf2 ioctl. The execbuf3 ioctl will also not > @@ -58,6 +62,13 @@ > * So, a lot of code supporting execbuf2 ioctl, like relocations, VA evictions, > * vma lookup table, implicit sync, vma active reference tracking etc., are not > * applicable for execbuf3 ioctl. > + * > + * During each execbuf submission, request fence is added to all VM_BIND mapped > + * objects with DMA_RESV_USAGE_BOOKKEEP. The DMA_RESV_USAGE_BOOKKEEP usage will > + * prevent over sync (See enum dma_resv_usage). Note that DRM_I915_GEM_WAIT and > + * DRM_I915_GEM_BUSY ioctls do not check for DMA_RESV_USAGE_BOOKKEEP usage and > + * hence should not be used for end of batch check. Instead, the execbuf3 > + * timeline out fence should be used for end of batch check. > */ > > /** > @@ -127,6 +138,23 @@ eb_find_vma(struct i915_address_space *vm, u64 addr) > return i915_gem_vm_bind_lookup_vma(vm, va); > } > > +static void eb_scoop_unbound_vma_all(struct i915_address_space *vm) > +{ > + struct i915_vma *vma, *vn; > + > + /** > + * Move all unbound vmas back into vm_bind_list so that they are > + * revalidated. > + */ > + spin_lock(&vm->vm_rebind_lock); > + list_for_each_entry_safe(vma, vn, &vm->vm_rebind_list, vm_rebind_link) { > + list_del_init(&vma->vm_rebind_link); > + if (!list_empty(&vma->vm_bind_link)) > + list_move_tail(&vma->vm_bind_link, &vm->vm_bind_list); > + } > + spin_unlock(&vm->vm_rebind_lock); > +} > + > static int eb_lookup_vma_all(struct i915_execbuffer *eb) > { > unsigned int i, current_batch = 0; > @@ -141,14 +169,108 @@ static int eb_lookup_vma_all(struct i915_execbuffer *eb) > ++current_batch; > } > > + eb_scoop_unbound_vma_all(eb->context->vm); > + > + return 0; > +} > + > +static int eb_lock_vma_all(struct i915_execbuffer *eb) > +{ > + struct i915_address_space *vm = eb->context->vm; > + struct i915_vma *vma; > + int err; > + > + err = i915_gem_object_lock(eb->context->vm->root_obj, &eb->ww); > + if (err) > + return err; > + > + list_for_each_entry(vma, &vm->non_priv_vm_bind_list, > + non_priv_vm_bind_link) { > + err = i915_gem_object_lock(vma->obj, &eb->ww); > + if (err) > + return err; > + } > + > return 0; > } > > +static void eb_release_persistent_vma_all(struct i915_execbuffer *eb) > +{ > + struct i915_address_space *vm = eb->context->vm; > + struct i915_vma *vma, *vn; > + > + lockdep_assert_held(&vm->vm_bind_lock); > + > + if (!(eb->args->flags & __EXEC3_HAS_PIN)) > + return; > + > + assert_object_held(vm->root_obj); > + > + list_for_each_entry_safe(vma, vn, &vm->vm_bind_list, vm_bind_link) > + if (i915_vma_verify_bind_complete(vma)) > + list_move_tail(&vma->vm_bind_link, &vm->vm_bound_list); > + > + eb->args->flags &= ~__EXEC3_HAS_PIN; > +} > + > static void eb_release_vma_all(struct i915_execbuffer *eb) > { > + eb_release_persistent_vma_all(eb); > eb_unpin_engine(eb); > } > > +static int eb_reserve_fence_for_persistent_vma_all(struct i915_execbuffer *eb) > +{ > + struct i915_address_space *vm = eb->context->vm; > + u64 num_fences = 1; > + struct i915_vma *vma; > + int ret; > + > + /* Reserve enough slots to accommodate composite fences */ > + if (intel_context_is_parallel(eb->context)) > + num_fences = eb->num_batches; > + > + ret = dma_resv_reserve_fences(vm->root_obj->base.resv, num_fences); > + if (ret) > + return ret; > + > + list_for_each_entry(vma, &vm->non_priv_vm_bind_list, > + non_priv_vm_bind_link) { > + ret = dma_resv_reserve_fences(vma->obj->base.resv, num_fences); > + if (ret) > + return ret; > + } > + > + return 0; > +} > + > +static int eb_validate_persistent_vma_all(struct i915_execbuffer *eb) > +{ > + struct i915_address_space *vm = eb->context->vm; > + struct i915_vma *vma; > + int ret = 0; > + > + lockdep_assert_held(&vm->vm_bind_lock); > + assert_object_held(vm->root_obj); > + > + ret = eb_reserve_fence_for_persistent_vma_all(eb); > + if (ret) > + return ret; > + > + list_for_each_entry(vma, &vm->vm_bind_list, vm_bind_link) { > + u64 pin_flags = vma->start | PIN_OFFSET_FIXED | > + PIN_USER | PIN_VALIDATE; > + > + ret = i915_vma_pin_ww(vma, &eb->ww, 0, 0, pin_flags); > + if (ret) > + break; > + > + eb->args->flags |= __EXEC3_HAS_PIN; > + } > + > + return ret; > +} > + > /* > * Using two helper loops for the order of which requests / batches are created > * and added the to backend. Requests are created in order from the parent to > @@ -160,13 +282,80 @@ static void eb_release_vma_all(struct i915_execbuffer *eb) > */ > #define for_each_batch_create_order(_eb) \ > for (unsigned int i = 0; i < (_eb)->num_batches; ++i) > +#define for_each_batch_add_order(_eb) \ > + for (int i = (_eb)->num_batches - 1; i >= 0; --i) > + > +static void __eb_persistent_add_shared_fence(struct drm_i915_gem_object *obj, > + struct dma_fence *fence) > +{ > + struct dma_fence *curr; > + int idx; > + > + dma_fence_array_for_each(curr, idx, fence) > + dma_resv_add_fence(obj->base.resv, curr, > + DMA_RESV_USAGE_BOOKKEEP); > + > + obj->write_domain = 0; > + obj->read_domains |= I915_GEM_GPU_DOMAINS; > + obj->mm.dirty = true; > +} > + > +static void eb_persistent_add_shared_fence(struct i915_execbuffer *eb) > +{ > + struct i915_address_space *vm = eb->context->vm; > + struct dma_fence *fence; > + struct i915_vma *vma; > + > + fence = eb->composite_fence ? eb->composite_fence : > + &eb->requests[0]->fence; > + > + __eb_persistent_add_shared_fence(vm->root_obj, fence); > + list_for_each_entry(vma, &vm->non_priv_vm_bind_list, > + non_priv_vm_bind_link) > + __eb_persistent_add_shared_fence(vma->obj, fence); > +} > + > +static void eb_move_all_persistent_vma_to_active(struct i915_execbuffer *eb) > +{ > + /* Add fence to BOs dma-resv fence list */ > + eb_persistent_add_shared_fence(eb); > +} > > static int eb_move_to_gpu(struct i915_execbuffer *eb) > { > + struct i915_address_space *vm = eb->context->vm; > + struct i915_vma *vma; > + int err = 0; > + > + lockdep_assert_held(&vm->vm_bind_lock); > + assert_object_held(vm->root_obj); > + > + eb_move_all_persistent_vma_to_active(eb); > + > + list_for_each_entry(vma, &vm->vm_bind_list, vm_bind_link) { > + for_each_batch_add_order(eb) { > + if (!eb->requests[i]) > + continue; > + > + err = i915_request_await_bind(eb->requests[i], vma); > + if (err) > + goto err_skip; > + } > + } > + > /* Unconditionally flush any chipset caches (for streaming writes). */ > intel_gt_chipset_flush(eb->gt); > > return 0; > + > +err_skip: > + for_each_batch_create_order(eb) { > + if (!eb->requests[i]) > + break; > + > + i915_request_set_error_once(eb->requests[i], err); > + } > + return err; > } > > static int eb_request_submit(struct i915_execbuffer *eb, > @@ -483,6 +672,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, > > mutex_lock(&eb.context->vm->vm_bind_lock); > > +lookup_vmas: > err = eb_lookup_vma_all(&eb); > if (err) { > eb_release_vma_all(&eb); > @@ -499,6 +689,22 @@ i915_gem_do_execbuffer(struct drm_device *dev, > /* only throttle once, even if we didn't need to throttle */ > throttle = false; > > + err = eb_lock_vma_all(&eb); > + if (err) > + goto err_validate; > + > + /** > + * No object unbinds possible once the objects are locked. So, > + * check for any unbinds here, which needs to be scooped up. > + */ > + if (!list_empty(&eb.context->vm->vm_rebind_list)) { > + eb_release_vma_all(&eb); > + i915_gem_ww_ctx_fini(&eb.ww); > + goto lookup_vmas; > + } Is it not possible to grab the object locks first, and then move stuff off the rebind_list to be re-validated? Or if not maybe a comment to explain? > + > + err = eb_validate_persistent_vma_all(&eb); > + > err_validate: > if (err == -EDEADLK) { > eb_release_vma_all(&eb);
On Tue, Oct 18, 2022 at 07:01:57PM +0100, Matthew Auld wrote: >On 18/10/2022 08:16, Niranjana Vishwanathapura wrote: >>Handle persistent (VM_BIND) mappings during the request submission >>in the execbuf3 path. >> >>v2: Ensure requests wait for bindings to complete. >>v3: Remove short term pinning with PIN_VALIDATE flag. >> Individualize fences before adding to dma_resv obj. >> >>Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> >>Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com> >>--- >> .../gpu/drm/i915/gem/i915_gem_execbuffer3.c | 208 +++++++++++++++++- >> 1 file changed, 207 insertions(+), 1 deletion(-) >> >>diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c >>index a9b4cc44bf66..8120e4c6b7da 100644 >>--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c >>+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c >>@@ -3,6 +3,7 @@ >> * Copyright © 2022 Intel Corporation >> */ >>+#include <linux/dma-fence-array.h> >> #include <linux/dma-resv.h> >> #include <linux/uaccess.h> >>@@ -19,6 +20,7 @@ >> #include "i915_gem_vm_bind.h" >> #include "i915_trace.h" >>+#define __EXEC3_HAS_PIN BIT_ULL(33) >> #define __EXEC3_ENGINE_PINNED BIT_ULL(32) >> #define __EXEC3_INTERNAL_FLAGS (~0ull << 32) >>@@ -42,7 +44,9 @@ >> * execlist. Hence, no support for implicit sync. >> * >> * The new execbuf3 ioctl only works in VM_BIND mode and the VM_BIND mode only >>- * works with execbuf3 ioctl for submission. >>+ * works with execbuf3 ioctl for submission. All BOs mapped on that VM (through >>+ * VM_BIND call) at the time of execbuf3 call are deemed required for that >>+ * submission. >> * >> * The execbuf3 ioctl directly specifies the batch addresses instead of as >> * object handles as in execbuf2 ioctl. The execbuf3 ioctl will also not >>@@ -58,6 +62,13 @@ >> * So, a lot of code supporting execbuf2 ioctl, like relocations, VA evictions, >> * vma lookup table, implicit sync, vma active reference tracking etc., are not >> * applicable for execbuf3 ioctl. >>+ * >>+ * During each execbuf submission, request fence is added to all VM_BIND mapped >>+ * objects with DMA_RESV_USAGE_BOOKKEEP. The DMA_RESV_USAGE_BOOKKEEP usage will >>+ * prevent over sync (See enum dma_resv_usage). Note that DRM_I915_GEM_WAIT and >>+ * DRM_I915_GEM_BUSY ioctls do not check for DMA_RESV_USAGE_BOOKKEEP usage and >>+ * hence should not be used for end of batch check. Instead, the execbuf3 >>+ * timeline out fence should be used for end of batch check. >> */ >> /** >>@@ -127,6 +138,23 @@ eb_find_vma(struct i915_address_space *vm, u64 addr) >> return i915_gem_vm_bind_lookup_vma(vm, va); >> } >>+static void eb_scoop_unbound_vma_all(struct i915_address_space *vm) >>+{ >>+ struct i915_vma *vma, *vn; >>+ >>+ /** >>+ * Move all unbound vmas back into vm_bind_list so that they are >>+ * revalidated. >>+ */ >>+ spin_lock(&vm->vm_rebind_lock); >>+ list_for_each_entry_safe(vma, vn, &vm->vm_rebind_list, vm_rebind_link) { >>+ list_del_init(&vma->vm_rebind_link); >>+ if (!list_empty(&vma->vm_bind_link)) >>+ list_move_tail(&vma->vm_bind_link, &vm->vm_bind_list); >>+ } >>+ spin_unlock(&vm->vm_rebind_lock); >>+} >>+ >> static int eb_lookup_vma_all(struct i915_execbuffer *eb) >> { >> unsigned int i, current_batch = 0; >>@@ -141,14 +169,108 @@ static int eb_lookup_vma_all(struct i915_execbuffer *eb) >> ++current_batch; >> } >>+ eb_scoop_unbound_vma_all(eb->context->vm); >>+ >>+ return 0; >>+} >>+ >>+static int eb_lock_vma_all(struct i915_execbuffer *eb) >>+{ >>+ struct i915_address_space *vm = eb->context->vm; >>+ struct i915_vma *vma; >>+ int err; >>+ >>+ err = i915_gem_object_lock(eb->context->vm->root_obj, &eb->ww); >>+ if (err) >>+ return err; >>+ >>+ list_for_each_entry(vma, &vm->non_priv_vm_bind_list, >>+ non_priv_vm_bind_link) { >>+ err = i915_gem_object_lock(vma->obj, &eb->ww); >>+ if (err) >>+ return err; >>+ } >>+ >> return 0; >> } >>+static void eb_release_persistent_vma_all(struct i915_execbuffer *eb) >>+{ >>+ struct i915_address_space *vm = eb->context->vm; >>+ struct i915_vma *vma, *vn; >>+ >>+ lockdep_assert_held(&vm->vm_bind_lock); >>+ >>+ if (!(eb->args->flags & __EXEC3_HAS_PIN)) >>+ return; >>+ >>+ assert_object_held(vm->root_obj); >>+ >>+ list_for_each_entry_safe(vma, vn, &vm->vm_bind_list, vm_bind_link) >>+ if (i915_vma_verify_bind_complete(vma)) >>+ list_move_tail(&vma->vm_bind_link, &vm->vm_bound_list); >>+ >>+ eb->args->flags &= ~__EXEC3_HAS_PIN; >>+} >>+ >> static void eb_release_vma_all(struct i915_execbuffer *eb) >> { >>+ eb_release_persistent_vma_all(eb); >> eb_unpin_engine(eb); >> } >>+static int eb_reserve_fence_for_persistent_vma_all(struct i915_execbuffer *eb) >>+{ >>+ struct i915_address_space *vm = eb->context->vm; >>+ u64 num_fences = 1; >>+ struct i915_vma *vma; >>+ int ret; >>+ >>+ /* Reserve enough slots to accommodate composite fences */ >>+ if (intel_context_is_parallel(eb->context)) >>+ num_fences = eb->num_batches; >>+ >>+ ret = dma_resv_reserve_fences(vm->root_obj->base.resv, num_fences); >>+ if (ret) >>+ return ret; >>+ >>+ list_for_each_entry(vma, &vm->non_priv_vm_bind_list, >>+ non_priv_vm_bind_link) { >>+ ret = dma_resv_reserve_fences(vma->obj->base.resv, num_fences); >>+ if (ret) >>+ return ret; >>+ } >>+ >>+ return 0; >>+} >>+ >>+static int eb_validate_persistent_vma_all(struct i915_execbuffer *eb) >>+{ >>+ struct i915_address_space *vm = eb->context->vm; >>+ struct i915_vma *vma; >>+ int ret = 0; >>+ >>+ lockdep_assert_held(&vm->vm_bind_lock); >>+ assert_object_held(vm->root_obj); >>+ >>+ ret = eb_reserve_fence_for_persistent_vma_all(eb); >>+ if (ret) >>+ return ret; >>+ >>+ list_for_each_entry(vma, &vm->vm_bind_list, vm_bind_link) { >>+ u64 pin_flags = vma->start | PIN_OFFSET_FIXED | >>+ PIN_USER | PIN_VALIDATE; >>+ >>+ ret = i915_vma_pin_ww(vma, &eb->ww, 0, 0, pin_flags); >>+ if (ret) >>+ break; >>+ >>+ eb->args->flags |= __EXEC3_HAS_PIN; >>+ } >>+ >>+ return ret; >>+} >>+ >> /* >> * Using two helper loops for the order of which requests / batches are created >> * and added the to backend. Requests are created in order from the parent to >>@@ -160,13 +282,80 @@ static void eb_release_vma_all(struct i915_execbuffer *eb) >> */ >> #define for_each_batch_create_order(_eb) \ >> for (unsigned int i = 0; i < (_eb)->num_batches; ++i) >>+#define for_each_batch_add_order(_eb) \ >>+ for (int i = (_eb)->num_batches - 1; i >= 0; --i) >>+ >>+static void __eb_persistent_add_shared_fence(struct drm_i915_gem_object *obj, >>+ struct dma_fence *fence) >>+{ >>+ struct dma_fence *curr; >>+ int idx; >>+ >>+ dma_fence_array_for_each(curr, idx, fence) >>+ dma_resv_add_fence(obj->base.resv, curr, >>+ DMA_RESV_USAGE_BOOKKEEP); >>+ >>+ obj->write_domain = 0; >>+ obj->read_domains |= I915_GEM_GPU_DOMAINS; >>+ obj->mm.dirty = true; >>+} >>+ >>+static void eb_persistent_add_shared_fence(struct i915_execbuffer *eb) >>+{ >>+ struct i915_address_space *vm = eb->context->vm; >>+ struct dma_fence *fence; >>+ struct i915_vma *vma; >>+ >>+ fence = eb->composite_fence ? eb->composite_fence : >>+ &eb->requests[0]->fence; >>+ >>+ __eb_persistent_add_shared_fence(vm->root_obj, fence); >>+ list_for_each_entry(vma, &vm->non_priv_vm_bind_list, >>+ non_priv_vm_bind_link) >>+ __eb_persistent_add_shared_fence(vma->obj, fence); >>+} >>+ >>+static void eb_move_all_persistent_vma_to_active(struct i915_execbuffer *eb) >>+{ >>+ /* Add fence to BOs dma-resv fence list */ >>+ eb_persistent_add_shared_fence(eb); >>+} >> static int eb_move_to_gpu(struct i915_execbuffer *eb) >> { >>+ struct i915_address_space *vm = eb->context->vm; >>+ struct i915_vma *vma; >>+ int err = 0; >>+ >>+ lockdep_assert_held(&vm->vm_bind_lock); >>+ assert_object_held(vm->root_obj); >>+ >>+ eb_move_all_persistent_vma_to_active(eb); >>+ >>+ list_for_each_entry(vma, &vm->vm_bind_list, vm_bind_link) { >>+ for_each_batch_add_order(eb) { >>+ if (!eb->requests[i]) >>+ continue; >>+ >>+ err = i915_request_await_bind(eb->requests[i], vma); >>+ if (err) >>+ goto err_skip; >>+ } >>+ } >>+ >> /* Unconditionally flush any chipset caches (for streaming writes). */ >> intel_gt_chipset_flush(eb->gt); >> return 0; >>+ >>+err_skip: >>+ for_each_batch_create_order(eb) { >>+ if (!eb->requests[i]) >>+ break; >>+ >>+ i915_request_set_error_once(eb->requests[i], err); >>+ } >>+ return err; >> } >> static int eb_request_submit(struct i915_execbuffer *eb, >>@@ -483,6 +672,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, >> mutex_lock(&eb.context->vm->vm_bind_lock); >>+lookup_vmas: >> err = eb_lookup_vma_all(&eb); >> if (err) { >> eb_release_vma_all(&eb); >>@@ -499,6 +689,22 @@ i915_gem_do_execbuffer(struct drm_device *dev, >> /* only throttle once, even if we didn't need to throttle */ >> throttle = false; >>+ err = eb_lock_vma_all(&eb); >>+ if (err) >>+ goto err_validate; >>+ >>+ /** >>+ * No object unbinds possible once the objects are locked. So, >>+ * check for any unbinds here, which needs to be scooped up. >>+ */ >>+ if (!list_empty(&eb.context->vm->vm_rebind_list)) { >>+ eb_release_vma_all(&eb); >>+ i915_gem_ww_ctx_fini(&eb.ww); >>+ goto lookup_vmas; >>+ } > >Is it not possible to grab the object locks first, and then move stuff >off the rebind_list to be re-validated? Or if not maybe a comment to >explain? > We need to do the rebind_list scooping in lookup phase of the execbuf path so that userptr_submit_init() can be called on them without holding the object lock. Object locks are held during validation phase. Ok, will add some documention where we invoke eb_scoop_unbound_vma_all(). Niranjana >>+ >>+ err = eb_validate_persistent_vma_all(&eb); >>+ >> err_validate: >> if (err == -EDEADLK) { >> eb_release_vma_all(&eb);
On Tue, Oct 18, 2022 at 01:20:06PM -0700, Niranjana Vishwanathapura wrote: >On Tue, Oct 18, 2022 at 07:01:57PM +0100, Matthew Auld wrote: >>On 18/10/2022 08:16, Niranjana Vishwanathapura wrote: >>>Handle persistent (VM_BIND) mappings during the request submission >>>in the execbuf3 path. >>> >>>v2: Ensure requests wait for bindings to complete. >>>v3: Remove short term pinning with PIN_VALIDATE flag. >>> Individualize fences before adding to dma_resv obj. >>> >>>Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> >>>Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com> >>>--- >>> .../gpu/drm/i915/gem/i915_gem_execbuffer3.c | 208 +++++++++++++++++- >>> 1 file changed, 207 insertions(+), 1 deletion(-) >>> >>>diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c >>>index a9b4cc44bf66..8120e4c6b7da 100644 >>>--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c >>>+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c >>>@@ -3,6 +3,7 @@ >>> * Copyright © 2022 Intel Corporation >>> */ >>>+#include <linux/dma-fence-array.h> >>> #include <linux/dma-resv.h> >>> #include <linux/uaccess.h> >>>@@ -19,6 +20,7 @@ >>> #include "i915_gem_vm_bind.h" >>> #include "i915_trace.h" >>>+#define __EXEC3_HAS_PIN BIT_ULL(33) >>> #define __EXEC3_ENGINE_PINNED BIT_ULL(32) >>> #define __EXEC3_INTERNAL_FLAGS (~0ull << 32) >>>@@ -42,7 +44,9 @@ >>> * execlist. Hence, no support for implicit sync. >>> * >>> * The new execbuf3 ioctl only works in VM_BIND mode and the VM_BIND mode only >>>- * works with execbuf3 ioctl for submission. >>>+ * works with execbuf3 ioctl for submission. All BOs mapped on that VM (through >>>+ * VM_BIND call) at the time of execbuf3 call are deemed required for that >>>+ * submission. >>> * >>> * The execbuf3 ioctl directly specifies the batch addresses instead of as >>> * object handles as in execbuf2 ioctl. The execbuf3 ioctl will also not >>>@@ -58,6 +62,13 @@ >>> * So, a lot of code supporting execbuf2 ioctl, like relocations, VA evictions, >>> * vma lookup table, implicit sync, vma active reference tracking etc., are not >>> * applicable for execbuf3 ioctl. >>>+ * >>>+ * During each execbuf submission, request fence is added to all VM_BIND mapped >>>+ * objects with DMA_RESV_USAGE_BOOKKEEP. The DMA_RESV_USAGE_BOOKKEEP usage will >>>+ * prevent over sync (See enum dma_resv_usage). Note that DRM_I915_GEM_WAIT and >>>+ * DRM_I915_GEM_BUSY ioctls do not check for DMA_RESV_USAGE_BOOKKEEP usage and >>>+ * hence should not be used for end of batch check. Instead, the execbuf3 >>>+ * timeline out fence should be used for end of batch check. >>> */ >>> /** >>>@@ -127,6 +138,23 @@ eb_find_vma(struct i915_address_space *vm, u64 addr) >>> return i915_gem_vm_bind_lookup_vma(vm, va); >>> } >>>+static void eb_scoop_unbound_vma_all(struct i915_address_space *vm) >>>+{ >>>+ struct i915_vma *vma, *vn; >>>+ >>>+ /** >>>+ * Move all unbound vmas back into vm_bind_list so that they are >>>+ * revalidated. >>>+ */ >>>+ spin_lock(&vm->vm_rebind_lock); >>>+ list_for_each_entry_safe(vma, vn, &vm->vm_rebind_list, vm_rebind_link) { >>>+ list_del_init(&vma->vm_rebind_link); >>>+ if (!list_empty(&vma->vm_bind_link)) >>>+ list_move_tail(&vma->vm_bind_link, &vm->vm_bind_list); >>>+ } >>>+ spin_unlock(&vm->vm_rebind_lock); >>>+} >>>+ >>> static int eb_lookup_vma_all(struct i915_execbuffer *eb) >>> { >>> unsigned int i, current_batch = 0; >>>@@ -141,14 +169,108 @@ static int eb_lookup_vma_all(struct i915_execbuffer *eb) >>> ++current_batch; >>> } >>>+ eb_scoop_unbound_vma_all(eb->context->vm); >>>+ >>>+ return 0; >>>+} >>>+ >>>+static int eb_lock_vma_all(struct i915_execbuffer *eb) >>>+{ >>>+ struct i915_address_space *vm = eb->context->vm; >>>+ struct i915_vma *vma; >>>+ int err; >>>+ >>>+ err = i915_gem_object_lock(eb->context->vm->root_obj, &eb->ww); >>>+ if (err) >>>+ return err; >>>+ >>>+ list_for_each_entry(vma, &vm->non_priv_vm_bind_list, >>>+ non_priv_vm_bind_link) { >>>+ err = i915_gem_object_lock(vma->obj, &eb->ww); >>>+ if (err) >>>+ return err; >>>+ } >>>+ >>> return 0; >>> } >>>+static void eb_release_persistent_vma_all(struct i915_execbuffer *eb) >>>+{ >>>+ struct i915_address_space *vm = eb->context->vm; >>>+ struct i915_vma *vma, *vn; >>>+ >>>+ lockdep_assert_held(&vm->vm_bind_lock); >>>+ >>>+ if (!(eb->args->flags & __EXEC3_HAS_PIN)) >>>+ return; >>>+ >>>+ assert_object_held(vm->root_obj); >>>+ >>>+ list_for_each_entry_safe(vma, vn, &vm->vm_bind_list, vm_bind_link) >>>+ if (i915_vma_verify_bind_complete(vma)) >>>+ list_move_tail(&vma->vm_bind_link, &vm->vm_bound_list); >>>+ >>>+ eb->args->flags &= ~__EXEC3_HAS_PIN; >>>+} >>>+ >>> static void eb_release_vma_all(struct i915_execbuffer *eb) >>> { >>>+ eb_release_persistent_vma_all(eb); >>> eb_unpin_engine(eb); >>> } >>>+static int eb_reserve_fence_for_persistent_vma_all(struct i915_execbuffer *eb) >>>+{ >>>+ struct i915_address_space *vm = eb->context->vm; >>>+ u64 num_fences = 1; >>>+ struct i915_vma *vma; >>>+ int ret; >>>+ >>>+ /* Reserve enough slots to accommodate composite fences */ >>>+ if (intel_context_is_parallel(eb->context)) >>>+ num_fences = eb->num_batches; >>>+ >>>+ ret = dma_resv_reserve_fences(vm->root_obj->base.resv, num_fences); >>>+ if (ret) >>>+ return ret; >>>+ >>>+ list_for_each_entry(vma, &vm->non_priv_vm_bind_list, >>>+ non_priv_vm_bind_link) { >>>+ ret = dma_resv_reserve_fences(vma->obj->base.resv, num_fences); >>>+ if (ret) >>>+ return ret; >>>+ } >>>+ >>>+ return 0; >>>+} >>>+ >>>+static int eb_validate_persistent_vma_all(struct i915_execbuffer *eb) >>>+{ >>>+ struct i915_address_space *vm = eb->context->vm; >>>+ struct i915_vma *vma; >>>+ int ret = 0; >>>+ >>>+ lockdep_assert_held(&vm->vm_bind_lock); >>>+ assert_object_held(vm->root_obj); >>>+ >>>+ ret = eb_reserve_fence_for_persistent_vma_all(eb); >>>+ if (ret) >>>+ return ret; >>>+ >>>+ list_for_each_entry(vma, &vm->vm_bind_list, vm_bind_link) { >>>+ u64 pin_flags = vma->start | PIN_OFFSET_FIXED | >>>+ PIN_USER | PIN_VALIDATE; >>>+ >>>+ ret = i915_vma_pin_ww(vma, &eb->ww, 0, 0, pin_flags); >>>+ if (ret) >>>+ break; >>>+ >>>+ eb->args->flags |= __EXEC3_HAS_PIN; >>>+ } >>>+ >>>+ return ret; >>>+} >>>+ >>> /* >>> * Using two helper loops for the order of which requests / batches are created >>> * and added the to backend. Requests are created in order from the parent to >>>@@ -160,13 +282,80 @@ static void eb_release_vma_all(struct i915_execbuffer *eb) >>> */ >>> #define for_each_batch_create_order(_eb) \ >>> for (unsigned int i = 0; i < (_eb)->num_batches; ++i) >>>+#define for_each_batch_add_order(_eb) \ >>>+ for (int i = (_eb)->num_batches - 1; i >= 0; --i) >>>+ >>>+static void __eb_persistent_add_shared_fence(struct drm_i915_gem_object *obj, >>>+ struct dma_fence *fence) >>>+{ >>>+ struct dma_fence *curr; >>>+ int idx; >>>+ >>>+ dma_fence_array_for_each(curr, idx, fence) >>>+ dma_resv_add_fence(obj->base.resv, curr, >>>+ DMA_RESV_USAGE_BOOKKEEP); >>>+ >>>+ obj->write_domain = 0; >>>+ obj->read_domains |= I915_GEM_GPU_DOMAINS; >>>+ obj->mm.dirty = true; >>>+} >>>+ >>>+static void eb_persistent_add_shared_fence(struct i915_execbuffer *eb) >>>+{ >>>+ struct i915_address_space *vm = eb->context->vm; >>>+ struct dma_fence *fence; >>>+ struct i915_vma *vma; >>>+ >>>+ fence = eb->composite_fence ? eb->composite_fence : >>>+ &eb->requests[0]->fence; >>>+ >>>+ __eb_persistent_add_shared_fence(vm->root_obj, fence); >>>+ list_for_each_entry(vma, &vm->non_priv_vm_bind_list, >>>+ non_priv_vm_bind_link) >>>+ __eb_persistent_add_shared_fence(vma->obj, fence); >>>+} >>>+ >>>+static void eb_move_all_persistent_vma_to_active(struct i915_execbuffer *eb) >>>+{ >>>+ /* Add fence to BOs dma-resv fence list */ >>>+ eb_persistent_add_shared_fence(eb); >>>+} >>> static int eb_move_to_gpu(struct i915_execbuffer *eb) >>> { >>>+ struct i915_address_space *vm = eb->context->vm; >>>+ struct i915_vma *vma; >>>+ int err = 0; >>>+ >>>+ lockdep_assert_held(&vm->vm_bind_lock); >>>+ assert_object_held(vm->root_obj); >>>+ >>>+ eb_move_all_persistent_vma_to_active(eb); >>>+ >>>+ list_for_each_entry(vma, &vm->vm_bind_list, vm_bind_link) { >>>+ for_each_batch_add_order(eb) { >>>+ if (!eb->requests[i]) >>>+ continue; >>>+ >>>+ err = i915_request_await_bind(eb->requests[i], vma); >>>+ if (err) >>>+ goto err_skip; >>>+ } >>>+ } >>>+ >>> /* Unconditionally flush any chipset caches (for streaming writes). */ >>> intel_gt_chipset_flush(eb->gt); >>> return 0; >>>+ >>>+err_skip: >>>+ for_each_batch_create_order(eb) { >>>+ if (!eb->requests[i]) >>>+ break; >>>+ >>>+ i915_request_set_error_once(eb->requests[i], err); >>>+ } >>>+ return err; >>> } >>> static int eb_request_submit(struct i915_execbuffer *eb, >>>@@ -483,6 +672,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, >>> mutex_lock(&eb.context->vm->vm_bind_lock); >>>+lookup_vmas: >>> err = eb_lookup_vma_all(&eb); >>> if (err) { >>> eb_release_vma_all(&eb); >>>@@ -499,6 +689,22 @@ i915_gem_do_execbuffer(struct drm_device *dev, >>> /* only throttle once, even if we didn't need to throttle */ >>> throttle = false; >>>+ err = eb_lock_vma_all(&eb); >>>+ if (err) >>>+ goto err_validate; >>>+ >>>+ /** >>>+ * No object unbinds possible once the objects are locked. So, >>>+ * check for any unbinds here, which needs to be scooped up. >>>+ */ >>>+ if (!list_empty(&eb.context->vm->vm_rebind_list)) { >>>+ eb_release_vma_all(&eb); >>>+ i915_gem_ww_ctx_fini(&eb.ww); >>>+ goto lookup_vmas; >>>+ } >> >>Is it not possible to grab the object locks first, and then move >>stuff off the rebind_list to be re-validated? Or if not maybe a >>comment to explain? >> > >We need to do the rebind_list scooping in lookup phase of the execbuf >path so that userptr_submit_init() can be called on them without >holding the object lock. Object locks are held during validation phase. >Ok, will add some documention where we invoke eb_scoop_unbound_vma_all(). > Hmm...seems bit more complicated than that. For userptr, we have separate userptr_invalidated_list. So, probably rebind_list can be scooped in the validation phase under the object locks, unless there are other ways the userptr objects gets unbound other than through mmu invalidation. Niranjana >Niranjana > >>>+ >>>+ err = eb_validate_persistent_vma_all(&eb); >>>+ >>> err_validate: >>> if (err == -EDEADLK) { >>> eb_release_vma_all(&eb);
On 18/10/2022 08:16, Niranjana Vishwanathapura wrote: > Handle persistent (VM_BIND) mappings during the request submission > in the execbuf3 path. > > v2: Ensure requests wait for bindings to complete. > v3: Remove short term pinning with PIN_VALIDATE flag. > Individualize fences before adding to dma_resv obj. > > Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com> Reviewed-by: Matthew Auld <matthew.auld@intel.com>
On 18/10/2022 08:16, Niranjana Vishwanathapura wrote: > Handle persistent (VM_BIND) mappings during the request submission > in the execbuf3 path. > > v2: Ensure requests wait for bindings to complete. > v3: Remove short term pinning with PIN_VALIDATE flag. > Individualize fences before adding to dma_resv obj. > > Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com> > --- > .../gpu/drm/i915/gem/i915_gem_execbuffer3.c | 208 +++++++++++++++++- > 1 file changed, 207 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c > index a9b4cc44bf66..8120e4c6b7da 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c > +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c > @@ -3,6 +3,7 @@ > * Copyright © 2022 Intel Corporation > */ > > +#include <linux/dma-fence-array.h> > #include <linux/dma-resv.h> > #include <linux/uaccess.h> > > @@ -19,6 +20,7 @@ > #include "i915_gem_vm_bind.h" > #include "i915_trace.h" > > +#define __EXEC3_HAS_PIN BIT_ULL(33) > #define __EXEC3_ENGINE_PINNED BIT_ULL(32) > #define __EXEC3_INTERNAL_FLAGS (~0ull << 32) > > @@ -42,7 +44,9 @@ > * execlist. Hence, no support for implicit sync. > * > * The new execbuf3 ioctl only works in VM_BIND mode and the VM_BIND mode only > - * works with execbuf3 ioctl for submission. > + * works with execbuf3 ioctl for submission. All BOs mapped on that VM (through > + * VM_BIND call) at the time of execbuf3 call are deemed required for that > + * submission. > * > * The execbuf3 ioctl directly specifies the batch addresses instead of as > * object handles as in execbuf2 ioctl. The execbuf3 ioctl will also not > @@ -58,6 +62,13 @@ > * So, a lot of code supporting execbuf2 ioctl, like relocations, VA evictions, > * vma lookup table, implicit sync, vma active reference tracking etc., are not > * applicable for execbuf3 ioctl. > + * > + * During each execbuf submission, request fence is added to all VM_BIND mapped > + * objects with DMA_RESV_USAGE_BOOKKEEP. The DMA_RESV_USAGE_BOOKKEEP usage will > + * prevent over sync (See enum dma_resv_usage). Note that DRM_I915_GEM_WAIT and > + * DRM_I915_GEM_BUSY ioctls do not check for DMA_RESV_USAGE_BOOKKEEP usage and > + * hence should not be used for end of batch check. Instead, the execbuf3 > + * timeline out fence should be used for end of batch check. > */ > > /** > @@ -127,6 +138,23 @@ eb_find_vma(struct i915_address_space *vm, u64 addr) > return i915_gem_vm_bind_lookup_vma(vm, va); > } > > +static void eb_scoop_unbound_vma_all(struct i915_address_space *vm) > +{ > + struct i915_vma *vma, *vn; > + > + /** > + * Move all unbound vmas back into vm_bind_list so that they are > + * revalidated. > + */ > + spin_lock(&vm->vm_rebind_lock); > + list_for_each_entry_safe(vma, vn, &vm->vm_rebind_list, vm_rebind_link) { > + list_del_init(&vma->vm_rebind_link); > + if (!list_empty(&vma->vm_bind_link)) > + list_move_tail(&vma->vm_bind_link, &vm->vm_bind_list); > + } > + spin_unlock(&vm->vm_rebind_lock); > +} > + > static int eb_lookup_vma_all(struct i915_execbuffer *eb) > { > unsigned int i, current_batch = 0; > @@ -141,14 +169,108 @@ static int eb_lookup_vma_all(struct i915_execbuffer *eb) > ++current_batch; > } > > + eb_scoop_unbound_vma_all(eb->context->vm); > + > + return 0; > +} > + > +static int eb_lock_vma_all(struct i915_execbuffer *eb) > +{ > + struct i915_address_space *vm = eb->context->vm; > + struct i915_vma *vma; > + int err; > + > + err = i915_gem_object_lock(eb->context->vm->root_obj, &eb->ww); > + if (err) > + return err; > + > + list_for_each_entry(vma, &vm->non_priv_vm_bind_list, > + non_priv_vm_bind_link) { > + err = i915_gem_object_lock(vma->obj, &eb->ww); > + if (err) > + return err; > + } > + > return 0; > } > > +static void eb_release_persistent_vma_all(struct i915_execbuffer *eb) > +{ > + struct i915_address_space *vm = eb->context->vm; > + struct i915_vma *vma, *vn; > + > + lockdep_assert_held(&vm->vm_bind_lock); > + > + if (!(eb->args->flags & __EXEC3_HAS_PIN)) > + return; > + > + assert_object_held(vm->root_obj); > + > + list_for_each_entry_safe(vma, vn, &vm->vm_bind_list, vm_bind_link) > + if (i915_vma_verify_bind_complete(vma)) > + list_move_tail(&vma->vm_bind_link, &vm->vm_bound_list); > + > + eb->args->flags &= ~__EXEC3_HAS_PIN; > +} > + > static void eb_release_vma_all(struct i915_execbuffer *eb) > { > + eb_release_persistent_vma_all(eb); > eb_unpin_engine(eb); > } > > +static int eb_reserve_fence_for_persistent_vma_all(struct i915_execbuffer *eb) > +{ > + struct i915_address_space *vm = eb->context->vm; > + u64 num_fences = 1; > + struct i915_vma *vma; > + int ret; > + > + /* Reserve enough slots to accommodate composite fences */ > + if (intel_context_is_parallel(eb->context)) > + num_fences = eb->num_batches; > + > + ret = dma_resv_reserve_fences(vm->root_obj->base.resv, num_fences); > + if (ret) > + return ret; > + > + list_for_each_entry(vma, &vm->non_priv_vm_bind_list, > + non_priv_vm_bind_link) { > + ret = dma_resv_reserve_fences(vma->obj->base.resv, num_fences); > + if (ret) > + return ret; > + } > + > + return 0; > +} > + > +static int eb_validate_persistent_vma_all(struct i915_execbuffer *eb) > +{ > + struct i915_address_space *vm = eb->context->vm; > + struct i915_vma *vma; > + int ret = 0; > + > + lockdep_assert_held(&vm->vm_bind_lock); > + assert_object_held(vm->root_obj); > + > + ret = eb_reserve_fence_for_persistent_vma_all(eb); > + if (ret) > + return ret; > + > + list_for_each_entry(vma, &vm->vm_bind_list, vm_bind_link) { > + u64 pin_flags = vma->start | PIN_OFFSET_FIXED | > + PIN_USER | PIN_VALIDATE; Should we also add NOEVICT here and in vm_bind? Or is eviction somehow still desired in vm_bind world? > + > + ret = i915_vma_pin_ww(vma, &eb->ww, 0, 0, pin_flags); > + if (ret) > + break; > + > + eb->args->flags |= __EXEC3_HAS_PIN; > + } > + > + return ret; > +} > + > /* > * Using two helper loops for the order of which requests / batches are created > * and added the to backend. Requests are created in order from the parent to > @@ -160,13 +282,80 @@ static void eb_release_vma_all(struct i915_execbuffer *eb) > */ > #define for_each_batch_create_order(_eb) \ > for (unsigned int i = 0; i < (_eb)->num_batches; ++i) > +#define for_each_batch_add_order(_eb) \ > + for (int i = (_eb)->num_batches - 1; i >= 0; --i) > + > +static void __eb_persistent_add_shared_fence(struct drm_i915_gem_object *obj, > + struct dma_fence *fence) > +{ > + struct dma_fence *curr; > + int idx; > + > + dma_fence_array_for_each(curr, idx, fence) > + dma_resv_add_fence(obj->base.resv, curr, > + DMA_RESV_USAGE_BOOKKEEP); > + > + obj->write_domain = 0; > + obj->read_domains |= I915_GEM_GPU_DOMAINS; > + obj->mm.dirty = true; > +} > + > +static void eb_persistent_add_shared_fence(struct i915_execbuffer *eb) > +{ > + struct i915_address_space *vm = eb->context->vm; > + struct dma_fence *fence; > + struct i915_vma *vma; > + > + fence = eb->composite_fence ? eb->composite_fence : > + &eb->requests[0]->fence; > + > + __eb_persistent_add_shared_fence(vm->root_obj, fence); > + list_for_each_entry(vma, &vm->non_priv_vm_bind_list, > + non_priv_vm_bind_link) > + __eb_persistent_add_shared_fence(vma->obj, fence); > +} > + > +static void eb_move_all_persistent_vma_to_active(struct i915_execbuffer *eb) > +{ > + /* Add fence to BOs dma-resv fence list */ > + eb_persistent_add_shared_fence(eb); > +} > > static int eb_move_to_gpu(struct i915_execbuffer *eb) > { > + struct i915_address_space *vm = eb->context->vm; > + struct i915_vma *vma; > + int err = 0; > + > + lockdep_assert_held(&vm->vm_bind_lock); > + assert_object_held(vm->root_obj); > + > + eb_move_all_persistent_vma_to_active(eb); > + > + list_for_each_entry(vma, &vm->vm_bind_list, vm_bind_link) { > + for_each_batch_add_order(eb) { > + if (!eb->requests[i]) > + continue; > + > + err = i915_request_await_bind(eb->requests[i], vma); > + if (err) > + goto err_skip; > + } > + } > + > /* Unconditionally flush any chipset caches (for streaming writes). */ > intel_gt_chipset_flush(eb->gt); > > return 0; > + > +err_skip: > + for_each_batch_create_order(eb) { > + if (!eb->requests[i]) > + break; > + > + i915_request_set_error_once(eb->requests[i], err); > + } > + return err; > } > > static int eb_request_submit(struct i915_execbuffer *eb, > @@ -483,6 +672,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, > > mutex_lock(&eb.context->vm->vm_bind_lock); > > +lookup_vmas: > err = eb_lookup_vma_all(&eb); > if (err) { > eb_release_vma_all(&eb); > @@ -499,6 +689,22 @@ i915_gem_do_execbuffer(struct drm_device *dev, > /* only throttle once, even if we didn't need to throttle */ > throttle = false; > > + err = eb_lock_vma_all(&eb); > + if (err) > + goto err_validate; > + > + /** > + * No object unbinds possible once the objects are locked. So, > + * check for any unbinds here, which needs to be scooped up. > + */ > + if (!list_empty(&eb.context->vm->vm_rebind_list)) { > + eb_release_vma_all(&eb); > + i915_gem_ww_ctx_fini(&eb.ww); > + goto lookup_vmas; > + } > + > + err = eb_validate_persistent_vma_all(&eb); > + > err_validate: > if (err == -EDEADLK) { > eb_release_vma_all(&eb);
On Thu, Oct 20, 2022 at 05:39:46PM +0100, Matthew Auld wrote: >On 18/10/2022 08:16, Niranjana Vishwanathapura wrote: >>Handle persistent (VM_BIND) mappings during the request submission >>in the execbuf3 path. >> >>v2: Ensure requests wait for bindings to complete. >>v3: Remove short term pinning with PIN_VALIDATE flag. >> Individualize fences before adding to dma_resv obj. >> >>Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> >>Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com> >>--- >> .../gpu/drm/i915/gem/i915_gem_execbuffer3.c | 208 +++++++++++++++++- >> 1 file changed, 207 insertions(+), 1 deletion(-) >> >>diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c >>index a9b4cc44bf66..8120e4c6b7da 100644 >>--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c >>+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c >>@@ -3,6 +3,7 @@ >> * Copyright © 2022 Intel Corporation >> */ >>+#include <linux/dma-fence-array.h> >> #include <linux/dma-resv.h> >> #include <linux/uaccess.h> >>@@ -19,6 +20,7 @@ >> #include "i915_gem_vm_bind.h" >> #include "i915_trace.h" >>+#define __EXEC3_HAS_PIN BIT_ULL(33) >> #define __EXEC3_ENGINE_PINNED BIT_ULL(32) >> #define __EXEC3_INTERNAL_FLAGS (~0ull << 32) >>@@ -42,7 +44,9 @@ >> * execlist. Hence, no support for implicit sync. >> * >> * The new execbuf3 ioctl only works in VM_BIND mode and the VM_BIND mode only >>- * works with execbuf3 ioctl for submission. >>+ * works with execbuf3 ioctl for submission. All BOs mapped on that VM (through >>+ * VM_BIND call) at the time of execbuf3 call are deemed required for that >>+ * submission. >> * >> * The execbuf3 ioctl directly specifies the batch addresses instead of as >> * object handles as in execbuf2 ioctl. The execbuf3 ioctl will also not >>@@ -58,6 +62,13 @@ >> * So, a lot of code supporting execbuf2 ioctl, like relocations, VA evictions, >> * vma lookup table, implicit sync, vma active reference tracking etc., are not >> * applicable for execbuf3 ioctl. >>+ * >>+ * During each execbuf submission, request fence is added to all VM_BIND mapped >>+ * objects with DMA_RESV_USAGE_BOOKKEEP. The DMA_RESV_USAGE_BOOKKEEP usage will >>+ * prevent over sync (See enum dma_resv_usage). Note that DRM_I915_GEM_WAIT and >>+ * DRM_I915_GEM_BUSY ioctls do not check for DMA_RESV_USAGE_BOOKKEEP usage and >>+ * hence should not be used for end of batch check. Instead, the execbuf3 >>+ * timeline out fence should be used for end of batch check. >> */ >> /** >>@@ -127,6 +138,23 @@ eb_find_vma(struct i915_address_space *vm, u64 addr) >> return i915_gem_vm_bind_lookup_vma(vm, va); >> } >>+static void eb_scoop_unbound_vma_all(struct i915_address_space *vm) >>+{ >>+ struct i915_vma *vma, *vn; >>+ >>+ /** >>+ * Move all unbound vmas back into vm_bind_list so that they are >>+ * revalidated. >>+ */ >>+ spin_lock(&vm->vm_rebind_lock); >>+ list_for_each_entry_safe(vma, vn, &vm->vm_rebind_list, vm_rebind_link) { >>+ list_del_init(&vma->vm_rebind_link); >>+ if (!list_empty(&vma->vm_bind_link)) >>+ list_move_tail(&vma->vm_bind_link, &vm->vm_bind_list); >>+ } >>+ spin_unlock(&vm->vm_rebind_lock); >>+} >>+ >> static int eb_lookup_vma_all(struct i915_execbuffer *eb) >> { >> unsigned int i, current_batch = 0; >>@@ -141,14 +169,108 @@ static int eb_lookup_vma_all(struct i915_execbuffer *eb) >> ++current_batch; >> } >>+ eb_scoop_unbound_vma_all(eb->context->vm); >>+ >>+ return 0; >>+} >>+ >>+static int eb_lock_vma_all(struct i915_execbuffer *eb) >>+{ >>+ struct i915_address_space *vm = eb->context->vm; >>+ struct i915_vma *vma; >>+ int err; >>+ >>+ err = i915_gem_object_lock(eb->context->vm->root_obj, &eb->ww); >>+ if (err) >>+ return err; >>+ >>+ list_for_each_entry(vma, &vm->non_priv_vm_bind_list, >>+ non_priv_vm_bind_link) { >>+ err = i915_gem_object_lock(vma->obj, &eb->ww); >>+ if (err) >>+ return err; >>+ } >>+ >> return 0; >> } >>+static void eb_release_persistent_vma_all(struct i915_execbuffer *eb) >>+{ >>+ struct i915_address_space *vm = eb->context->vm; >>+ struct i915_vma *vma, *vn; >>+ >>+ lockdep_assert_held(&vm->vm_bind_lock); >>+ >>+ if (!(eb->args->flags & __EXEC3_HAS_PIN)) >>+ return; >>+ >>+ assert_object_held(vm->root_obj); >>+ >>+ list_for_each_entry_safe(vma, vn, &vm->vm_bind_list, vm_bind_link) >>+ if (i915_vma_verify_bind_complete(vma)) >>+ list_move_tail(&vma->vm_bind_link, &vm->vm_bound_list); >>+ >>+ eb->args->flags &= ~__EXEC3_HAS_PIN; >>+} >>+ >> static void eb_release_vma_all(struct i915_execbuffer *eb) >> { >>+ eb_release_persistent_vma_all(eb); >> eb_unpin_engine(eb); >> } >>+static int eb_reserve_fence_for_persistent_vma_all(struct i915_execbuffer *eb) >>+{ >>+ struct i915_address_space *vm = eb->context->vm; >>+ u64 num_fences = 1; >>+ struct i915_vma *vma; >>+ int ret; >>+ >>+ /* Reserve enough slots to accommodate composite fences */ >>+ if (intel_context_is_parallel(eb->context)) >>+ num_fences = eb->num_batches; >>+ >>+ ret = dma_resv_reserve_fences(vm->root_obj->base.resv, num_fences); >>+ if (ret) >>+ return ret; >>+ >>+ list_for_each_entry(vma, &vm->non_priv_vm_bind_list, >>+ non_priv_vm_bind_link) { >>+ ret = dma_resv_reserve_fences(vma->obj->base.resv, num_fences); >>+ if (ret) >>+ return ret; >>+ } >>+ >>+ return 0; >>+} >>+ >>+static int eb_validate_persistent_vma_all(struct i915_execbuffer *eb) >>+{ >>+ struct i915_address_space *vm = eb->context->vm; >>+ struct i915_vma *vma; >>+ int ret = 0; >>+ >>+ lockdep_assert_held(&vm->vm_bind_lock); >>+ assert_object_held(vm->root_obj); >>+ >>+ ret = eb_reserve_fence_for_persistent_vma_all(eb); >>+ if (ret) >>+ return ret; >>+ >>+ list_for_each_entry(vma, &vm->vm_bind_list, vm_bind_link) { >>+ u64 pin_flags = vma->start | PIN_OFFSET_FIXED | >>+ PIN_USER | PIN_VALIDATE; > >Should we also add NOEVICT here and in vm_bind? Or is eviction somehow >still desired in vm_bind world? Yah, we only check above if there is mapping at vma->start, but not the whole range. Will add NOEVICT here and in execbuffer3.c Niranjana > >>+ >>+ ret = i915_vma_pin_ww(vma, &eb->ww, 0, 0, pin_flags); >>+ if (ret) >>+ break; >>+ >>+ eb->args->flags |= __EXEC3_HAS_PIN; >>+ } >>+ >>+ return ret; >>+} >>+ >> /* >> * Using two helper loops for the order of which requests / batches are created >> * and added the to backend. Requests are created in order from the parent to >>@@ -160,13 +282,80 @@ static void eb_release_vma_all(struct i915_execbuffer *eb) >> */ >> #define for_each_batch_create_order(_eb) \ >> for (unsigned int i = 0; i < (_eb)->num_batches; ++i) >>+#define for_each_batch_add_order(_eb) \ >>+ for (int i = (_eb)->num_batches - 1; i >= 0; --i) >>+ >>+static void __eb_persistent_add_shared_fence(struct drm_i915_gem_object *obj, >>+ struct dma_fence *fence) >>+{ >>+ struct dma_fence *curr; >>+ int idx; >>+ >>+ dma_fence_array_for_each(curr, idx, fence) >>+ dma_resv_add_fence(obj->base.resv, curr, >>+ DMA_RESV_USAGE_BOOKKEEP); >>+ >>+ obj->write_domain = 0; >>+ obj->read_domains |= I915_GEM_GPU_DOMAINS; >>+ obj->mm.dirty = true; >>+} >>+ >>+static void eb_persistent_add_shared_fence(struct i915_execbuffer *eb) >>+{ >>+ struct i915_address_space *vm = eb->context->vm; >>+ struct dma_fence *fence; >>+ struct i915_vma *vma; >>+ >>+ fence = eb->composite_fence ? eb->composite_fence : >>+ &eb->requests[0]->fence; >>+ >>+ __eb_persistent_add_shared_fence(vm->root_obj, fence); >>+ list_for_each_entry(vma, &vm->non_priv_vm_bind_list, >>+ non_priv_vm_bind_link) >>+ __eb_persistent_add_shared_fence(vma->obj, fence); >>+} >>+ >>+static void eb_move_all_persistent_vma_to_active(struct i915_execbuffer *eb) >>+{ >>+ /* Add fence to BOs dma-resv fence list */ >>+ eb_persistent_add_shared_fence(eb); >>+} >> static int eb_move_to_gpu(struct i915_execbuffer *eb) >> { >>+ struct i915_address_space *vm = eb->context->vm; >>+ struct i915_vma *vma; >>+ int err = 0; >>+ >>+ lockdep_assert_held(&vm->vm_bind_lock); >>+ assert_object_held(vm->root_obj); >>+ >>+ eb_move_all_persistent_vma_to_active(eb); >>+ >>+ list_for_each_entry(vma, &vm->vm_bind_list, vm_bind_link) { >>+ for_each_batch_add_order(eb) { >>+ if (!eb->requests[i]) >>+ continue; >>+ >>+ err = i915_request_await_bind(eb->requests[i], vma); >>+ if (err) >>+ goto err_skip; >>+ } >>+ } >>+ >> /* Unconditionally flush any chipset caches (for streaming writes). */ >> intel_gt_chipset_flush(eb->gt); >> return 0; >>+ >>+err_skip: >>+ for_each_batch_create_order(eb) { >>+ if (!eb->requests[i]) >>+ break; >>+ >>+ i915_request_set_error_once(eb->requests[i], err); >>+ } >>+ return err; >> } >> static int eb_request_submit(struct i915_execbuffer *eb, >>@@ -483,6 +672,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, >> mutex_lock(&eb.context->vm->vm_bind_lock); >>+lookup_vmas: >> err = eb_lookup_vma_all(&eb); >> if (err) { >> eb_release_vma_all(&eb); >>@@ -499,6 +689,22 @@ i915_gem_do_execbuffer(struct drm_device *dev, >> /* only throttle once, even if we didn't need to throttle */ >> throttle = false; >>+ err = eb_lock_vma_all(&eb); >>+ if (err) >>+ goto err_validate; >>+ >>+ /** >>+ * No object unbinds possible once the objects are locked. So, >>+ * check for any unbinds here, which needs to be scooped up. >>+ */ >>+ if (!list_empty(&eb.context->vm->vm_rebind_list)) { >>+ eb_release_vma_all(&eb); >>+ i915_gem_ww_ctx_fini(&eb.ww); >>+ goto lookup_vmas; >>+ } >>+ >>+ err = eb_validate_persistent_vma_all(&eb); >>+ >> err_validate: >> if (err == -EDEADLK) { >> eb_release_vma_all(&eb);
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c index a9b4cc44bf66..8120e4c6b7da 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer3.c @@ -3,6 +3,7 @@ * Copyright © 2022 Intel Corporation */ +#include <linux/dma-fence-array.h> #include <linux/dma-resv.h> #include <linux/uaccess.h> @@ -19,6 +20,7 @@ #include "i915_gem_vm_bind.h" #include "i915_trace.h" +#define __EXEC3_HAS_PIN BIT_ULL(33) #define __EXEC3_ENGINE_PINNED BIT_ULL(32) #define __EXEC3_INTERNAL_FLAGS (~0ull << 32) @@ -42,7 +44,9 @@ * execlist. Hence, no support for implicit sync. * * The new execbuf3 ioctl only works in VM_BIND mode and the VM_BIND mode only - * works with execbuf3 ioctl for submission. + * works with execbuf3 ioctl for submission. All BOs mapped on that VM (through + * VM_BIND call) at the time of execbuf3 call are deemed required for that + * submission. * * The execbuf3 ioctl directly specifies the batch addresses instead of as * object handles as in execbuf2 ioctl. The execbuf3 ioctl will also not @@ -58,6 +62,13 @@ * So, a lot of code supporting execbuf2 ioctl, like relocations, VA evictions, * vma lookup table, implicit sync, vma active reference tracking etc., are not * applicable for execbuf3 ioctl. + * + * During each execbuf submission, request fence is added to all VM_BIND mapped + * objects with DMA_RESV_USAGE_BOOKKEEP. The DMA_RESV_USAGE_BOOKKEEP usage will + * prevent over sync (See enum dma_resv_usage). Note that DRM_I915_GEM_WAIT and + * DRM_I915_GEM_BUSY ioctls do not check for DMA_RESV_USAGE_BOOKKEEP usage and + * hence should not be used for end of batch check. Instead, the execbuf3 + * timeline out fence should be used for end of batch check. */ /** @@ -127,6 +138,23 @@ eb_find_vma(struct i915_address_space *vm, u64 addr) return i915_gem_vm_bind_lookup_vma(vm, va); } +static void eb_scoop_unbound_vma_all(struct i915_address_space *vm) +{ + struct i915_vma *vma, *vn; + + /** + * Move all unbound vmas back into vm_bind_list so that they are + * revalidated. + */ + spin_lock(&vm->vm_rebind_lock); + list_for_each_entry_safe(vma, vn, &vm->vm_rebind_list, vm_rebind_link) { + list_del_init(&vma->vm_rebind_link); + if (!list_empty(&vma->vm_bind_link)) + list_move_tail(&vma->vm_bind_link, &vm->vm_bind_list); + } + spin_unlock(&vm->vm_rebind_lock); +} + static int eb_lookup_vma_all(struct i915_execbuffer *eb) { unsigned int i, current_batch = 0; @@ -141,14 +169,108 @@ static int eb_lookup_vma_all(struct i915_execbuffer *eb) ++current_batch; } + eb_scoop_unbound_vma_all(eb->context->vm); + + return 0; +} + +static int eb_lock_vma_all(struct i915_execbuffer *eb) +{ + struct i915_address_space *vm = eb->context->vm; + struct i915_vma *vma; + int err; + + err = i915_gem_object_lock(eb->context->vm->root_obj, &eb->ww); + if (err) + return err; + + list_for_each_entry(vma, &vm->non_priv_vm_bind_list, + non_priv_vm_bind_link) { + err = i915_gem_object_lock(vma->obj, &eb->ww); + if (err) + return err; + } + return 0; } +static void eb_release_persistent_vma_all(struct i915_execbuffer *eb) +{ + struct i915_address_space *vm = eb->context->vm; + struct i915_vma *vma, *vn; + + lockdep_assert_held(&vm->vm_bind_lock); + + if (!(eb->args->flags & __EXEC3_HAS_PIN)) + return; + + assert_object_held(vm->root_obj); + + list_for_each_entry_safe(vma, vn, &vm->vm_bind_list, vm_bind_link) + if (i915_vma_verify_bind_complete(vma)) + list_move_tail(&vma->vm_bind_link, &vm->vm_bound_list); + + eb->args->flags &= ~__EXEC3_HAS_PIN; +} + static void eb_release_vma_all(struct i915_execbuffer *eb) { + eb_release_persistent_vma_all(eb); eb_unpin_engine(eb); } +static int eb_reserve_fence_for_persistent_vma_all(struct i915_execbuffer *eb) +{ + struct i915_address_space *vm = eb->context->vm; + u64 num_fences = 1; + struct i915_vma *vma; + int ret; + + /* Reserve enough slots to accommodate composite fences */ + if (intel_context_is_parallel(eb->context)) + num_fences = eb->num_batches; + + ret = dma_resv_reserve_fences(vm->root_obj->base.resv, num_fences); + if (ret) + return ret; + + list_for_each_entry(vma, &vm->non_priv_vm_bind_list, + non_priv_vm_bind_link) { + ret = dma_resv_reserve_fences(vma->obj->base.resv, num_fences); + if (ret) + return ret; + } + + return 0; +} + +static int eb_validate_persistent_vma_all(struct i915_execbuffer *eb) +{ + struct i915_address_space *vm = eb->context->vm; + struct i915_vma *vma; + int ret = 0; + + lockdep_assert_held(&vm->vm_bind_lock); + assert_object_held(vm->root_obj); + + ret = eb_reserve_fence_for_persistent_vma_all(eb); + if (ret) + return ret; + + list_for_each_entry(vma, &vm->vm_bind_list, vm_bind_link) { + u64 pin_flags = vma->start | PIN_OFFSET_FIXED | + PIN_USER | PIN_VALIDATE; + + ret = i915_vma_pin_ww(vma, &eb->ww, 0, 0, pin_flags); + if (ret) + break; + + eb->args->flags |= __EXEC3_HAS_PIN; + } + + return ret; +} + /* * Using two helper loops for the order of which requests / batches are created * and added the to backend. Requests are created in order from the parent to @@ -160,13 +282,80 @@ static void eb_release_vma_all(struct i915_execbuffer *eb) */ #define for_each_batch_create_order(_eb) \ for (unsigned int i = 0; i < (_eb)->num_batches; ++i) +#define for_each_batch_add_order(_eb) \ + for (int i = (_eb)->num_batches - 1; i >= 0; --i) + +static void __eb_persistent_add_shared_fence(struct drm_i915_gem_object *obj, + struct dma_fence *fence) +{ + struct dma_fence *curr; + int idx; + + dma_fence_array_for_each(curr, idx, fence) + dma_resv_add_fence(obj->base.resv, curr, + DMA_RESV_USAGE_BOOKKEEP); + + obj->write_domain = 0; + obj->read_domains |= I915_GEM_GPU_DOMAINS; + obj->mm.dirty = true; +} + +static void eb_persistent_add_shared_fence(struct i915_execbuffer *eb) +{ + struct i915_address_space *vm = eb->context->vm; + struct dma_fence *fence; + struct i915_vma *vma; + + fence = eb->composite_fence ? eb->composite_fence : + &eb->requests[0]->fence; + + __eb_persistent_add_shared_fence(vm->root_obj, fence); + list_for_each_entry(vma, &vm->non_priv_vm_bind_list, + non_priv_vm_bind_link) + __eb_persistent_add_shared_fence(vma->obj, fence); +} + +static void eb_move_all_persistent_vma_to_active(struct i915_execbuffer *eb) +{ + /* Add fence to BOs dma-resv fence list */ + eb_persistent_add_shared_fence(eb); +} static int eb_move_to_gpu(struct i915_execbuffer *eb) { + struct i915_address_space *vm = eb->context->vm; + struct i915_vma *vma; + int err = 0; + + lockdep_assert_held(&vm->vm_bind_lock); + assert_object_held(vm->root_obj); + + eb_move_all_persistent_vma_to_active(eb); + + list_for_each_entry(vma, &vm->vm_bind_list, vm_bind_link) { + for_each_batch_add_order(eb) { + if (!eb->requests[i]) + continue; + + err = i915_request_await_bind(eb->requests[i], vma); + if (err) + goto err_skip; + } + } + /* Unconditionally flush any chipset caches (for streaming writes). */ intel_gt_chipset_flush(eb->gt); return 0; + +err_skip: + for_each_batch_create_order(eb) { + if (!eb->requests[i]) + break; + + i915_request_set_error_once(eb->requests[i], err); + } + return err; } static int eb_request_submit(struct i915_execbuffer *eb, @@ -483,6 +672,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, mutex_lock(&eb.context->vm->vm_bind_lock); +lookup_vmas: err = eb_lookup_vma_all(&eb); if (err) { eb_release_vma_all(&eb); @@ -499,6 +689,22 @@ i915_gem_do_execbuffer(struct drm_device *dev, /* only throttle once, even if we didn't need to throttle */ throttle = false; + err = eb_lock_vma_all(&eb); + if (err) + goto err_validate; + + /** + * No object unbinds possible once the objects are locked. So, + * check for any unbinds here, which needs to be scooped up. + */ + if (!list_empty(&eb.context->vm->vm_rebind_list)) { + eb_release_vma_all(&eb); + i915_gem_ww_ctx_fini(&eb.ww); + goto lookup_vmas; + } + + err = eb_validate_persistent_vma_all(&eb); + err_validate: if (err == -EDEADLK) { eb_release_vma_all(&eb);