Message ID | 20170629125930.821-7-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Jun 29, 2017 at 01:59:30PM +0100, Chris Wilson wrote: > Reduce the list iteration when incrementing the timeline by storing the > fences in increasing order. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Sumit Semwal <sumit.semwal@linaro.org> > Cc: Sean Paul <seanpaul@chromium.org> > Cc: Gustavo Padovan <gustavo@padovan.org> > --- > drivers/dma-buf/sw_sync.c | 49 ++++++++++++++++++++++++++++++++++++++------ > drivers/dma-buf/sync_debug.h | 5 +++++ > 2 files changed, 48 insertions(+), 6 deletions(-) > > diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c > index e51fe11bbbea..8cef5d345316 100644 > --- a/drivers/dma-buf/sw_sync.c > +++ b/drivers/dma-buf/sw_sync.c > @@ -96,6 +96,7 @@ static struct sync_timeline *sync_timeline_create(const char *name) > obj->context = dma_fence_context_alloc(1); > strlcpy(obj->name, name, sizeof(obj->name)); > > + obj->pt_tree = RB_ROOT; > INIT_LIST_HEAD(&obj->pt_list); > spin_lock_init(&obj->lock); > > @@ -142,9 +143,13 @@ static void sync_timeline_signal(struct sync_timeline *obj, unsigned int inc) > > obj->value += inc; > > - list_for_each_entry_safe(pt, next, &obj->pt_list, link) > - if (dma_fence_is_signaled_locked(&pt->base)) > - list_del_init(&pt->link); > + list_for_each_entry_safe(pt, next, &obj->pt_list, link) { > + if (!dma_fence_is_signaled_locked(&pt->base)) > + break; > + > + list_del_init(&pt->link); > + rb_erase(&pt->node, &obj->pt_tree); > + } > > spin_unlock_irq(&obj->lock); > } > @@ -174,8 +179,38 @@ static struct sync_pt *sync_pt_create(struct sync_timeline *obj, > INIT_LIST_HEAD(&pt->link); > > spin_lock_irq(&obj->lock); > - if (!dma_fence_is_signaled_locked(&pt->base)) > - list_add_tail(&pt->link, &obj->pt_list); > + if (!dma_fence_is_signaled_locked(&pt->base)) { > + struct rb_node **p = &obj->pt_tree.rb_node; > + struct rb_node *parent = NULL; > + > + while (*p) { > + struct sync_pt *other; > + int cmp; > + > + parent = *p; > + other = rb_entry(parent, typeof(*pt), node); > + cmp = value - other->base.seqno; We're imposing an implicit limitation on userspace here that points cannot be > INT_MAX apart (since they'll be in the wrong order in the tree). I wonder how much this patch will actually save, given that the number of active points on a given timeline should be small. Do we have any evidence that this optimization is warranted? Sean > + if (cmp > 0) { > + p = &parent->rb_right; > + } else if (cmp < 0) { > + p = &parent->rb_left; > + } else { > + if (dma_fence_get_rcu(&other->base)) { > + dma_fence_put(&pt->base); > + pt = other; > + goto unlock; > + } > + p = &parent->rb_left; > + } > + } > + rb_link_node(&pt->node, parent, p); > + rb_insert_color(&pt->node, &obj->pt_tree); > + > + parent = rb_next(&pt->node); > + list_add_tail(&pt->link, > + parent ? &rb_entry(parent, typeof(*pt), node)->link : &obj->pt_list); > + } > +unlock: > spin_unlock_irq(&obj->lock); > > return pt; > @@ -201,8 +236,10 @@ static void timeline_fence_release(struct dma_fence *fence) > > spin_lock_irqsave(fence->lock, flags); > > - if (!list_empty(&pt->link)) > + if (!list_empty(&pt->link)) { > list_del(&pt->link); > + rb_erase(&pt->node, &parent->pt_tree); > + } > > spin_unlock_irqrestore(fence->lock, flags); > > diff --git a/drivers/dma-buf/sync_debug.h b/drivers/dma-buf/sync_debug.h > index 899ba0e19fd3..b7a5fab12179 100644 > --- a/drivers/dma-buf/sync_debug.h > +++ b/drivers/dma-buf/sync_debug.h > @@ -14,6 +14,7 @@ > #define _LINUX_SYNC_H > > #include <linux/list.h> > +#include <linux/rbtree.h> > #include <linux/spinlock.h> > #include <linux/dma-fence.h> > > @@ -25,6 +26,7 @@ > * @kref: reference count on fence. > * @name: name of the sync_timeline. Useful for debugging > * @lock: lock protecting @child_list_head and fence.status > + * @pt_tree: rbtree of active (unsignaled/errored) sync_pts > * @pt_list: list of active (unsignaled/errored) sync_pts > * @sync_timeline_list: membership in global sync_timeline_list > */ > @@ -36,6 +38,7 @@ struct sync_timeline { > u64 context; > int value; > > + struct rb_root pt_tree; > struct list_head pt_list; > spinlock_t lock; > > @@ -51,10 +54,12 @@ static inline struct sync_timeline *dma_fence_parent(struct dma_fence *fence) > * struct sync_pt - sync_pt object > * @base: base fence object > * @link: link on the sync timeline's list > + * @node: node in the sync timeline's tree > */ > struct sync_pt { > struct dma_fence base; > struct list_head link; > + struct rb_node node; > }; > > #ifdef CONFIG_SW_SYNC > -- > 2.13.1
Quoting Sean Paul (2017-06-29 19:10:11) > On Thu, Jun 29, 2017 at 01:59:30PM +0100, Chris Wilson wrote: > > Reduce the list iteration when incrementing the timeline by storing the > > fences in increasing order. > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > Cc: Sumit Semwal <sumit.semwal@linaro.org> > > Cc: Sean Paul <seanpaul@chromium.org> > > Cc: Gustavo Padovan <gustavo@padovan.org> > > --- > > drivers/dma-buf/sw_sync.c | 49 ++++++++++++++++++++++++++++++++++++++------ > > drivers/dma-buf/sync_debug.h | 5 +++++ > > 2 files changed, 48 insertions(+), 6 deletions(-) > > > > diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c > > index e51fe11bbbea..8cef5d345316 100644 > > --- a/drivers/dma-buf/sw_sync.c > > +++ b/drivers/dma-buf/sw_sync.c > > @@ -96,6 +96,7 @@ static struct sync_timeline *sync_timeline_create(const char *name) > > obj->context = dma_fence_context_alloc(1); > > strlcpy(obj->name, name, sizeof(obj->name)); > > > > + obj->pt_tree = RB_ROOT; > > INIT_LIST_HEAD(&obj->pt_list); > > spin_lock_init(&obj->lock); > > > > @@ -142,9 +143,13 @@ static void sync_timeline_signal(struct sync_timeline *obj, unsigned int inc) > > > > obj->value += inc; > > > > - list_for_each_entry_safe(pt, next, &obj->pt_list, link) > > - if (dma_fence_is_signaled_locked(&pt->base)) > > - list_del_init(&pt->link); > > + list_for_each_entry_safe(pt, next, &obj->pt_list, link) { > > + if (!dma_fence_is_signaled_locked(&pt->base)) > > + break; > > + > > + list_del_init(&pt->link); > > + rb_erase(&pt->node, &obj->pt_tree); > > + } > > > > spin_unlock_irq(&obj->lock); > > } > > @@ -174,8 +179,38 @@ static struct sync_pt *sync_pt_create(struct sync_timeline *obj, > > INIT_LIST_HEAD(&pt->link); > > > > spin_lock_irq(&obj->lock); > > - if (!dma_fence_is_signaled_locked(&pt->base)) > > - list_add_tail(&pt->link, &obj->pt_list); > > + if (!dma_fence_is_signaled_locked(&pt->base)) { > > + struct rb_node **p = &obj->pt_tree.rb_node; > > + struct rb_node *parent = NULL; > > + > > + while (*p) { > > + struct sync_pt *other; > > + int cmp; > > + > > + parent = *p; > > + other = rb_entry(parent, typeof(*pt), node); > > + cmp = value - other->base.seqno; > > We're imposing an implicit limitation on userspace here that points cannot be > > INT_MAX apart (since they'll be in the wrong order in the tree). That's not a new limitation. It's an artifact of using the u32 timeline/seqno. > I wonder how much this patch will actually save, given that the number of active points > on a given timeline should be small. Do we have any evidence that this > optimization is warranted? The good news is that for empty/small trees, the overhead is tiny, less than the cost of the syscall. I honestly don't know who uses sw_sync and so would benefit. I figured I would throw it out here since it was trivial. -Chris
diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c index e51fe11bbbea..8cef5d345316 100644 --- a/drivers/dma-buf/sw_sync.c +++ b/drivers/dma-buf/sw_sync.c @@ -96,6 +96,7 @@ static struct sync_timeline *sync_timeline_create(const char *name) obj->context = dma_fence_context_alloc(1); strlcpy(obj->name, name, sizeof(obj->name)); + obj->pt_tree = RB_ROOT; INIT_LIST_HEAD(&obj->pt_list); spin_lock_init(&obj->lock); @@ -142,9 +143,13 @@ static void sync_timeline_signal(struct sync_timeline *obj, unsigned int inc) obj->value += inc; - list_for_each_entry_safe(pt, next, &obj->pt_list, link) - if (dma_fence_is_signaled_locked(&pt->base)) - list_del_init(&pt->link); + list_for_each_entry_safe(pt, next, &obj->pt_list, link) { + if (!dma_fence_is_signaled_locked(&pt->base)) + break; + + list_del_init(&pt->link); + rb_erase(&pt->node, &obj->pt_tree); + } spin_unlock_irq(&obj->lock); } @@ -174,8 +179,38 @@ static struct sync_pt *sync_pt_create(struct sync_timeline *obj, INIT_LIST_HEAD(&pt->link); spin_lock_irq(&obj->lock); - if (!dma_fence_is_signaled_locked(&pt->base)) - list_add_tail(&pt->link, &obj->pt_list); + if (!dma_fence_is_signaled_locked(&pt->base)) { + struct rb_node **p = &obj->pt_tree.rb_node; + struct rb_node *parent = NULL; + + while (*p) { + struct sync_pt *other; + int cmp; + + parent = *p; + other = rb_entry(parent, typeof(*pt), node); + cmp = value - other->base.seqno; + if (cmp > 0) { + p = &parent->rb_right; + } else if (cmp < 0) { + p = &parent->rb_left; + } else { + if (dma_fence_get_rcu(&other->base)) { + dma_fence_put(&pt->base); + pt = other; + goto unlock; + } + p = &parent->rb_left; + } + } + rb_link_node(&pt->node, parent, p); + rb_insert_color(&pt->node, &obj->pt_tree); + + parent = rb_next(&pt->node); + list_add_tail(&pt->link, + parent ? &rb_entry(parent, typeof(*pt), node)->link : &obj->pt_list); + } +unlock: spin_unlock_irq(&obj->lock); return pt; @@ -201,8 +236,10 @@ static void timeline_fence_release(struct dma_fence *fence) spin_lock_irqsave(fence->lock, flags); - if (!list_empty(&pt->link)) + if (!list_empty(&pt->link)) { list_del(&pt->link); + rb_erase(&pt->node, &parent->pt_tree); + } spin_unlock_irqrestore(fence->lock, flags); diff --git a/drivers/dma-buf/sync_debug.h b/drivers/dma-buf/sync_debug.h index 899ba0e19fd3..b7a5fab12179 100644 --- a/drivers/dma-buf/sync_debug.h +++ b/drivers/dma-buf/sync_debug.h @@ -14,6 +14,7 @@ #define _LINUX_SYNC_H #include <linux/list.h> +#include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/dma-fence.h> @@ -25,6 +26,7 @@ * @kref: reference count on fence. * @name: name of the sync_timeline. Useful for debugging * @lock: lock protecting @child_list_head and fence.status + * @pt_tree: rbtree of active (unsignaled/errored) sync_pts * @pt_list: list of active (unsignaled/errored) sync_pts * @sync_timeline_list: membership in global sync_timeline_list */ @@ -36,6 +38,7 @@ struct sync_timeline { u64 context; int value; + struct rb_root pt_tree; struct list_head pt_list; spinlock_t lock; @@ -51,10 +54,12 @@ static inline struct sync_timeline *dma_fence_parent(struct dma_fence *fence) * struct sync_pt - sync_pt object * @base: base fence object * @link: link on the sync timeline's list + * @node: node in the sync timeline's tree */ struct sync_pt { struct dma_fence base; struct list_head link; + struct rb_node node; }; #ifdef CONFIG_SW_SYNC
Reduce the list iteration when incrementing the timeline by storing the fences in increasing order. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Sumit Semwal <sumit.semwal@linaro.org> Cc: Sean Paul <seanpaul@chromium.org> Cc: Gustavo Padovan <gustavo@padovan.org> --- drivers/dma-buf/sw_sync.c | 49 ++++++++++++++++++++++++++++++++++++++------ drivers/dma-buf/sync_debug.h | 5 +++++ 2 files changed, 48 insertions(+), 6 deletions(-)