diff mbox series

[5/4] dma-fence: Have dma_fence_signal call signal_locked

Message ID 20190811091523.9382-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

Chris Wilson Aug. 11, 2019, 9:15 a.m. UTC
Now that dma_fence_signal always takes the spinlock to flush the
cb_list, simply take the spinlock and call dma_fence_signal_locked() to
avoid code repetition.

Suggested-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/dma-fence.c | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

Comments

Christian König Aug. 11, 2019, 4:09 p.m. UTC | #1
Am 11.08.19 um 11:15 schrieb Chris Wilson:
> Now that dma_fence_signal always takes the spinlock to flush the
> cb_list, simply take the spinlock and call dma_fence_signal_locked() to
> avoid code repetition.
>
> Suggested-by: Christian König <christian.koenig@amd.com>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Christian König <christian.koenig@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/dma-buf/dma-fence.c | 32 ++++++++++++--------------------
>   1 file changed, 12 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
> index ab4a456bba04..367b71084d34 100644
> --- a/drivers/dma-buf/dma-fence.c
> +++ b/drivers/dma-buf/dma-fence.c
> @@ -122,26 +122,18 @@ EXPORT_SYMBOL(dma_fence_context_alloc);
>    */
>   int dma_fence_signal_locked(struct dma_fence *fence)
>   {
> -	int ret = 0;
> -
> -	lockdep_assert_held(fence->lock);
> -
>   	if (WARN_ON(!fence))
>   		return -EINVAL;
>   
> -	if (!__dma_fence_signal(fence)) {
> -		ret = -EINVAL;
> +	lockdep_assert_held(fence->lock);
>   
> -		/*
> -		 * we might have raced with the unlocked dma_fence_signal,
> -		 * still run through all callbacks
> -		 */
> -	} else {
> -		__dma_fence_signal__timestamp(fence, ktime_get());
> -	}
> +	if (!__dma_fence_signal(fence))
> +		return -EINVAL;
>   
> +	__dma_fence_signal__timestamp(fence, ktime_get());
>   	__dma_fence_signal__notify(fence);
> -	return ret;
> +
> +	return 0;
>   }
>   EXPORT_SYMBOL(dma_fence_signal_locked);
>   
> @@ -161,19 +153,19 @@ EXPORT_SYMBOL(dma_fence_signal_locked);
>   int dma_fence_signal(struct dma_fence *fence)
>   {
>   	unsigned long flags;
> +	int ret;
>   
>   	if (!fence)
>   		return -EINVAL;
>   
> -	if (!__dma_fence_signal(fence))
> -		return -EINVAL;
> -
> -	__dma_fence_signal__timestamp(fence, ktime_get());
> +	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
> +		return 0;
>   
>   	spin_lock_irqsave(fence->lock, flags);
> -	__dma_fence_signal__notify(fence);
> +	ret = dma_fence_signal_locked(fence);
>   	spin_unlock_irqrestore(fence->lock, flags);
> -	return 0;
> +
> +	return ret;
>   }
>   EXPORT_SYMBOL(dma_fence_signal);
>
Daniel Vetter Aug. 14, 2019, 5:20 p.m. UTC | #2
On Sun, Aug 11, 2019 at 10:15:23AM +0100, Chris Wilson wrote:
> Now that dma_fence_signal always takes the spinlock to flush the
> cb_list, simply take the spinlock and call dma_fence_signal_locked() to
> avoid code repetition.
> 
> Suggested-by: Christian König <christian.koenig@amd.com>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Christian König <christian.koenig@amd.com>

Hm, I think this largely defeats the point of having the lockless signal
enabling trickery in dma_fence. Maybe that part isn't needed by anyone,
but feels like a thing that needs a notch more thought. And if we need it,
maybe a bit more cleanup.

I guess with the addition of more fancy atomic BITs we could avoid this
too ... but no idea whether that's worth the effort.
-Daniel

> ---
>  drivers/dma-buf/dma-fence.c | 32 ++++++++++++--------------------
>  1 file changed, 12 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
> index ab4a456bba04..367b71084d34 100644
> --- a/drivers/dma-buf/dma-fence.c
> +++ b/drivers/dma-buf/dma-fence.c
> @@ -122,26 +122,18 @@ EXPORT_SYMBOL(dma_fence_context_alloc);
>   */
>  int dma_fence_signal_locked(struct dma_fence *fence)
>  {
> -	int ret = 0;
> -
> -	lockdep_assert_held(fence->lock);
> -
>  	if (WARN_ON(!fence))
>  		return -EINVAL;
>  
> -	if (!__dma_fence_signal(fence)) {
> -		ret = -EINVAL;
> +	lockdep_assert_held(fence->lock);
>  
> -		/*
> -		 * we might have raced with the unlocked dma_fence_signal,
> -		 * still run through all callbacks
> -		 */
> -	} else {
> -		__dma_fence_signal__timestamp(fence, ktime_get());
> -	}
> +	if (!__dma_fence_signal(fence))
> +		return -EINVAL;
>  
> +	__dma_fence_signal__timestamp(fence, ktime_get());
>  	__dma_fence_signal__notify(fence);
> -	return ret;
> +
> +	return 0;
>  }
>  EXPORT_SYMBOL(dma_fence_signal_locked);
>  
> @@ -161,19 +153,19 @@ EXPORT_SYMBOL(dma_fence_signal_locked);
>  int dma_fence_signal(struct dma_fence *fence)
>  {
>  	unsigned long flags;
> +	int ret;
>  
>  	if (!fence)
>  		return -EINVAL;
>  
> -	if (!__dma_fence_signal(fence))
> -		return -EINVAL;
> -
> -	__dma_fence_signal__timestamp(fence, ktime_get());
> +	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
> +		return 0;
>  
>  	spin_lock_irqsave(fence->lock, flags);
> -	__dma_fence_signal__notify(fence);
> +	ret = dma_fence_signal_locked(fence);
>  	spin_unlock_irqrestore(fence->lock, flags);
> -	return 0;
> +
> +	return ret;
>  }
>  EXPORT_SYMBOL(dma_fence_signal);
>  
> -- 
> 2.23.0.rc1
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Chris Wilson Aug. 15, 2019, 6:45 p.m. UTC | #3
Quoting Daniel Vetter (2019-08-14 18:20:53)
> On Sun, Aug 11, 2019 at 10:15:23AM +0100, Chris Wilson wrote:
> > Now that dma_fence_signal always takes the spinlock to flush the
> > cb_list, simply take the spinlock and call dma_fence_signal_locked() to
> > avoid code repetition.
> > 
> > Suggested-by: Christian König <christian.koenig@amd.com>
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Christian König <christian.koenig@amd.com>
> 
> Hm, I think this largely defeats the point of having the lockless signal
> enabling trickery in dma_fence. Maybe that part isn't needed by anyone,
> but feels like a thing that needs a notch more thought. And if we need it,
> maybe a bit more cleanup.

You mean dma_fence_enable_sw_signaling(). The only user appears to be to
flush fences, which is actually the intent of always notifying the signal
cb. By always doing the callbacks, we can avoid installing the interrupt
and completely saturating CPUs with irqs, instead doing a batch in a
leisurely timer callback if not flushed naturally.
-Chris
Daniel Vetter Aug. 15, 2019, 6:48 p.m. UTC | #4
On Thu, Aug 15, 2019 at 8:46 PM Chris Wilson <chris@chris-wilson.co.uk> wrote:
> Quoting Daniel Vetter (2019-08-14 18:20:53)
> > On Sun, Aug 11, 2019 at 10:15:23AM +0100, Chris Wilson wrote:
> > > Now that dma_fence_signal always takes the spinlock to flush the
> > > cb_list, simply take the spinlock and call dma_fence_signal_locked() to
> > > avoid code repetition.
> > >
> > > Suggested-by: Christian König <christian.koenig@amd.com>
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Cc: Christian König <christian.koenig@amd.com>
> >
> > Hm, I think this largely defeats the point of having the lockless signal
> > enabling trickery in dma_fence. Maybe that part isn't needed by anyone,
> > but feels like a thing that needs a notch more thought. And if we need it,
> > maybe a bit more cleanup.
>
> You mean dma_fence_enable_sw_signaling(). The only user appears to be to
> flush fences, which is actually the intent of always notifying the signal
> cb. By always doing the callbacks, we can avoid installing the interrupt
> and completely saturating CPUs with irqs, instead doing a batch in a
> leisurely timer callback if not flushed naturally.

Yeah I'm not against ditching this, but can't we ditch a lot more if
we just always take the spinlock in those paths now anyways? Kinda not
worth having the complexity anymore.
-Daniel
Chris Wilson Aug. 15, 2019, 7:03 p.m. UTC | #5
Quoting Daniel Vetter (2019-08-15 19:48:42)
> On Thu, Aug 15, 2019 at 8:46 PM Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > Quoting Daniel Vetter (2019-08-14 18:20:53)
> > > On Sun, Aug 11, 2019 at 10:15:23AM +0100, Chris Wilson wrote:
> > > > Now that dma_fence_signal always takes the spinlock to flush the
> > > > cb_list, simply take the spinlock and call dma_fence_signal_locked() to
> > > > avoid code repetition.
> > > >
> > > > Suggested-by: Christian König <christian.koenig@amd.com>
> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > Cc: Christian König <christian.koenig@amd.com>
> > >
> > > Hm, I think this largely defeats the point of having the lockless signal
> > > enabling trickery in dma_fence. Maybe that part isn't needed by anyone,
> > > but feels like a thing that needs a notch more thought. And if we need it,
> > > maybe a bit more cleanup.
> >
> > You mean dma_fence_enable_sw_signaling(). The only user appears to be to
> > flush fences, which is actually the intent of always notifying the signal
> > cb. By always doing the callbacks, we can avoid installing the interrupt
> > and completely saturating CPUs with irqs, instead doing a batch in a
> > leisurely timer callback if not flushed naturally.
> 
> Yeah I'm not against ditching this,

I was just thinking aloud working out what the current use case in ttm
was for.

> but can't we ditch a lot more if
> we just always take the spinlock in those paths now anyways? Kinda not
> worth having the complexity anymore.

You would be able to drop the was_set from dma_fence_add_callback. Say

diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index 59ac96ec7ba8..e566445134a2 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -345,38 +345,31 @@ int dma_fence_add_callback(struct dma_fence *fence, struct dma_fence_cb *cb,
                           dma_fence_func_t func)
 {
        unsigned long flags;
-       int ret = 0;
-       bool was_set;
+       int ret = -ENOENT;

        if (WARN_ON(!fence || !func))
                return -EINVAL;

-       if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) {
-               INIT_LIST_HEAD(&cb->node);
+       INIT_LIST_HEAD(&cb->node);
+       cb->func = func;
+
+       if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
                return -ENOENT;
-       }

        spin_lock_irqsave(fence->lock, flags);
-
-       was_set = test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
-                                  &fence->flags);
-
-       if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
-               ret = -ENOENT;
-       else if (!was_set && fence->ops->enable_signaling) {
+       if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags) &&
+           !test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+                             &fence->flags)) {
                trace_dma_fence_enable_signal(fence);

-               if (!fence->ops->enable_signaling(fence)) {
+               if (!fence->ops->enable_signaling ||
+                   fence->ops->enable_signaling(fence)) {
+                       list_add_tail(&cb->node, &fence->cb_list);
+                       ret = 0;
+               } else {
                        dma_fence_signal_locked(fence);
-                       ret = -ENOENT;
                }
        }
-
-       if (!ret) {
-               cb->func = func;
-               list_add_tail(&cb->node, &fence->cb_list);
-       } else
-               INIT_LIST_HEAD(&cb->node);
        spin_unlock_irqrestore(fence->lock, flags);

        return ret;

Not a whole lot changes in terms of branches and serialising
instructions. One less baffling sequence to worry about.
-Chris
Chris Wilson Aug. 15, 2019, 7:29 p.m. UTC | #6
Quoting Chris Wilson (2019-08-15 20:03:13)
> Quoting Daniel Vetter (2019-08-15 19:48:42)
> > On Thu, Aug 15, 2019 at 8:46 PM Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > Quoting Daniel Vetter (2019-08-14 18:20:53)
> > > > On Sun, Aug 11, 2019 at 10:15:23AM +0100, Chris Wilson wrote:
> > > > > Now that dma_fence_signal always takes the spinlock to flush the
> > > > > cb_list, simply take the spinlock and call dma_fence_signal_locked() to
> > > > > avoid code repetition.
> > > > >
> > > > > Suggested-by: Christian König <christian.koenig@amd.com>
> > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > > Cc: Christian König <christian.koenig@amd.com>
> > > >
> > > > Hm, I think this largely defeats the point of having the lockless signal
> > > > enabling trickery in dma_fence. Maybe that part isn't needed by anyone,
> > > > but feels like a thing that needs a notch more thought. And if we need it,
> > > > maybe a bit more cleanup.
> > >
> > > You mean dma_fence_enable_sw_signaling(). The only user appears to be to
> > > flush fences, which is actually the intent of always notifying the signal
> > > cb. By always doing the callbacks, we can avoid installing the interrupt
> > > and completely saturating CPUs with irqs, instead doing a batch in a
> > > leisurely timer callback if not flushed naturally.
> > 
> > Yeah I'm not against ditching this,
> 
> I was just thinking aloud working out what the current use case in ttm
> was for.
> 
> > but can't we ditch a lot more if
> > we just always take the spinlock in those paths now anyways? Kinda not
> > worth having the complexity anymore.
> 
> You would be able to drop the was_set from dma_fence_add_callback. Say
> 
> diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
> index 59ac96ec7ba8..e566445134a2 100644
> --- a/drivers/dma-buf/dma-fence.c
> +++ b/drivers/dma-buf/dma-fence.c
> @@ -345,38 +345,31 @@ int dma_fence_add_callback(struct dma_fence *fence, struct dma_fence_cb *cb,
>                            dma_fence_func_t func)
>  {
>         unsigned long flags;
> -       int ret = 0;
> -       bool was_set;
> +       int ret = -ENOENT;
> 
>         if (WARN_ON(!fence || !func))
>                 return -EINVAL;
> 
> -       if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) {
> -               INIT_LIST_HEAD(&cb->node);
> +       INIT_LIST_HEAD(&cb->node);
> +       cb->func = func;
> +
> +       if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
>                 return -ENOENT;
> -       }
> 
>         spin_lock_irqsave(fence->lock, flags);
> -
> -       was_set = test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> -                                  &fence->flags);
> -
> -       if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
> -               ret = -ENOENT;
> -       else if (!was_set && fence->ops->enable_signaling) {
> +       if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags) &&
> +           !test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> +                             &fence->flags)) {
>                 trace_dma_fence_enable_signal(fence);
> 
> -               if (!fence->ops->enable_signaling(fence)) {
> +               if (!fence->ops->enable_signaling ||
> +                   fence->ops->enable_signaling(fence)) {
> +                       list_add_tail(&cb->node, &fence->cb_list);
> +                       ret = 0;
> +               } else {
>                         dma_fence_signal_locked(fence);
> -                       ret = -ENOENT;
>                 }
>         }
> -
> -       if (!ret) {
> -               cb->func = func;
> -               list_add_tail(&cb->node, &fence->cb_list);
> -       } else
> -               INIT_LIST_HEAD(&cb->node);
>         spin_unlock_irqrestore(fence->lock, flags);
> 
>         return ret;
> 
> Not a whole lot changes in terms of branches and serialising
> instructions. One less baffling sequence to worry about.

Fwiw,
Function                                     old     new   delta
dma_fence_add_callback                       338     302     -36

Almost certainly more shaving if you stare.
-Chris
Christian König Aug. 16, 2019, 7:58 a.m. UTC | #7
Am 15.08.19 um 21:29 schrieb Chris Wilson:
> Quoting Chris Wilson (2019-08-15 20:03:13)
>> Quoting Daniel Vetter (2019-08-15 19:48:42)
>>> On Thu, Aug 15, 2019 at 8:46 PM Chris Wilson <chris@chris-wilson.co.uk> wrote:
>>>> Quoting Daniel Vetter (2019-08-14 18:20:53)
>>>>> On Sun, Aug 11, 2019 at 10:15:23AM +0100, Chris Wilson wrote:
>>>>>> Now that dma_fence_signal always takes the spinlock to flush the
>>>>>> cb_list, simply take the spinlock and call dma_fence_signal_locked() to
>>>>>> avoid code repetition.
>>>>>>
>>>>>> Suggested-by: Christian König <christian.koenig@amd.com>
>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>>> Cc: Christian König <christian.koenig@amd.com>
>>>>> Hm, I think this largely defeats the point of having the lockless signal
>>>>> enabling trickery in dma_fence. Maybe that part isn't needed by anyone,
>>>>> but feels like a thing that needs a notch more thought. And if we need it,
>>>>> maybe a bit more cleanup.
>>>> You mean dma_fence_enable_sw_signaling(). The only user appears to be to
>>>> flush fences, which is actually the intent of always notifying the signal
>>>> cb. By always doing the callbacks, we can avoid installing the interrupt
>>>> and completely saturating CPUs with irqs, instead doing a batch in a
>>>> leisurely timer callback if not flushed naturally.
>>> Yeah I'm not against ditching this,
>> I was just thinking aloud working out what the current use case in ttm
>> was for.
>>
>>> but can't we ditch a lot more if
>>> we just always take the spinlock in those paths now anyways? Kinda not
>>> worth having the complexity anymore.
>> You would be able to drop the was_set from dma_fence_add_callback. Say
>>
>> diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
>> index 59ac96ec7ba8..e566445134a2 100644
>> --- a/drivers/dma-buf/dma-fence.c
>> +++ b/drivers/dma-buf/dma-fence.c
>> @@ -345,38 +345,31 @@ int dma_fence_add_callback(struct dma_fence *fence, struct dma_fence_cb *cb,
>>                             dma_fence_func_t func)
>>   {
>>          unsigned long flags;
>> -       int ret = 0;
>> -       bool was_set;
>> +       int ret = -ENOENT;
>>
>>          if (WARN_ON(!fence || !func))
>>                  return -EINVAL;
>>
>> -       if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) {
>> -               INIT_LIST_HEAD(&cb->node);
>> +       INIT_LIST_HEAD(&cb->node);
>> +       cb->func = func;
>> +
>> +       if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
>>                  return -ENOENT;
>> -       }
>>
>>          spin_lock_irqsave(fence->lock, flags);
>> -
>> -       was_set = test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
>> -                                  &fence->flags);
>> -
>> -       if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
>> -               ret = -ENOENT;
>> -       else if (!was_set && fence->ops->enable_signaling) {
>> +       if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags) &&
>> +           !test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
>> +                             &fence->flags)) {
>>                  trace_dma_fence_enable_signal(fence);
>>
>> -               if (!fence->ops->enable_signaling(fence)) {
>> +               if (!fence->ops->enable_signaling ||
>> +                   fence->ops->enable_signaling(fence)) {
>> +                       list_add_tail(&cb->node, &fence->cb_list);
>> +                       ret = 0;
>> +               } else {
>>                          dma_fence_signal_locked(fence);
>> -                       ret = -ENOENT;
>>                  }
>>          }
>> -
>> -       if (!ret) {
>> -               cb->func = func;
>> -               list_add_tail(&cb->node, &fence->cb_list);
>> -       } else
>> -               INIT_LIST_HEAD(&cb->node);
>>          spin_unlock_irqrestore(fence->lock, flags);
>>
>>          return ret;
>>
>> Not a whole lot changes in terms of branches and serialising
>> instructions. One less baffling sequence to worry about.
> Fwiw,
> Function                                     old     new   delta
> dma_fence_add_callback                       338     302     -36

Well since the sequence number change didn't worked out I'm now working 
on something where I replaced the shared fences list with a reference 
counted version where we also have an active and staged view of the fences.

This removed the whole problem of keeping things alive while inside the 
RCU and also removes the retry looping etc.. Additional to that we can 
also get rid of most of the memory barriers while adding and 
manipulating fences.

The end result in a totally artificial command submission test case is a 
61% performance improvement. This is so much that I'm actually still 
searching if that is not caused by bug somewhere.

Will probably need some more weeks till this is done, but yeah there is 
a huge potential for optimization here,
Christian.

>
> Almost certainly more shaving if you stare.
> -Chris
diff mbox series

Patch

diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index ab4a456bba04..367b71084d34 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -122,26 +122,18 @@  EXPORT_SYMBOL(dma_fence_context_alloc);
  */
 int dma_fence_signal_locked(struct dma_fence *fence)
 {
-	int ret = 0;
-
-	lockdep_assert_held(fence->lock);
-
 	if (WARN_ON(!fence))
 		return -EINVAL;
 
-	if (!__dma_fence_signal(fence)) {
-		ret = -EINVAL;
+	lockdep_assert_held(fence->lock);
 
-		/*
-		 * we might have raced with the unlocked dma_fence_signal,
-		 * still run through all callbacks
-		 */
-	} else {
-		__dma_fence_signal__timestamp(fence, ktime_get());
-	}
+	if (!__dma_fence_signal(fence))
+		return -EINVAL;
 
+	__dma_fence_signal__timestamp(fence, ktime_get());
 	__dma_fence_signal__notify(fence);
-	return ret;
+
+	return 0;
 }
 EXPORT_SYMBOL(dma_fence_signal_locked);
 
@@ -161,19 +153,19 @@  EXPORT_SYMBOL(dma_fence_signal_locked);
 int dma_fence_signal(struct dma_fence *fence)
 {
 	unsigned long flags;
+	int ret;
 
 	if (!fence)
 		return -EINVAL;
 
-	if (!__dma_fence_signal(fence))
-		return -EINVAL;
-
-	__dma_fence_signal__timestamp(fence, ktime_get());
+	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
+		return 0;
 
 	spin_lock_irqsave(fence->lock, flags);
-	__dma_fence_signal__notify(fence);
+	ret = dma_fence_signal_locked(fence);
 	spin_unlock_irqrestore(fence->lock, flags);
-	return 0;
+
+	return ret;
 }
 EXPORT_SYMBOL(dma_fence_signal);