diff mbox

[6/7] drm/radeon: allow asynchronous waiting on foreign fences

Message ID 54085031.1080600@canonical.com (mailing list archive)
State New, archived
Headers show

Commit Message

Maarten Lankhorst Sept. 4, 2014, 11:42 a.m. UTC
Use the semaphore mechanism to make this happen, this uses signaling
from the cpu instead of signaling by the gpu.

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
---
 drivers/gpu/drm/radeon/radeon.h           |  17 ++-
 drivers/gpu/drm/radeon/radeon_cs.c        |  30 ++---
 drivers/gpu/drm/radeon/radeon_fence.c     |  13 ++-
 drivers/gpu/drm/radeon/radeon_semaphore.c | 184 ++++++++++++++++++++++++++++++
 4 files changed, 221 insertions(+), 23 deletions(-)

Comments

Christian König Sept. 4, 2014, 11:54 a.m. UTC | #1
Am 04.09.2014 um 13:42 schrieb Maarten Lankhorst:
> Use the semaphore mechanism to make this happen, this uses signaling
> from the cpu instead of signaling by the gpu.

I'm not sure if this will work reliable when the semaphores are in 
system memory. We might need to reserve some VRAM for them instead.

Regards,
Christian.

>
> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
> ---
>   drivers/gpu/drm/radeon/radeon.h           |  17 ++-
>   drivers/gpu/drm/radeon/radeon_cs.c        |  30 ++---
>   drivers/gpu/drm/radeon/radeon_fence.c     |  13 ++-
>   drivers/gpu/drm/radeon/radeon_semaphore.c | 184 ++++++++++++++++++++++++++++++
>   4 files changed, 221 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index dddb2b7dd752..cd18fa7f801c 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -359,6 +359,11 @@ struct radeon_fence_driver {
>   	struct delayed_work		lockup_work;
>   };
>   
> +struct radeon_fence_cb {
> +	struct fence_cb base;
> +	struct fence *fence;
> +};
> +
>   struct radeon_fence {
>   	struct fence base;
>   
> @@ -368,6 +373,10 @@ struct radeon_fence {
>   	unsigned			ring;
>   
>   	wait_queue_t			fence_wake;
> +
> +	atomic_t			num_cpu_cbs;
> +	struct radeon_fence_cb		*cpu_cbs;
> +	uint32_t			*cpu_sema;
>   };
>   
>   int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring);
> @@ -574,9 +583,11 @@ int radeon_mode_dumb_mmap(struct drm_file *filp,
>    */
>   struct radeon_semaphore {
>   	struct radeon_sa_bo		*sa_bo;
> -	signed				waiters;
> +	signed				waiters, cpu_waiters, cpu_waiters_max;
>   	uint64_t			gpu_addr;
>   	struct radeon_fence		*sync_to[RADEON_NUM_RINGS];
> +	uint32_t			*cpu_sema;
> +	struct radeon_fence_cb		*cpu_cbs;
>   };
>   
>   int radeon_semaphore_create(struct radeon_device *rdev,
> @@ -587,6 +598,10 @@ bool radeon_semaphore_emit_wait(struct radeon_device *rdev, int ring,
>   				struct radeon_semaphore *semaphore);
>   void radeon_semaphore_sync_to(struct radeon_semaphore *semaphore,
>   			      struct radeon_fence *fence);
> +int radeon_semaphore_sync_obj(struct radeon_device *rdev,
> +			      struct radeon_semaphore *semaphore,
> +			      struct reservation_object *resv);
> +
>   int radeon_semaphore_sync_rings(struct radeon_device *rdev,
>   				struct radeon_semaphore *semaphore,
>   				int waiting_ring);
> diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
> index 8ad4e2cfae15..b141f5bd029d 100644
> --- a/drivers/gpu/drm/radeon/radeon_cs.c
> +++ b/drivers/gpu/drm/radeon/radeon_cs.c
> @@ -250,32 +250,16 @@ static int radeon_cs_get_ring(struct radeon_cs_parser *p, u32 ring, s32 priority
>   
>   static int radeon_cs_sync_rings(struct radeon_cs_parser *p)
>   {
> -	int i;
> -
> -	for (i = 0; i < p->nrelocs; i++) {
> -		struct reservation_object *resv;
> -		struct fence *fence;
> -		struct radeon_fence *rfence;
> -		int r;
> +	int i, ret = 0;
>   
> +	for (i = 0; !ret && i < p->nrelocs; i++) {
>   		if (!p->relocs[i].robj)
>   			continue;
>   
> -		resv = p->relocs[i].robj->tbo.resv;
> -		fence = reservation_object_get_excl(resv);
> -		if (!fence)
> -			continue;
> -		rfence = to_radeon_fence(fence);
> -		if (!rfence || rfence->rdev != p->rdev) {
> -			r = fence_wait(fence, true);
> -			if (r)
> -				return r;
> -			continue;
> -		}
> -
> -		radeon_semaphore_sync_to(p->ib.semaphore, rfence);
> +		ret = radeon_semaphore_sync_obj(p->rdev, p->ib.semaphore,
> +						p->relocs[i].robj->tbo.resv);
>   	}
> -	return 0;
> +	return ret;
>   }
>   
>   /* XXX: note that this is called from the legacy UMS CS ioctl as well */
> @@ -442,6 +426,10 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo
>   		 */
>   		list_sort(NULL, &parser->validated, cmp_size_smaller_first);
>   
> +		/* must be called with all reservation_objects still held */
> +		radeon_semaphore_free(parser->rdev, &parser->ib.semaphore,
> +				      parser->ib.fence);
> +
>   		ttm_eu_fence_buffer_objects(&parser->ticket,
>   					    &parser->validated,
>   					    &parser->ib.fence->base);
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
> index 0262fe2580d2..7687a7f8f41b 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -142,6 +142,8 @@ int radeon_fence_emit(struct radeon_device *rdev,
>   	(*fence)->ring = ring;
>   	fence_init(&(*fence)->base, &radeon_fence_ops,
>   		   &rdev->fence_queue.lock, rdev->fence_context + ring, seq);
> +	(*fence)->cpu_cbs = NULL;
> +	(*fence)->cpu_sema = NULL;
>   	radeon_fence_ring_emit(rdev, ring, *fence);
>   	trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq);
>   	radeon_fence_schedule_check(rdev, ring);
> @@ -1057,11 +1059,20 @@ static signed long radeon_fence_default_wait(struct fence *f, bool intr,
>   	return t;
>   }
>   
> +static void __radeon_fence_destroy(struct fence *f)
> +{
> +	struct radeon_fence *fence = to_radeon_fence(f);
> +
> +	WARN_ON(fence->cpu_cbs);
> +	kfree(fence->cpu_cbs);
> +	fence_free(f);
> +}
> +
>   const struct fence_ops radeon_fence_ops = {
>   	.get_driver_name = radeon_fence_get_driver_name,
>   	.get_timeline_name = radeon_fence_get_timeline_name,
>   	.enable_signaling = radeon_fence_enable_signaling,
>   	.signaled = radeon_fence_is_signaled,
>   	.wait = radeon_fence_default_wait,
> -	.release = NULL,
> +	.release = __radeon_fence_destroy,
>   };
> diff --git a/drivers/gpu/drm/radeon/radeon_semaphore.c b/drivers/gpu/drm/radeon/radeon_semaphore.c
> index 56d9fd66d8ae..2e71463d11c5 100644
> --- a/drivers/gpu/drm/radeon/radeon_semaphore.c
> +++ b/drivers/gpu/drm/radeon/radeon_semaphore.c
> @@ -30,6 +30,7 @@
>   #include <drm/drmP.h>
>   #include "radeon.h"
>   #include "radeon_trace.h"
> +#include <trace/events/fence.h>
>   
>   int radeon_semaphore_create(struct radeon_device *rdev,
>   			    struct radeon_semaphore **semaphore)
> @@ -49,7 +50,11 @@ int radeon_semaphore_create(struct radeon_device *rdev,
>   		return r;
>   	}
>   	(*semaphore)->waiters = 0;
> +	(*semaphore)->cpu_waiters = 0;
> +	(*semaphore)->cpu_waiters_max = 0;
>   	(*semaphore)->gpu_addr = radeon_sa_bo_gpu_addr((*semaphore)->sa_bo);
> +	(*semaphore)->cpu_sema = NULL;
> +	(*semaphore)->cpu_cbs = NULL;
>   
>   	cpu_addr = radeon_sa_bo_cpu_addr((*semaphore)->sa_bo);
>   	for (i = 0; i < RADEON_NUM_SYNCS; ++i)
> @@ -115,6 +120,101 @@ void radeon_semaphore_sync_to(struct radeon_semaphore *semaphore,
>           semaphore->sync_to[fence->ring] = radeon_fence_later(fence, other);
>   }
>   
> +int radeon_semaphore_reserve_cpu_waiters(struct radeon_semaphore *semaphore, int add)
> +{
> +	int max = 4;
> +	struct radeon_fence_cb *cpu_cbs;
> +
> +	if (semaphore->cpu_waiters + add <= semaphore->cpu_waiters_max)
> +		return 0;
> +
> +	if (semaphore->cpu_waiters_max)
> +		max = semaphore->cpu_waiters_max * 2;
> +
> +	cpu_cbs = krealloc(semaphore->cpu_cbs, max * sizeof(*cpu_cbs), GFP_KERNEL);
> +	if (!cpu_cbs)
> +		return -ENOMEM;
> +	semaphore->cpu_cbs = cpu_cbs;
> +	semaphore->cpu_waiters_max = max;
> +	return 0;
> +}
> +
> +static void radeon_semaphore_add_cpu_cb(struct radeon_semaphore *semaphore,
> +					struct fence *fence)
> +{
> +	unsigned i;
> +	struct radeon_fence_cb *empty = NULL;
> +
> +	for (i = 0; i < semaphore->cpu_waiters; ++i) {
> +		struct fence *other = semaphore->cpu_cbs[i].fence;
> +
> +		if (!other)
> +			empty = &semaphore->cpu_cbs[i];
> +		else if (other->context == fence->context) {
> +			semaphore->cpu_cbs[i].fence = fence_later(other, fence);
> +			return;
> +		}
> +	}
> +
> +	if (!empty)
> +		empty = &semaphore->cpu_cbs[semaphore->cpu_waiters++];
> +
> +	empty->fence = fence;
> +	return;
> +}
> +
> +/**
> + * radeon_semaphore_sync_obj - use the semaphore to sync to a bo
> + *
> + * @semaphore: semaphore object to add fence to
> + * @resv: the reservation_object to sync to
> + *
> + * Sync the reservation_object using this semaphore.
> + *
> + * radeon_semaphore_free must be called with all reservation_object locks
> + * still held!!!
> + */
> +int radeon_semaphore_sync_obj(struct radeon_device *rdev,
> +			      struct radeon_semaphore *semaphore,
> +			      struct reservation_object *resv)
> +{
> +	struct fence *fence;
> +	struct radeon_fence *rfence;
> +	struct reservation_object_list *fobj;
> +	int ret, i;
> +
> +	fobj = reservation_object_get_list(resv);
> +	if (fobj && fobj->shared_count) {
> +		ret = radeon_semaphore_reserve_cpu_waiters(semaphore, fobj->shared_count);
> +		if (ret)
> +			return ret;
> +		for (i = 0; i < fobj->shared_count; ++i) {
> +			fence = rcu_dereference_protected(fobj->shared[i],
> +						reservation_object_held(resv));
> +
> +			radeon_semaphore_add_cpu_cb(semaphore, fence);
> +		}
> +		return 0;
> +	}
> +
> +	fence = reservation_object_get_excl(resv);
> +	if (!fence)
> +		return 0;
> +
> +	rfence = to_radeon_fence(fence);
> +	if (rfence && rfence->rdev == rdev) {
> +		struct radeon_fence *other = semaphore->sync_to[rfence->ring];
> +
> +		semaphore->sync_to[rfence->ring] =
> +			radeon_fence_later(rfence, other);
> +		return 0;
> +	}
> +	ret = radeon_semaphore_reserve_cpu_waiters(semaphore, 1);
> +	if (!ret)
> +		radeon_semaphore_add_cpu_cb(semaphore, fence);
> +	return ret;
> +}
> +
>   /**
>    * radeon_semaphore_sync_rings - sync ring to all registered fences
>    *
> @@ -124,6 +224,8 @@ void radeon_semaphore_sync_to(struct radeon_semaphore *semaphore,
>    *
>    * Ensure that all registered fences are signaled before letting
>    * the ring continue. The caller must hold the ring lock.
> + *
> + * This function may only be called once on a semaphore.
>    */
>   int radeon_semaphore_sync_rings(struct radeon_device *rdev,
>   				struct radeon_semaphore *semaphore,
> @@ -132,6 +234,16 @@ int radeon_semaphore_sync_rings(struct radeon_device *rdev,
>   	unsigned count = 0;
>   	int i, r;
>   
> +	if (semaphore->cpu_waiters) {
> +		/* allocate enough space for sync command */
> +		if (radeon_semaphore_emit_wait(rdev, ring, semaphore)) {
> +			semaphore->cpu_sema = radeon_sa_bo_cpu_addr(semaphore->sa_bo);
> +			semaphore->gpu_addr += 8;
> +			++count;
> +		} else
> +			semaphore->cpu_waiters = -1;
> +	}
> +
>           for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>   		struct radeon_fence *fence = semaphore->sync_to[i];
>   
> @@ -188,6 +300,68 @@ int radeon_semaphore_sync_rings(struct radeon_device *rdev,
>   	return 0;
>   }
>   
> +static void radeon_semaphore_cpu_trigger(struct fence *other_fence,
> +					 struct fence_cb *fence_cb)
> +{
> +	struct radeon_fence_cb *cb = (struct radeon_fence_cb*)fence_cb;
> +	struct radeon_fence *fence = (struct radeon_fence *)cb->fence;
> +
> +#ifdef CONFIG_FENCE_TRACE
> +	int ret = atomic_dec_return(&fence->num_cpu_cbs);
> +
> +	if (ret)
> +		FENCE_TRACE(&fence->base, "triggered from %u#%u, %i remaining\n",
> +			    ret, other_fence->context, other_fence->seqno);
> +	else
> +#else
> +	if (atomic_dec_and_test(&fence->num_cpu_cbs))
> +#endif
> +	{
> +		FENCE_TRACE(&fence->base, "triggered from %u#%u, starting work\n",
> +			    other_fence->context, other_fence->seqno);
> +
> +		*fence->cpu_sema = ~0;
> +
> +		kfree(fence->cpu_cbs);
> +		fence->cpu_cbs = NULL;
> +	}
> +}
> +
> +static void radeon_semaphore_arm_cpu_cbs(struct radeon_semaphore *semaphore,
> +					 struct radeon_fence *fence)
> +{
> +	unsigned i, skipped = 0;
> +
> +	fence->cpu_cbs = semaphore->cpu_cbs;
> +	fence->cpu_sema = semaphore->cpu_sema;
> +	atomic_set(&fence->num_cpu_cbs, semaphore->cpu_waiters);
> +
> +	for (i = 0; i < semaphore->cpu_waiters; ++i) {
> +		struct fence *other = fence->cpu_cbs[i].fence;
> +
> +		if (other) {
> +			fence->cpu_cbs[i].fence = &fence->base;
> +			trace_fence_annotate_wait_on(&fence->base, other);
> +
> +			FENCE_TRACE(&fence->base, "queued wait on %u#%u\n",
> +				    other->context, other->seqno);
> +
> +			if (!fence_add_callback(other, &fence->cpu_cbs[i].base,
> +						radeon_semaphore_cpu_trigger))
> +				continue;
> +		}
> +		skipped++;
> +	}
> +
> +	if (skipped && atomic_sub_and_test(skipped, &fence->num_cpu_cbs)) {
> +		FENCE_TRACE(&fence->base, "No triggers, starting..\n");
> +
> +		*fence->cpu_sema = ~0;
> +		kfree(fence->cpu_cbs);
> +		fence->cpu_cbs = NULL;
> +	}
> +}
> +
>   void radeon_semaphore_free(struct radeon_device *rdev,
>   			   struct radeon_semaphore **semaphore,
>   			   struct radeon_fence *fence)
> @@ -195,6 +369,16 @@ void radeon_semaphore_free(struct radeon_device *rdev,
>   	if (semaphore == NULL || *semaphore == NULL) {
>   		return;
>   	}
> +	if ((*semaphore)->cpu_cbs) {
> +		(*semaphore)->waiters--;
> +
> +		if (!fence) {
> +			*(*semaphore)->cpu_sema = ~0U;
> +			kfree((*semaphore)->cpu_cbs);
> +		} else
> +			radeon_semaphore_arm_cpu_cbs(*semaphore, fence);
> +	}
> +
>   	if ((*semaphore)->waiters > 0) {
>   		dev_err(rdev->dev, "semaphore %p has more waiters than signalers,"
>   			" hardware lockup imminent!\n", *semaphore);
Maarten Lankhorst Sept. 4, 2014, 12:08 p.m. UTC | #2
Hey,

Op 04-09-14 om 13:54 schreef Christian König:
> Am 04.09.2014 um 13:42 schrieb Maarten Lankhorst:
>> Use the semaphore mechanism to make this happen, this uses signaling
>> from the cpu instead of signaling by the gpu.
>
> I'm not sure if this will work reliable when the semaphores are in system memory. We might need to reserve some VRAM for them instead.
>
> Regards,
> Christian.
Why would it be unreliable? I mostly kept it in semaphore for simplicity.

~Maarten
Christian König Sept. 4, 2014, 12:25 p.m. UTC | #3
Am 04.09.2014 um 14:08 schrieb Maarten Lankhorst:
> Hey,
>
> Op 04-09-14 om 13:54 schreef Christian König:
>> Am 04.09.2014 um 13:42 schrieb Maarten Lankhorst:
>>> Use the semaphore mechanism to make this happen, this uses signaling
>>> from the cpu instead of signaling by the gpu.
>> I'm not sure if this will work reliable when the semaphores are in system memory. We might need to reserve some VRAM for them instead.
>>
>> Regards,
>> Christian.
> Why would it be unreliable? I mostly kept it in semaphore for simplicity.

The semaphore block tries to avoid memory accesses whenever possible.

For example when a signal for address A arrives the block doesn't 
necessary writes that to memory but instead tries to match it 
immediately with a wait for address A. Similar is true if a wait for 
address A arrives and the semaphore block thinks it knows the memory 
value at address A.

Also I'm not sure if the semaphore block really polls the memory address 
for changes, instead it might just snoop the MC for writes to this 
address. Since CPU writes to system memory aren't seen by the GPU MC the 
semaphore block would never know something changed.

I need to check the docs how to do this correctly,
Christian.

>
> ~Maarten
>
Christian König Sept. 4, 2014, 1:34 p.m. UTC | #4
> I need to check the docs how to do this correctly,
The docs don't really cover this case.

For the GPU waiting on an address there is an extra document just for 
this case which I don't have at hand right now. But IIRC it was 
recommended to use the local memory of the device waiting on the 
semaphore. I'm just not sure if that's for pure performance reasons to 
avoid accessing the bus or if there's a hard and unavoidable hardware 
reason to do so.

For the GPU signaling case there is a special bit in the semaphore 
instructions that you need to set if any user outside of the GPU should 
see the write.

In general it is explicitly supported to use semaphores for inter device 
synchronization on the bus (that's what the block is made for), but it's 
not intended to be used for synchronization between the CPU and the 
device. So I'm not sure if things like cache snooping is implemented and 
correctly supported.

Well I see the feature more like nice to have and needs a bunch of 
testing, so I would say either wait with the patch for now or make it 
optional to use or something like that.

Regards,
Christian.

Am 04.09.2014 um 14:25 schrieb Christian König:
> Am 04.09.2014 um 14:08 schrieb Maarten Lankhorst:
>> Hey,
>>
>> Op 04-09-14 om 13:54 schreef Christian König:
>>> Am 04.09.2014 um 13:42 schrieb Maarten Lankhorst:
>>>> Use the semaphore mechanism to make this happen, this uses signaling
>>>> from the cpu instead of signaling by the gpu.
>>> I'm not sure if this will work reliable when the semaphores are in 
>>> system memory. We might need to reserve some VRAM for them instead.
>>>
>>> Regards,
>>> Christian.
>> Why would it be unreliable? I mostly kept it in semaphore for 
>> simplicity.
>
> The semaphore block tries to avoid memory accesses whenever possible.
>
> For example when a signal for address A arrives the block doesn't 
> necessary writes that to memory but instead tries to match it 
> immediately with a wait for address A. Similar is true if a wait for 
> address A arrives and the semaphore block thinks it knows the memory 
> value at address A.
>
> Also I'm not sure if the semaphore block really polls the memory 
> address for changes, instead it might just snoop the MC for writes to 
> this address. Since CPU writes to system memory aren't seen by the GPU 
> MC the semaphore block would never know something changed.
>
> I need to check the docs how to do this correctly,
> Christian.
>
>>
>> ~Maarten
>>
>
Maarten Lankhorst Sept. 4, 2014, 1:57 p.m. UTC | #5
Hey,

Op 04-09-14 om 15:34 schreef Christian König:
>> I need to check the docs how to do this correctly,
> The docs don't really cover this case.
>
> For the GPU waiting on an address there is an extra document just for this case which I don't have at hand right now. But IIRC it was recommended to use the local memory of the device waiting on the semaphore. I'm just not sure if that's for pure performance reasons to avoid accessing the bus or if there's a hard and unavoidable hardware reason to do so.
>
> For the GPU signaling case there is a special bit in the semaphore instructions that you need to set if any user outside of the GPU should see the write.
>
> In general it is explicitly supported to use semaphores for inter device synchronization on the bus (that's what the block is made for), but it's not intended to be used for synchronization between the CPU and the device. So I'm not sure if things like cache snooping is implemented and correctly supported.
>
> Well I see the feature more like nice to have and needs a bunch of testing, so I would say either wait with the patch for now or make it optional to use or something like that.
You're right, it's meant as something 'nice to have'. This is why it came after the patch that exports reservation_object to/from dma-buf. :-)

~Maarten
diff mbox

Patch

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index dddb2b7dd752..cd18fa7f801c 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -359,6 +359,11 @@  struct radeon_fence_driver {
 	struct delayed_work		lockup_work;
 };
 
+struct radeon_fence_cb {
+	struct fence_cb base;
+	struct fence *fence;
+};
+
 struct radeon_fence {
 	struct fence base;
 
@@ -368,6 +373,10 @@  struct radeon_fence {
 	unsigned			ring;
 
 	wait_queue_t			fence_wake;
+
+	atomic_t			num_cpu_cbs;
+	struct radeon_fence_cb		*cpu_cbs;
+	uint32_t			*cpu_sema;
 };
 
 int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring);
@@ -574,9 +583,11 @@  int radeon_mode_dumb_mmap(struct drm_file *filp,
  */
 struct radeon_semaphore {
 	struct radeon_sa_bo		*sa_bo;
-	signed				waiters;
+	signed				waiters, cpu_waiters, cpu_waiters_max;
 	uint64_t			gpu_addr;
 	struct radeon_fence		*sync_to[RADEON_NUM_RINGS];
+	uint32_t			*cpu_sema;
+	struct radeon_fence_cb		*cpu_cbs;
 };
 
 int radeon_semaphore_create(struct radeon_device *rdev,
@@ -587,6 +598,10 @@  bool radeon_semaphore_emit_wait(struct radeon_device *rdev, int ring,
 				struct radeon_semaphore *semaphore);
 void radeon_semaphore_sync_to(struct radeon_semaphore *semaphore,
 			      struct radeon_fence *fence);
+int radeon_semaphore_sync_obj(struct radeon_device *rdev,
+			      struct radeon_semaphore *semaphore,
+			      struct reservation_object *resv);
+
 int radeon_semaphore_sync_rings(struct radeon_device *rdev,
 				struct radeon_semaphore *semaphore,
 				int waiting_ring);
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
index 8ad4e2cfae15..b141f5bd029d 100644
--- a/drivers/gpu/drm/radeon/radeon_cs.c
+++ b/drivers/gpu/drm/radeon/radeon_cs.c
@@ -250,32 +250,16 @@  static int radeon_cs_get_ring(struct radeon_cs_parser *p, u32 ring, s32 priority
 
 static int radeon_cs_sync_rings(struct radeon_cs_parser *p)
 {
-	int i;
-
-	for (i = 0; i < p->nrelocs; i++) {
-		struct reservation_object *resv;
-		struct fence *fence;
-		struct radeon_fence *rfence;
-		int r;
+	int i, ret = 0;
 
+	for (i = 0; !ret && i < p->nrelocs; i++) {
 		if (!p->relocs[i].robj)
 			continue;
 
-		resv = p->relocs[i].robj->tbo.resv;
-		fence = reservation_object_get_excl(resv);
-		if (!fence)
-			continue;
-		rfence = to_radeon_fence(fence);
-		if (!rfence || rfence->rdev != p->rdev) {
-			r = fence_wait(fence, true);
-			if (r)
-				return r;
-			continue;
-		}
-
-		radeon_semaphore_sync_to(p->ib.semaphore, rfence);
+		ret = radeon_semaphore_sync_obj(p->rdev, p->ib.semaphore,
+						p->relocs[i].robj->tbo.resv);
 	}
-	return 0;
+	return ret;
 }
 
 /* XXX: note that this is called from the legacy UMS CS ioctl as well */
@@ -442,6 +426,10 @@  static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo
 		 */
 		list_sort(NULL, &parser->validated, cmp_size_smaller_first);
 
+		/* must be called with all reservation_objects still held */
+		radeon_semaphore_free(parser->rdev, &parser->ib.semaphore,
+				      parser->ib.fence);
+
 		ttm_eu_fence_buffer_objects(&parser->ticket,
 					    &parser->validated,
 					    &parser->ib.fence->base);
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index 0262fe2580d2..7687a7f8f41b 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -142,6 +142,8 @@  int radeon_fence_emit(struct radeon_device *rdev,
 	(*fence)->ring = ring;
 	fence_init(&(*fence)->base, &radeon_fence_ops,
 		   &rdev->fence_queue.lock, rdev->fence_context + ring, seq);
+	(*fence)->cpu_cbs = NULL;
+	(*fence)->cpu_sema = NULL;
 	radeon_fence_ring_emit(rdev, ring, *fence);
 	trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq);
 	radeon_fence_schedule_check(rdev, ring);
@@ -1057,11 +1059,20 @@  static signed long radeon_fence_default_wait(struct fence *f, bool intr,
 	return t;
 }
 
+static void __radeon_fence_destroy(struct fence *f)
+{
+	struct radeon_fence *fence = to_radeon_fence(f);
+
+	WARN_ON(fence->cpu_cbs);
+	kfree(fence->cpu_cbs);
+	fence_free(f);
+}
+
 const struct fence_ops radeon_fence_ops = {
 	.get_driver_name = radeon_fence_get_driver_name,
 	.get_timeline_name = radeon_fence_get_timeline_name,
 	.enable_signaling = radeon_fence_enable_signaling,
 	.signaled = radeon_fence_is_signaled,
 	.wait = radeon_fence_default_wait,
-	.release = NULL,
+	.release = __radeon_fence_destroy,
 };
diff --git a/drivers/gpu/drm/radeon/radeon_semaphore.c b/drivers/gpu/drm/radeon/radeon_semaphore.c
index 56d9fd66d8ae..2e71463d11c5 100644
--- a/drivers/gpu/drm/radeon/radeon_semaphore.c
+++ b/drivers/gpu/drm/radeon/radeon_semaphore.c
@@ -30,6 +30,7 @@ 
 #include <drm/drmP.h>
 #include "radeon.h"
 #include "radeon_trace.h"
+#include <trace/events/fence.h>
 
 int radeon_semaphore_create(struct radeon_device *rdev,
 			    struct radeon_semaphore **semaphore)
@@ -49,7 +50,11 @@  int radeon_semaphore_create(struct radeon_device *rdev,
 		return r;
 	}
 	(*semaphore)->waiters = 0;
+	(*semaphore)->cpu_waiters = 0;
+	(*semaphore)->cpu_waiters_max = 0;
 	(*semaphore)->gpu_addr = radeon_sa_bo_gpu_addr((*semaphore)->sa_bo);
+	(*semaphore)->cpu_sema = NULL;
+	(*semaphore)->cpu_cbs = NULL;
 
 	cpu_addr = radeon_sa_bo_cpu_addr((*semaphore)->sa_bo);
 	for (i = 0; i < RADEON_NUM_SYNCS; ++i)
@@ -115,6 +120,101 @@  void radeon_semaphore_sync_to(struct radeon_semaphore *semaphore,
         semaphore->sync_to[fence->ring] = radeon_fence_later(fence, other);
 }
 
+int radeon_semaphore_reserve_cpu_waiters(struct radeon_semaphore *semaphore, int add)
+{
+	int max = 4;
+	struct radeon_fence_cb *cpu_cbs;
+
+	if (semaphore->cpu_waiters + add <= semaphore->cpu_waiters_max)
+		return 0;
+
+	if (semaphore->cpu_waiters_max)
+		max = semaphore->cpu_waiters_max * 2;
+
+	cpu_cbs = krealloc(semaphore->cpu_cbs, max * sizeof(*cpu_cbs), GFP_KERNEL);
+	if (!cpu_cbs)
+		return -ENOMEM;
+	semaphore->cpu_cbs = cpu_cbs;
+	semaphore->cpu_waiters_max = max;
+	return 0;
+}
+
+static void radeon_semaphore_add_cpu_cb(struct radeon_semaphore *semaphore,
+					struct fence *fence)
+{
+	unsigned i;
+	struct radeon_fence_cb *empty = NULL;
+
+	for (i = 0; i < semaphore->cpu_waiters; ++i) {
+		struct fence *other = semaphore->cpu_cbs[i].fence;
+
+		if (!other)
+			empty = &semaphore->cpu_cbs[i];
+		else if (other->context == fence->context) {
+			semaphore->cpu_cbs[i].fence = fence_later(other, fence);
+			return;
+		}
+	}
+
+	if (!empty)
+		empty = &semaphore->cpu_cbs[semaphore->cpu_waiters++];
+
+	empty->fence = fence;
+	return;
+}
+
+/**
+ * radeon_semaphore_sync_obj - use the semaphore to sync to a bo
+ *
+ * @semaphore: semaphore object to add fence to
+ * @resv: the reservation_object to sync to
+ *
+ * Sync the reservation_object using this semaphore.
+ *
+ * radeon_semaphore_free must be called with all reservation_object locks
+ * still held!!!
+ */
+int radeon_semaphore_sync_obj(struct radeon_device *rdev,
+			      struct radeon_semaphore *semaphore,
+			      struct reservation_object *resv)
+{
+	struct fence *fence;
+	struct radeon_fence *rfence;
+	struct reservation_object_list *fobj;
+	int ret, i;
+
+	fobj = reservation_object_get_list(resv);
+	if (fobj && fobj->shared_count) {
+		ret = radeon_semaphore_reserve_cpu_waiters(semaphore, fobj->shared_count);
+		if (ret)
+			return ret;
+		for (i = 0; i < fobj->shared_count; ++i) {
+			fence = rcu_dereference_protected(fobj->shared[i],
+						reservation_object_held(resv));
+
+			radeon_semaphore_add_cpu_cb(semaphore, fence);
+		}
+		return 0;
+	}
+
+	fence = reservation_object_get_excl(resv);
+	if (!fence)
+		return 0;
+
+	rfence = to_radeon_fence(fence);
+	if (rfence && rfence->rdev == rdev) {
+		struct radeon_fence *other = semaphore->sync_to[rfence->ring];
+
+		semaphore->sync_to[rfence->ring] =
+			radeon_fence_later(rfence, other);
+		return 0;
+	}
+	ret = radeon_semaphore_reserve_cpu_waiters(semaphore, 1);
+	if (!ret)
+		radeon_semaphore_add_cpu_cb(semaphore, fence);
+	return ret;
+}
+
 /**
  * radeon_semaphore_sync_rings - sync ring to all registered fences
  *
@@ -124,6 +224,8 @@  void radeon_semaphore_sync_to(struct radeon_semaphore *semaphore,
  *
  * Ensure that all registered fences are signaled before letting
  * the ring continue. The caller must hold the ring lock.
+ *
+ * This function may only be called once on a semaphore.
  */
 int radeon_semaphore_sync_rings(struct radeon_device *rdev,
 				struct radeon_semaphore *semaphore,
@@ -132,6 +234,16 @@  int radeon_semaphore_sync_rings(struct radeon_device *rdev,
 	unsigned count = 0;
 	int i, r;
 
+	if (semaphore->cpu_waiters) {
+		/* allocate enough space for sync command */
+		if (radeon_semaphore_emit_wait(rdev, ring, semaphore)) {
+			semaphore->cpu_sema = radeon_sa_bo_cpu_addr(semaphore->sa_bo);
+			semaphore->gpu_addr += 8;
+			++count;
+		} else
+			semaphore->cpu_waiters = -1;
+	}
+
         for (i = 0; i < RADEON_NUM_RINGS; ++i) {
 		struct radeon_fence *fence = semaphore->sync_to[i];
 
@@ -188,6 +300,68 @@  int radeon_semaphore_sync_rings(struct radeon_device *rdev,
 	return 0;
 }
 
+static void radeon_semaphore_cpu_trigger(struct fence *other_fence,
+					 struct fence_cb *fence_cb)
+{
+	struct radeon_fence_cb *cb = (struct radeon_fence_cb*)fence_cb;
+	struct radeon_fence *fence = (struct radeon_fence *)cb->fence;
+
+#ifdef CONFIG_FENCE_TRACE
+	int ret = atomic_dec_return(&fence->num_cpu_cbs);
+
+	if (ret)
+		FENCE_TRACE(&fence->base, "triggered from %u#%u, %i remaining\n",
+			    ret, other_fence->context, other_fence->seqno);
+	else
+#else
+	if (atomic_dec_and_test(&fence->num_cpu_cbs))
+#endif
+	{
+		FENCE_TRACE(&fence->base, "triggered from %u#%u, starting work\n",
+			    other_fence->context, other_fence->seqno);
+
+		*fence->cpu_sema = ~0;
+
+		kfree(fence->cpu_cbs);
+		fence->cpu_cbs = NULL;
+	}
+}
+
+static void radeon_semaphore_arm_cpu_cbs(struct radeon_semaphore *semaphore,
+					 struct radeon_fence *fence)
+{
+	unsigned i, skipped = 0;
+
+	fence->cpu_cbs = semaphore->cpu_cbs;
+	fence->cpu_sema = semaphore->cpu_sema;
+	atomic_set(&fence->num_cpu_cbs, semaphore->cpu_waiters);
+
+	for (i = 0; i < semaphore->cpu_waiters; ++i) {
+		struct fence *other = fence->cpu_cbs[i].fence;
+
+		if (other) {
+			fence->cpu_cbs[i].fence = &fence->base;
+			trace_fence_annotate_wait_on(&fence->base, other);
+
+			FENCE_TRACE(&fence->base, "queued wait on %u#%u\n",
+				    other->context, other->seqno);
+
+			if (!fence_add_callback(other, &fence->cpu_cbs[i].base,
+						radeon_semaphore_cpu_trigger))
+				continue;
+		}
+		skipped++;
+	}
+
+	if (skipped && atomic_sub_and_test(skipped, &fence->num_cpu_cbs)) {
+		FENCE_TRACE(&fence->base, "No triggers, starting..\n");
+
+		*fence->cpu_sema = ~0;
+		kfree(fence->cpu_cbs);
+		fence->cpu_cbs = NULL;
+	}
+}
+
 void radeon_semaphore_free(struct radeon_device *rdev,
 			   struct radeon_semaphore **semaphore,
 			   struct radeon_fence *fence)
@@ -195,6 +369,16 @@  void radeon_semaphore_free(struct radeon_device *rdev,
 	if (semaphore == NULL || *semaphore == NULL) {
 		return;
 	}
+	if ((*semaphore)->cpu_cbs) {
+		(*semaphore)->waiters--;
+
+		if (!fence) {
+			*(*semaphore)->cpu_sema = ~0U;
+			kfree((*semaphore)->cpu_cbs);
+		} else
+			radeon_semaphore_arm_cpu_cbs(*semaphore, fence);
+	}
+
 	if ((*semaphore)->waiters > 0) {
 		dev_err(rdev->dev, "semaphore %p has more waiters than signalers,"
 			" hardware lockup imminent!\n", *semaphore);