diff mbox

[7/7] drm/nouveau: allow asynchronous waiting using gart fences

Message ID 54085057.6070904@canonical.com (mailing list archive)
State New, archived
Headers show

Commit Message

Maarten Lankhorst Sept. 4, 2014, 11:43 a.m. UTC
This requires allocating a fence sooner to annotate any
cross-dev fences, and making sure that enough memory is
available before emitting the fence.

The current seqno is written to the GART bo on completion,
and a list of finished fences is kept to allow arbitrary depth.

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
---
 drivers/gpu/drm/nouveau/nouveau_bo.c      |  28 ++--
 drivers/gpu/drm/nouveau/nouveau_chan.c    |   6 +-
 drivers/gpu/drm/nouveau/nouveau_display.c |  45 ++++---
 drivers/gpu/drm/nouveau/nouveau_fence.c   | 212 ++++++++++++++++++++++++++----
 drivers/gpu/drm/nouveau/nouveau_fence.h   |  29 ++--
 drivers/gpu/drm/nouveau/nouveau_gem.c     |  25 ++--
 drivers/gpu/drm/nouveau/nv04_fence.c      |   9 +-
 drivers/gpu/drm/nouveau/nv10_fence.c      |   9 +-
 drivers/gpu/drm/nouveau/nv84_fence.c      |  31 +++--
 drivers/gpu/drm/nouveau/nvc0_fence.c      |   4 +-
 10 files changed, 305 insertions(+), 93 deletions(-)
diff mbox

Patch

diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index f89b4a7c93fe..24c941927926 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -970,21 +970,21 @@  nouveau_bo_move_m2mf(struct ttm_buffer_object *bo, int evict, bool intr,
 	}
 
 	mutex_lock_nested(&cli->mutex, SINGLE_DEPTH_NESTING);
-	ret = nouveau_fence_sync(nouveau_bo(bo), chan, true);
-	if (ret == 0) {
+	ret = nouveau_fence_new(chan, &fence);
+	if (ret)
+		goto out;
+
+	ret = nouveau_fence_sync(nouveau_bo(bo), fence, true);
+	if (ret == 0)
 		ret = drm->ttm.move(chan, bo, &bo->mem, new_mem);
-		if (ret == 0) {
-			ret = nouveau_fence_new(chan, false, &fence);
-			if (ret == 0) {
-				ret = ttm_bo_move_accel_cleanup(bo,
-								&fence->base,
-								evict,
-								no_wait_gpu,
-								new_mem);
-				nouveau_fence_unref(&fence);
-			}
-		}
-	}
+	if (ret == 0)
+		ret = nouveau_fence_emit(fence);
+	if (ret == 0)
+		ret = ttm_bo_move_accel_cleanup(bo, &fence->base, evict,
+						no_wait_gpu, new_mem);
+	nouveau_fence_unref(&fence);
+
+out:
 	mutex_unlock(&cli->mutex);
 	return ret;
 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c b/drivers/gpu/drm/nouveau/nouveau_chan.c
index d639750379d6..1e5c76dfed3a 100644
--- a/drivers/gpu/drm/nouveau/nouveau_chan.c
+++ b/drivers/gpu/drm/nouveau/nouveau_chan.c
@@ -46,9 +46,11 @@  nouveau_channel_idle(struct nouveau_channel *chan)
 	struct nouveau_fence *fence = NULL;
 	int ret;
 
-	ret = nouveau_fence_new(chan, false, &fence);
+	ret = nouveau_fence_new(chan, &fence);
 	if (!ret) {
-		ret = nouveau_fence_wait(fence, false, false);
+		ret = nouveau_fence_emit(fence);
+		if (!ret)
+			ret = nouveau_fence_wait(fence, false, false);
 		nouveau_fence_unref(&fence);
 	}
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c b/drivers/gpu/drm/nouveau/nouveau_display.c
index a9ec525c0994..adbf870686aa 100644
--- a/drivers/gpu/drm/nouveau/nouveau_display.c
+++ b/drivers/gpu/drm/nouveau/nouveau_display.c
@@ -26,6 +26,7 @@ 
 
 #include <drm/drmP.h>
 #include <drm/drm_crtc_helper.h>
+#include <drm/ttm/ttm_execbuf_util.h>
 
 #include <nvif/class.h>
 
@@ -36,7 +37,6 @@ 
 #include "nouveau_gem.h"
 #include "nouveau_connector.h"
 #include "nv50_display.h"
-
 #include "nouveau_fence.h"
 
 #include <nvif/event.h>
@@ -644,7 +644,7 @@  nouveau_page_flip_emit(struct nouveau_channel *chan,
 		       struct nouveau_bo *old_bo,
 		       struct nouveau_bo *new_bo,
 		       struct nouveau_page_flip_state *s,
-		       struct nouveau_fence **pfence)
+		       struct nouveau_fence *fence)
 {
 	struct nouveau_fence_chan *fctx = chan->fence;
 	struct nouveau_drm *drm = chan->drm;
@@ -657,11 +657,6 @@  nouveau_page_flip_emit(struct nouveau_channel *chan,
 	list_add_tail(&s->head, &fctx->flip);
 	spin_unlock_irqrestore(&dev->event_lock, flags);
 
-	/* Synchronize with the old framebuffer */
-	ret = nouveau_fence_sync(old_bo, chan, false);
-	if (ret)
-		goto fail;
-
 	/* Emit the pageflip */
 	ret = RING_SPACE(chan, 2);
 	if (ret)
@@ -674,7 +669,7 @@  nouveau_page_flip_emit(struct nouveau_channel *chan,
 	OUT_RING  (chan, 0x00000000);
 	FIRE_RING (chan);
 
-	ret = nouveau_fence_new(chan, false, pfence);
+	ret = nouveau_fence_emit(fence);
 	if (ret)
 		goto fail;
 
@@ -700,6 +695,12 @@  nouveau_crtc_page_flip(struct drm_crtc *crtc, struct drm_framebuffer *fb,
 	struct nouveau_cli *cli;
 	struct nouveau_fence *fence;
 	int ret;
+	struct ttm_validate_buffer resv[2] = {
+		{ .bo = &old_bo->bo },
+		{ .bo = &new_bo->bo },
+	};
+	struct ww_acquire_ctx ticket;
+	LIST_HEAD(res);
 
 	chan = drm->channel;
 	if (!chan)
@@ -714,28 +715,31 @@  nouveau_crtc_page_flip(struct drm_crtc *crtc, struct drm_framebuffer *fb,
 		ret = nouveau_bo_pin(new_bo, TTM_PL_FLAG_VRAM);
 		if (ret)
 			goto fail_free;
+		list_add(&resv[1].head, &res);
 	}
+	list_add(&resv[0].head, &res);
 
 	mutex_lock(&cli->mutex);
-	ret = ttm_bo_reserve(&new_bo->bo, true, false, false, NULL);
+	ret = nouveau_fence_new(chan, &fence);
 	if (ret)
 		goto fail_unpin;
 
-	/* synchronise rendering channel with the kernel's channel */
-	ret = nouveau_fence_sync(new_bo, chan, false);
-	if (ret) {
-		ttm_bo_unreserve(&new_bo->bo);
+	ret = ttm_eu_reserve_buffers(&ticket, &res, true);
+	if (ret)
 		goto fail_unpin;
-	}
 
 	if (new_bo != old_bo) {
-		ttm_bo_unreserve(&new_bo->bo);
-
-		ret = ttm_bo_reserve(&old_bo->bo, true, false, false, NULL);
+		/* synchronise rendering channel with the kernel's channel */
+		ret = nouveau_fence_sync(new_bo, fence, false);
 		if (ret)
-			goto fail_unpin;
+			goto fail_unreserve;
 	}
 
+	/* Synchronize with the old framebuffer */
+	ret = nouveau_fence_sync(old_bo, fence, false);
+	if (ret)
+		goto fail_unreserve;
+
 	/* Initialize a page flip struct */
 	*s = (struct nouveau_page_flip_state)
 		{ { }, event, nouveau_crtc(crtc)->index,
@@ -772,7 +776,7 @@  nouveau_crtc_page_flip(struct drm_crtc *crtc, struct drm_framebuffer *fb,
 		nouveau_bo_ref(new_bo, &dispnv04->image[head]);
 	}
 
-	ret = nouveau_page_flip_emit(chan, old_bo, new_bo, s, &fence);
+	ret = nouveau_page_flip_emit(chan, old_bo, new_bo, s, fence);
 	if (ret)
 		goto fail_unreserve;
 	mutex_unlock(&cli->mutex);
@@ -781,7 +785,7 @@  nouveau_crtc_page_flip(struct drm_crtc *crtc, struct drm_framebuffer *fb,
 	crtc->primary->fb = fb;
 
 	nouveau_bo_fence(old_bo, fence, false);
-	ttm_bo_unreserve(&old_bo->bo);
+	ttm_eu_backoff_reservation(&ticket, &res);
 	if (old_bo != new_bo)
 		nouveau_bo_unpin(old_bo);
 	nouveau_fence_unref(&fence);
@@ -792,6 +796,7 @@  fail_unreserve:
 	ttm_bo_unreserve(&old_bo->bo);
 fail_unpin:
 	mutex_unlock(&cli->mutex);
+	nouveau_fence_unref(&fence);
 	if (old_bo != new_bo)
 		nouveau_bo_unpin(new_bo);
 fail_free:
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 574517a396fd..b1a1f0bfbe5a 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -143,6 +143,8 @@  nouveau_fence_context_new(struct nouveau_channel *chan, struct nouveau_fence_cha
 	struct nouveau_fence_priv *priv = (void*)chan->drm->fence;
 	int ret;
 
+	spin_lock_init(&fctx->trigger_lock);
+	INIT_LIST_HEAD(&fctx->triggers);
 	INIT_LIST_HEAD(&fctx->flip);
 	INIT_LIST_HEAD(&fctx->pending);
 	spin_lock_init(&fctx->lock);
@@ -218,33 +220,128 @@  err:
 	func(data);
 }
 
+static void nouveau_fence_cpu_triggered(struct nouveau_fence *fence)
+{
+	struct nouveau_channel *chan = fence->channel;
+	struct nouveau_fence_chan *fctx = chan->fence;
+	u32 any_seq = false, seq = ~0U;
+
+	/* unblock fence, this function is called with irqs disabled */
+	kfree(fence->waiters);
+	fence->waiters = NULL;
+
+	spin_lock(&fctx->trigger_lock);
+
+	/*
+	 * signal all fences for which waiters == NULL until the
+	 * first entry is found for which this is not true.
+	 *
+	 * This allows the wait >= seq op to work correctly on sysmem.
+	 */
+	while (!list_empty(&fctx->triggers)) {
+		struct nouveau_fence *chk = list_entry(fctx->triggers.next,
+						       struct nouveau_fence,
+						       trigger);
+
+		if (chk->waiters)
+			break;
+
+		any_seq = true;
+		seq = chk->base.seqno;
+
+		list_del(&chk->trigger);
+		fence_put(&chk->base);
+	}
+
+	if (any_seq)
+		fctx->signal_sysmem(chan, seq);
+
+	spin_unlock(&fctx->trigger_lock);
+}
+
+static void nouveau_fence_cpu_trigger(struct fence *other_fence,
+				      struct fence_cb *fence_cb)
+{
+	struct nouveau_fence_cb *cb = (struct nouveau_fence_cb*)fence_cb;
+	struct nouveau_fence *fence = (struct nouveau_fence *)cb->fence;
+
+#ifdef CONFIG_FENCE_TRACE
+	int ret = atomic_dec_return(&fence->readers);
+
+	if (ret)
+		FENCE_TRACE(&fence->base, "triggered from %u#%u, %i remaining\n",
+			    ret, other_fence->context, other_fence->seqno);
+	else
+#else
+	if (atomic_dec_and_test(&fence->readers))
+#endif
+	{
+		FENCE_TRACE(&fence->base, "triggered from %u#%u, starting work\n",
+			    other_fence->context, other_fence->seqno);
+
+		nouveau_fence_cpu_triggered(fence);
+	}
+}
+
+static void
+nouveau_fence_emit_waiters(struct nouveau_fence *fence,
+			   struct nouveau_fence_chan *fctx)
+{
+	unsigned i, skipped = 0;
+
+	atomic_set(&fence->readers, fence->num_waiters);
+
+	/* add to triggers */
+	fence_get(&fence->base);
+	spin_lock_irq(&fctx->trigger_lock);
+	list_add_tail(&fence->trigger, &fctx->triggers);
+	spin_unlock_irq(&fctx->trigger_lock);
+
+	for (i = 0; i < fence->num_waiters; ++i) {
+		struct fence *other = fence->waiters[i].fence;
+
+		if (other) {
+			fence->waiters[i].fence = &fence->base;
+			trace_fence_annotate_wait_on(&fence->base, other);
+
+			FENCE_TRACE(&fence->base, "queued wait on %u#%u\n",
+				    other->context, other->seqno);
+
+			if (!fence_add_callback(other, &fence->waiters[i].base,
+						nouveau_fence_cpu_trigger))
+				continue;
+		}
+		skipped++;
+	}
+
+	if (skipped && atomic_sub_and_test(skipped, &fence->readers)) {
+		FENCE_TRACE(&fence->base, "No triggers, starting..\n");
+
+		nouveau_fence_cpu_triggered(fence);
+	}
+}
+
 int
-nouveau_fence_emit(struct nouveau_fence *fence, struct nouveau_channel *chan)
+nouveau_fence_emit(struct nouveau_fence *fence)
 {
+	struct nouveau_channel *chan = fence->channel;
 	struct nouveau_fence_chan *fctx = chan->fence;
-	struct nouveau_fence_priv *priv = (void*)chan->drm->fence;
 	int ret;
 
-	fence->channel  = chan;
+	WARN(fence->head.next, "fence is emitted twice!\n");
 	fence->timeout  = jiffies + (15 * HZ);
 
-	if (priv->uevent)
-		fence_init(&fence->base, &nouveau_fence_ops_uevent,
-			   &fctx->lock,
-			   priv->context_base + chan->chid, ++fctx->sequence);
-	else
-		fence_init(&fence->base, &nouveau_fence_ops_legacy,
-			   &fctx->lock,
-			   priv->context_base + chan->chid, ++fctx->sequence);
-
 	trace_fence_emit(&fence->base);
-	ret = fctx->emit(fence);
+	ret = fctx->emit(fence, false);
 	if (!ret) {
 		fence_get(&fence->base);
 		spin_lock_irq(&fctx->lock);
 		nouveau_fence_update(chan, fctx);
 		list_add_tail(&fence->head, &fctx->pending);
 		spin_unlock_irq(&fctx->lock);
+
+		if (fence->num_waiters)
+			nouveau_fence_emit_waiters(fence, fctx);
 	}
 
 	return ret;
@@ -345,9 +442,58 @@  nouveau_fence_wait(struct nouveau_fence *fence, bool lazy, bool intr)
 		return 0;
 }
 
+static int nouveau_fence_reserve_waiter(struct nouveau_fence *fence)
+{
+	int max = 8;
+	struct nouveau_fence_cb *waiters;
+
+	if (fence->num_waiters + 1 <= fence->max_waiters)
+		return 0;
+
+	if (fence->max_waiters)
+		max = fence->max_waiters * 2;
+
+	waiters = krealloc(fence->waiters, max * sizeof(*waiters), GFP_KERNEL);
+	if (!waiters)
+		return -ENOMEM;
+	fence->waiters = waiters;
+	fence->max_waiters = max;
+	return 0;
+}
+
+static int nouveau_fence_add_fence_list(struct nouveau_fence *fence,
+					 struct fence *victim)
+{
+	struct nouveau_fence_cb *empty = NULL;
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < fence->num_waiters; ++i) {
+		struct fence *other = fence->waiters[i].fence;
+
+		if (!other)
+			empty = &fence->waiters[i];
+		else if (other->context == victim->context) {
+			fence->waiters[i].fence = fence_later(other, victim);
+			return 0;
+		}
+	}
+
+	if (!empty) {
+		ret = nouveau_fence_reserve_waiter(fence);
+		if (ret)
+			return ret;
+		empty = &fence->waiters[fence->num_waiters++];
+	}
+
+	empty->fence = victim;
+	return 0;
+}
+
 int
-nouveau_fence_sync(struct nouveau_bo *nvbo, struct nouveau_channel *chan, bool exclusive)
+nouveau_fence_sync(struct nouveau_bo *nvbo, struct nouveau_fence *nvfence, bool exclusive)
 {
+	struct nouveau_channel *chan = nvfence->channel;
 	struct nouveau_fence_chan *fctx = chan->fence;
 	struct fence *fence;
 	struct reservation_object *resv = nvbo->bo.resv;
@@ -371,6 +517,8 @@  nouveau_fence_sync(struct nouveau_bo *nvbo, struct nouveau_channel *chan, bool e
 		f = nouveau_local_fence(fence, chan->drm);
 		if (f)
 			prev = f->channel;
+		else if (fctx->signal_sysmem)
+			return nouveau_fence_add_fence_list(nvfence, fence);
 
 		if (!prev || (prev != chan && (ret = fctx->sync(f, prev, chan))))
 			ret = fence_wait(fence, true);
@@ -390,6 +538,11 @@  nouveau_fence_sync(struct nouveau_bo *nvbo, struct nouveau_channel *chan, bool e
 		f = nouveau_local_fence(fence, chan->drm);
 		if (f)
 			prev = f->channel;
+		else if (fctx->signal_sysmem) {
+			ret = nouveau_fence_add_fence_list(nvfence, fence);
+			if (ret)
+				break;
+		}
 
 		if (!prev || (ret = fctx->sync(f, prev, chan)))
 			ret = fence_wait(fence, true);
@@ -404,15 +557,22 @@  nouveau_fence_sync(struct nouveau_bo *nvbo, struct nouveau_channel *chan, bool e
 void
 nouveau_fence_unref(struct nouveau_fence **pfence)
 {
-	if (*pfence)
-		fence_put(&(*pfence)->base);
+	struct nouveau_fence *fence = *pfence;
+
+	if (!fence)
+		return;
+
 	*pfence = NULL;
+	fence_put(&fence->base);
 }
 
 int
-nouveau_fence_new(struct nouveau_channel *chan, bool sysmem,
+nouveau_fence_new(struct nouveau_channel *chan,
 		  struct nouveau_fence **pfence)
 {
+	struct nouveau_fifo_chan *fifo = (void*)chan->object;
+	struct nouveau_fence_priv *priv = (void*)chan->drm->fence;
+	struct nouveau_fence_chan *fctx = chan->fence;
 	struct nouveau_fence *fence;
 	int ret = 0;
 
@@ -423,11 +583,11 @@  nouveau_fence_new(struct nouveau_channel *chan, bool sysmem,
 	if (!fence)
 		return -ENOMEM;
 
-	fence->sysmem = sysmem;
+	fence->channel = chan;
 
-	ret = nouveau_fence_emit(fence, chan);
-	if (ret)
-		nouveau_fence_unref(&fence);
+	fence_init(&fence->base, priv->uevent ? &nouveau_fence_ops_uevent :
+		     &nouveau_fence_ops_legacy, &fctx->lock,
+		     priv->context_base + fifo->chid, ++fctx->sequence);
 
 	*pfence = fence;
 	return ret;
@@ -486,13 +646,21 @@  static bool nouveau_fence_no_signaling(struct fence *f)
 	return true;
 }
 
+static void nouveau_fence_release(struct fence *f)
+{
+	struct nouveau_fence *fence = from_fence(f);
+
+	kfree(fence->waiters);
+	fence_free(&fence->base);
+}
+
 static const struct fence_ops nouveau_fence_ops_legacy = {
 	.get_driver_name = nouveau_fence_get_get_driver_name,
 	.get_timeline_name = nouveau_fence_get_timeline_name,
 	.enable_signaling = nouveau_fence_no_signaling,
 	.signaled = nouveau_fence_is_signaled,
 	.wait = nouveau_fence_wait_legacy,
-	.release = NULL
+	.release = nouveau_fence_release
 };
 
 static bool nouveau_fence_enable_signaling(struct fence *f)
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h b/drivers/gpu/drm/nouveau/nouveau_fence.h
index 986c8135e564..f2a56c940a2c 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.h
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.h
@@ -12,33 +12,41 @@  struct nouveau_fence {
 
 	struct list_head head;
 
-	bool sysmem;
-
 	struct nouveau_channel *channel;
 	unsigned long timeout;
+
+	atomic_t readers;
+	struct list_head trigger;
+	struct nouveau_fence_cb {
+		struct fence_cb base;
+		struct fence *fence;
+	} *waiters;
+	int num_waiters, max_waiters;
 };
 
-int  nouveau_fence_new(struct nouveau_channel *, bool sysmem,
+int  nouveau_fence_new(struct nouveau_channel *,
 		       struct nouveau_fence **);
 void nouveau_fence_unref(struct nouveau_fence **);
 
-int  nouveau_fence_emit(struct nouveau_fence *, struct nouveau_channel *);
+int  nouveau_fence_emit(struct nouveau_fence *);
 bool nouveau_fence_done(struct nouveau_fence *);
 void nouveau_fence_work(struct fence *, void (*)(void *), void *);
 int  nouveau_fence_wait(struct nouveau_fence *, bool lazy, bool intr);
-int  nouveau_fence_sync(struct nouveau_bo *, struct nouveau_channel *, bool exclusive);
+int  nouveau_fence_sync(struct nouveau_bo *, struct nouveau_fence *fence, bool exclusive);
 
 struct nouveau_fence_chan {
 	spinlock_t lock;
 	struct list_head pending;
 	struct list_head flip;
 
-	int  (*emit)(struct nouveau_fence *);
+	spinlock_t trigger_lock;
+	struct list_head triggers;
+
+	int  (*emit)(struct nouveau_fence *, bool);
 	int  (*sync)(struct nouveau_fence *, struct nouveau_channel *,
 		     struct nouveau_channel *);
 	u32  (*read)(struct nouveau_channel *);
-	int  (*emit32)(struct nouveau_channel *, u64, u32);
-	int  (*sync32)(struct nouveau_channel *, u64, u32);
+	void (*signal_sysmem)(struct nouveau_channel *, u32 seq);
 
 	u32 sequence;
 	u32 context;
@@ -67,7 +75,7 @@  void nouveau_fence_context_del(struct nouveau_fence_chan *);
 int nv04_fence_create(struct nouveau_drm *);
 int nv04_fence_mthd(struct nouveau_channel *, u32, u32, u32);
 
-int  nv10_fence_emit(struct nouveau_fence *);
+int  nv10_fence_emit(struct nouveau_fence *, bool sysmem);
 int  nv17_fence_sync(struct nouveau_fence *, struct nouveau_channel *,
 		     struct nouveau_channel *);
 u32  nv10_fence_read(struct nouveau_channel *);
@@ -86,6 +94,9 @@  int nouveau_flip_complete(void *chan);
 
 struct nv84_fence_chan {
 	struct nouveau_fence_chan base;
+	int  (*emit32)(struct nouveau_channel *, u64, u32);
+	int  (*sync32)(struct nouveau_channel *, u64, u32);
+
 	struct nouveau_vma vma;
 	struct nouveau_vma vma_gart;
 	struct nouveau_vma dispc_vma[4];
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 1bc4eb33b60f..e6f11a60c453 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -433,7 +433,7 @@  retry:
 static int
 validate_list(struct nouveau_channel *chan, struct nouveau_cli *cli,
 	      struct list_head *list, struct drm_nouveau_gem_pushbuf_bo *pbbo,
-	      uint64_t user_pbbo_ptr)
+	      uint64_t user_pbbo_ptr, struct nouveau_fence *fence)
 {
 	struct nouveau_drm *drm = chan->drm;
 	struct drm_nouveau_gem_pushbuf_bo __user *upbbo =
@@ -459,7 +459,7 @@  validate_list(struct nouveau_channel *chan, struct nouveau_cli *cli,
 			return ret;
 		}
 
-		ret = nouveau_fence_sync(nvbo, chan, !!b->write_domains);
+		ret = nouveau_fence_sync(nvbo, fence, !!b->write_domains);
 		if (unlikely(ret)) {
 			if (ret != -ERESTARTSYS)
 				NV_PRINTK(error, cli, "fail post-validate sync\n");
@@ -496,7 +496,8 @@  nouveau_gem_pushbuf_validate(struct nouveau_channel *chan,
 			     struct drm_file *file_priv,
 			     struct drm_nouveau_gem_pushbuf_bo *pbbo,
 			     uint64_t user_buffers, int nr_buffers,
-			     struct validate_op *op, int *apply_relocs)
+			     struct validate_op *op, int *apply_relocs,
+			     struct nouveau_fence *fence)
 {
 	struct nouveau_cli *cli = nouveau_cli(file_priv);
 	int ret;
@@ -513,7 +514,7 @@  nouveau_gem_pushbuf_validate(struct nouveau_channel *chan,
 		return ret;
 	}
 
-	ret = validate_list(chan, cli, &op->list, pbbo, user_buffers);
+	ret = validate_list(chan, cli, &op->list, pbbo, user_buffers, fence);
 	if (unlikely(ret < 0)) {
 		if (ret != -ERESTARTSYS)
 			NV_PRINTK(error, cli, "validating bo list\n");
@@ -707,9 +708,14 @@  nouveau_gem_ioctl_pushbuf(struct drm_device *dev, void *data,
 		}
 	}
 
+	ret = nouveau_fence_new(chan, &fence);
+	if (ret)
+		goto out_prevalid;
+
 	/* Validate buffer list */
 	ret = nouveau_gem_pushbuf_validate(chan, file_priv, bo, req->buffers,
-					   req->nr_buffers, &op, &do_reloc);
+					   req->nr_buffers, &op, &do_reloc,
+					   fence);
 	if (ret) {
 		if (ret != -ERESTARTSYS)
 			NV_PRINTK(error, cli, "validate: %d\n", ret);
@@ -793,18 +799,21 @@  nouveau_gem_ioctl_pushbuf(struct drm_device *dev, void *data,
 		}
 	}
 
-	ret = nouveau_fence_new(chan, false, &fence);
+	ret = nouveau_fence_emit(fence);
 	if (ret) {
 		NV_PRINTK(error, cli, "error fencing pushbuf: %d\n", ret);
 		WIND_RING(chan);
 		goto out;
 	}
 
-out:
 	validate_fini(&op, fence, bo);
-	nouveau_fence_unref(&fence);
+
+out:
+	if (ret)
+		validate_fini(&op, NULL, bo);
 
 out_prevalid:
+	nouveau_fence_unref(&fence);
 	u_free(bo);
 	u_free(push);
 
diff --git a/drivers/gpu/drm/nouveau/nv04_fence.c b/drivers/gpu/drm/nouveau/nv04_fence.c
index 4484131d826a..de4d69166a37 100644
--- a/drivers/gpu/drm/nouveau/nv04_fence.c
+++ b/drivers/gpu/drm/nouveau/nv04_fence.c
@@ -35,10 +35,15 @@  struct nv04_fence_priv {
 };
 
 static int
-nv04_fence_emit(struct nouveau_fence *fence)
+nv04_fence_emit(struct nouveau_fence *fence, bool sysmem)
 {
 	struct nouveau_channel *chan = fence->channel;
-	int ret = RING_SPACE(chan, 2);
+	int ret;
+
+	if (sysmem)
+		return -ENODEV;
+
+	ret = RING_SPACE(chan, 2);
 	if (ret == 0) {
 		BEGIN_NV04(chan, NvSubSw, 0x0150, 1);
 		OUT_RING  (chan, fence->base.seqno);
diff --git a/drivers/gpu/drm/nouveau/nv10_fence.c b/drivers/gpu/drm/nouveau/nv10_fence.c
index 737d066ffc60..1608b0acfe0b 100644
--- a/drivers/gpu/drm/nouveau/nv10_fence.c
+++ b/drivers/gpu/drm/nouveau/nv10_fence.c
@@ -27,10 +27,15 @@ 
 #include "nv10_fence.h"
 
 int
-nv10_fence_emit(struct nouveau_fence *fence)
+nv10_fence_emit(struct nouveau_fence *fence, bool sysmem)
 {
 	struct nouveau_channel *chan = fence->channel;
-	int ret = RING_SPACE(chan, 2);
+	int ret;
+
+	if (sysmem)
+		return -ENODEV;
+
+	ret = RING_SPACE(chan, 2);
 	if (ret == 0) {
 		BEGIN_NV04(chan, 0, NV10_SUBCHAN_REF_CNT, 1);
 		OUT_RING  (chan, fence->base.seqno);
diff --git a/drivers/gpu/drm/nouveau/nv84_fence.c b/drivers/gpu/drm/nouveau/nv84_fence.c
index 7b372a68aa4e..84fc0c3c5c9a 100644
--- a/drivers/gpu/drm/nouveau/nv84_fence.c
+++ b/drivers/gpu/drm/nouveau/nv84_fence.c
@@ -71,18 +71,18 @@  nv84_fence_sync32(struct nouveau_channel *chan, u64 virtual, u32 sequence)
 }
 
 static int
-nv84_fence_emit(struct nouveau_fence *fence)
+nv84_fence_emit(struct nouveau_fence *fence, bool sysmem)
 {
 	struct nouveau_channel *chan = fence->channel;
 	struct nv84_fence_chan *fctx = chan->fence;
 	u64 addr = chan->chid * 16;
 
-	if (fence->sysmem)
+	if (sysmem)
 		addr += fctx->vma_gart.offset;
 	else
 		addr += fctx->vma.offset;
 
-	return fctx->base.emit32(chan, addr, fence->base.seqno);
+	return fctx->emit32(chan, addr, fence->base.seqno);
 }
 
 static int
@@ -92,12 +92,9 @@  nv84_fence_sync(struct nouveau_fence *fence,
 	struct nv84_fence_chan *fctx = chan->fence;
 	u64 addr = prev->chid * 16;
 
-	if (fence->sysmem)
-		addr += fctx->vma_gart.offset;
-	else
-		addr += fctx->vma.offset;
+	addr += fctx->vma.offset;
 
-	return fctx->base.sync32(chan, addr, fence->base.seqno);
+	return fctx->sync32(chan, addr, fence->base.seqno);
 }
 
 static u32
@@ -108,6 +105,15 @@  nv84_fence_read(struct nouveau_channel *chan)
 }
 
 static void
+nv84_fence_signal_sysmem(struct nouveau_channel *chan, u32 seq)
+{
+	struct nouveau_fifo_chan *fifo = (void *)chan->object;
+	struct nv84_fence_priv *priv = chan->drm->fence;
+
+	return nouveau_bo_wr32(priv->bo_gart, fifo->chid * 16/4, seq);
+}
+
+static void
 nv84_fence_context_del(struct nouveau_channel *chan)
 {
 	struct drm_device *dev = chan->drm->dev;
@@ -140,12 +146,15 @@  nv84_fence_context_new(struct nouveau_channel *chan)
 		return -ENOMEM;
 
 	nouveau_fence_context_new(chan, &fctx->base);
+
 	fctx->base.emit = nv84_fence_emit;
 	fctx->base.sync = nv84_fence_sync;
 	fctx->base.read = nv84_fence_read;
-	fctx->base.emit32 = nv84_fence_emit32;
-	fctx->base.sync32 = nv84_fence_sync32;
+	fctx->base.signal_sysmem = nv84_fence_signal_sysmem;
 	fctx->base.sequence = nv84_fence_read(chan);
+	nouveau_bo_wr32(priv->bo_gart, chan->chid * 16/4, fctx->base.sequence);
+	fctx->emit32 = nv84_fence_emit32;
+	fctx->sync32 = nv84_fence_sync32;
 
 	ret = nouveau_bo_vma_add(priv->bo, cli->vm, &fctx->vma);
 	if (ret == 0) {
@@ -159,8 +168,6 @@  nv84_fence_context_new(struct nouveau_channel *chan)
 		ret = nouveau_bo_vma_add(bo, cli->vm, &fctx->dispc_vma[i]);
 	}
 
-	nouveau_bo_wr32(priv->bo, chan->chid * 16/4, 0x00000000);
-
 	if (ret)
 		nv84_fence_context_del(chan);
 	return ret;
diff --git a/drivers/gpu/drm/nouveau/nvc0_fence.c b/drivers/gpu/drm/nouveau/nvc0_fence.c
index becf19abda2d..612689a5e35a 100644
--- a/drivers/gpu/drm/nouveau/nvc0_fence.c
+++ b/drivers/gpu/drm/nouveau/nvc0_fence.c
@@ -66,8 +66,8 @@  nvc0_fence_context_new(struct nouveau_channel *chan)
 	int ret = nv84_fence_context_new(chan);
 	if (ret == 0) {
 		struct nv84_fence_chan *fctx = chan->fence;
-		fctx->base.emit32 = nvc0_fence_emit32;
-		fctx->base.sync32 = nvc0_fence_sync32;
+		fctx->emit32 = nvc0_fence_emit32;
+		fctx->sync32 = nvc0_fence_sync32;
 	}
 	return ret;
 }