diff mbox

[PATCHv5,RESEND,3/8] gpu: host1x: Add channel support

Message ID 1358250244-9678-4-git-send-email-tbergstrom@nvidia.com (mailing list archive)
State New, archived
Headers show

Commit Message

Terje Bergstrom Jan. 15, 2013, 11:43 a.m. UTC
Add support for host1x client modules, and host1x channels to submit
work to the clients. The work is submitted in GEM CMA buffers, so
this patch adds support for them.

Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/host1x/Kconfig                  |   25 +-
 drivers/gpu/host1x/Makefile                 |    5 +
 drivers/gpu/host1x/cdma.c                   |  439 +++++++++++++++++++
 drivers/gpu/host1x/cdma.h                   |  107 +++++
 drivers/gpu/host1x/channel.c                |  140 ++++++
 drivers/gpu/host1x/channel.h                |   58 +++
 drivers/gpu/host1x/cma.c                    |  116 +++++
 drivers/gpu/host1x/cma.h                    |   43 ++
 drivers/gpu/host1x/dev.c                    |   13 +
 drivers/gpu/host1x/dev.h                    |   59 +++
 drivers/gpu/host1x/host1x.h                 |   29 ++
 drivers/gpu/host1x/hw/cdma_hw.c             |  475 +++++++++++++++++++++
 drivers/gpu/host1x/hw/cdma_hw.h             |   37 ++
 drivers/gpu/host1x/hw/channel_hw.c          |  148 +++++++
 drivers/gpu/host1x/hw/host1x01.c            |    6 +
 drivers/gpu/host1x/hw/host1x01_hardware.h   |  124 ++++++
 drivers/gpu/host1x/hw/hw_host1x01_channel.h |  102 +++++
 drivers/gpu/host1x/hw/hw_host1x01_sync.h    |   12 +
 drivers/gpu/host1x/hw/hw_host1x01_uclass.h  |  168 ++++++++
 drivers/gpu/host1x/hw/syncpt_hw.c           |   10 +
 drivers/gpu/host1x/intr.c                   |   29 +-
 drivers/gpu/host1x/intr.h                   |    6 +
 drivers/gpu/host1x/job.c                    |  612 +++++++++++++++++++++++++++
 drivers/gpu/host1x/job.h                    |  164 +++++++
 drivers/gpu/host1x/memmgr.c                 |  173 ++++++++
 drivers/gpu/host1x/memmgr.h                 |   72 ++++
 drivers/gpu/host1x/syncpt.c                 |   11 +
 drivers/gpu/host1x/syncpt.h                 |    4 +
 include/trace/events/host1x.h               |  211 +++++++++
 29 files changed, 3396 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/host1x/cdma.c
 create mode 100644 drivers/gpu/host1x/cdma.h
 create mode 100644 drivers/gpu/host1x/channel.c
 create mode 100644 drivers/gpu/host1x/channel.h
 create mode 100644 drivers/gpu/host1x/cma.c
 create mode 100644 drivers/gpu/host1x/cma.h
 create mode 100644 drivers/gpu/host1x/host1x.h
 create mode 100644 drivers/gpu/host1x/hw/cdma_hw.c
 create mode 100644 drivers/gpu/host1x/hw/cdma_hw.h
 create mode 100644 drivers/gpu/host1x/hw/channel_hw.c
 create mode 100644 drivers/gpu/host1x/hw/hw_host1x01_channel.h
 create mode 100644 drivers/gpu/host1x/hw/hw_host1x01_uclass.h
 create mode 100644 drivers/gpu/host1x/job.c
 create mode 100644 drivers/gpu/host1x/job.h
 create mode 100644 drivers/gpu/host1x/memmgr.c
 create mode 100644 drivers/gpu/host1x/memmgr.h

Comments

Thierry Reding Feb. 25, 2013, 3:24 p.m. UTC | #1
On Tue, Jan 15, 2013 at 01:43:59PM +0200, Terje Bergstrom wrote:
[...]
> diff --git a/drivers/gpu/host1x/Kconfig b/drivers/gpu/host1x/Kconfig
> index e89fb2b..57680a6 100644
> --- a/drivers/gpu/host1x/Kconfig
> +++ b/drivers/gpu/host1x/Kconfig
> @@ -3,4 +3,27 @@ config TEGRA_HOST1X
>  	help
>  	  Driver for the Tegra host1x hardware.
>  
> -	  Required for enabling tegradrm.
> +	  Required for enabling tegradrm and 2D acceleration.

I don't think I commented on this in the other patches, but I think this
could use a bit more information about what host1x is. Also mentioning
that it is a requirement for tegra-drm and 2D acceleration isn't very
useful because it can equally well be expressed in Kconfig. If you add
some description about what host1x is, people will know that they want
to enable it.

> +if TEGRA_HOST1X
> +
> +config TEGRA_HOST1X_CMA
> +	bool "Support DRM CMA buffers"
> +	depends on DRM
> +	default y
> +	select DRM_GEM_CMA_HELPER
> +	select DRM_KMS_CMA_HELPER
> +	help
> +	  Say yes if you wish to use DRM CMA buffers.
> +
> +	  If unsure, choose Y.

Perhaps make this not user-selectable (for now)? If somebody disables
this explicitly they won't get a working driver, right?

> diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
[...]
> +#include "cdma.h"
> +#include "channel.h"
> +#include "dev.h"
> +#include "memmgr.h"
> +#include "job.h"
> +#include <asm/cacheflush.h>
> +
> +#include <linux/slab.h>
> +#include <linux/kfifo.h>
> +#include <linux/interrupt.h>
> +#include <trace/events/host1x.h>
> +
> +#define TRACE_MAX_LENGTH 128U

"" includes generally follow <> ones.

> +/*
> + * Add an entry to the sync queue.
> + */
> +static void add_to_sync_queue(struct host1x_cdma *cdma,
> +			      struct host1x_job *job,
> +			      u32 nr_slots,
> +			      u32 first_get)
> +{
> +	if (job->syncpt_id == NVSYNCPT_INVALID) {
> +		dev_warn(&job->ch->dev->dev, "%s: Invalid syncpt\n",
> +				__func__);
> +		return;
> +	}
> +
> +	job->first_get = first_get;
> +	job->num_slots = nr_slots;
> +	host1x_job_get(job);
> +	list_add_tail(&job->list, &cdma->sync_queue);
> +}

It's a bit odd that you pass a job in here along with some parameters
that are then assigned to the job's fields. Couldn't you just assign
them to the job's fields before passing the job into this function?

I also see that you only use this function once, so maybe you could
open-code it instead.

> +/*
> + * Return the status of the cdma's sync queue or push buffer for the given event
> + *  - sq empty: returns 1 for empty, 0 for not empty (as in "1 empty queue" :-)
> + *  - pb space: returns the number of free slots in the channel's push buffer
> + * Must be called with the cdma lock held.
> + */
> +static unsigned int cdma_status_locked(struct host1x_cdma *cdma,
> +		enum cdma_event event)
> +{
> +	struct host1x *host1x = cdma_to_host1x(cdma);
> +	switch (event) {
> +	case CDMA_EVENT_SYNC_QUEUE_EMPTY:
> +		return list_empty(&cdma->sync_queue) ? 1 : 0;
> +	case CDMA_EVENT_PUSH_BUFFER_SPACE: {
> +		struct push_buffer *pb = &cdma->push_buffer;
> +		return host1x->cdma_pb_op.space(pb);
> +	}
> +	default:
> +		return 0;
> +	}
> +}

Similarly this function is only used in one place and it requires a
whole lot of documentation to define the meaning of the return value. If
you implement this functionality directly in host1x_cdma_wait_locked()
you have much more context and don't require all this "protocol".

> +/*
> + * Start timer for a buffer submition that has completed yet.

"submission". And I don't understand the "that has completed yet" part.

> + * Must be called with the cdma lock held.
> + */
> +static void cdma_start_timer_locked(struct host1x_cdma *cdma,
> +		struct host1x_job *job)

You use two different styles to indent the function parameters. You
might want to stick to one, preferably aligning them with the first
parameter on the first line.

> +{
> +	struct host1x *host = cdma_to_host1x(cdma);
> +
> +	if (cdma->timeout.clientid) {
> +		/* timer already started */
> +		return;
> +	}
> +
> +	cdma->timeout.clientid = job->clientid;
> +	cdma->timeout.syncpt = host1x_syncpt_get(host, job->syncpt_id);
> +	cdma->timeout.syncpt_val = job->syncpt_end;
> +	cdma->timeout.start_ktime = ktime_get();
> +
> +	schedule_delayed_work(&cdma->timeout.wq,
> +			msecs_to_jiffies(job->timeout));
> +}
> +
> +/*
> + * Stop timer when a buffer submition completes.

"submission"

> +/*
> + * For all sync queue entries that have already finished according to the
> + * current sync point registers:
> + *  - unpin & unref their mems
> + *  - pop their push buffer slots
> + *  - remove them from the sync queue
> + * This is normally called from the host code's worker thread, but can be
> + * called manually if necessary.
> + * Must be called with the cdma lock held.
> + */
> +static void update_cdma_locked(struct host1x_cdma *cdma)
> +{
> +	bool signal = false;
> +	struct host1x *host1x = cdma_to_host1x(cdma);
> +	struct host1x_job *job, *n;
> +
> +	/* If CDMA is stopped, queue is cleared and we can return */
> +	if (!cdma->running)
> +		return;
> +
> +	/*
> +	 * Walk the sync queue, reading the sync point registers as necessary,
> +	 * to consume as many sync queue entries as possible without blocking
> +	 */
> +	list_for_each_entry_safe(job, n, &cdma->sync_queue, list) {
> +		struct host1x_syncpt *sp = host1x->syncpt + job->syncpt_id;

host1x_syncpt_get()?

> +
> +		/* Check whether this syncpt has completed, and bail if not */
> +		if (!host1x_syncpt_is_expired(sp, job->syncpt_end)) {
> +			/* Start timer on next pending syncpt */
> +			if (job->timeout)
> +				cdma_start_timer_locked(cdma, job);
> +			break;
> +		}
> +
> +		/* Cancel timeout, when a buffer completes */
> +		if (cdma->timeout.clientid)
> +			stop_cdma_timer_locked(cdma);
> +
> +		/* Unpin the memory */
> +		host1x_job_unpin(job);
> +
> +		/* Pop push buffer slots */
> +		if (job->num_slots) {
> +			struct push_buffer *pb = &cdma->push_buffer;
> +			host1x->cdma_pb_op.pop_from(pb, job->num_slots);
> +			if (cdma->event == CDMA_EVENT_PUSH_BUFFER_SPACE)
> +				signal = true;
> +		}
> +
> +		list_del(&job->list);
> +		host1x_job_put(job);
> +	}
> +
> +	if (list_empty(&cdma->sync_queue) &&
> +				cdma->event == CDMA_EVENT_SYNC_QUEUE_EMPTY)
> +			signal = true;

This looks funny, maybe:

	if (cdma->event == CDMA_EVENT_SYNC_QUEUE_EMPTY &&
	    list_empty(&cdma->sync_queue))
		signal = true;

?

> +
> +	/* Wake up CdmaWait() if the requested event happened */

CdmaWait()? Where's that?

> +	if (signal) {
> +		cdma->event = CDMA_EVENT_NONE;
> +		up(&cdma->sem);
> +	}
> +}
> +
> +void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
> +		struct platform_device *dev)

There's nothing in this function that requires a platform_device, so
passing struct device should be enough. Or maybe host1x_cdma should get
a struct device * field?

> +{
> +	u32 get_restart;

Maybe just call this "restart" or "restart_addr". get_restart sounds
like a function name.

> +	u32 syncpt_incrs;
> +	struct host1x_job *job = NULL;
> +	u32 syncpt_val;
> +	struct host1x *host1x = cdma_to_host1x(cdma);
> +
> +	syncpt_val = host1x_syncpt_load_min(cdma->timeout.syncpt);
> +
> +	dev_dbg(&dev->dev,
> +		"%s: starting cleanup (thresh %d)\n",
> +		__func__, syncpt_val);

This fits on two lines.

> +
> +	/*
> +	 * Move the sync_queue read pointer to the first entry that hasn't
> +	 * completed based on the current HW syncpt value. It's likely there
> +	 * won't be any (i.e. we're still at the head), but covers the case
> +	 * where a syncpt incr happens just prior/during the teardown.
> +	 */
> +
> +	dev_dbg(&dev->dev,
> +		"%s: skip completed buffers still in sync_queue\n",
> +		__func__);

This too.

> +	list_for_each_entry(job, &cdma->sync_queue, list) {
> +		if (syncpt_val < job->syncpt_end)
> +			break;
> +
> +		host1x_job_dump(&dev->dev, job);
> +	}

That's potentially a lot of debug output. I wonder if it might make
sense to control parts of this via a module parameter. Then again, if
somebody really needs to debug this, maybe they really want *all* the
information.

> +	/*
> +	 * Walk the sync_queue, first incrementing with the CPU syncpts that
> +	 * are partially executed (the first buffer) or fully skipped while
> +	 * still in the current context (slots are also NOP-ed).
> +	 *
> +	 * At the point contexts are interleaved, syncpt increments must be
> +	 * done inline with the pushbuffer from a GATHER buffer to maintain
> +	 * the order (slots are modified to be a GATHER of syncpt incrs).
> +	 *
> +	 * Note: save in get_restart the location where the timed out buffer
> +	 * started in the PB, so we can start the refetch from there (with the
> +	 * modified NOP-ed PB slots). This lets things appear to have completed
> +	 * properly for this buffer and resources are freed.
> +	 */
> +
> +	dev_dbg(&dev->dev,
> +		"%s: perform CPU incr on pending same ctx buffers\n",
> +		__func__);

Can be collapsed to two lines.

> +
> +	get_restart = cdma->last_put;
> +	if (!list_empty(&cdma->sync_queue))
> +		get_restart = job->first_get;

Perhaps:

	if (list_empty(&cdma->sync_queue))
		restart = cdma->last_put;
	else
		restart = job->first_get;

?

> +	list_for_each_entry_from(job, &cdma->sync_queue, list)
> +		if (job->clientid == cdma->timeout.clientid)
> +			job->timeout = 500;

I think this warrants a comment.

> +/*
> + * Destroy a cdma
> + */
> +void host1x_cdma_deinit(struct host1x_cdma *cdma)
> +{
> +	struct push_buffer *pb = &cdma->push_buffer;
> +	struct host1x *host1x = cdma_to_host1x(cdma);
> +
> +	if (cdma->running) {
> +		pr_warn("%s: CDMA still running\n",
> +				__func__);
> +	} else {
> +		host1x->cdma_pb_op.destroy(pb);
> +		host1x->cdma_op.timeout_destroy(cdma);
> +	}
> +}

There's no way to recover from the situation where a cdma is still
running. Can this not return an error code (-EBUSY?) if the cdma can't
be destroyed?

> +/*
> + * End a cdma submit
> + * Kick off DMA, add job to the sync queue, and a number of slots to be freed
> + * from the pushbuffer. The handles for a submit must all be pinned at the same
> + * time, but they can be unpinned in smaller chunks.
> + */
> +void host1x_cdma_end(struct host1x_cdma *cdma,
> +		struct host1x_job *job)
> +{
> +	struct host1x *host1x = cdma_to_host1x(cdma);
> +	bool was_idle = list_empty(&cdma->sync_queue);

Maybe just "idle"? It reflects the current state of the CDMA, not any
old state.

> +
> +	host1x->cdma_op.kick(cdma);
> +
> +	add_to_sync_queue(cdma,
> +			job,
> +			cdma->slots_used,
> +			cdma->first_get);

No need to split this over so many lines. Also, shouldn't the order be
reversed here? I.e. first add to sync queue, then start DMA?

> +	/* start timer on idle -> active transitions */
> +	if (job->timeout && was_idle)
> +		cdma_start_timer_locked(cdma, job);

This could be part of add_to_sync_queue(), but if you open-code that as
I suggest earlier it should obviously stay.

> diff --git a/drivers/gpu/host1x/cdma.h b/drivers/gpu/host1x/cdma.h
[...]
> +struct platform_device;

No need for this if you pass struct device * instead.

> +/*
> + * cdma
> + *
> + * This is in charge of a host command DMA channel.
> + * Sends ops to a push buffer, and takes responsibility for unpinning
> + * (& possibly freeing) of memory after those ops have completed.
> + * Producer:
> + *	begin
> + *		push - send ops to the push buffer
> + *	end - start command DMA and enqueue handles to be unpinned
> + * Consumer:
> + *	update - call to update sync queue and push buffer, unpin memory
> + */

I find the name to be a bit confusing. For some reason I automatically
think of GSM when I read CDMA. This really is more of a job queue, so
maybe calling it host1x_job_queue might be more appropriate. But I've
already requested a lot of things to be renamed, so I think I can live
with this being called CDMA if you don't want to change it.

Alternatively all of these could be moved to the struct host1x_channel
given that there's only one of each of the push_buffer, buffer_timeout
and host1x_cma objects per channel.

> diff --git a/drivers/gpu/host1x/channel.c b/drivers/gpu/host1x/channel.c
[...]
> +#include "channel.h"
> +#include "dev.h"
> +#include "job.h"
> +
> +#include <linux/slab.h>
> +#include <linux/module.h>

Again the include ordering is strange.

> +/*
> + * Iterator function for host1x device list
> + * It takes a fptr as an argument and calls that function for each
> + * device in the list
> + */
> +void host1x_channel_for_all(struct host1x *host1x, void *data,
> +	int (*fptr)(struct host1x_channel *ch, void *fdata))
> +{
> +	struct host1x_channel *ch;
> +	int ret;
> +
> +	list_for_each_entry(ch, &host1x->chlist.list, list) {
> +		if (ch && fptr) {
> +			ret = fptr(ch, data);
> +			if (ret) {
> +				pr_info("%s: iterator error\n", __func__);
> +				break;
> +			}
> +		}
> +	}
> +}

Couldn't you rewrite this as a macro, similar to list_for_each_entry()
so that users could do something like:

	host1x_for_each_channel(channel, host1x) {
		...
	}

That's a bit friendlier than having each user write a separate function
to be called from this iterator.

> +int host1x_channel_submit(struct host1x_job *job)
> +{
> +	return host1x_get_host(job->ch->dev)->channel_op.submit(job);
> +}

I'd expect a function named host1x_channel_submit() to take a struct
host1x_channel *. Should this perhaps be called host1x_job_submit()?

> +struct host1x_channel *host1x_channel_get(struct host1x_channel *ch)
> +{
> +	int err = 0;
> +
> +	mutex_lock(&ch->reflock);
> +	if (ch->refcount == 0)
> +		err = host1x_cdma_init(&ch->cdma);
> +	if (!err)
> +		ch->refcount++;
> +
> +	mutex_unlock(&ch->reflock);
> +
> +	return err ? NULL : ch;
> +}

Why don't you use any of the kernel's reference counting mechanisms?

> +void host1x_channel_put(struct host1x_channel *ch)
> +{
> +	mutex_lock(&ch->reflock);
> +	if (ch->refcount == 1) {
> +		host1x_get_host(ch->dev)->cdma_op.stop(&ch->cdma);
> +		host1x_cdma_deinit(&ch->cdma);
> +	}
> +	ch->refcount--;
> +	mutex_unlock(&ch->reflock);
> +}

I think you can do all of this using a kref.

> +struct host1x_channel *host1x_channel_alloc(struct platform_device *pdev)
> +{
> +	struct host1x_channel *ch = NULL;
> +	struct host1x *host1x = host1x_get_host(pdev);
> +	int chindex;
> +	int max_channels = host1x->info.nb_channels;
> +	int err;
> +
> +	mutex_lock(&host1x->chlist_mutex);
> +
> +	chindex = host1x->allocated_channels;
> +	if (chindex > max_channels)
> +		goto fail;
> +
> +	ch = kzalloc(sizeof(*ch), GFP_KERNEL);
> +	if (ch == NULL)
> +		goto fail;
> +
> +	/* Link platform_device to host1x_channel */
> +	err = host1x->channel_op.init(ch, host1x, chindex);
> +	if (err < 0)
> +		goto fail;
> +
> +	ch->dev = pdev;
> +
> +	/* Add to channel list */
> +	list_add_tail(&ch->list, &host1x->chlist.list);
> +
> +	host1x->allocated_channels++;
> +
> +	mutex_unlock(&host1x->chlist_mutex);
> +	return ch;
> +
> +fail:
> +	dev_err(&pdev->dev, "failed to init channel\n");
> +	kfree(ch);
> +	mutex_unlock(&host1x->chlist_mutex);
> +	return NULL;
> +}

I think the critical section could be shorter here. It's probably not
worth the extra trouble, though, given that channels are not often
allocated.

> +void host1x_channel_free(struct host1x_channel *ch)
> +{
> +	struct host1x *host1x = host1x_get_host(ch->dev);
> +	struct host1x_channel *chiter, *tmp;
> +	list_for_each_entry_safe(chiter, tmp, &host1x->chlist.list, list) {
> +		if (chiter == ch) {
> +			list_del(&chiter->list);
> +			kfree(ch);
> +			host1x->allocated_channels--;
> +
> +			return;
> +		}
> +	}
> +}

This doesn't free the channel if it happens to not be part of the host1x
channel list. Perhaps an easier way to write it would be:

	host1x = host1x_get_host(ch->dev);

	list_del(&ch->list);
	kfree(ch);

	host1x->allocated_channels--;

Looking at the rest of the code, it seems like a channel will never not
be part of the host1x channel list, so I don't think there's a need to
to scan the list.

On a side-note: generally if you break out of the loop right after
freeing the memory of a removed node, there's no need to use the _safe
variant since you won't be accessing the .next field of the freed node
anyway.

Maybe these should also adopt a similar naming as what we discussed for
the syncpoints. That is:

	struct host1x_channel *host1x_channel_request(struct device *dev);

?

> diff --git a/drivers/gpu/host1x/channel.h b/drivers/gpu/host1x/channel.h
[...]
> +
> +/*
> + * host1x device list in debug-fs dump of host1x and client device
> + * as well as channel state
> + */

I don't understand this comment.

> +struct host1x_channel {
> +	struct list_head list;
> +
> +	int refcount;
> +	int chid;

This can probably just be id. It is a field of host1x_channel, so the ch
prefix is redundant.

> +	struct mutex reflock;
> +	struct mutex submitlock;
> +	void __iomem *regs;
> +	struct device *node;

This is never used.

> +	struct platform_device *dev;

Can this be just struct device *?

> +	struct cdev cdev;

This is never used.

> +/* channel list operations */
> +void host1x_channel_list_init(struct host1x *);
> +void host1x_channel_for_all(struct host1x *, void *data,
> +	int (*fptr)(struct host1x_channel *ch, void *fdata));
> +
> +struct host1x_channel *host1x_channel_alloc(struct platform_device *pdev);
> +void host1x_channel_free(struct host1x_channel *ch);

Is it a good idea to make host1x_channel_free() publicly available?
Shouldn't the host1x_channel_alloc()/host1x_channel_request() return a
host1x_channel with a reference count of 1 and everybody release their
reference using host1x_channel_put() to make sure the channel is freed
only after the last reference disappears?

Otherwise whoever calls host1x_channel_free() will confuse everybody
else that's still keeping a reference.

> diff --git a/drivers/gpu/host1x/cma.c b/drivers/gpu/host1x/cma.c
[...]

Various spurious blank lines in this file, and the alignment of function
parameters is off.

> +struct mem_handle *host1x_cma_get(u32 id, struct platform_device *dev)

I don't think this needs platform_device either.

> +{
> +	struct drm_gem_cma_object *obj = to_cma_obj((void *)id);
> +	struct mutex *struct_mutex = &obj->base.dev->struct_mutex;
> +
> +	mutex_lock(struct_mutex);
> +	drm_gem_object_reference(&obj->base);
> +	mutex_unlock(struct_mutex);

I think it's more customary to obtain a pointer to struct drm_device and
then use mutex_{lock,unlock}(&drm->struct_mutex). Or you could just use
drm_gem_object_reference_unlocked(&obj->base) instead. Which doesn't
exist yet, apparently. But it could be added.

> +int host1x_cma_pin_array_ids(struct platform_device *dev,
> +		long unsigned *ids,
> +		long unsigned id_type_mask,
> +		long unsigned id_type,
> +		u32 count,
> +		struct host1x_job_unpin_data *unpin_data,
> +		dma_addr_t *phys_addr)

struct device * and unsigned long please. count can also doesn't need to
be a sized type. unsigned int will do just fine. The return value can
also be unsigned int if you don't expect to return any error conditions.

> +{
> +	int i;
> +	int pin_count = 0;

Both should be unsigned as well, and can go on one line:

	unsigned int pin_count = 0, i;

> diff --git a/drivers/gpu/host1x/dev.h b/drivers/gpu/host1x/dev.h
[...]
>  struct host1x;
> +struct host1x_intr;
>  struct host1x_syncpt;
> +struct host1x_channel;
> +struct host1x_cdma;
> +struct host1x_job;
> +struct push_buffer;
> +struct dentry;

I think this already belongs in a previous patch. The debugfs dentry
isn't added in this patch.

> +struct host1x_channel_ops {
> +	int (*init)(struct host1x_channel *,
> +		    struct host1x *,
> +		    int chid);

Please add the parameter names as well (the same goes for all ops
declared in this file). And "id" will be enough. Also the channel ID can
surely be unsigned, right?

> +struct host1x_cdma_ops {
> +	void (*start)(struct host1x_cdma *);
> +	void (*stop)(struct host1x_cdma *);
> +	void (*kick)(struct  host1x_cdma *);
> +	int (*timeout_init)(struct host1x_cdma *,
> +			    u32 syncpt_id);
> +	void (*timeout_destroy)(struct host1x_cdma *);
> +	void (*timeout_teardown_begin)(struct host1x_cdma *);
> +	void (*timeout_teardown_end)(struct host1x_cdma *,
> +				     u32 getptr);
> +	void (*timeout_cpu_incr)(struct host1x_cdma *,
> +				 u32 getptr,
> +				 u32 syncpt_incrs,
> +				 u32 syncval,
> +				 u32 nr_slots);
> +};

Can the timeout_ prefix not be dropped? The functions are generally
useful and not directly related to timeouts, even though they seem to
only be used during timeout handling.

Also, is it really necessary to abstract these into an ops structure? I
get that newer hardware revisions might require different ops for sync-
point handling because the register layout or number of syncpoints may
be different, but the CDMA and push buffer (below) concepts are pretty
much a software abstraction, and as such its implementation is unlikely
to change with some future hardware revision.

> +struct host1x_pushbuffer_ops {
> +	void (*reset)(struct push_buffer *);
> +	int (*init)(struct push_buffer *);
> +	void (*destroy)(struct push_buffer *);
> +	void (*push_to)(struct push_buffer *,
> +			struct mem_handle *,
> +			u32 op1, u32 op2);
> +	void (*pop_from)(struct push_buffer *,
> +			 unsigned int slots);

Maybe just push() and pop()?

> +	u32 (*space)(struct push_buffer *);
> +	u32 (*putptr)(struct push_buffer *);
> +};
>  
>  struct host1x_syncpt_ops {
>  	void (*reset)(struct host1x_syncpt *);
> @@ -64,9 +111,19 @@ struct host1x {
>  	struct host1x_device_info info;
>  	struct clk *clk;
>  
> +	/* Sync point dedicated to replacing waits for expired fences */
> +	struct host1x_syncpt *nop_sp;
> +
> +	struct host1x_channel_ops channel_op;
> +	struct host1x_cdma_ops cdma_op;
> +	struct host1x_pushbuffer_ops cdma_pb_op;
>  	struct host1x_syncpt_ops syncpt_op;
>  	struct host1x_intr_ops intr_op;
>  
> +	struct mutex chlist_mutex;
> +	struct host1x_channel chlist;

Shouldn't this just be struct list_head?

> +	int allocated_channels;

unsigned int? And maybe just "num_channels"?

> diff --git a/drivers/gpu/host1x/host1x.h b/drivers/gpu/host1x/host1x.h
[...]
> +enum host1x_class {
> +	NV_HOST1X_CLASS_ID		= 0x1,
> +	NV_GRAPHICS_2D_CLASS_ID		= 0x51,

This entry belongs in a later patch, right? And I find it convenient if
enumeration constants start with the enum name as prefix. Furthermore
it'd be nice to reuse the hardware module names, like so:

	enum host1x_class {
		HOST1X_CLASS_HOST1X,
		HOST1X_CLASS_GR2D,
		HOST1X_CLASS_GR3D,
	};

> diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
[...]
> +#include <linux/slab.h>
> +#include <linux/scatterlist.h>
> +#include <linux/dma-mapping.h>
> +#include "cdma.h"
> +#include "channel.h"
> +#include "dev.h"
> +#include "memmgr.h"
> +
> +#include "cdma_hw.h"
> +
> +static inline u32 host1x_channel_dmactrl(int stop, int get_rst, int init_get)
> +{
> +	return HOST1X_CHANNEL_DMACTRL_DMASTOP_F(stop)
> +		| HOST1X_CHANNEL_DMACTRL_DMAGETRST_F(get_rst)
> +		| HOST1X_CHANNEL_DMACTRL_DMAINITGET_F(init_get);

I think it is more customary to put the | at the end of the preceding
line:

	return HOST1X_CHANNEL_DMACTRL_DMASTOP_F(stop) |
	       HOST1X_CHANNEL_DMACTRL_DMAGETRST_F(get_rst) |
	       HOST1X_CHANNEL_DMACTRL_DMAINITGET_F(init_get);

Also since these are all single bits, I'd prefer if you could drop the
_F suffix and not make them take a parameter. I think it'd even be
better not to have this function at all, but make the intent explicit
where the register is written. That is, have each call site set the bits
explicitly instead of calling this helper. Having a parameter list such
as (true, false, false) or (true, true, true) is confusing since you
have to keep looking up the meaning of the parameters.

> +}
> +
> +static void cdma_timeout_handler(struct work_struct *work);

Can this prototype be avoided?

> +/**
> + * Reset to empty push buffer
> + */
> +static void push_buffer_reset(struct push_buffer *pb)
> +{
> +	pb->fence = PUSH_BUFFER_SIZE - 8;
> +	pb->cur = 0;

Maybe position is a better name than cur.

> +/**
> + * Init push buffer resources
> + */
> +static void push_buffer_destroy(struct push_buffer *pb);

You should be careful with these comment blocks. If you start them with
/**, then you should make them proper kerneldoc comments. But you don't
really need that for static functions, so you could just make them /*-
style.

Also this particular comment is confusingly place on top of the proto-
type of push_buffer_destroy().

> +/*
> + * Push two words to the push buffer
> + * Caller must ensure push buffer is not full
> + */
> +static void push_buffer_push_to(struct push_buffer *pb,
> +		struct mem_handle *handle,
> +		u32 op1, u32 op2)
> +{
> +	u32 cur = pb->cur;
> +	u32 *p = (u32 *)((u32)pb->mapped + cur);

You do all this extra casting to make sure to increment by bytes and not
32-bit words. How about you change pb->cur to contain the word index, so
that you don't have to go through hoops each time around.

Alternatively you could make it a pointer to u32 and not have to index
or cast at all. So you'd end up with something like:

	struct push_buffer {
		u32 *start;
		u32 *end;
		u32 *ptr;
	};

> +/*
> + * Return the number of two word slots free in the push buffer
> + */
> +static u32 push_buffer_space(struct push_buffer *pb)
> +{
> +	return ((pb->fence - pb->cur) & (PUSH_BUFFER_SIZE - 1)) / 8;
> +}

Why & (PUSH_BUFFER_SIZE - 1) here? fence - cur can never be larger than
PUSH_BUFFER_SIZE, can it?

> +/*
> + * Init timeout resources
> + */
> +static int cdma_timeout_init(struct host1x_cdma *cdma,
> +				 u32 syncpt_id)
> +{
> +	if (syncpt_id == NVSYNCPT_INVALID)
> +		return -EINVAL;

Do we really need the syncpt_id check here? It is the only reason why we
need to pass the parameter in the first place, and if we get to this
point we should already have made sure that the syncpoint is actually
valid.

> +/*
> + * Increment timedout buffer's syncpt via CPU.

Nit: "timed out buffer's"

> + */
> +static void cdma_timeout_cpu_incr(struct host1x_cdma *cdma, u32 getptr,
> +				u32 syncpt_incrs, u32 syncval, u32 nr_slots)

The syncval parameter isn't used.

> +{
> +	struct host1x *host1x = cdma_to_host1x(cdma);
> +	struct push_buffer *pb = &cdma->push_buffer;
> +	u32 i, getidx;
> +
> +	for (i = 0; i < syncpt_incrs; i++)
> +		host1x_syncpt_cpu_incr(cdma->timeout.syncpt);
> +
> +	/* after CPU incr, ensure shadow is up to date */
> +	host1x_syncpt_load_min(cdma->timeout.syncpt);
> +
> +	/* NOP all the PB slots */
> +	getidx = getptr - pb->phys;
> +	while (nr_slots--) {
> +		u32 *p = (u32 *)((u32)pb->mapped + getidx);
> +		*(p++) = HOST1X_OPCODE_NOOP;
> +		*(p++) = HOST1X_OPCODE_NOOP;
> +		dev_dbg(&host1x->dev->dev, "%s: NOP at 0x%x\n",
> +			__func__, pb->phys + getidx);
> +		getidx = (getidx + 8) & (PUSH_BUFFER_SIZE - 1);
> +	}
> +	wmb();

Why the memory barrier?

> +/*
> + * Similar to cdma_start(), but rather than starting from an idle
> + * state (where DMA GET is set to DMA PUT), on a timeout we restore
> + * DMA GET from an explicit value (so DMA may again be pending).
> + */
> +static void cdma_timeout_restart(struct host1x_cdma *cdma, u32 getptr)
> +{
> +	struct host1x *host1x = cdma_to_host1x(cdma);
> +	struct host1x_channel *ch = cdma_to_channel(cdma);
> +
> +	if (cdma->running)
> +		return;
> +
> +	cdma->last_put = host1x->cdma_pb_op.putptr(&cdma->push_buffer);
> +
> +	host1x_ch_writel(ch, host1x_channel_dmactrl(true, false, false),
> +		HOST1X_CHANNEL_DMACTRL);
> +
> +	/* set base, end pointer (all of memory) */
> +	host1x_ch_writel(ch, 0, HOST1X_CHANNEL_DMASTART);
> +	host1x_ch_writel(ch, 0xFFFFFFFF, HOST1X_CHANNEL_DMAEND);

According to the TRM, writing to HOST1X_CHANNEL_DMASTART will start a
DMA transfer on the channel (if DMA_PUT != DMA_GET). Irrespective of
that, why set the valid range to all of physical memory? We know the
valid range of the push buffer, why not set the limits accordingly?

> +/*
> + * Kick channel DMA into action by writing its PUT offset (if it has changed)
> + */
> +static void cdma_kick(struct host1x_cdma *cdma)
> +{
> +	struct host1x *host1x = cdma_to_host1x(cdma);
> +	struct host1x_channel *ch = cdma_to_channel(cdma);
> +	u32 put;
> +
> +	put = host1x->cdma_pb_op.putptr(&cdma->push_buffer);
> +
> +	if (put != cdma->last_put) {
> +		host1x_ch_writel(ch, put, HOST1X_CHANNEL_DMAPUT);
> +		cdma->last_put = put;
> +	}
> +}

kick() sounds unusual. Maybe flush or commit or something similar would
be more accurate.

> +static void cdma_stop(struct host1x_cdma *cdma)
> +{
> +	struct host1x_channel *ch = cdma_to_channel(cdma);
> +
> +	mutex_lock(&cdma->lock);
> +	if (cdma->running) {
> +		host1x_cdma_wait_locked(cdma, CDMA_EVENT_SYNC_QUEUE_EMPTY);
> +		host1x_ch_writel(ch, host1x_channel_dmactrl(true, false, false),
> +			HOST1X_CHANNEL_DMACTRL);
> +		cdma->running = false;
> +	}
> +	mutex_unlock(&cdma->lock);
> +}

Perhaps this should be ranem cdma_stop_sync() or similar to make it
clear that it waits for the queue to run empty.

> +static void cdma_timeout_teardown_end(struct host1x_cdma *cdma, u32 getptr)

Maybe the last parameter should be called restart to match its purpose?

> +{
> +	struct host1x *host1x = cdma_to_host1x(cdma);
> +	struct host1x_channel *ch = cdma_to_channel(cdma);
> +	u32 cmdproc_stop;
> +
> +	dev_dbg(&host1x->dev->dev,
> +		"end channel teardown (id %d, DMAGET restart = 0x%x)\n",
> +		ch->chid, getptr);
> +
> +	cmdproc_stop = host1x_sync_readl(host1x, HOST1X_SYNC_CMDPROC_STOP);
> +	cmdproc_stop &= ~(BIT(ch->chid));

No need for the extra parentheses.

> +	host1x_sync_writel(host1x, cmdproc_stop, HOST1X_SYNC_CMDPROC_STOP);
> +
> +	cdma->torndown = false;
> +	cdma_timeout_restart(cdma, getptr);
> +}

I find this a bit non-intuitive. We teardown a channel, and when we're
done tearing down, the torndown variable is set to false and the channel
is actually restarted. Maybe you could explain some more how this works
and what its purpose is.

> +/*
> + * If this timeout fires, it indicates the current sync_queue entry has
> + * exceeded its TTL and the userctx should be timed out and remaining
> + * submits already issued cleaned up (future submits return an error).
> + */

I can't seem to find what causes subsequent submits to return an error.
Also, how is the channel reset so that new jobs can be submitted?

> +static void cdma_timeout_handler(struct work_struct *work)
> +{
> +	struct host1x_cdma *cdma;
> +	struct host1x *host1x;
> +	struct host1x_channel *ch;
> +
> +	u32 syncpt_val;
> +
> +	u32 prev_cmdproc, cmdproc_stop;
> +
> +	cdma = container_of(to_delayed_work(work), struct host1x_cdma,
> +			    timeout.wq);
> +	host1x = cdma_to_host1x(cdma);
> +	ch = cdma_to_channel(cdma);
> +
> +	mutex_lock(&cdma->lock);
> +
> +	if (!cdma->timeout.clientid) {
> +		dev_dbg(&host1x->dev->dev,
> +			 "cdma_timeout: expired, but has no clientid\n");
> +		mutex_unlock(&cdma->lock);
> +		return;
> +	}

How can the CDMA not have a client?

> +
> +	/* stop processing to get a clean snapshot */
> +	prev_cmdproc = host1x_sync_readl(host1x, HOST1X_SYNC_CMDPROC_STOP);
> +	cmdproc_stop = prev_cmdproc | BIT(ch->chid);
> +	host1x_sync_writel(host1x, cmdproc_stop, HOST1X_SYNC_CMDPROC_STOP);
> +
> +	dev_dbg(&host1x->dev->dev, "cdma_timeout: cmdproc was 0x%x is 0x%x\n",
> +		prev_cmdproc, cmdproc_stop);
> +
> +	syncpt_val = host1x_syncpt_load_min(host1x->syncpt);
> +
> +	/* has buffer actually completed? */
> +	if ((s32)(syncpt_val - cdma->timeout.syncpt_val) >= 0) {
> +		dev_dbg(&host1x->dev->dev,
> +			 "cdma_timeout: expired, but buffer had completed\n");

Maybe this should really be a warning?

> +		/* restore */
> +		cmdproc_stop = prev_cmdproc & ~(BIT(ch->chid));

No need for the extra parentheses. Also, why not just use prev_cmdproc,
which shouldn't have the bit set anyway?

> diff --git a/drivers/gpu/host1x/hw/cdma_hw.h b/drivers/gpu/host1x/hw/cdma_hw.h
[...]
> +/*
> + * Size of the sync queue. If it is too small, we won't be able to queue up
> + * many command buffers. If it is too large, we waste memory.
> + */
> +#define HOST1X_SYNC_QUEUE_SIZE 512

I don't see this used anywhere.

> +/*
> + * Number of gathers we allow to be queued up per channel. Must be a
> + * power of two. Currently sized such that pushbuffer is 4KB (512*8B).
> + */
> +#define HOST1X_GATHER_QUEUE_SIZE 512

More pieces falling into place.

> diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
[...]
> +#include "host1x.h"
> +#include "channel.h"
> +#include "dev.h"
> +#include <linux/slab.h>
> +#include "intr.h"
> +#include "job.h"
> +#include <trace/events/host1x.h>

More include ordering issues.

> +static void submit_gathers(struct host1x_job *job)
> +{
> +	/* push user gathers */
> +	int i;

unsigned int?

> +	for (i = 0 ; i < job->num_gathers; i++) {
> +		struct host1x_job_gather *g = &job->gathers[i];
> +		u32 op1 = host1x_opcode_gather(g->words);
> +		u32 op2 = g->mem_base + g->offset;
> +		host1x_cdma_push_gather(&job->ch->cdma,
> +				job->gathers[i].ref,
> +				job->gathers[i].offset,
> +				op1, op2);
> +	}
> +}

Perhaps inline this into channel_submit()? I'm not sure how useful it
really is to split off smallish functions such as this which aren't
reused anywhere else. I don't have any major objection though, so you
can keep it separate if you want.

> +static inline void __iomem *host1x_channel_regs(void __iomem *p, int ndx)
> +{
> +	p += ndx * NV_HOST1X_CHANNEL_MAP_SIZE_BYTES;
> +	return p;
> +}
> +
> +static int host1x_channel_init(struct host1x_channel *ch,
> +	struct host1x *dev, int index)
> +{
> +	ch->chid = index;
> +	mutex_init(&ch->reflock);
> +	mutex_init(&ch->submitlock);
> +
> +	ch->regs = host1x_channel_regs(dev->regs, index);
> +	return 0;
> +}

You only use host1x_channel_regs() once, so I really don't think it buys
you anything to split it off. Both host1x_channel_regs() and
host1x_channel_init() are short enough that they can be collapsed.

> diff --git a/drivers/gpu/host1x/hw/host1x01.c b/drivers/gpu/host1x/hw/host1x01.c
[...]
>  #include "hw/host1x01.h"
>  #include "dev.h"
> +#include "channel.h"
>  #include "hw/host1x01_hardware.h"
>  
> +#include "hw/channel_hw.c"
> +#include "hw/cdma_hw.c"
>  #include "hw/syncpt_hw.c"
>  #include "hw/intr_hw.c"
>  
>  int host1x01_init(struct host1x *host)
>  {
> +	host->channel_op = host1x_channel_ops;
> +	host->cdma_op = host1x_cdma_ops;
> +	host->cdma_pb_op = host1x_pushbuffer_ops;
>  	host->syncpt_op = host1x_syncpt_ops;
>  	host->intr_op = host1x_intr_ops;

I think I mentioned this before, but I'd prefer not to have the .c files
included here, but rather reference the ops structures externally. But I
still think that especially CDMA and push buffer ops don't need to be in
separate structures since they aren't likely to change with new hardware
revisions.

> diff --git a/drivers/gpu/host1x/hw/host1x01_hardware.h b/drivers/gpu/host1x/hw/host1x01_hardware.h
[...]
> index c1d5324..03873c0 100644
> --- a/drivers/gpu/host1x/hw/host1x01_hardware.h
> +++ b/drivers/gpu/host1x/hw/host1x01_hardware.h
> @@ -21,6 +21,130 @@
>  
>  #include <linux/types.h>
>  #include <linux/bitops.h>
> +#include "hw_host1x01_channel.h"
>  #include "hw_host1x01_sync.h"
> +#include "hw_host1x01_uclass.h"
> +
> +/* channel registers */
> +#define NV_HOST1X_CHANNEL_MAP_SIZE_BYTES 16384

The only user of this seems to be host1x_channel_regs(), so it could be
moved to that file. Also the name is overly long, why not something like
HOST1X_CHANNEL_SIZE?

> +#define HOST1X_OPCODE_NOOP host1x_opcode_nonincr(0, 0)

HOST1X_OPCODE_NOP would be more canonical in my opinion.


> +static inline u32 host1x_mask2(unsigned x, unsigned y)
> +{
> +	return 1 | (1 << (y - x));
> +}

What's this? I don't see it used anywhere.

> diff --git a/drivers/gpu/host1x/hw/hw_host1x01_channel.h b/drivers/gpu/host1x/hw/hw_host1x01_channel.h
[...]
> +#define HOST1X_CHANNEL_DMACTRL_DMASTOP_F(v) \
> +	host1x_channel_dmactrl_dmastop_f(v)

I mentioned this elsewhere already, but I think the _F suffix (and _f
for that matter) along with the v parameter should go away.

> diff --git a/drivers/gpu/host1x/hw/hw_host1x01_uclass.h b/drivers/gpu/host1x/hw/hw_host1x01_uclass.h
[...]

What does the "uclass" stand for? It seems a bit useless to me.

> diff --git a/drivers/gpu/host1x/hw/syncpt_hw.c b/drivers/gpu/host1x/hw/syncpt_hw.c
> index 16e3ada..ba48cee 100644
> --- a/drivers/gpu/host1x/hw/syncpt_hw.c
> +++ b/drivers/gpu/host1x/hw/syncpt_hw.c
> @@ -97,6 +97,15 @@ static void syncpt_cpu_incr(struct host1x_syncpt *sp)
>  	wmb();
>  }
>  
> +/* remove a wait pointed to by patch_addr */
> +static int syncpt_patch_wait(struct host1x_syncpt *sp, void *patch_addr)
> +{
> +	u32 override = host1x_class_host_wait_syncpt(
> +			NVSYNCPT_GRAPHICS_HOST, 0);
> +	__raw_writel(override, patch_addr);

__raw_writel() isn't meant to be used for regular memory addresses, but
only for MMIO addresses. patch_addr will be a kernel virtual address to
an location in RAM, so you can just treat it as a normal pointer, so:

	*(u32 *)patch_addr = override;

A small optimization might be to make override a static const, so that
it doesn't have to be composed every time.

> diff --git a/drivers/gpu/host1x/intr.c b/drivers/gpu/host1x/intr.c
[...]
> +static void action_submit_complete(struct host1x_waitlist *waiter)
> +{
> +	struct host1x_channel *channel = waiter->data;
> +	int nr_completed = waiter->count;

No need for this variable.

> diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c
[...]
> +#ifdef CONFIG_TEGRA_HOST1X_FIREWALL
> +static int host1x_firewall = 1;
> +#else
> +static int host1x_firewall;
> +#endif

You could use IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL) in the code,
which will have the nice side-effect of compiling code out if the symbol
isn't selected.

> +struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
> +		u32 num_cmdbufs, u32 num_relocs, u32 num_waitchks)

Maybe make the parameters unsigned int instead of u32?

> +{
> +	struct host1x_job *job = NULL;
> +	int num_unpins = num_cmdbufs + num_relocs;

unsigned int?

> +	s64 total;

This doesn't need to be signed, u64 will be good enough. None of the
terms in the expression that assigns to total can be negative.

> +	void *mem;
> +
> +	/* Check that we're not going to overflow */
> +	total = sizeof(struct host1x_job)
> +			+ num_relocs * sizeof(struct host1x_reloc)
> +			+ num_unpins * sizeof(struct host1x_job_unpin_data)
> +			+ num_waitchks * sizeof(struct host1x_waitchk)
> +			+ num_cmdbufs * sizeof(struct host1x_job_gather)
> +			+ num_unpins * sizeof(dma_addr_t)
> +			+ num_unpins * sizeof(u32 *);

"+"s at the end of the preceding lines.

> +	if (total > ULONG_MAX)
> +		return NULL;
> +
> +	mem = job = kzalloc(total, GFP_KERNEL);
> +	if (!job)
> +		return NULL;
> +
> +	kref_init(&job->ref);
> +	job->ch = ch;
> +
> +	/* First init state to zero */
> +
> +	/*
> +	 * Redistribute memory to the structs.
> +	 * Overflows and negative conditions have
> +	 * already been checked in job_alloc().
> +	 */

The last two lines don't really apply here. The checks are in this same
function and they check only for overflow, not negative conditions,
which can't happen anyway since the counts are all unsigned.

> +void host1x_job_get(struct host1x_job *job)
> +{
> +	kref_get(&job->ref);
> +}

I think it is common for *_get() functions to return a pointer to the
referenced object.

> +void host1x_job_add_gather(struct host1x_job *job,
> +		u32 mem_id, u32 words, u32 offset)
> +{
> +	struct host1x_job_gather *cur_gather =
> +			&job->gathers[job->num_gathers];

Should this check for overflow?

> +/*
> + * Check driver supplied waitchk structs for syncpt thresholds
> + * that have already been satisfied and NULL the comparison (to
> + * avoid a wrap condition in the HW).
> + */
> +static int do_waitchks(struct host1x_job *job, struct host1x *host,
> +		u32 patch_mem, struct mem_handle *h)
> +{
> +	int i;
> +
> +	/* compare syncpt vs wait threshold */
> +	for (i = 0; i < job->num_waitchk; i++) {
> +		struct host1x_waitchk *wait = &job->waitchk[i];
> +		struct host1x_syncpt *sp =
> +			host1x_syncpt_get(host, wait->syncpt_id);
> +
> +		/* validate syncpt id */
> +		if (wait->syncpt_id > host1x_syncpt_nb_pts(host))
> +			continue;
> +
> +		/* skip all other gathers */
> +		if (patch_mem != wait->mem)
> +			continue;
> +
> +		trace_host1x_syncpt_wait_check(wait->mem, wait->offset,
> +				wait->syncpt_id, wait->thresh,
> +				host1x_syncpt_read_min(sp));
> +		if (host1x_syncpt_is_expired(
> +			host1x_syncpt_get(host, wait->syncpt_id),
> +			wait->thresh)) {

You already have the sp variable that you could use here to make it more
readable.

> +			struct host1x_syncpt *sp =
> +				host1x_syncpt_get(host, wait->syncpt_id);

And you don't need this then, since you already have sp pointing to the
same syncpoint.

> +			void *patch_addr = NULL;
> +
> +			/*
> +			 * NULL an already satisfied WAIT_SYNCPT host method,
> +			 * by patching its args in the command stream. The
> +			 * method data is changed to reference a reserved
> +			 * (never given out or incr) NVSYNCPT_GRAPHICS_HOST
> +			 * syncpt with a matching threshold value of 0, so
> +			 * is guaranteed to be popped by the host HW.
> +			 */
> +			dev_dbg(&host->dev->dev,
> +			    "drop WAIT id %d (%s) thresh 0x%x, min 0x%x\n",
> +			    wait->syncpt_id, sp->name, wait->thresh,
> +			    host1x_syncpt_read_min(sp));
> +
> +			/* patch the wait */
> +			patch_addr = host1x_memmgr_kmap(h,
> +					wait->offset >> PAGE_SHIFT);
> +			if (patch_addr) {
> +				host1x_syncpt_patch_wait(sp,
> +					(patch_addr +
> +						(wait->offset & ~PAGE_MASK)));
> +				host1x_memmgr_kunmap(h,
> +						wait->offset >> PAGE_SHIFT,
> +						patch_addr);
> +			} else {
> +				pr_err("Couldn't map cmdbuf for wait check\n");
> +			}

This is a case where splitting out a small function would actually be
useful to make the code more readable since you can remove two levels of
indentation. You can just pass in the handle and the offset, let it do
the actual patching. Maybe

	host1x_syncpt_patch_offset(sp, h, wait->offset);

?

> +		}
> +
> +		wait->mem = 0;
> +	}
> +	return 0;
> +}
> +
> +

There's a gratuitous blank line.

> +static int pin_job_mem(struct host1x_job *job)
> +{
> +	int i;
> +	int count = 0;
> +	int result;

These (and the return value) can all be unsigned int.

> +static int do_relocs(struct host1x_job *job,
> +		u32 cmdbuf_mem, struct mem_handle *h)
> +{
> +	int i = 0;

This can also be unsigned int.

> +	int last_page = -1;

And this should match the type of cmdbuf_offset (u32). You can initially
set it to something like ~0 to make sure it doesn't match any valid
offset.

> +	void *cmdbuf_page_addr = NULL;
> +
> +	/* pin & patch the relocs for one gather */
> +	while (i < job->num_relocs) {
> +		struct host1x_reloc *reloc = &job->relocarray[i];
> +
> +		/* skip all other gathers */
> +		if (cmdbuf_mem != reloc->cmdbuf_mem) {
> +			i++;
> +			continue;
> +		}
> +
> +		if (last_page != reloc->cmdbuf_offset >> PAGE_SHIFT) {
> +			if (cmdbuf_page_addr)
> +				host1x_memmgr_kunmap(h,
> +						last_page, cmdbuf_page_addr);
> +
> +			cmdbuf_page_addr = host1x_memmgr_kmap(h,
> +					reloc->cmdbuf_offset >> PAGE_SHIFT);
> +			last_page = reloc->cmdbuf_offset >> PAGE_SHIFT;
> +
> +			if (unlikely(!cmdbuf_page_addr)) {
> +				pr_err("Couldn't map cmdbuf for relocation\n");
> +				return -ENOMEM;
> +			}
> +		}
> +
> +		__raw_writel(
> +			(job->reloc_addr_phys[i] +
> +				reloc->target_offset) >> reloc->shift,
> +			(cmdbuf_page_addr +
> +				(reloc->cmdbuf_offset & ~PAGE_MASK)));

Again, wrong __raw_writel() usage.

> +
> +		/* remove completed reloc from the job */
> +		if (i != job->num_relocs - 1) {
> +			struct host1x_reloc *reloc_last =
> +				&job->relocarray[job->num_relocs - 1];
> +			reloc->cmdbuf_mem	= reloc_last->cmdbuf_mem;
> +			reloc->cmdbuf_offset	= reloc_last->cmdbuf_offset;
> +			reloc->target		= reloc_last->target;
> +			reloc->target_offset	= reloc_last->target_offset;
> +			reloc->shift		= reloc_last->shift;
> +			job->reloc_addr_phys[i] =
> +				job->reloc_addr_phys[job->num_relocs - 1];
> +			job->num_relocs--;
> +		} else {
> +			break;
> +		}
> +	}
> +
> +	if (cmdbuf_page_addr)
> +		host1x_memmgr_kunmap(h, last_page, cmdbuf_page_addr);
> +
> +	return 0;
> +}

Also the algorithm seems a bit strange and hard to follow. Instead of
removing relocs from the job, replacing them with the last entry and
decrementing job->num_relocs, how much is the penalty for always
iterating over all relocs? This is one of the other cases where I'd
argue that simplicity is key. Furthermore you need to copy quite a bit
of data to replace the completed relocs, so I'm not sure it buys you
much.

It could always be optimized later on by just setting a bit in the reloc
to mark it as completed, or keep a bitmask of completed relocations or
whatever.

> +static int check_reloc(struct host1x_reloc *reloc,
> +		u32 cmdbuf_id, int offset)

offset can be unsigned int.

> +{
> +	int err = 0;
> +	if (reloc->cmdbuf_mem != cmdbuf_id
> +			|| reloc->cmdbuf_offset != offset * sizeof(u32))
> +		err = -EINVAL;
> +
> +	return err;
> +}

More canonically:

	offset *= sizeof(u32);

	if (reloc->cmdbuf_mem != cmdbuf_id || reloc->cmdbuf_offset != offset)
		return -EINVAL;

	return 0;

> +
> +static int check_mask(struct host1x_job *job,
> +		struct platform_device *pdev,
> +		struct host1x_reloc **reloc, int *num_relocs,
> +		u32 cmdbuf_id, int *offset,
> +		u32 *words, u32 class, u32 reg, u32 mask)

num_relocs and offset can be unsigned int *.

Same comment for the other check_*() functions. That said I think the
code would become a lot more readable if you were to wrap all of these
parameters into a structure, say host1x_firewall, and just pass that
into the functions.

> +static inline int copy_gathers(struct host1x_job *job,
> +		struct platform_device *pdev)

struct device *

> +{
> +	size_t size = 0;
> +	size_t offset = 0;
> +	int i;
> +
> +	for (i = 0; i < job->num_gathers; i++) {
> +		struct host1x_job_gather *g = &job->gathers[i];
> +		size += g->words * sizeof(u32);
> +	}
> +
> +	job->gather_copy_mapped = dma_alloc_writecombine(&pdev->dev,
> +			size, &job->gather_copy, GFP_KERNEL);
> +	if (IS_ERR(job->gather_copy_mapped)) {

dma_alloc_writecombine() returns NULL on failure, so this check is
wrong.

> +		int err = PTR_ERR(job->gather_copy_mapped);
> +		job->gather_copy_mapped = NULL;
> +		return err;
> +	}
> +
> +	job->gather_copy_size = size;
> +
> +	for (i = 0; i < job->num_gathers; i++) {
> +		struct host1x_job_gather *g = &job->gathers[i];
> +		void *gather = host1x_memmgr_mmap(g->ref);
> +		memcpy(job->gather_copy_mapped + offset,
> +				gather + g->offset,
> +				g->words * sizeof(u32));
> +
> +		g->mem_base = job->gather_copy;
> +		g->offset = offset;
> +		g->mem_id = 0;
> +		g->ref = 0;
> +
> +		host1x_memmgr_munmap(g->ref, gather);
> +		offset += g->words * sizeof(u32);
> +	}
> +
> +	return 0;
> +}

I wonder, where's this DMA buffer actually used? I can't find any use
between this copy and the corresponding dma_free_writecombine() call.

> +int host1x_job_pin(struct host1x_job *job, struct platform_device *pdev)
> +{
> +	int err = 0, i = 0, j = 0;

No need to initialize these here. i and j can also be unsigned.

> +	struct host1x *host = host1x_get_host(pdev);
> +	DECLARE_BITMAP(waitchk_mask, host1x_syncpt_nb_pts(host));
> +
> +	bitmap_zero(waitchk_mask, host1x_syncpt_nb_pts(host));
> +	for (i = 0; i < job->num_waitchk; i++) {
> +		u32 syncpt_id = job->waitchk[i].syncpt_id;
> +		if (syncpt_id < host1x_syncpt_nb_pts(host))
> +			set_bit(syncpt_id, waitchk_mask);
> +	}
> +
> +	/* get current syncpt values for waitchk */
> +	for_each_set_bit(i, &waitchk_mask[0], sizeof(waitchk_mask))
> +		host1x_syncpt_load_min(host->syncpt + i);
> +
> +	/* pin memory */
> +	err = pin_job_mem(job);
> +	if (err <= 0)
> +		goto out;

pin_job_mem() never returns negative.

> +	/* patch gathers */
> +	for (i = 0; i < job->num_gathers; i++) {
> +		struct host1x_job_gather *g = &job->gathers[i];
> +
> +		/* process each gather mem only once */
> +		if (!g->ref) {
> +			g->ref = host1x_memmgr_get(g->mem_id, job->ch->dev);
> +			if (IS_ERR(g->ref)) {

host1x_memmgr_get() also seems to return NULL on error.

> +				err = PTR_ERR(g->ref);
> +				g->ref = NULL;
> +				break;
> +			}
> +
> +			g->mem_base = job->gather_addr_phys[i];
> +
> +			for (j = 0; j < job->num_gathers; j++) {
> +				struct host1x_job_gather *tmp =
> +					&job->gathers[j];
> +				if (!tmp->ref && tmp->mem_id == g->mem_id) {
> +					tmp->ref = g->ref;
> +					tmp->mem_base = g->mem_base;
> +				}
> +			}
> +			err = 0;
> +			if (host1x_firewall)

if (IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL))

> +				err = validate(job, pdev, g);
> +			if (err)
> +				dev_err(&pdev->dev,
> +					"Job validate returned %d\n", err);
> +			if (!err)
> +				err = do_relocs(job, g->mem_id,  g->ref);
> +			if (!err)
> +				err = do_waitchks(job, host,
> +						g->mem_id, g->ref);
> +			host1x_memmgr_put(g->ref);
> +			if (err)
> +				break;
> +		}
> +	}
> +
> +	if (host1x_firewall && !err) {

And here.

> +/*
> + * Debug routine used to dump job entries
> + */
> +void host1x_job_dump(struct device *dev, struct host1x_job *job)
> +{
> +	dev_dbg(dev, "    SYNCPT_ID   %d\n",
> +		job->syncpt_id);
> +	dev_dbg(dev, "    SYNCPT_VAL  %d\n",
> +		job->syncpt_end);
> +	dev_dbg(dev, "    FIRST_GET   0x%x\n",
> +		job->first_get);
> +	dev_dbg(dev, "    TIMEOUT     %d\n",
> +		job->timeout);
> +	dev_dbg(dev, "    NUM_SLOTS   %d\n",
> +		job->num_slots);
> +	dev_dbg(dev, "    NUM_HANDLES %d\n",
> +		job->num_unpins);
> +}

These don't need to be wrapped.

> diff --git a/drivers/gpu/host1x/job.h b/drivers/gpu/host1x/job.h
[...]
> +struct host1x_job_gather {
> +	u32 words;
> +	dma_addr_t mem_base;
> +	u32 mem_id;
> +	int offset;
> +	struct mem_handle *ref;
> +};
> +
> +struct host1x_cmdbuf {
> +	__u32 mem;
> +	__u32 offset;
> +	__u32 words;
> +	__u32 pad;
> +};
> +
> +struct host1x_reloc {
> +	__u32 cmdbuf_mem;
> +	__u32 cmdbuf_offset;
> +	__u32 target;
> +	__u32 target_offset;
> +	__u32 shift;
> +	__u32 pad;
> +};
> +
> +struct host1x_waitchk {
> +	__u32 mem;
> +	__u32 offset;
> +	__u32 syncpt_id;
> +	__u32 thresh;
> +};

None of these are shared with userspace, so they shouldn't take the
__u32 types, but the regular u32 ones.

> +/*
> + * Each submit is tracked as a host1x_job.
> + */
> +struct host1x_job {
> +	/* When refcount goes to zero, job can be freed */
> +	struct kref ref;
> +
> +	/* List entry */
> +	struct list_head list;
> +
> +	/* Channel where job is submitted to */
> +	struct host1x_channel *ch;

Maybe write it out as "channel"?

> +
> +	int clientid;

Subsequent patches assign u32 to this field, so maybe the type should be
changed here. And maybe leave out the id suffix. It doesn't really add
any information.

> +	/* Gathers and their memory */
> +	struct host1x_job_gather *gathers;
> +	int num_gathers;

unsigned int

> +	/* Wait checks to be processed at submit time */
> +	struct host1x_waitchk *waitchk;
> +	int num_waitchk;

unsigned int

> +	u32 waitchk_mask;

This might need to be changed to a bitfield once future Tegra versions
start supporting more than 32 syncpoints.

> +	/* Array of handles to be pinned & unpinned */
> +	struct host1x_reloc *relocarray;
> +	int num_relocs;

unsigned int

> +	struct host1x_job_unpin_data *unpins;
> +	int num_unpins;

unsigned int

> +	dma_addr_t *addr_phys;
> +	dma_addr_t *gather_addr_phys;
> +	dma_addr_t *reloc_addr_phys;
> +
> +	/* Sync point id, number of increments and end related to the submit */
> +	u32 syncpt_id;
> +	u32 syncpt_incrs;
> +	u32 syncpt_end;
> +
> +	/* Maximum time to wait for this job */
> +	int timeout;

unsigned int. I think we discussed this already in a slightly different
context in patch 2.

> +	/* Null kickoff prevents submit from being sent to hardware */
> +	bool null_kickoff;

I don't think this is used anywhere.

> +	/* Index and number of slots used in the push buffer */
> +	int first_get;
> +	int num_slots;

unsigned int

> +
> +	/* Copy of gathers */
> +	size_t gather_copy_size;
> +	dma_addr_t gather_copy;
> +	u8 *gather_copy_mapped;

Are these really needed? They don't seem to be used anywhere except to
store a copy and free that copy sometime later.

> +
> +	/* Temporary space for unpin ids */
> +	long unsigned int *pin_ids;

unsigned long

> +	/* Check if register is marked as an address reg */
> +	int (*is_addr_reg)(struct platform_device *dev, u32 reg, u32 class);

is_addr_reg() sounds a bit unusual. Maybe match this to the name of the
main firewall routine, validate()?

> +	/* Request a SETCLASS to this class */
> +	u32 class;
> +
> +	/* Add a channel wait for previous ops to complete */
> +	u32 serialize;

This is used in code as a boolean. Why does it need to be 32 bits?

> diff --git a/drivers/gpu/host1x/memmgr.h b/drivers/gpu/host1x/memmgr.h
[...]
> +struct mem_handle;
> +struct platform_device;
> +
> +struct host1x_job_unpin_data {
> +	struct mem_handle *h;
> +	struct sg_table *mem;
> +};
> +
> +enum mem_mgr_flag {
> +	mem_mgr_flag_uncacheable = 0,
> +	mem_mgr_flag_write_combine = 1,
> +};

I'd like to see this use a more object-oriented approach and more common
terminology. All of these handles are essentially buffer objects, so
maybe something like host1x_bo would be a nice and short name.

To make this more object-oriented, I propose something like:

	struct host1x_bo_ops {
		int (*alloc)(struct host1x_bo *bo, size_t size, unsigned long align,
			     unsigned long flags);
		int (*free)(struct host1x_bo *bo);
		...
	};

	struct host1x_bo {
		const struct host1x_bo_ops *ops;
	};

	struct host1x_cma_bo {
		struct host1x_bo base;
		struct drm_gem_cma_object *obj;
	};

	static inline struct host1x_cma_bo *to_host1x_cma_bo(struct host1x_bo *bo)
	{
		return container_of(bo, struct host1x_cma_bo, base);
	}

	static inline int host1x_bo_alloc(struct host1x_bo *bo, size_t size,
					  unsigned long align, unsigned long flags)
	{
		return bo->ops->alloc(bo, size, align, flags);
	}

	...

That should be easy to extend with a new type of BO once the IOMMU-based
allocator is ready. And as I said it is much closer in terminology to
what other drivers do.

> diff --git a/drivers/gpu/host1x/syncpt.h b/drivers/gpu/host1x/syncpt.h
> index b46d044..255a3a3 100644
> --- a/drivers/gpu/host1x/syncpt.h
> +++ b/drivers/gpu/host1x/syncpt.h
> @@ -26,6 +26,7 @@
>  struct host1x;
>  
>  #define NVSYNCPT_INVALID			(-1)
> +#define NVSYNCPT_GRAPHICS_HOST			0

I think these should match other naming, so:

	#define HOST1X_SYNCPT_INVALID	-1
	#define HOST1X_SYNCPT_HOST1X	 0

There are a few more occurrences where platform_device is used but I
haven't commented on them. I don't think any of them won't work with
just a struct device instead. Also I may not have caught all of the
places where you should rather be using unsigned int instead of int,
so you might want to look out for some of those.

Generally I very much like where this is going. Are there any plans to
move the userspace binary driver to this interface at some point so we
can more actively test it? Also, is anything else blocking adding a
gr3d device similar to gr2d from this patch series?

Thierry
Terje Bergstrom Feb. 26, 2013, 9:48 a.m. UTC | #2
On 25.02.2013 17:24, Thierry Reding wrote:
> * PGP Signed by an unknown key
> 
> On Tue, Jan 15, 2013 at 01:43:59PM +0200, Terje Bergstrom wrote:
> [...]
>> diff --git a/drivers/gpu/host1x/Kconfig b/drivers/gpu/host1x/Kconfig
>> index e89fb2b..57680a6 100644
>> --- a/drivers/gpu/host1x/Kconfig
>> +++ b/drivers/gpu/host1x/Kconfig
>> @@ -3,4 +3,27 @@ config TEGRA_HOST1X
>>       help
>>         Driver for the Tegra host1x hardware.
>>
>> -       Required for enabling tegradrm.
>> +       Required for enabling tegradrm and 2D acceleration.
> 
> I don't think I commented on this in the other patches, but I think this
> could use a bit more information about what host1x is. Also mentioning
> that it is a requirement for tegra-drm and 2D acceleration isn't very
> useful because it can equally well be expressed in Kconfig. If you add
> some description about what host1x is, people will know that they want
> to enable it.

Ok, we'll rewrite that. I think we can reuse the text from commit msg
that I stole from Stephen's appnote.

> 
>> +if TEGRA_HOST1X
>> +
>> +config TEGRA_HOST1X_CMA
>> +     bool "Support DRM CMA buffers"
>> +     depends on DRM
>> +     default y
>> +     select DRM_GEM_CMA_HELPER
>> +     select DRM_KMS_CMA_HELPER
>> +     help
>> +       Say yes if you wish to use DRM CMA buffers.
>> +
>> +       If unsure, choose Y.
> 
> Perhaps make this not user-selectable (for now)? If somebody disables
> this explicitly they won't get a working driver, right?

True, there's no alternative, so it should not be user selectable.

> 
>> diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
> [...]
>> +#include "cdma.h"
>> +#include "channel.h"
>> +#include "dev.h"
>> +#include "memmgr.h"
>> +#include "job.h"
>> +#include <asm/cacheflush.h>
>> +
>> +#include <linux/slab.h>
>> +#include <linux/kfifo.h>
>> +#include <linux/interrupt.h>
>> +#include <trace/events/host1x.h>
>> +
>> +#define TRACE_MAX_LENGTH 128U
> 
> "" includes generally follow <> ones.

Will do.

> 
>> +/*
>> + * Add an entry to the sync queue.
>> + */
>> +static void add_to_sync_queue(struct host1x_cdma *cdma,
>> +                           struct host1x_job *job,
>> +                           u32 nr_slots,
>> +                           u32 first_get)
>> +{
>> +     if (job->syncpt_id == NVSYNCPT_INVALID) {
>> +             dev_warn(&job->ch->dev->dev, "%s: Invalid syncpt\n",
>> +                             __func__);
>> +             return;
>> +     }
>> +
>> +     job->first_get = first_get;
>> +     job->num_slots = nr_slots;
>> +     host1x_job_get(job);
>> +     list_add_tail(&job->list, &cdma->sync_queue);
>> +}
> 
> It's a bit odd that you pass a job in here along with some parameters
> that are then assigned to the job's fields. Couldn't you just assign
> them to the job's fields before passing the job into this function?
> 
> I also see that you only use this function once, so maybe you could
> open-code it instead.

I think open coding would be the best choice. There's no real reason to
have this as separate function. That'd solve the odd parameters
phenomenon, too.

> 
>> +/*
>> + * Return the status of the cdma's sync queue or push buffer for the given event
>> + *  - sq empty: returns 1 for empty, 0 for not empty (as in "1 empty queue" :-)
>> + *  - pb space: returns the number of free slots in the channel's push buffer
>> + * Must be called with the cdma lock held.
>> + */
>> +static unsigned int cdma_status_locked(struct host1x_cdma *cdma,
>> +             enum cdma_event event)
>> +{
>> +     struct host1x *host1x = cdma_to_host1x(cdma);
>> +     switch (event) {
>> +     case CDMA_EVENT_SYNC_QUEUE_EMPTY:
>> +             return list_empty(&cdma->sync_queue) ? 1 : 0;
>> +     case CDMA_EVENT_PUSH_BUFFER_SPACE: {
>> +             struct push_buffer *pb = &cdma->push_buffer;
>> +             return host1x->cdma_pb_op.space(pb);
>> +     }
>> +     default:
>> +             return 0;
>> +     }
>> +}
> 
> Similarly this function is only used in one place and it requires a
> whole lot of documentation to define the meaning of the return value. If
> you implement this functionality directly in host1x_cdma_wait_locked()
> you have much more context and don't require all this "protocol".

I agree, this function is confusing. For some future functionality, it's
going to be called from a second place with CDMA_EVENT_SYNC_QUEUE_EMPTY,
but it's better of both of those calls are just opened up to get rid of
the extra switch().

> 
>> +/*
>> + * Start timer for a buffer submition that has completed yet.
> 
> "submission". And I don't understand the "that has completed yet" part.

It should become "Start timer that tracks the time spent by the job".

> 
>> + * Must be called with the cdma lock held.
>> + */
>> +static void cdma_start_timer_locked(struct host1x_cdma *cdma,
>> +             struct host1x_job *job)
> 
> You use two different styles to indent the function parameters. You
> might want to stick to one, preferably aligning them with the first
> parameter on the first line.

I've generally favored "two tabs" indenting, but we'll anyway
standardize on one.

> 
>> +{
>> +     struct host1x *host = cdma_to_host1x(cdma);
>> +
>> +     if (cdma->timeout.clientid) {
>> +             /* timer already started */
>> +             return;
>> +     }
>> +
>> +     cdma->timeout.clientid = job->clientid;
>> +     cdma->timeout.syncpt = host1x_syncpt_get(host, job->syncpt_id);
>> +     cdma->timeout.syncpt_val = job->syncpt_end;
>> +     cdma->timeout.start_ktime = ktime_get();
>> +
>> +     schedule_delayed_work(&cdma->timeout.wq,
>> +                     msecs_to_jiffies(job->timeout));
>> +}
>> +
>> +/*
>> + * Stop timer when a buffer submition completes.
> 
> "submission"

Will fix.

> 
>> +/*
>> + * For all sync queue entries that have already finished according to the
>> + * current sync point registers:
>> + *  - unpin & unref their mems
>> + *  - pop their push buffer slots
>> + *  - remove them from the sync queue
>> + * This is normally called from the host code's worker thread, but can be
>> + * called manually if necessary.
>> + * Must be called with the cdma lock held.
>> + */
>> +static void update_cdma_locked(struct host1x_cdma *cdma)
>> +{
>> +     bool signal = false;
>> +     struct host1x *host1x = cdma_to_host1x(cdma);
>> +     struct host1x_job *job, *n;
>> +
>> +     /* If CDMA is stopped, queue is cleared and we can return */
>> +     if (!cdma->running)
>> +             return;
>> +
>> +     /*
>> +      * Walk the sync queue, reading the sync point registers as necessary,
>> +      * to consume as many sync queue entries as possible without blocking
>> +      */
>> +     list_for_each_entry_safe(job, n, &cdma->sync_queue, list) {
>> +             struct host1x_syncpt *sp = host1x->syncpt + job->syncpt_id;
> 
> host1x_syncpt_get()?

Yes, that should be used.

> 
>> +
>> +             /* Check whether this syncpt has completed, and bail if not */
>> +             if (!host1x_syncpt_is_expired(sp, job->syncpt_end)) {
>> +                     /* Start timer on next pending syncpt */
>> +                     if (job->timeout)
>> +                             cdma_start_timer_locked(cdma, job);
>> +                     break;
>> +             }
>> +
>> +             /* Cancel timeout, when a buffer completes */
>> +             if (cdma->timeout.clientid)
>> +                     stop_cdma_timer_locked(cdma);
>> +
>> +             /* Unpin the memory */
>> +             host1x_job_unpin(job);
>> +
>> +             /* Pop push buffer slots */
>> +             if (job->num_slots) {
>> +                     struct push_buffer *pb = &cdma->push_buffer;
>> +                     host1x->cdma_pb_op.pop_from(pb, job->num_slots);
>> +                     if (cdma->event == CDMA_EVENT_PUSH_BUFFER_SPACE)
>> +                             signal = true;
>> +             }
>> +
>> +             list_del(&job->list);
>> +             host1x_job_put(job);
>> +     }
>> +
>> +     if (list_empty(&cdma->sync_queue) &&
>> +                             cdma->event == CDMA_EVENT_SYNC_QUEUE_EMPTY)
>> +                     signal = true;
> 
> This looks funny, maybe:
> 
>         if (cdma->event == CDMA_EVENT_SYNC_QUEUE_EMPTY &&
>             list_empty(&cdma->sync_queue))
>                 signal = true;
> 
> ?

Indenting at least is strange. I don't have a preference for the
ordering of conditions, so if you like the latter order, we can just use
that.

> 
>> +
>> +     /* Wake up CdmaWait() if the requested event happened */
> 
> CdmaWait()? Where's that?

host1x_cdma_wait_locked(). Will fix.

> 
>> +     if (signal) {
>> +             cdma->event = CDMA_EVENT_NONE;
>> +             up(&cdma->sem);
>> +     }
>> +}
>> +
>> +void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
>> +             struct platform_device *dev)
> 
> There's nothing in this function that requires a platform_device, so
> passing struct device should be enough. Or maybe host1x_cdma should get
> a struct device * field?

I think we'll just start using struct device * in general in code.
Arto's been already fixing a lot of these, so he might've already fixed
this.

> 
>> +{
>> +     u32 get_restart;
> 
> Maybe just call this "restart" or "restart_addr". get_restart sounds
> like a function name.

Ok, how about "restart_dmaget_addr"? That indicates what we're doing
with the restart address.

> 
>> +     u32 syncpt_incrs;
>> +     struct host1x_job *job = NULL;
>> +     u32 syncpt_val;
>> +     struct host1x *host1x = cdma_to_host1x(cdma);
>> +
>> +     syncpt_val = host1x_syncpt_load_min(cdma->timeout.syncpt);
>> +
>> +     dev_dbg(&dev->dev,
>> +             "%s: starting cleanup (thresh %d)\n",
>> +             __func__, syncpt_val);
> 
> This fits on two lines.

Will merge.

> 
>> +
>> +     /*
>> +      * Move the sync_queue read pointer to the first entry that hasn't
>> +      * completed based on the current HW syncpt value. It's likely there
>> +      * won't be any (i.e. we're still at the head), but covers the case
>> +      * where a syncpt incr happens just prior/during the teardown.
>> +      */
>> +
>> +     dev_dbg(&dev->dev,
>> +             "%s: skip completed buffers still in sync_queue\n",
>> +             __func__);
> 
> This too.

Ok.

> 
>> +     list_for_each_entry(job, &cdma->sync_queue, list) {
>> +             if (syncpt_val < job->syncpt_end)
>> +                     break;
>> +
>> +             host1x_job_dump(&dev->dev, job);
>> +     }
> 
> That's potentially a lot of debug output. I wonder if it might make
> sense to control parts of this via a module parameter. Then again, if
> somebody really needs to debug this, maybe they really want *all* the
> information.

host1x_job_dump() uses dev_dbg(), so it only dumps a lot if DEBUG has
been defined in that file.

> 
>> +     /*
>> +      * Walk the sync_queue, first incrementing with the CPU syncpts that
>> +      * are partially executed (the first buffer) or fully skipped while
>> +      * still in the current context (slots are also NOP-ed).
>> +      *
>> +      * At the point contexts are interleaved, syncpt increments must be
>> +      * done inline with the pushbuffer from a GATHER buffer to maintain
>> +      * the order (slots are modified to be a GATHER of syncpt incrs).
>> +      *
>> +      * Note: save in get_restart the location where the timed out buffer
>> +      * started in the PB, so we can start the refetch from there (with the
>> +      * modified NOP-ed PB slots). This lets things appear to have completed
>> +      * properly for this buffer and resources are freed.
>> +      */
>> +
>> +     dev_dbg(&dev->dev,
>> +             "%s: perform CPU incr on pending same ctx buffers\n",
>> +             __func__);
> 
> Can be collapsed to two lines.

Sure.

> 
>> +
>> +     get_restart = cdma->last_put;
>> +     if (!list_empty(&cdma->sync_queue))
>> +             get_restart = job->first_get;
> 
> Perhaps:
> 
>         if (list_empty(&cdma->sync_queue))
>                 restart = cdma->last_put;
>         else
>                 restart = job->first_get;
> 
> ?

That's equivalent in functionality, and there's one less assignment for
one path, so sounds good.

> 
>> +     list_for_each_entry_from(job, &cdma->sync_queue, list)
>> +             if (job->clientid == cdma->timeout.clientid)
>> +                     job->timeout = 500;
> 
> I think this warrants a comment.

Sure. We're accelerating timing out jobs for the client that submitted
the job that timed out. But we'll add a comment. And, in downstream, we
already changed this to "job->timeout = max(job->timeout, 500), so we
should use that.

> 
>> +/*
>> + * Destroy a cdma
>> + */
>> +void host1x_cdma_deinit(struct host1x_cdma *cdma)
>> +{
>> +     struct push_buffer *pb = &cdma->push_buffer;
>> +     struct host1x *host1x = cdma_to_host1x(cdma);
>> +
>> +     if (cdma->running) {
>> +             pr_warn("%s: CDMA still running\n",
>> +                             __func__);
>> +     } else {
>> +             host1x->cdma_pb_op.destroy(pb);
>> +             host1x->cdma_op.timeout_destroy(cdma);
>> +     }
>> +}
> 
> There's no way to recover from the situation where a cdma is still
> running. Can this not return an error code (-EBUSY?) if the cdma can't
> be destroyed?

It's called from close(), which cannot return an error code. It's
actually more of a power optimization. The effect is that if there are
no users for channel, we'll just not free up the push buffer.

I think the proper fix would actually be to check in host1x_cdma_init()
if push buffer is already allocated and cdma->running. In that case we
could skip most of initialization.

> 
>> +/*
>> + * End a cdma submit
>> + * Kick off DMA, add job to the sync queue, and a number of slots to be freed
>> + * from the pushbuffer. The handles for a submit must all be pinned at the same
>> + * time, but they can be unpinned in smaller chunks.
>> + */
>> +void host1x_cdma_end(struct host1x_cdma *cdma,
>> +             struct host1x_job *job)
>> +{
>> +     struct host1x *host1x = cdma_to_host1x(cdma);
>> +     bool was_idle = list_empty(&cdma->sync_queue);
> 
> Maybe just "idle"? It reflects the current state of the CDMA, not any
> old state.

Ok.

> 
>> +
>> +     host1x->cdma_op.kick(cdma);
>> +
>> +     add_to_sync_queue(cdma,
>> +                     job,
>> +                     cdma->slots_used,
>> +                     cdma->first_get);
> 
> No need to split this over so many lines. Also, shouldn't the order be
> reversed here? I.e. first add to sync queue, then start DMA?

Yeah, I think the order should be reversed. And, we're anyway moving the
code inline, so there's no function call.

> 
>> +     /* start timer on idle -> active transitions */
>> +     if (job->timeout && was_idle)
>> +             cdma_start_timer_locked(cdma, job);
> 
> This could be part of add_to_sync_queue(), but if you open-code that as
> I suggest earlier it should obviously stay.

Yep, let's open-code that.

> 
>> diff --git a/drivers/gpu/host1x/cdma.h b/drivers/gpu/host1x/cdma.h
> [...]
>> +struct platform_device;
> 
> No need for this if you pass struct device * instead.

Will change.

> 
>> +/*
>> + * cdma
>> + *
>> + * This is in charge of a host command DMA channel.
>> + * Sends ops to a push buffer, and takes responsibility for unpinning
>> + * (& possibly freeing) of memory after those ops have completed.
>> + * Producer:
>> + *   begin
>> + *           push - send ops to the push buffer
>> + *   end - start command DMA and enqueue handles to be unpinned
>> + * Consumer:
>> + *   update - call to update sync queue and push buffer, unpin memory
>> + */
> 
> I find the name to be a bit confusing. For some reason I automatically
> think of GSM when I read CDMA. This really is more of a job queue, so
> maybe calling it host1x_job_queue might be more appropriate. But I've
> already requested a lot of things to be renamed, so I think I can live
> with this being called CDMA if you don't want to change it.
> 
> Alternatively all of these could be moved to the struct host1x_channel
> given that there's only one of each of the push_buffer, buffer_timeout
> and host1x_cma objects per channel.

I did consider merging those two at a time. That should work, as they
both deal with channels essentially. I also saw that the resulting file
and data structures became quite large, so I have so far preferred to
keep them separate.

This way I can keep the "higher level" stuff (inserting setclass,
serializing, allocating sync point ranges, etc) in one file and lower
level stuff (write to hardware, deal with push buffer pointers, etc) in
another.

> 
>> diff --git a/drivers/gpu/host1x/channel.c b/drivers/gpu/host1x/channel.c
> [...]
>> +#include "channel.h"
>> +#include "dev.h"
>> +#include "job.h"
>> +
>> +#include <linux/slab.h>
>> +#include <linux/module.h>
> 
> Again the include ordering is strange.

Will fix.

> 
>> +/*
>> + * Iterator function for host1x device list
>> + * It takes a fptr as an argument and calls that function for each
>> + * device in the list
>> + */
>> +void host1x_channel_for_all(struct host1x *host1x, void *data,
>> +     int (*fptr)(struct host1x_channel *ch, void *fdata))
>> +{
>> +     struct host1x_channel *ch;
>> +     int ret;
>> +
>> +     list_for_each_entry(ch, &host1x->chlist.list, list) {
>> +             if (ch && fptr) {
>> +                     ret = fptr(ch, data);
>> +                     if (ret) {
>> +                             pr_info("%s: iterator error\n", __func__);
>> +                             break;
>> +                     }
>> +             }
>> +     }
>> +}
> 
> Couldn't you rewrite this as a macro, similar to list_for_each_entry()
> so that users could do something like:
> 
>         host1x_for_each_channel(channel, host1x) {
>                 ...
>         }
> 
> That's a bit friendlier than having each user write a separate function
> to be called from this iterator.

Sounds good, we'll try that. My macro magic is rusty, but I trust
list_for_each_entry() will give a template.

> 
>> +int host1x_channel_submit(struct host1x_job *job)
>> +{
>> +     return host1x_get_host(job->ch->dev)->channel_op.submit(job);
>> +}
> 
> I'd expect a function named host1x_channel_submit() to take a struct
> host1x_channel *. Should this perhaps be called host1x_job_submit()?

It calls into channel code directly, and the underlying op also just
takes a job. We could add channel as a parameter, and not pass it in
host1x_job_alloc(). but we actually need the channel data already in
host1x_job_pin(), which comes before submit. We need it so that we pin
the buffer to correct engine.

> 
>> +struct host1x_channel *host1x_channel_get(struct host1x_channel *ch)
>> +{
>> +     int err = 0;
>> +
>> +     mutex_lock(&ch->reflock);
>> +     if (ch->refcount == 0)
>> +             err = host1x_cdma_init(&ch->cdma);
>> +     if (!err)
>> +             ch->refcount++;
>> +
>> +     mutex_unlock(&ch->reflock);
>> +
>> +     return err ? NULL : ch;
>> +}
> 
> Why don't you use any of the kernel's reference counting mechanisms?
> 
>> +void host1x_channel_put(struct host1x_channel *ch)
>> +{
>> +     mutex_lock(&ch->reflock);
>> +     if (ch->refcount == 1) {
>> +             host1x_get_host(ch->dev)->cdma_op.stop(&ch->cdma);
>> +             host1x_cdma_deinit(&ch->cdma);
>> +     }
>> +     ch->refcount--;
>> +     mutex_unlock(&ch->reflock);
>> +}
> 
> I think you can do all of this using a kref.

I think the original reason was that there's no reason to use atomic
kref, as we anyway have to do mutual exclusion via mutex. But, using
kref won't be any problem, so we could use that.

> 
>> +struct host1x_channel *host1x_channel_alloc(struct platform_device *pdev)
>> +{
>> +     struct host1x_channel *ch = NULL;
>> +     struct host1x *host1x = host1x_get_host(pdev);
>> +     int chindex;
>> +     int max_channels = host1x->info.nb_channels;
>> +     int err;
>> +
>> +     mutex_lock(&host1x->chlist_mutex);
>> +
>> +     chindex = host1x->allocated_channels;
>> +     if (chindex > max_channels)
>> +             goto fail;
>> +
>> +     ch = kzalloc(sizeof(*ch), GFP_KERNEL);
>> +     if (ch == NULL)
>> +             goto fail;
>> +
>> +     /* Link platform_device to host1x_channel */
>> +     err = host1x->channel_op.init(ch, host1x, chindex);
>> +     if (err < 0)
>> +             goto fail;
>> +
>> +     ch->dev = pdev;
>> +
>> +     /* Add to channel list */
>> +     list_add_tail(&ch->list, &host1x->chlist.list);
>> +
>> +     host1x->allocated_channels++;
>> +
>> +     mutex_unlock(&host1x->chlist_mutex);
>> +     return ch;
>> +
>> +fail:
>> +     dev_err(&pdev->dev, "failed to init channel\n");
>> +     kfree(ch);
>> +     mutex_unlock(&host1x->chlist_mutex);
>> +     return NULL;
>> +}
> 
> I think the critical section could be shorter here. It's probably not
> worth the extra trouble, though, given that channels are not often
> allocated.

Yeah, boot time isn't measured in microseconds. :-) But, if we just make
allocated_channels an atomic, we should be able to drop chlist_mutex
altogether and it could simplify the code.

> 
>> +void host1x_channel_free(struct host1x_channel *ch)
>> +{
>> +     struct host1x *host1x = host1x_get_host(ch->dev);
>> +     struct host1x_channel *chiter, *tmp;
>> +     list_for_each_entry_safe(chiter, tmp, &host1x->chlist.list, list) {
>> +             if (chiter == ch) {
>> +                     list_del(&chiter->list);
>> +                     kfree(ch);
>> +                     host1x->allocated_channels--;
>> +
>> +                     return;
>> +             }
>> +     }
>> +}
> 
> This doesn't free the channel if it happens to not be part of the host1x
> channel list. Perhaps an easier way to write it would be:
> 
>         host1x = host1x_get_host(ch->dev);
> 
>         list_del(&ch->list);
>         kfree(ch);
> 
>         host1x->allocated_channels--;
> 
> Looking at the rest of the code, it seems like a channel will never not
> be part of the host1x channel list, so I don't think there's a need to
> to scan the list.

I think you're right. This is just overprotective. Your variant does the
same thing with much less code.

> 
> On a side-note: generally if you break out of the loop right after
> freeing the memory of a removed node, there's no need to use the _safe
> variant since you won't be accessing the .next field of the freed node
> anyway.

That's true.

> 
> Maybe these should also adopt a similar naming as what we discussed for
> the syncpoints. That is:
> 
>         struct host1x_channel *host1x_channel_request(struct device *dev);
> 
> ?

Sounds good.

> 
>> diff --git a/drivers/gpu/host1x/channel.h b/drivers/gpu/host1x/channel.h
> [...]
>> +
>> +/*
>> + * host1x device list in debug-fs dump of host1x and client device
>> + * as well as channel state
>> + */
> 
> I don't understand this comment.

Probably because it's not a sentence and doesn't make sense. I think
it's just misplaced. We'll find its proper home.

> 
>> +struct host1x_channel {
>> +     struct list_head list;
>> +
>> +     int refcount;
>> +     int chid;
> 
> This can probably just be id. It is a field of host1x_channel, so the ch
> prefix is redundant.

Ok.

> 
>> +     struct mutex reflock;
>> +     struct mutex submitlock;
>> +     void __iomem *regs;
>> +     struct device *node;
> 
> This is never used.

Yep, let's remove "node".

> 
>> +     struct platform_device *dev;
> 
> Can this be just struct device *?

I think so. I'll let Arto look at all places where we could change
platform_device->device. He was already on it.

> 
>> +     struct cdev cdev;
> 
> This is never used.

Will remove.

> 
>> +/* channel list operations */
>> +void host1x_channel_list_init(struct host1x *);
>> +void host1x_channel_for_all(struct host1x *, void *data,
>> +     int (*fptr)(struct host1x_channel *ch, void *fdata));
>> +
>> +struct host1x_channel *host1x_channel_alloc(struct platform_device *pdev);
>> +void host1x_channel_free(struct host1x_channel *ch);
> 
> Is it a good idea to make host1x_channel_free() publicly available?
> Shouldn't the host1x_channel_alloc()/host1x_channel_request() return a
> host1x_channel with a reference count of 1 and everybody release their
> reference using host1x_channel_put() to make sure the channel is freed
> only after the last reference disappears?
> 
> Otherwise whoever calls host1x_channel_free() will confuse everybody
> else that's still keeping a reference.

The difference is that _put and _get are called to indicate how many
user space processes there are for the channel. Even if there are no
processes, we won't free the channel structure - we just freeze the channel.

_alloc and _free are different in that they actually create the channel
structs and delete them and they follow the lifecycle of the driver.
Perhaps we should figure new naming, but refcounting and alloc/free
cannot be merged here.

> 
>> diff --git a/drivers/gpu/host1x/cma.c b/drivers/gpu/host1x/cma.c
> [...]
> 
> Various spurious blank lines in this file, and the alignment of function
> parameters is off.

Will fix.

> 
>> +struct mem_handle *host1x_cma_get(u32 id, struct platform_device *dev)
> 
> I don't think this needs platform_device either.

Will fix.

> 
>> +{
>> +     struct drm_gem_cma_object *obj = to_cma_obj((void *)id);
>> +     struct mutex *struct_mutex = &obj->base.dev->struct_mutex;
>> +
>> +     mutex_lock(struct_mutex);
>> +     drm_gem_object_reference(&obj->base);
>> +     mutex_unlock(struct_mutex);
> 
> I think it's more customary to obtain a pointer to struct drm_device and
> then use mutex_{lock,unlock}(&drm->struct_mutex). Or you could just use
> drm_gem_object_reference_unlocked(&obj->base) instead. Which doesn't
> exist yet, apparently. But it could be added.

I think we could take the former path - just refer to mutex in a
different way.

>> +int host1x_cma_pin_array_ids(struct platform_device *dev,
>> +             long unsigned *ids,
>> +             long unsigned id_type_mask,
>> +             long unsigned id_type,
>> +             u32 count,
>> +             struct host1x_job_unpin_data *unpin_data,
>> +             dma_addr_t *phys_addr)
> 
> struct device * and unsigned long please. count can also doesn't need to
> be a sized type. unsigned int will do just fine. The return value can
> also be unsigned int if you don't expect to return any error conditions.

I think we'll need to check these. ids probably needs to be a u32 *, and
id_type_mask and id_type should be u32. They come like that from user space.

> 
>> +{
>> +     int i;
>> +     int pin_count = 0;
> 
> Both should be unsigned as well, and can go on one line:
> 
>         unsigned int pin_count = 0, i;

Ok.

> 
>> diff --git a/drivers/gpu/host1x/dev.h b/drivers/gpu/host1x/dev.h
> [...]
>>  struct host1x;
>> +struct host1x_intr;
>>  struct host1x_syncpt;
>> +struct host1x_channel;
>> +struct host1x_cdma;
>> +struct host1x_job;
>> +struct push_buffer;
>> +struct dentry;
> 
> I think this already belongs in a previous patch. The debugfs dentry
> isn't added in this patch.

Ok, that was a mistake I did when I re-split after one of the previous
rounds. I compiled (at least thought I did) after each patch, so it
might be that these aren't actually needed.

> 
>> +struct host1x_channel_ops {
>> +     int (*init)(struct host1x_channel *,
>> +                 struct host1x *,
>> +                 int chid);
> 
> Please add the parameter names as well (the same goes for all ops
> declared in this file). And "id" will be enough. Also the channel ID can
> surely be unsigned, right?

Sure to all of these.

> 
>> +struct host1x_cdma_ops {
>> +     void (*start)(struct host1x_cdma *);
>> +     void (*stop)(struct host1x_cdma *);
>> +     void (*kick)(struct  host1x_cdma *);
>> +     int (*timeout_init)(struct host1x_cdma *,
>> +                         u32 syncpt_id);
>> +     void (*timeout_destroy)(struct host1x_cdma *);
>> +     void (*timeout_teardown_begin)(struct host1x_cdma *);
>> +     void (*timeout_teardown_end)(struct host1x_cdma *,
>> +                                  u32 getptr);
>> +     void (*timeout_cpu_incr)(struct host1x_cdma *,
>> +                              u32 getptr,
>> +                              u32 syncpt_incrs,
>> +                              u32 syncval,
>> +                              u32 nr_slots);
>> +};
> 
> Can the timeout_ prefix not be dropped? The functions are generally
> useful and not directly related to timeouts, even though they seem to
> only be used during timeout handling.

All the timeout functions actually access the timeout struct, so they're
not generic. Teardown functions are the only ones which don't access
timeout.

> 
> Also, is it really necessary to abstract these into an ops structure? I
> get that newer hardware revisions might require different ops for sync-
> point handling because the register layout or number of syncpoints may
> be different, but the CDMA and push buffer (below) concepts are pretty
> much a software abstraction, and as such its implementation is unlikely
> to change with some future hardware revision.

Pushbuffer ops can become generic. There's only one catch - init uses
the restart opcode. But the opcode is not going to change, so we can
generalize that.

> 
>> +struct host1x_pushbuffer_ops {
>> +     void (*reset)(struct push_buffer *);
>> +     int (*init)(struct push_buffer *);
>> +     void (*destroy)(struct push_buffer *);
>> +     void (*push_to)(struct push_buffer *,
>> +                     struct mem_handle *,
>> +                     u32 op1, u32 op2);
>> +     void (*pop_from)(struct push_buffer *,
>> +                      unsigned int slots);
> 
> Maybe just push() and pop()?

Can do.

> 
>> +     u32 (*space)(struct push_buffer *);
>> +     u32 (*putptr)(struct push_buffer *);
>> +};
>>
>>  struct host1x_syncpt_ops {
>>       void (*reset)(struct host1x_syncpt *);
>> @@ -64,9 +111,19 @@ struct host1x {
>>       struct host1x_device_info info;
>>       struct clk *clk;
>>
>> +     /* Sync point dedicated to replacing waits for expired fences */
>> +     struct host1x_syncpt *nop_sp;
>> +
>> +     struct host1x_channel_ops channel_op;
>> +     struct host1x_cdma_ops cdma_op;
>> +     struct host1x_pushbuffer_ops cdma_pb_op;
>>       struct host1x_syncpt_ops syncpt_op;
>>       struct host1x_intr_ops intr_op;
>>
>> +     struct mutex chlist_mutex;
>> +     struct host1x_channel chlist;
> 
> Shouldn't this just be struct list_head?

I think you're right, to follow the normal kernel conventions.

> 
>> +     int allocated_channels;
> 
> unsigned int? And maybe just "num_channels"?

num_channels could be thought as "number of available channels", so I'd
like to use num_allocated_channels here.

> 
>> diff --git a/drivers/gpu/host1x/host1x.h b/drivers/gpu/host1x/host1x.h
> [...]
>> +enum host1x_class {
>> +     NV_HOST1X_CLASS_ID              = 0x1,
>> +     NV_GRAPHICS_2D_CLASS_ID         = 0x51,
> 
> This entry belongs in a later patch, right? And I find it convenient if
> enumeration constants start with the enum name as prefix. Furthermore
> it'd be nice to reuse the hardware module names, like so:
> 
>         enum host1x_class {
>                 HOST1X_CLASS_HOST1X,
>                 HOST1X_CLASS_GR2D,
>                 HOST1X_CLASS_GR3D,
>         };

The naming sounds good. We already use HOST1X_CLASS_HOST1X in code to
insert a wait. If you'd prefer, we can move the definition of
HOST1X_CLASS_GR2D to the later patch.

> 
>> diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
> [...]
>> +#include <linux/slab.h>
>> +#include <linux/scatterlist.h>
>> +#include <linux/dma-mapping.h>
>> +#include "cdma.h"
>> +#include "channel.h"
>> +#include "dev.h"
>> +#include "memmgr.h"
>> +
>> +#include "cdma_hw.h"
>> +
>> +static inline u32 host1x_channel_dmactrl(int stop, int get_rst, int init_get)
>> +{
>> +     return HOST1X_CHANNEL_DMACTRL_DMASTOP_F(stop)
>> +             | HOST1X_CHANNEL_DMACTRL_DMAGETRST_F(get_rst)
>> +             | HOST1X_CHANNEL_DMACTRL_DMAINITGET_F(init_get);
> 
> I think it is more customary to put the | at the end of the preceding
> line:
> 
>         return HOST1X_CHANNEL_DMACTRL_DMASTOP_F(stop) |
>                HOST1X_CHANNEL_DMACTRL_DMAGETRST_F(get_rst) |
>                HOST1X_CHANNEL_DMACTRL_DMAINITGET_F(init_get);
> 
> Also since these are all single bits, I'd prefer if you could drop the
> _F suffix and not make them take a parameter. I think it'd even be
> better not to have this function at all, but make the intent explicit
> where the register is written. That is, have each call site set the bits
> explicitly instead of calling this helper. Having a parameter list such
> as (true, false, false) or (true, true, true) is confusing since you
> have to keep looking up the meaning of the parameters.

The operation that the _F macros do is masking and bit shifting the
fields correctly. Without that, we'd need to expose several macros to
mask and shift, and I'd rather just have one macro to take care of that.

But, we can open code the function to wherever it's used if that's more
readable.

> 
>> +}
>> +
>> +static void cdma_timeout_handler(struct work_struct *work);
> 
> Can this prototype be avoided?

We could try shuffling the code. There might be some dependency problems
that forced this ordering, but we'll try.

> 
>> +/**
>> + * Reset to empty push buffer
>> + */
>> +static void push_buffer_reset(struct push_buffer *pb)
>> +{
>> +     pb->fence = PUSH_BUFFER_SIZE - 8;
>> +     pb->cur = 0;
> 
> Maybe position is a better name than cur.

Sure.

> 
>> +/**
>> + * Init push buffer resources
>> + */
>> +static void push_buffer_destroy(struct push_buffer *pb);
> 
> You should be careful with these comment blocks. If you start them with
> /**, then you should make them proper kerneldoc comments. But you don't
> really need that for static functions, so you could just make them /*-
> style.
> 
> Also this particular comment is confusingly place on top of the proto-
> type of push_buffer_destroy().

You're right. We'll just remove the /** */ notation and use normal
comments. And the comment is just misplaced, so we'll move it.

> 
>> +/*
>> + * Push two words to the push buffer
>> + * Caller must ensure push buffer is not full
>> + */
>> +static void push_buffer_push_to(struct push_buffer *pb,
>> +             struct mem_handle *handle,
>> +             u32 op1, u32 op2)
>> +{
>> +     u32 cur = pb->cur;
>> +     u32 *p = (u32 *)((u32)pb->mapped + cur);
> 
> You do all this extra casting to make sure to increment by bytes and not
> 32-bit words. How about you change pb->cur to contain the word index, so
> that you don't have to go through hoops each time around.
> 
> Alternatively you could make it a pointer to u32 and not have to index
> or cast at all. So you'd end up with something like:
> 
>         struct push_buffer {
>                 u32 *start;
>                 u32 *end;
>                 u32 *ptr;
>         };

The complexity comes from the fact that we deal both with device virtual
addresses and CPU addresses to the same buffer. We'll need the indexes
so that we can convert between the two address spaces, but we might be
able to use word indexes. We'll check this.

> 
>> +/*
>> + * Return the number of two word slots free in the push buffer
>> + */
>> +static u32 push_buffer_space(struct push_buffer *pb)
>> +{
>> +     return ((pb->fence - pb->cur) & (PUSH_BUFFER_SIZE - 1)) / 8;
>> +}
> 
> Why & (PUSH_BUFFER_SIZE - 1) here? fence - cur can never be larger than
> PUSH_BUFFER_SIZE, can it?

You're right, this function doesn't need to worry about wrapping.

> 
>> +/*
>> + * Init timeout resources
>> + */
>> +static int cdma_timeout_init(struct host1x_cdma *cdma,
>> +                              u32 syncpt_id)
>> +{
>> +     if (syncpt_id == NVSYNCPT_INVALID)
>> +             return -EINVAL;
> 
> Do we really need the syncpt_id check here? It is the only reason why we
> need to pass the parameter in the first place, and if we get to this
> point we should already have made sure that the syncpoint is actually
> valid.

True, we can drop this.

> 
>> +/*
>> + * Increment timedout buffer's syncpt via CPU.
> 
> Nit: "timed out buffer's"

Will fix.

> 
>> + */
>> +static void cdma_timeout_cpu_incr(struct host1x_cdma *cdma, u32 getptr,
>> +                             u32 syncpt_incrs, u32 syncval, u32 nr_slots)
> 
> The syncval parameter isn't used.

True, that'd be used only with wait base support, as we need to
synchronize wait base with the sync point. Will remove.

> 
>> +{
>> +     struct host1x *host1x = cdma_to_host1x(cdma);
>> +     struct push_buffer *pb = &cdma->push_buffer;
>> +     u32 i, getidx;
>> +
>> +     for (i = 0; i < syncpt_incrs; i++)
>> +             host1x_syncpt_cpu_incr(cdma->timeout.syncpt);
>> +
>> +     /* after CPU incr, ensure shadow is up to date */
>> +     host1x_syncpt_load_min(cdma->timeout.syncpt);
>> +
>> +     /* NOP all the PB slots */
>> +     getidx = getptr - pb->phys;
>> +     while (nr_slots--) {
>> +             u32 *p = (u32 *)((u32)pb->mapped + getidx);
>> +             *(p++) = HOST1X_OPCODE_NOOP;
>> +             *(p++) = HOST1X_OPCODE_NOOP;
>> +             dev_dbg(&host1x->dev->dev, "%s: NOP at 0x%x\n",
>> +                     __func__, pb->phys + getidx);
>> +             getidx = (getidx + 8) & (PUSH_BUFFER_SIZE - 1);
>> +     }
>> +     wmb();
> 
> Why the memory barrier?

Can't think of any good reason. Will try removing.

> 
>> +/*
>> + * Similar to cdma_start(), but rather than starting from an idle
>> + * state (where DMA GET is set to DMA PUT), on a timeout we restore
>> + * DMA GET from an explicit value (so DMA may again be pending).
>> + */
>> +static void cdma_timeout_restart(struct host1x_cdma *cdma, u32 getptr)
>> +{
>> +     struct host1x *host1x = cdma_to_host1x(cdma);
>> +     struct host1x_channel *ch = cdma_to_channel(cdma);
>> +
>> +     if (cdma->running)
>> +             return;
>> +
>> +     cdma->last_put = host1x->cdma_pb_op.putptr(&cdma->push_buffer);
>> +
>> +     host1x_ch_writel(ch, host1x_channel_dmactrl(true, false, false),
>> +             HOST1X_CHANNEL_DMACTRL);
>> +
>> +     /* set base, end pointer (all of memory) */
>> +     host1x_ch_writel(ch, 0, HOST1X_CHANNEL_DMASTART);
>> +     host1x_ch_writel(ch, 0xFFFFFFFF, HOST1X_CHANNEL_DMAEND);
> 
> According to the TRM, writing to HOST1X_CHANNEL_DMASTART will start a
> DMA transfer on the channel (if DMA_PUT != DMA_GET). Irrespective of
> that, why set the valid range to all of physical memory? We know the
> valid range of the push buffer, why not set the limits accordingly?

That'd make sense. Currently we use the RESTART as the barrier, but
having hardware check against DMAEND is a good idea, too.

> 
>> +/*
>> + * Kick channel DMA into action by writing its PUT offset (if it has changed)
>> + */
>> +static void cdma_kick(struct host1x_cdma *cdma)
>> +{
>> +     struct host1x *host1x = cdma_to_host1x(cdma);
>> +     struct host1x_channel *ch = cdma_to_channel(cdma);
>> +     u32 put;
>> +
>> +     put = host1x->cdma_pb_op.putptr(&cdma->push_buffer);
>> +
>> +     if (put != cdma->last_put) {
>> +             host1x_ch_writel(ch, put, HOST1X_CHANNEL_DMAPUT);
>> +             cdma->last_put = put;
>> +     }
>> +}
> 
> kick() sounds unusual. Maybe flush or commit or something similar would
> be more accurate.

We could use flush.

> 
>> +static void cdma_stop(struct host1x_cdma *cdma)
>> +{
>> +     struct host1x_channel *ch = cdma_to_channel(cdma);
>> +
>> +     mutex_lock(&cdma->lock);
>> +     if (cdma->running) {
>> +             host1x_cdma_wait_locked(cdma, CDMA_EVENT_SYNC_QUEUE_EMPTY);
>> +             host1x_ch_writel(ch, host1x_channel_dmactrl(true, false, false),
>> +                     HOST1X_CHANNEL_DMACTRL);
>> +             cdma->running = false;
>> +     }
>> +     mutex_unlock(&cdma->lock);
>> +}
> 
> Perhaps this should be ranem cdma_stop_sync() or similar to make it
> clear that it waits for the queue to run empty.

Ok, sounds good.

> 
>> +static void cdma_timeout_teardown_end(struct host1x_cdma *cdma, u32 getptr)
> 
> Maybe the last parameter should be called restart to match its purpose?

Makes sense, will do.

> 
>> +{
>> +     struct host1x *host1x = cdma_to_host1x(cdma);
>> +     struct host1x_channel *ch = cdma_to_channel(cdma);
>> +     u32 cmdproc_stop;
>> +
>> +     dev_dbg(&host1x->dev->dev,
>> +             "end channel teardown (id %d, DMAGET restart = 0x%x)\n",
>> +             ch->chid, getptr);
>> +
>> +     cmdproc_stop = host1x_sync_readl(host1x, HOST1X_SYNC_CMDPROC_STOP);
>> +     cmdproc_stop &= ~(BIT(ch->chid));
> 
> No need for the extra parentheses.

Ok, will remove.

> 
>> +     host1x_sync_writel(host1x, cmdproc_stop, HOST1X_SYNC_CMDPROC_STOP);
>> +
>> +     cdma->torndown = false;
>> +     cdma_timeout_restart(cdma, getptr);
>> +}
> 
> I find this a bit non-intuitive. We teardown a channel, and when we're
> done tearing down, the torndown variable is set to false and the channel
> is actually restarted. Maybe you could explain some more how this works
> and what its purpose is.

Actually, teardown_begin freezes the channel, then we manipulate the
queue, and in the end teardown_end restarts the channel. So these should
be named freeze and resume. We could even drop the timeout from the
names of these functions.

> 
>> +/*
>> + * If this timeout fires, it indicates the current sync_queue entry has
>> + * exceeded its TTL and the userctx should be timed out and remaining
>> + * submits already issued cleaned up (future submits return an error).
>> + */
> 
> I can't seem to find what causes subsequent submits to return an error.
> Also, how is the channel reset so that new jobs can be submitted?

That comment actually applies only downstream. We blacklist contexts for
channels that carry state across submits (=have hardware contexts
implemented). 2D has atomic jobs, so it doesn't need blacklisting.

host1x_cdma_update_sync_queue() purges the failed job, finds the DMAGET
for the next job, and sets sync points correctly. It'll call
teardown_end (which we'll rename) to resume the channel with the new
DMAGET pointer.

> 
>> +static void cdma_timeout_handler(struct work_struct *work)
>> +{
>> +     struct host1x_cdma *cdma;
>> +     struct host1x *host1x;
>> +     struct host1x_channel *ch;
>> +
>> +     u32 syncpt_val;
>> +
>> +     u32 prev_cmdproc, cmdproc_stop;
>> +
>> +     cdma = container_of(to_delayed_work(work), struct host1x_cdma,
>> +                         timeout.wq);
>> +     host1x = cdma_to_host1x(cdma);
>> +     ch = cdma_to_channel(cdma);
>> +
>> +     mutex_lock(&cdma->lock);
>> +
>> +     if (!cdma->timeout.clientid) {
>> +             dev_dbg(&host1x->dev->dev,
>> +                      "cdma_timeout: expired, but has no clientid\n");
>> +             mutex_unlock(&cdma->lock);
>> +             return;
>> +     }
> 
> How can the CDMA not have a client?

I don't think that's possible. :-) We should just remove the check. It
might be that we were just protecting some kind of race between timeout
code triggering and something else, but I can't really think of a scenario.

> 
>> +
>> +     /* stop processing to get a clean snapshot */
>> +     prev_cmdproc = host1x_sync_readl(host1x, HOST1X_SYNC_CMDPROC_STOP);
>> +     cmdproc_stop = prev_cmdproc | BIT(ch->chid);
>> +     host1x_sync_writel(host1x, cmdproc_stop, HOST1X_SYNC_CMDPROC_STOP);
>> +
>> +     dev_dbg(&host1x->dev->dev, "cdma_timeout: cmdproc was 0x%x is 0x%x\n",
>> +             prev_cmdproc, cmdproc_stop);
>> +
>> +     syncpt_val = host1x_syncpt_load_min(host1x->syncpt);
>> +
>> +     /* has buffer actually completed? */
>> +     if ((s32)(syncpt_val - cdma->timeout.syncpt_val) >= 0) {
>> +             dev_dbg(&host1x->dev->dev,
>> +                      "cdma_timeout: expired, but buffer had completed\n");
> 
> Maybe this should really be a warning?

Not really - it's actually just a normal state. We got a timeout event,
but before we process it, it might be that the job manages to complete.
This can happen, and is not an error case.

> 
>> +             /* restore */
>> +             cmdproc_stop = prev_cmdproc & ~(BIT(ch->chid));
> 
> No need for the extra parentheses. Also, why not just use prev_cmdproc,
> which shouldn't have the bit set anyway?

Yeah, prev_cmdproc is the one we should use directly.

> 
>> diff --git a/drivers/gpu/host1x/hw/cdma_hw.h b/drivers/gpu/host1x/hw/cdma_hw.h
> [...]
>> +/*
>> + * Size of the sync queue. If it is too small, we won't be able to queue up
>> + * many command buffers. If it is too large, we waste memory.
>> + */
>> +#define HOST1X_SYNC_QUEUE_SIZE 512
> 
> I don't see this used anywhere.

Sync queue used to be an array. It hasn't been for a long time, but this
remained. Will remove.

> 
>> +/*
>> + * Number of gathers we allow to be queued up per channel. Must be a
>> + * power of two. Currently sized such that pushbuffer is 4KB (512*8B).
>> + */
>> +#define HOST1X_GATHER_QUEUE_SIZE 512
> 
> More pieces falling into place.

Great. :-)

> 
>> diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
> [...]
>> +#include "host1x.h"
>> +#include "channel.h"
>> +#include "dev.h"
>> +#include <linux/slab.h>
>> +#include "intr.h"
>> +#include "job.h"
>> +#include <trace/events/host1x.h>
> 
> More include ordering issues.

Will fix.

> 
>> +static void submit_gathers(struct host1x_job *job)
>> +{
>> +     /* push user gathers */
>> +     int i;
> 
> unsigned int?
> 
>> +     for (i = 0 ; i < job->num_gathers; i++) {
>> +             struct host1x_job_gather *g = &job->gathers[i];
>> +             u32 op1 = host1x_opcode_gather(g->words);
>> +             u32 op2 = g->mem_base + g->offset;
>> +             host1x_cdma_push_gather(&job->ch->cdma,
>> +                             job->gathers[i].ref,
>> +                             job->gathers[i].offset,
>> +                             op1, op2);
>> +     }
>> +}
> 
> Perhaps inline this into channel_submit()? I'm not sure how useful it
> really is to split off smallish functions such as this which aren't
> reused anywhere else. I don't have any major objection though, so you
> can keep it separate if you want.

I split these out because channel_submit() became so long that I
couldn't understand it anymore. I'd prefer keeping separate just to keep
myself (semi-)sane.

> 
>> +static inline void __iomem *host1x_channel_regs(void __iomem *p, int ndx)
>> +{
>> +     p += ndx * NV_HOST1X_CHANNEL_MAP_SIZE_BYTES;
>> +     return p;
>> +}
>> +
>> +static int host1x_channel_init(struct host1x_channel *ch,
>> +     struct host1x *dev, int index)
>> +{
>> +     ch->chid = index;
>> +     mutex_init(&ch->reflock);
>> +     mutex_init(&ch->submitlock);
>> +
>> +     ch->regs = host1x_channel_regs(dev->regs, index);
>> +     return 0;
>> +}
> 
> You only use host1x_channel_regs() once, so I really don't think it buys
> you anything to split it off. Both host1x_channel_regs() and
> host1x_channel_init() are short enough that they can be collapsed.

True, will merge.

> 
>> diff --git a/drivers/gpu/host1x/hw/host1x01.c b/drivers/gpu/host1x/hw/host1x01.c
> [...]
>>  #include "hw/host1x01.h"
>>  #include "dev.h"
>> +#include "channel.h"
>>  #include "hw/host1x01_hardware.h"
>>
>> +#include "hw/channel_hw.c"
>> +#include "hw/cdma_hw.c"
>>  #include "hw/syncpt_hw.c"
>>  #include "hw/intr_hw.c"
>>
>>  int host1x01_init(struct host1x *host)
>>  {
>> +     host->channel_op = host1x_channel_ops;
>> +     host->cdma_op = host1x_cdma_ops;
>> +     host->cdma_pb_op = host1x_pushbuffer_ops;
>>       host->syncpt_op = host1x_syncpt_ops;
>>       host->intr_op = host1x_intr_ops;
> 
> I think I mentioned this before, but I'd prefer not to have the .c files
> included here, but rather reference the ops structures externally. But I
> still think that especially CDMA and push buffer ops don't need to be in
> separate structures since they aren't likely to change with new hardware
> revisions.

The C files need to be included here so that they pick up the hardware
defs for the correct SoC. Pushbuffer is probably something we can
generalize, but channel registers can change, so they need to be per SoC.

> 
>> diff --git a/drivers/gpu/host1x/hw/host1x01_hardware.h b/drivers/gpu/host1x/hw/host1x01_hardware.h
> [...]
>> index c1d5324..03873c0 100644
>> --- a/drivers/gpu/host1x/hw/host1x01_hardware.h
>> +++ b/drivers/gpu/host1x/hw/host1x01_hardware.h
>> @@ -21,6 +21,130 @@
>>
>>  #include <linux/types.h>
>>  #include <linux/bitops.h>
>> +#include "hw_host1x01_channel.h"
>>  #include "hw_host1x01_sync.h"
>> +#include "hw_host1x01_uclass.h"
>> +
>> +/* channel registers */
>> +#define NV_HOST1X_CHANNEL_MAP_SIZE_BYTES 16384
> 
> The only user of this seems to be host1x_channel_regs(), so it could be
> moved to that file. Also the name is overly long, why not something like
> HOST1X_CHANNEL_SIZE?

Sounds good.

> 
>> +#define HOST1X_OPCODE_NOOP host1x_opcode_nonincr(0, 0)
> 
> HOST1X_OPCODE_NOP would be more canonical in my opinion.

Ok, can change.

> 
> 
>> +static inline u32 host1x_mask2(unsigned x, unsigned y)
>> +{
>> +     return 1 | (1 << (y - x));
>> +}
> 
> What's this? I don't see it used anywhere.

It's a shortcut to add two register writes to one MASK opcode, but we'll
remove the def as it's not used.

> 
>> diff --git a/drivers/gpu/host1x/hw/hw_host1x01_channel.h b/drivers/gpu/host1x/hw/hw_host1x01_channel.h
> [...]
>> +#define HOST1X_CHANNEL_DMACTRL_DMASTOP_F(v) \
>> +     host1x_channel_dmactrl_dmastop_f(v)
> 
> I mentioned this elsewhere already, but I think the _F suffix (and _f
> for that matter) along with the v parameter should go away.

I'd prefer keeping so that I don't have to use two #defines to replace
one. That IMO makes the usage harder and more error prone.

> 
>> diff --git a/drivers/gpu/host1x/hw/hw_host1x01_uclass.h b/drivers/gpu/host1x/hw/hw_host1x01_uclass.h
> [...]
> 
> What does the "uclass" stand for? It seems a bit useless to me.

It means host1x class, i.e. the host1x registers that can be written to
from push buffers.

> 
>> diff --git a/drivers/gpu/host1x/hw/syncpt_hw.c b/drivers/gpu/host1x/hw/syncpt_hw.c
>> index 16e3ada..ba48cee 100644
>> --- a/drivers/gpu/host1x/hw/syncpt_hw.c
>> +++ b/drivers/gpu/host1x/hw/syncpt_hw.c
>> @@ -97,6 +97,15 @@ static void syncpt_cpu_incr(struct host1x_syncpt *sp)
>>       wmb();
>>  }
>>
>> +/* remove a wait pointed to by patch_addr */
>> +static int syncpt_patch_wait(struct host1x_syncpt *sp, void *patch_addr)
>> +{
>> +     u32 override = host1x_class_host_wait_syncpt(
>> +                     NVSYNCPT_GRAPHICS_HOST, 0);
>> +     __raw_writel(override, patch_addr);
> 
> __raw_writel() isn't meant to be used for regular memory addresses, but
> only for MMIO addresses. patch_addr will be a kernel virtual address to
> an location in RAM, so you can just treat it as a normal pointer, so:
> 
>         *(u32 *)patch_addr = override;

Sure, you mentioned it earlier, but I've just forgotten that. Sorry
about that.

> 
> A small optimization might be to make override a static const, so that
> it doesn't have to be composed every time.

Can do.

> 
>> diff --git a/drivers/gpu/host1x/intr.c b/drivers/gpu/host1x/intr.c
> [...]
>> +static void action_submit_complete(struct host1x_waitlist *waiter)
>> +{
>> +     struct host1x_channel *channel = waiter->data;
>> +     int nr_completed = waiter->count;
> 
> No need for this variable.

I'm using it for tracing in a follow-up patch. It can be used in traces
for checking the queue length at each point of time.

> 
>> diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c
> [...]
>> +#ifdef CONFIG_TEGRA_HOST1X_FIREWALL
>> +static int host1x_firewall = 1;
>> +#else
>> +static int host1x_firewall;
>> +#endif
> 
> You could use IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL) in the code,
> which will have the nice side-effect of compiling code out if the symbol
> isn't selected.

Sure, I just wasn't aware of IS_ENABLED.

> 
>> +struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
>> +             u32 num_cmdbufs, u32 num_relocs, u32 num_waitchks)
> 
> Maybe make the parameters unsigned int instead of u32?

I'll check this, but we're getting them from user space, and that API
has a fixed length field. That's why I'm carrying that type over.

> 
>> +{
>> +     struct host1x_job *job = NULL;
>> +     int num_unpins = num_cmdbufs + num_relocs;
> 
> unsigned int?

Sounds good.

> 
>> +     s64 total;
> 
> This doesn't need to be signed, u64 will be good enough. None of the
> terms in the expression that assigns to total can be negative.

True, will change.

> 
>> +     void *mem;
>> +
>> +     /* Check that we're not going to overflow */
>> +     total = sizeof(struct host1x_job)
>> +                     + num_relocs * sizeof(struct host1x_reloc)
>> +                     + num_unpins * sizeof(struct host1x_job_unpin_data)
>> +                     + num_waitchks * sizeof(struct host1x_waitchk)
>> +                     + num_cmdbufs * sizeof(struct host1x_job_gather)
>> +                     + num_unpins * sizeof(dma_addr_t)
>> +                     + num_unpins * sizeof(u32 *);
> 
> "+"s at the end of the preceding lines.

Ok.

> 
>> +     if (total > ULONG_MAX)
>> +             return NULL;
>> +
>> +     mem = job = kzalloc(total, GFP_KERNEL);
>> +     if (!job)
>> +             return NULL;
>> +
>> +     kref_init(&job->ref);
>> +     job->ch = ch;
>> +
>> +     /* First init state to zero */
>> +
>> +     /*
>> +      * Redistribute memory to the structs.
>> +      * Overflows and negative conditions have
>> +      * already been checked in job_alloc().
>> +      */
> 
> The last two lines don't really apply here. The checks are in this same
> function and they check only for overflow, not negative conditions,
> which can't happen anyway since the counts are all unsigned.

Actually overflow and negative in this case meant the same thing. Will
fix comment.

> 
>> +void host1x_job_get(struct host1x_job *job)
>> +{
>> +     kref_get(&job->ref);
>> +}
> 
> I think it is common for *_get() functions to return a pointer to the
> referenced object.

Ok, can do.

> 
>> +void host1x_job_add_gather(struct host1x_job *job,
>> +             u32 mem_id, u32 words, u32 offset)
>> +{
>> +     struct host1x_job_gather *cur_gather =
>> +                     &job->gathers[job->num_gathers];
> 
> Should this check for overflow?

As defensive measure, could do, but this is not exploitable.

> 
>> +/*
>> + * Check driver supplied waitchk structs for syncpt thresholds
>> + * that have already been satisfied and NULL the comparison (to
>> + * avoid a wrap condition in the HW).
>> + */
>> +static int do_waitchks(struct host1x_job *job, struct host1x *host,
>> +             u32 patch_mem, struct mem_handle *h)
>> +{
>> +     int i;
>> +
>> +     /* compare syncpt vs wait threshold */
>> +     for (i = 0; i < job->num_waitchk; i++) {
>> +             struct host1x_waitchk *wait = &job->waitchk[i];
>> +             struct host1x_syncpt *sp =
>> +                     host1x_syncpt_get(host, wait->syncpt_id);
>> +
>> +             /* validate syncpt id */
>> +             if (wait->syncpt_id > host1x_syncpt_nb_pts(host))
>> +                     continue;
>> +
>> +             /* skip all other gathers */
>> +             if (patch_mem != wait->mem)
>> +                     continue;
>> +
>> +             trace_host1x_syncpt_wait_check(wait->mem, wait->offset,
>> +                             wait->syncpt_id, wait->thresh,
>> +                             host1x_syncpt_read_min(sp));
>> +             if (host1x_syncpt_is_expired(
>> +                     host1x_syncpt_get(host, wait->syncpt_id),
>> +                     wait->thresh)) {
> 
> You already have the sp variable that you could use here to make it more
> readable.

True, will use that.

> 
>> +                     struct host1x_syncpt *sp =
>> +                             host1x_syncpt_get(host, wait->syncpt_id);
> 
> And you don't need this then, since you already have sp pointing to the
> same syncpoint.

Ok.

> 
>> +                     void *patch_addr = NULL;
>> +
>> +                     /*
>> +                      * NULL an already satisfied WAIT_SYNCPT host method,
>> +                      * by patching its args in the command stream. The
>> +                      * method data is changed to reference a reserved
>> +                      * (never given out or incr) NVSYNCPT_GRAPHICS_HOST
>> +                      * syncpt with a matching threshold value of 0, so
>> +                      * is guaranteed to be popped by the host HW.
>> +                      */
>> +                     dev_dbg(&host->dev->dev,
>> +                         "drop WAIT id %d (%s) thresh 0x%x, min 0x%x\n",
>> +                         wait->syncpt_id, sp->name, wait->thresh,
>> +                         host1x_syncpt_read_min(sp));
>> +
>> +                     /* patch the wait */
>> +                     patch_addr = host1x_memmgr_kmap(h,
>> +                                     wait->offset >> PAGE_SHIFT);
>> +                     if (patch_addr) {
>> +                             host1x_syncpt_patch_wait(sp,
>> +                                     (patch_addr +
>> +                                             (wait->offset & ~PAGE_MASK)));
>> +                             host1x_memmgr_kunmap(h,
>> +                                             wait->offset >> PAGE_SHIFT,
>> +                                             patch_addr);
>> +                     } else {
>> +                             pr_err("Couldn't map cmdbuf for wait check\n");
>> +                     }
> 
> This is a case where splitting out a small function would actually be
> useful to make the code more readable since you can remove two levels of
> indentation. You can just pass in the handle and the offset, let it do
> the actual patching. Maybe
> 
>         host1x_syncpt_patch_offset(sp, h, wait->offset);
> 
> ?

Sounds good, for readability point of view.

> 
>> +             }
>> +
>> +             wait->mem = 0;
>> +     }
>> +     return 0;
>> +}
>> +
>> +
> 
> There's a gratuitous blank line.

Will remove.

> 
>> +static int pin_job_mem(struct host1x_job *job)
>> +{
>> +     int i;
>> +     int count = 0;
>> +     int result;
> 
> These (and the return value) can all be unsigned int.

True. will fix.

> 
>> +static int do_relocs(struct host1x_job *job,
>> +             u32 cmdbuf_mem, struct mem_handle *h)
>> +{
>> +     int i = 0;
> 
> This can also be unsigned int.

True, will fix.

> 
>> +     int last_page = -1;
> 
> And this should match the type of cmdbuf_offset (u32). You can initially
> set it to something like ~0 to make sure it doesn't match any valid
> offset.

You're right, will change.

> 
>> +     void *cmdbuf_page_addr = NULL;
>> +
>> +     /* pin & patch the relocs for one gather */
>> +     while (i < job->num_relocs) {
>> +             struct host1x_reloc *reloc = &job->relocarray[i];
>> +
>> +             /* skip all other gathers */
>> +             if (cmdbuf_mem != reloc->cmdbuf_mem) {
>> +                     i++;
>> +                     continue;
>> +             }
>> +
>> +             if (last_page != reloc->cmdbuf_offset >> PAGE_SHIFT) {
>> +                     if (cmdbuf_page_addr)
>> +                             host1x_memmgr_kunmap(h,
>> +                                             last_page, cmdbuf_page_addr);
>> +
>> +                     cmdbuf_page_addr = host1x_memmgr_kmap(h,
>> +                                     reloc->cmdbuf_offset >> PAGE_SHIFT);
>> +                     last_page = reloc->cmdbuf_offset >> PAGE_SHIFT;
>> +
>> +                     if (unlikely(!cmdbuf_page_addr)) {
>> +                             pr_err("Couldn't map cmdbuf for relocation\n");
>> +                             return -ENOMEM;
>> +                     }
>> +             }
>> +
>> +             __raw_writel(
>> +                     (job->reloc_addr_phys[i] +
>> +                             reloc->target_offset) >> reloc->shift,
>> +                     (cmdbuf_page_addr +
>> +                             (reloc->cmdbuf_offset & ~PAGE_MASK)));
> 
> Again, wrong __raw_writel() usage.

Yes, sorry, I forgot about this.

> 
>> +
>> +             /* remove completed reloc from the job */
>> +             if (i != job->num_relocs - 1) {
>> +                     struct host1x_reloc *reloc_last =
>> +                             &job->relocarray[job->num_relocs - 1];
>> +                     reloc->cmdbuf_mem       = reloc_last->cmdbuf_mem;
>> +                     reloc->cmdbuf_offset    = reloc_last->cmdbuf_offset;
>> +                     reloc->target           = reloc_last->target;
>> +                     reloc->target_offset    = reloc_last->target_offset;
>> +                     reloc->shift            = reloc_last->shift;
>> +                     job->reloc_addr_phys[i] =
>> +                             job->reloc_addr_phys[job->num_relocs - 1];
>> +                     job->num_relocs--;
>> +             } else {
>> +                     break;
>> +             }
>> +     }
>> +
>> +     if (cmdbuf_page_addr)
>> +             host1x_memmgr_kunmap(h, last_page, cmdbuf_page_addr);
>> +
>> +     return 0;
>> +}
> 
> Also the algorithm seems a bit strange and hard to follow. Instead of
> removing relocs from the job, replacing them with the last entry and
> decrementing job->num_relocs, how much is the penalty for always
> iterating over all relocs? This is one of the other cases where I'd
> argue that simplicity is key. Furthermore you need to copy quite a bit
> of data to replace the completed relocs, so I'm not sure it buys you
> much.
> 
> It could always be optimized later on by just setting a bit in the reloc
> to mark it as completed, or keep a bitmask of completed relocations or
> whatever.

This was done in a big optimization patch, but we'll check if we could
remove this. Previously we just set cmdbuf_mem for the completed reloc
to 0, and that should work in this case.

> 
>> +static int check_reloc(struct host1x_reloc *reloc,
>> +             u32 cmdbuf_id, int offset)
> 
> offset can be unsigned int.

Yep, will change.

> 
>> +{
>> +     int err = 0;
>> +     if (reloc->cmdbuf_mem != cmdbuf_id
>> +                     || reloc->cmdbuf_offset != offset * sizeof(u32))
>> +             err = -EINVAL;
>> +
>> +     return err;
>> +}
> 
> More canonically:
> 
>         offset *= sizeof(u32);
> 
>         if (reloc->cmdbuf_mem != cmdbuf_id || reloc->cmdbuf_offset != offset)
>                 return -EINVAL;
> 
>         return 0;

Ok, both do the same thing, so can change.

> 
>> +
>> +static int check_mask(struct host1x_job *job,
>> +             struct platform_device *pdev,
>> +             struct host1x_reloc **reloc, int *num_relocs,
>> +             u32 cmdbuf_id, int *offset,
>> +             u32 *words, u32 class, u32 reg, u32 mask)
> 
> num_relocs and offset can be unsigned int *.
> 
> Same comment for the other check_*() functions. That said I think the
> code would become a lot more readable if you were to wrap all of these
> parameters into a structure, say host1x_firewall, and just pass that
> into the functions.

True, might improve performance, too. We'll do that.

> 
>> +static inline int copy_gathers(struct host1x_job *job,
>> +             struct platform_device *pdev)
> 
> struct device *

Will do.

> 
>> +{
>> +     size_t size = 0;
>> +     size_t offset = 0;
>> +     int i;
>> +
>> +     for (i = 0; i < job->num_gathers; i++) {
>> +             struct host1x_job_gather *g = &job->gathers[i];
>> +             size += g->words * sizeof(u32);
>> +     }
>> +
>> +     job->gather_copy_mapped = dma_alloc_writecombine(&pdev->dev,
>> +                     size, &job->gather_copy, GFP_KERNEL);
>> +     if (IS_ERR(job->gather_copy_mapped)) {
> 
> dma_alloc_writecombine() returns NULL on failure, so this check is
> wrong.

Oops, will fix.

> 
>> +             int err = PTR_ERR(job->gather_copy_mapped);
>> +             job->gather_copy_mapped = NULL;
>> +             return err;
>> +     }
>> +
>> +     job->gather_copy_size = size;
>> +
>> +     for (i = 0; i < job->num_gathers; i++) {
>> +             struct host1x_job_gather *g = &job->gathers[i];
>> +             void *gather = host1x_memmgr_mmap(g->ref);
>> +             memcpy(job->gather_copy_mapped + offset,
>> +                             gather + g->offset,
>> +                             g->words * sizeof(u32));
>> +
>> +             g->mem_base = job->gather_copy;
>> +             g->offset = offset;
>> +             g->mem_id = 0;
>> +             g->ref = 0;
>> +
>> +             host1x_memmgr_munmap(g->ref, gather);
>> +             offset += g->words * sizeof(u32);
>> +     }
>> +
>> +     return 0;
>> +}
> 
> I wonder, where's this DMA buffer actually used? I can't find any use
> between this copy and the corresponding dma_free_writecombine() call.

We replace the gathers in host1x_job with the ones we allocate here, so
they are used when pushing the gather's to hardware.

This is done so that user space cannot tamper with the gathers once
they've been checked by firewall.

> 
>> +int host1x_job_pin(struct host1x_job *job, struct platform_device *pdev)
>> +{
>> +     int err = 0, i = 0, j = 0;
> 
> No need to initialize these here. i and j can also be unsigned.

Ok.

> 
>> +     struct host1x *host = host1x_get_host(pdev);
>> +     DECLARE_BITMAP(waitchk_mask, host1x_syncpt_nb_pts(host));
>> +
>> +     bitmap_zero(waitchk_mask, host1x_syncpt_nb_pts(host));
>> +     for (i = 0; i < job->num_waitchk; i++) {
>> +             u32 syncpt_id = job->waitchk[i].syncpt_id;
>> +             if (syncpt_id < host1x_syncpt_nb_pts(host))
>> +                     set_bit(syncpt_id, waitchk_mask);
>> +     }
>> +
>> +     /* get current syncpt values for waitchk */
>> +     for_each_set_bit(i, &waitchk_mask[0], sizeof(waitchk_mask))
>> +             host1x_syncpt_load_min(host->syncpt + i);
>> +
>> +     /* pin memory */
>> +     err = pin_job_mem(job);
>> +     if (err <= 0)
>> +             goto out;
> 
> pin_job_mem() never returns negative.

Ok, will fix.

> 
>> +     /* patch gathers */
>> +     for (i = 0; i < job->num_gathers; i++) {
>> +             struct host1x_job_gather *g = &job->gathers[i];
>> +
>> +             /* process each gather mem only once */
>> +             if (!g->ref) {
>> +                     g->ref = host1x_memmgr_get(g->mem_id, job->ch->dev);
>> +                     if (IS_ERR(g->ref)) {
> 
> host1x_memmgr_get() also seems to return NULL on error.

I think I'll change memmgr_get() to return an ERR_PTR().

> 
>> +                             err = PTR_ERR(g->ref);
>> +                             g->ref = NULL;
>> +                             break;
>> +                     }
>> +
>> +                     g->mem_base = job->gather_addr_phys[i];
>> +
>> +                     for (j = 0; j < job->num_gathers; j++) {
>> +                             struct host1x_job_gather *tmp =
>> +                                     &job->gathers[j];
>> +                             if (!tmp->ref && tmp->mem_id == g->mem_id) {
>> +                                     tmp->ref = g->ref;
>> +                                     tmp->mem_base = g->mem_base;
>> +                             }
>> +                     }
>> +                     err = 0;
>> +                     if (host1x_firewall)
> 
> if (IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL))

Will fix.

> 
>> +                             err = validate(job, pdev, g);
>> +                     if (err)
>> +                             dev_err(&pdev->dev,
>> +                                     "Job validate returned %d\n", err);
>> +                     if (!err)
>> +                             err = do_relocs(job, g->mem_id,  g->ref);
>> +                     if (!err)
>> +                             err = do_waitchks(job, host,
>> +                                             g->mem_id, g->ref);
>> +                     host1x_memmgr_put(g->ref);
>> +                     if (err)
>> +                             break;
>> +             }
>> +     }
>> +
>> +     if (host1x_firewall && !err) {
> 
> And here.

Here, too.

> 
>> +/*
>> + * Debug routine used to dump job entries
>> + */
>> +void host1x_job_dump(struct device *dev, struct host1x_job *job)
>> +{
>> +     dev_dbg(dev, "    SYNCPT_ID   %d\n",
>> +             job->syncpt_id);
>> +     dev_dbg(dev, "    SYNCPT_VAL  %d\n",
>> +             job->syncpt_end);
>> +     dev_dbg(dev, "    FIRST_GET   0x%x\n",
>> +             job->first_get);
>> +     dev_dbg(dev, "    TIMEOUT     %d\n",
>> +             job->timeout);
>> +     dev_dbg(dev, "    NUM_SLOTS   %d\n",
>> +             job->num_slots);
>> +     dev_dbg(dev, "    NUM_HANDLES %d\n",
>> +             job->num_unpins);
>> +}
> 
> These don't need to be wrapped.

True, will merge lines.

> 
>> diff --git a/drivers/gpu/host1x/job.h b/drivers/gpu/host1x/job.h
> [...]
>> +struct host1x_job_gather {
>> +     u32 words;
>> +     dma_addr_t mem_base;
>> +     u32 mem_id;
>> +     int offset;
>> +     struct mem_handle *ref;
>> +};
>> +
>> +struct host1x_cmdbuf {
>> +     __u32 mem;
>> +     __u32 offset;
>> +     __u32 words;
>> +     __u32 pad;
>> +};
>> +
>> +struct host1x_reloc {
>> +     __u32 cmdbuf_mem;
>> +     __u32 cmdbuf_offset;
>> +     __u32 target;
>> +     __u32 target_offset;
>> +     __u32 shift;
>> +     __u32 pad;
>> +};
>> +
>> +struct host1x_waitchk {
>> +     __u32 mem;
>> +     __u32 offset;
>> +     __u32 syncpt_id;
>> +     __u32 thresh;
>> +};
> 
> None of these are shared with userspace, so they shouldn't take the
> __u32 types, but the regular u32 ones.

True. We copy stuff from user space types to these, but we don't use
these directly in user space API.

> 
>> +/*
>> + * Each submit is tracked as a host1x_job.
>> + */
>> +struct host1x_job {
>> +     /* When refcount goes to zero, job can be freed */
>> +     struct kref ref;
>> +
>> +     /* List entry */
>> +     struct list_head list;
>> +
>> +     /* Channel where job is submitted to */
>> +     struct host1x_channel *ch;
> 
> Maybe write it out as "channel"?

Ok.

> 
>> +
>> +     int clientid;
> 
> Subsequent patches assign u32 to this field, so maybe the type should be
> changed here. And maybe leave out the id suffix. It doesn't really add
> any information.

Good catch, will change.

> 
>> +     /* Gathers and their memory */
>> +     struct host1x_job_gather *gathers;
>> +     int num_gathers;
> 
> unsigned int

Will change.

> 
>> +     /* Wait checks to be processed at submit time */
>> +     struct host1x_waitchk *waitchk;
>> +     int num_waitchk;
> 
> unsigned int

Ok.

> 
>> +     u32 waitchk_mask;
> 
> This might need to be changed to a bitfield once future Tegra versions
> start supporting more than 32 syncpoints.

True, I think we'll need to get this changed already now. We actually
drop the usage of waitchk_mask in downstream because of this. It's
basically just an optimization that doesn't gain any real world speed
advantage.

> 
>> +     /* Array of handles to be pinned & unpinned */
>> +     struct host1x_reloc *relocarray;
>> +     int num_relocs;
> 
> unsigned int

Will change.

> 
>> +     struct host1x_job_unpin_data *unpins;
>> +     int num_unpins;
> 
> unsigned int

Will change.

> 
>> +     dma_addr_t *addr_phys;
>> +     dma_addr_t *gather_addr_phys;
>> +     dma_addr_t *reloc_addr_phys;
>> +
>> +     /* Sync point id, number of increments and end related to the submit */
>> +     u32 syncpt_id;
>> +     u32 syncpt_incrs;
>> +     u32 syncpt_end;
>> +
>> +     /* Maximum time to wait for this job */
>> +     int timeout;
> 
> unsigned int. I think we discussed this already in a slightly different
> context in patch 2.

Sure, will change. I think timeouts were discussed wrt syncpt wait timeout.

> 
>> +     /* Null kickoff prevents submit from being sent to hardware */
>> +     bool null_kickoff;
> 
> I don't think this is used anywhere.

True, we can remove this as we haven't posted the code for null kickoff.

> 
>> +     /* Index and number of slots used in the push buffer */
>> +     int first_get;
>> +     int num_slots;
> 
> unsigned int

Ok.

> 
>> +
>> +     /* Copy of gathers */
>> +     size_t gather_copy_size;
>> +     dma_addr_t gather_copy;
>> +     u8 *gather_copy_mapped;
> 
> Are these really needed? They don't seem to be used anywhere except to
> store a copy and free that copy sometime later.

They're needed so that kernel can take a copy of the gathers so that
user space cannot tamper with them post-submit.

> 
>> +
>> +     /* Temporary space for unpin ids */
>> +     long unsigned int *pin_ids;
> 
> unsigned long

Will change.

> 
>> +     /* Check if register is marked as an address reg */
>> +     int (*is_addr_reg)(struct platform_device *dev, u32 reg, u32 class);
> 
> is_addr_reg() sounds a bit unusual. Maybe match this to the name of the
> main firewall routine, validate()?

The point of this op is to just tell if a register for a class is
pointing to a buffer. validate then uses this information. But both
answers (yes/no) and both types of registers are still valid, so
validate() wouldn't be the proper name.

validation is then done by checking that there's a reloc corresponding
to each register write to a register that can hold an address.

> 
>> +     /* Request a SETCLASS to this class */
>> +     u32 class;
>> +
>> +     /* Add a channel wait for previous ops to complete */
>> +     u32 serialize;
> 
> This is used in code as a boolean. Why does it need to be 32 bits?

No need, will change to bool.

> 
>> diff --git a/drivers/gpu/host1x/memmgr.h b/drivers/gpu/host1x/memmgr.h
> [...]
>> +struct mem_handle;
>> +struct platform_device;
>> +
>> +struct host1x_job_unpin_data {
>> +     struct mem_handle *h;
>> +     struct sg_table *mem;
>> +};
>> +
>> +enum mem_mgr_flag {
>> +     mem_mgr_flag_uncacheable = 0,
>> +     mem_mgr_flag_write_combine = 1,
>> +};
> 
> I'd like to see this use a more object-oriented approach and more common
> terminology. All of these handles are essentially buffer objects, so
> maybe something like host1x_bo would be a nice and short name.
> 
> To make this more object-oriented, I propose something like:
> 
>         struct host1x_bo_ops {
>                 int (*alloc)(struct host1x_bo *bo, size_t size, unsigned long align,
>                              unsigned long flags);
>                 int (*free)(struct host1x_bo *bo);
>                 ...
>         };
> 
>         struct host1x_bo {
>                 const struct host1x_bo_ops *ops;
>         };
> 
>         struct host1x_cma_bo {
>                 struct host1x_bo base;
>                 struct drm_gem_cma_object *obj;
>         };
> 
>         static inline struct host1x_cma_bo *to_host1x_cma_bo(struct host1x_bo *bo)
>         {
>                 return container_of(bo, struct host1x_cma_bo, base);
>         }
> 
>         static inline int host1x_bo_alloc(struct host1x_bo *bo, size_t size,
>                                           unsigned long align, unsigned long flags)
>         {
>                 return bo->ops->alloc(bo, size, align, flags);
>         }
> 
>         ...
> 
> That should be easy to extend with a new type of BO once the IOMMU-based
> allocator is ready. And as I said it is much closer in terminology to
> what other drivers do.

One complexity is that we're using the same type for communicating with
user space. Each buffer carries with it a flag indicating its allocator.
We might be able to model the internal structure to be more like what
you propose, but for the API we still need the flag.

> 
>> diff --git a/drivers/gpu/host1x/syncpt.h b/drivers/gpu/host1x/syncpt.h
>> index b46d044..255a3a3 100644
>> --- a/drivers/gpu/host1x/syncpt.h
>> +++ b/drivers/gpu/host1x/syncpt.h
>> @@ -26,6 +26,7 @@
>>  struct host1x;
>>
>>  #define NVSYNCPT_INVALID                     (-1)
>> +#define NVSYNCPT_GRAPHICS_HOST                       0
> 
> I think these should match other naming, so:
> 
>         #define HOST1X_SYNCPT_INVALID   -1
>         #define HOST1X_SYNCPT_HOST1X     0

Sure, sounds good.

> There are a few more occurrences where platform_device is used but I
> haven't commented on them. I don't think any of them won't work with
> just a struct device instead. Also I may not have caught all of the
> places where you should rather be using unsigned int instead of int,
> so you might want to look out for some of those.

Yes, we'll go through the code with this in mind.

> Generally I very much like where this is going. Are there any plans to
> move the userspace binary driver to this interface at some point so we
> can more actively test it? Also, is anything else blocking adding a
> gr3d device similar to gr2d from this patch series?

We're doing this in stages. I don't want to change the code base and
APIs both in one step, because big moves in both user and kernel space
tend to fail easily.

First we upstream code, and try to get feature parity. Then we
re-engineer our downstream driver delta on top of the upstream code, but
in this phase we keep the downstream kernel API.

In the next step, we'll start moving to the DRM APIs.

So, there's quite a few steps still before we're on DRM APIs, but we'll
reach it at some point. :-)

3D driver should work on top of this. I don't see anything blocking that.

Terje
Thierry Reding Feb. 27, 2013, 8:56 a.m. UTC | #3
On Tue, Feb 26, 2013 at 11:48:18AM +0200, Terje Bergström wrote:
> On 25.02.2013 17:24, Thierry Reding wrote:
> > On Tue, Jan 15, 2013 at 01:43:59PM +0200, Terje Bergstrom wrote:
[...]
> >> +/*
> >> + * Start timer for a buffer submition that has completed yet.
> > 
> > "submission". And I don't understand the "that has completed yet" part.
> 
> It should become "Start timer that tracks the time spent by the job".

Yes, that's a lot better.

> >> +     if (list_empty(&cdma->sync_queue) &&
> >> +                             cdma->event == CDMA_EVENT_SYNC_QUEUE_EMPTY)
> >> +                     signal = true;
> > 
> > This looks funny, maybe:
> > 
> >         if (cdma->event == CDMA_EVENT_SYNC_QUEUE_EMPTY &&
> >             list_empty(&cdma->sync_queue))
> >                 signal = true;
> > 
> > ?
> 
> Indenting at least is strange. I don't have a preference for the
> ordering of conditions, so if you like the latter order, we can just use
> that.

I just happen to find it easier to read that way. If you want to keep
the ordering that's fine, but the indentation needs to be fixed.

> >> +{
> >> +     u32 get_restart;
> > 
> > Maybe just call this "restart" or "restart_addr". get_restart sounds
> > like a function name.
> 
> Ok, how about "restart_dmaget_addr"? That indicates what we're doing
> with the restart address.

Sounds good.

> >> +     list_for_each_entry(job, &cdma->sync_queue, list) {
> >> +             if (syncpt_val < job->syncpt_end)
> >> +                     break;
> >> +
> >> +             host1x_job_dump(&dev->dev, job);
> >> +     }
> > 
> > That's potentially a lot of debug output. I wonder if it might make
> > sense to control parts of this via a module parameter. Then again, if
> > somebody really needs to debug this, maybe they really want *all* the
> > information.
> 
> host1x_job_dump() uses dev_dbg(), so it only dumps a lot if DEBUG has
> been defined in that file.

Okay, let's leave it like that then.

> >> +/*
> >> + * Destroy a cdma
> >> + */
> >> +void host1x_cdma_deinit(struct host1x_cdma *cdma)
> >> +{
> >> +     struct push_buffer *pb = &cdma->push_buffer;
> >> +     struct host1x *host1x = cdma_to_host1x(cdma);
> >> +
> >> +     if (cdma->running) {
> >> +             pr_warn("%s: CDMA still running\n",
> >> +                             __func__);
> >> +     } else {
> >> +             host1x->cdma_pb_op.destroy(pb);
> >> +             host1x->cdma_op.timeout_destroy(cdma);
> >> +     }
> >> +}
> > 
> > There's no way to recover from the situation where a cdma is still
> > running. Can this not return an error code (-EBUSY?) if the cdma can't
> > be destroyed?
> 
> It's called from close(), which cannot return an error code. It's
> actually more of a power optimization. The effect is that if there are
> no users for channel, we'll just not free up the push buffer.
> 
> I think the proper fix would actually be to check in host1x_cdma_init()
> if push buffer is already allocated and cdma->running. In that case we
> could skip most of initialization.

Yes, in that case it might be useful to do this. I still think it's
worth to return an error code to the caller, even if it can't be
propagated. That way the caller at least has the possibility to react.

I'm still not quite sure I understand the necessity for this, though.
Maybe you can give an example of when this will actually happen?

> >> +/*
> >> + * cdma
> >> + *
> >> + * This is in charge of a host command DMA channel.
> >> + * Sends ops to a push buffer, and takes responsibility for unpinning
> >> + * (& possibly freeing) of memory after those ops have completed.
> >> + * Producer:
> >> + *   begin
> >> + *           push - send ops to the push buffer
> >> + *   end - start command DMA and enqueue handles to be unpinned
> >> + * Consumer:
> >> + *   update - call to update sync queue and push buffer, unpin memory
> >> + */
> > 
> > I find the name to be a bit confusing. For some reason I automatically
> > think of GSM when I read CDMA. This really is more of a job queue, so
> > maybe calling it host1x_job_queue might be more appropriate. But I've
> > already requested a lot of things to be renamed, so I think I can live
> > with this being called CDMA if you don't want to change it.
> > 
> > Alternatively all of these could be moved to the struct host1x_channel
> > given that there's only one of each of the push_buffer, buffer_timeout
> > and host1x_cma objects per channel.
> 
> I did consider merging those two at a time. That should work, as they
> both deal with channels essentially. I also saw that the resulting file
> and data structures became quite large, so I have so far preferred to
> keep them separate.
> 
> This way I can keep the "higher level" stuff (inserting setclass,
> serializing, allocating sync point ranges, etc) in one file and lower
> level stuff (write to hardware, deal with push buffer pointers, etc) in
> another.

Alright. I can live with that.

> >> +int host1x_channel_submit(struct host1x_job *job)
> >> +{
> >> +     return host1x_get_host(job->ch->dev)->channel_op.submit(job);
> >> +}
> > 
> > I'd expect a function named host1x_channel_submit() to take a struct
> > host1x_channel *. Should this perhaps be called host1x_job_submit()?
> 
> It calls into channel code directly, and the underlying op also just
> takes a job. We could add channel as a parameter, and not pass it in
> host1x_job_alloc(). but we actually need the channel data already in
> host1x_job_pin(), which comes before submit. We need it so that we pin
> the buffer to correct engine.

That's all fine. My point was that this operates on a job object, so I'd
find it more intuitive if the function name reflected that. There's
nothing wrong with submitting a job without explicitly specifying the
channel if it is tied to one channel anyway.

host1x_channel_submit() would imply "submit channel", which doesn't make
sense, so the next best alternative is "submit job to channel", but that
isn't reflected in the parameters. So host1x_job_submit() fits pretty
well. There's no reason why it has to be prefixed host1x_channel_*,
right?

> >> +struct host1x_channel *host1x_channel_alloc(struct platform_device *pdev)
> >> +{
> >> +     struct host1x_channel *ch = NULL;
> >> +     struct host1x *host1x = host1x_get_host(pdev);
> >> +     int chindex;
> >> +     int max_channels = host1x->info.nb_channels;
> >> +     int err;
> >> +
> >> +     mutex_lock(&host1x->chlist_mutex);
> >> +
> >> +     chindex = host1x->allocated_channels;
> >> +     if (chindex > max_channels)
> >> +             goto fail;
> >> +
> >> +     ch = kzalloc(sizeof(*ch), GFP_KERNEL);
> >> +     if (ch == NULL)
> >> +             goto fail;
> >> +
> >> +     /* Link platform_device to host1x_channel */
> >> +     err = host1x->channel_op.init(ch, host1x, chindex);
> >> +     if (err < 0)
> >> +             goto fail;
> >> +
> >> +     ch->dev = pdev;
> >> +
> >> +     /* Add to channel list */
> >> +     list_add_tail(&ch->list, &host1x->chlist.list);
> >> +
> >> +     host1x->allocated_channels++;
> >> +
> >> +     mutex_unlock(&host1x->chlist_mutex);
> >> +     return ch;
> >> +
> >> +fail:
> >> +     dev_err(&pdev->dev, "failed to init channel\n");
> >> +     kfree(ch);
> >> +     mutex_unlock(&host1x->chlist_mutex);
> >> +     return NULL;
> >> +}
> > 
> > I think the critical section could be shorter here. It's probably not
> > worth the extra trouble, though, given that channels are not often
> > allocated.
> 
> Yeah, boot time isn't measured in microseconds. :-) But, if we just make
> allocated_channels an atomic, we should be able to drop chlist_mutex
> altogether and it could simplify the code.

You still need to protect the list from concurrent modification.

> >> +/* channel list operations */
> >> +void host1x_channel_list_init(struct host1x *);
> >> +void host1x_channel_for_all(struct host1x *, void *data,
> >> +     int (*fptr)(struct host1x_channel *ch, void *fdata));
> >> +
> >> +struct host1x_channel *host1x_channel_alloc(struct platform_device *pdev);
> >> +void host1x_channel_free(struct host1x_channel *ch);
> > 
> > Is it a good idea to make host1x_channel_free() publicly available?
> > Shouldn't the host1x_channel_alloc()/host1x_channel_request() return a
> > host1x_channel with a reference count of 1 and everybody release their
> > reference using host1x_channel_put() to make sure the channel is freed
> > only after the last reference disappears?
> > 
> > Otherwise whoever calls host1x_channel_free() will confuse everybody
> > else that's still keeping a reference.
> 
> The difference is that _put and _get are called to indicate how many
> user space processes there are for the channel. Even if there are no
> processes, we won't free the channel structure - we just freeze the channel.
> 
> _alloc and _free are different in that they actually create the channel
> structs and delete them and they follow the lifecycle of the driver.
> Perhaps we should figure new naming, but refcounting and alloc/free
> cannot be merged here.

I understand. Perhaps better names would be host1x_channel_setup() and
host1x_channel_teardown()?

> >> +{
> >> +     struct drm_gem_cma_object *obj = to_cma_obj((void *)id);
> >> +     struct mutex *struct_mutex = &obj->base.dev->struct_mutex;
> >> +
> >> +     mutex_lock(struct_mutex);
> >> +     drm_gem_object_reference(&obj->base);
> >> +     mutex_unlock(struct_mutex);
> > 
> > I think it's more customary to obtain a pointer to struct drm_device and
> > then use mutex_{lock,unlock}(&drm->struct_mutex). Or you could just use
> > drm_gem_object_reference_unlocked(&obj->base) instead. Which doesn't
> > exist yet, apparently. But it could be added.
> 
> I think we could take the former path - just refer to mutex in a
> different way.

You'll get extra points if you add the function =). The documentation in
Documentation/DocBook/drm.tmpl says that it exists, but it doesn't, so
you'd even be fixing a bug along the way.

> >> +int host1x_cma_pin_array_ids(struct platform_device *dev,
> >> +             long unsigned *ids,
> >> +             long unsigned id_type_mask,
> >> +             long unsigned id_type,
> >> +             u32 count,
> >> +             struct host1x_job_unpin_data *unpin_data,
> >> +             dma_addr_t *phys_addr)
> > 
> > struct device * and unsigned long please. count can also doesn't need to
> > be a sized type. unsigned int will do just fine. The return value can
> > also be unsigned int if you don't expect to return any error conditions.
> 
> I think we'll need to check these. ids probably needs to be a u32 *, and
> id_type_mask and id_type should be u32. They come like that from user space.

Okay. My main point was that it's more usual to use "unsigned long" than
"long unsigned":

	linux.git $ git grep -n 'long unsigned' | wc -l
	72
	linux.git $ git grep -n 'unsigned long' | wc -l
	106575

Also the more I think about it, the more I have doubts that passing
around IDs like this (or using ID types and masks) is the right thing to
do. I'll get back to that later.

> > 
> >> +     int allocated_channels;
> > 
> > unsigned int? And maybe just "num_channels"?
> 
> num_channels could be thought as "number of available channels", so I'd
> like to use num_allocated_channels here.

Okay.

> >> diff --git a/drivers/gpu/host1x/host1x.h b/drivers/gpu/host1x/host1x.h
> > [...]
> >> +enum host1x_class {
> >> +     NV_HOST1X_CLASS_ID              = 0x1,
> >> +     NV_GRAPHICS_2D_CLASS_ID         = 0x51,
> > 
> > This entry belongs in a later patch, right? And I find it convenient if
> > enumeration constants start with the enum name as prefix. Furthermore
> > it'd be nice to reuse the hardware module names, like so:
> > 
> >         enum host1x_class {
> >                 HOST1X_CLASS_HOST1X,
> >                 HOST1X_CLASS_GR2D,
> >                 HOST1X_CLASS_GR3D,
> >         };
> 
> The naming sounds good. We already use HOST1X_CLASS_HOST1X in code to
> insert a wait. If you'd prefer, we can move the definition of
> HOST1X_CLASS_GR2D to the later patch.

Yes, it's better to introduce it in the patch that first uses it.

> >> diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
> > [...]
> >> +#include <linux/slab.h>
> >> +#include <linux/scatterlist.h>
> >> +#include <linux/dma-mapping.h>
> >> +#include "cdma.h"
> >> +#include "channel.h"
> >> +#include "dev.h"
> >> +#include "memmgr.h"
> >> +
> >> +#include "cdma_hw.h"
> >> +
> >> +static inline u32 host1x_channel_dmactrl(int stop, int get_rst, int init_get)
> >> +{
> >> +     return HOST1X_CHANNEL_DMACTRL_DMASTOP_F(stop)
> >> +             | HOST1X_CHANNEL_DMACTRL_DMAGETRST_F(get_rst)
> >> +             | HOST1X_CHANNEL_DMACTRL_DMAINITGET_F(init_get);
> > 
> > I think it is more customary to put the | at the end of the preceding
> > line:
> > 
> >         return HOST1X_CHANNEL_DMACTRL_DMASTOP_F(stop) |
> >                HOST1X_CHANNEL_DMACTRL_DMAGETRST_F(get_rst) |
> >                HOST1X_CHANNEL_DMACTRL_DMAINITGET_F(init_get);
> > 
> > Also since these are all single bits, I'd prefer if you could drop the
> > _F suffix and not make them take a parameter. I think it'd even be
> > better not to have this function at all, but make the intent explicit
> > where the register is written. That is, have each call site set the bits
> > explicitly instead of calling this helper. Having a parameter list such
> > as (true, false, false) or (true, true, true) is confusing since you
> > have to keep looking up the meaning of the parameters.
> 
> The operation that the _F macros do is masking and bit shifting the
> fields correctly. Without that, we'd need to expose several macros to
> mask and shift, and I'd rather just have one macro to take care of that.
> 
> But, we can open code the function to wherever it's used if that's more
> readable.

I wasn't arguing against masking and shifting, but rather in favour of
treating these like normal bit definitions. So instead of passing a
boolean parameter to the macro, you just don't use it if the bit isn't
supposed to be set. And if you want to set the bit you or in the value.

So:

	static inline u32 host1x_channel_dmactrl_dmastop(void)
	{
		return 1 << 0;
	}

	#define HOST1X_CHANNEL_DMACTRL_DMASTOP \
		host1x_channel_dmactrl_dmastop()

> >> +/*
> >> + * Similar to cdma_start(), but rather than starting from an idle
> >> + * state (where DMA GET is set to DMA PUT), on a timeout we restore
> >> + * DMA GET from an explicit value (so DMA may again be pending).
> >> + */
> >> +static void cdma_timeout_restart(struct host1x_cdma *cdma, u32 getptr)
> >> +{
> >> +     struct host1x *host1x = cdma_to_host1x(cdma);
> >> +     struct host1x_channel *ch = cdma_to_channel(cdma);
> >> +
> >> +     if (cdma->running)
> >> +             return;
> >> +
> >> +     cdma->last_put = host1x->cdma_pb_op.putptr(&cdma->push_buffer);
> >> +
> >> +     host1x_ch_writel(ch, host1x_channel_dmactrl(true, false, false),
> >> +             HOST1X_CHANNEL_DMACTRL);
> >> +
> >> +     /* set base, end pointer (all of memory) */
> >> +     host1x_ch_writel(ch, 0, HOST1X_CHANNEL_DMASTART);
> >> +     host1x_ch_writel(ch, 0xFFFFFFFF, HOST1X_CHANNEL_DMAEND);
> > 
> > According to the TRM, writing to HOST1X_CHANNEL_DMASTART will start a
> > DMA transfer on the channel (if DMA_PUT != DMA_GET). Irrespective of
> > that, why set the valid range to all of physical memory? We know the
> > valid range of the push buffer, why not set the limits accordingly?
> 
> That'd make sense. Currently we use the RESTART as the barrier, but
> having hardware check against DMAEND is a good idea, too.

Any reason why DMASTART shouldn't be used to restrict the range as well?

> >> +/*
> >> + * Kick channel DMA into action by writing its PUT offset (if it has changed)
> >> + */
> >> +static void cdma_kick(struct host1x_cdma *cdma)
> >> +{
> >> +     struct host1x *host1x = cdma_to_host1x(cdma);
> >> +     struct host1x_channel *ch = cdma_to_channel(cdma);
> >> +     u32 put;
> >> +
> >> +     put = host1x->cdma_pb_op.putptr(&cdma->push_buffer);
> >> +
> >> +     if (put != cdma->last_put) {
> >> +             host1x_ch_writel(ch, put, HOST1X_CHANNEL_DMAPUT);
> >> +             cdma->last_put = put;
> >> +     }
> >> +}
> > 
> > kick() sounds unusual. Maybe flush or commit or something similar would
> > be more accurate.
> 
> We could use flush.

Great.

> >> +     host1x_sync_writel(host1x, cmdproc_stop, HOST1X_SYNC_CMDPROC_STOP);
> >> +
> >> +     cdma->torndown = false;
> >> +     cdma_timeout_restart(cdma, getptr);
> >> +}
> > 
> > I find this a bit non-intuitive. We teardown a channel, and when we're
> > done tearing down, the torndown variable is set to false and the channel
> > is actually restarted. Maybe you could explain some more how this works
> > and what its purpose is.
> 
> Actually, teardown_begin freezes the channel, then we manipulate the
> queue, and in the end teardown_end restarts the channel. So these should
> be named freeze and resume. We could even drop the timeout from the
> names of these functions.

Sounds good.

> >> +     /* stop processing to get a clean snapshot */
> >> +     prev_cmdproc = host1x_sync_readl(host1x, HOST1X_SYNC_CMDPROC_STOP);
> >> +     cmdproc_stop = prev_cmdproc | BIT(ch->chid);
> >> +     host1x_sync_writel(host1x, cmdproc_stop, HOST1X_SYNC_CMDPROC_STOP);
> >> +
> >> +     dev_dbg(&host1x->dev->dev, "cdma_timeout: cmdproc was 0x%x is 0x%x\n",
> >> +             prev_cmdproc, cmdproc_stop);
> >> +
> >> +     syncpt_val = host1x_syncpt_load_min(host1x->syncpt);
> >> +
> >> +     /* has buffer actually completed? */
> >> +     if ((s32)(syncpt_val - cdma->timeout.syncpt_val) >= 0) {
> >> +             dev_dbg(&host1x->dev->dev,
> >> +                      "cdma_timeout: expired, but buffer had completed\n");
> > 
> > Maybe this should really be a warning?
> 
> Not really - it's actually just a normal state. We got a timeout event,
> but before we process it, it might be that the job manages to complete.
> This can happen, and is not an error case.

Okay, I see. That's fine then.

> >> +     for (i = 0 ; i < job->num_gathers; i++) {
> >> +             struct host1x_job_gather *g = &job->gathers[i];
> >> +             u32 op1 = host1x_opcode_gather(g->words);
> >> +             u32 op2 = g->mem_base + g->offset;
> >> +             host1x_cdma_push_gather(&job->ch->cdma,
> >> +                             job->gathers[i].ref,
> >> +                             job->gathers[i].offset,
> >> +                             op1, op2);
> >> +     }
> >> +}
> > 
> > Perhaps inline this into channel_submit()? I'm not sure how useful it
> > really is to split off smallish functions such as this which aren't
> > reused anywhere else. I don't have any major objection though, so you
> > can keep it separate if you want.
> 
> I split these out because channel_submit() became so long that I
> couldn't understand it anymore. I'd prefer keeping separate just to keep
> myself (semi-)sane.

Okay. =)

> >> diff --git a/drivers/gpu/host1x/hw/host1x01.c b/drivers/gpu/host1x/hw/host1x01.c
> > [...]
> >>  #include "hw/host1x01.h"
> >>  #include "dev.h"
> >> +#include "channel.h"
> >>  #include "hw/host1x01_hardware.h"
> >>
> >> +#include "hw/channel_hw.c"
> >> +#include "hw/cdma_hw.c"
> >>  #include "hw/syncpt_hw.c"
> >>  #include "hw/intr_hw.c"
> >>
> >>  int host1x01_init(struct host1x *host)
> >>  {
> >> +     host->channel_op = host1x_channel_ops;
> >> +     host->cdma_op = host1x_cdma_ops;
> >> +     host->cdma_pb_op = host1x_pushbuffer_ops;
> >>       host->syncpt_op = host1x_syncpt_ops;
> >>       host->intr_op = host1x_intr_ops;
> > 
> > I think I mentioned this before, but I'd prefer not to have the .c files
> > included here, but rather reference the ops structures externally. But I
> > still think that especially CDMA and push buffer ops don't need to be in
> > separate structures since they aren't likely to change with new hardware
> > revisions.
> 
> The C files need to be included here so that they pick up the hardware
> defs for the correct SoC. Pushbuffer is probably something we can
> generalize, but channel registers can change, so they need to be per SoC.

We can do the same using extern variables, can't we? If you're concerned
about the definitions that come from the headers, we can probably make
that work by parameterizing more.

I think we can live with this way for now and clean it up later, though.

> >> diff --git a/drivers/gpu/host1x/hw/hw_host1x01_channel.h b/drivers/gpu/host1x/hw/hw_host1x01_channel.h
> > [...]
> >> +#define HOST1X_CHANNEL_DMACTRL_DMASTOP_F(v) \
> >> +     host1x_channel_dmactrl_dmastop_f(v)
> > 
> > I mentioned this elsewhere already, but I think the _F suffix (and _f
> > for that matter) along with the v parameter should go away.
> 
> I'd prefer keeping so that I don't have to use two #defines to replace
> one. That IMO makes the usage harder and more error prone.

That's precisely my point. This actually makes it harder to use. If you
don't want to set the bit, just don't or it in. It's completely
pointless to shift and mask an unset bit.

> >> diff --git a/drivers/gpu/host1x/hw/hw_host1x01_uclass.h b/drivers/gpu/host1x/hw/hw_host1x01_uclass.h
> > [...]
> > 
> > What does the "uclass" stand for? It seems a bit useless to me.
> 
> It means host1x class, i.e. the host1x registers that can be written to
> from push buffers.

I still don't understand why we need uclass. It seems redundant.

> >> diff --git a/drivers/gpu/host1x/intr.c b/drivers/gpu/host1x/intr.c
> > [...]
> >> +static void action_submit_complete(struct host1x_waitlist *waiter)
> >> +{
> >> +     struct host1x_channel *channel = waiter->data;
> >> +     int nr_completed = waiter->count;
> > 
> > No need for this variable.
> 
> I'm using it for tracing in a follow-up patch. It can be used in traces
> for checking the queue length at each point of time.

Any reason why it can't be introduced in the follow-up patch?

> >> +struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
> >> +             u32 num_cmdbufs, u32 num_relocs, u32 num_waitchks)
> > 
> > Maybe make the parameters unsigned int instead of u32?
> 
> I'll check this, but we're getting them from user space, and that API
> has a fixed length field. That's why I'm carrying that type over.

Okay, it isn't that important.

> >> +void host1x_job_add_gather(struct host1x_job *job,
> >> +             u32 mem_id, u32 words, u32 offset)
> >> +{
> >> +     struct host1x_job_gather *cur_gather =
> >> +                     &job->gathers[job->num_gathers];
> > 
> > Should this check for overflow?
> 
> As defensive measure, could do, but this is not exploitable.

Alright then.

> >> +
> >> +             /* remove completed reloc from the job */
> >> +             if (i != job->num_relocs - 1) {
> >> +                     struct host1x_reloc *reloc_last =
> >> +                             &job->relocarray[job->num_relocs - 1];
> >> +                     reloc->cmdbuf_mem       = reloc_last->cmdbuf_mem;
> >> +                     reloc->cmdbuf_offset    = reloc_last->cmdbuf_offset;
> >> +                     reloc->target           = reloc_last->target;
> >> +                     reloc->target_offset    = reloc_last->target_offset;
> >> +                     reloc->shift            = reloc_last->shift;
> >> +                     job->reloc_addr_phys[i] =
> >> +                             job->reloc_addr_phys[job->num_relocs - 1];
> >> +                     job->num_relocs--;
> >> +             } else {
> >> +                     break;
> >> +             }
> >> +     }
> >> +
> >> +     if (cmdbuf_page_addr)
> >> +             host1x_memmgr_kunmap(h, last_page, cmdbuf_page_addr);
> >> +
> >> +     return 0;
> >> +}
> > 
> > Also the algorithm seems a bit strange and hard to follow. Instead of
> > removing relocs from the job, replacing them with the last entry and
> > decrementing job->num_relocs, how much is the penalty for always
> > iterating over all relocs? This is one of the other cases where I'd
> > argue that simplicity is key. Furthermore you need to copy quite a bit
> > of data to replace the completed relocs, so I'm not sure it buys you
> > much.
> > 
> > It could always be optimized later on by just setting a bit in the reloc
> > to mark it as completed, or keep a bitmask of completed relocations or
> > whatever.
> 
> This was done in a big optimization patch, but we'll check if we could
> remove this. Previously we just set cmdbuf_mem for the completed reloc
> to 0, and that should work in this case.

That certainly sounds simpler.

> >> +             int err = PTR_ERR(job->gather_copy_mapped);
> >> +             job->gather_copy_mapped = NULL;
> >> +             return err;
> >> +     }
> >> +
> >> +     job->gather_copy_size = size;
> >> +
> >> +     for (i = 0; i < job->num_gathers; i++) {
> >> +             struct host1x_job_gather *g = &job->gathers[i];
> >> +             void *gather = host1x_memmgr_mmap(g->ref);
> >> +             memcpy(job->gather_copy_mapped + offset,
> >> +                             gather + g->offset,
> >> +                             g->words * sizeof(u32));
> >> +
> >> +             g->mem_base = job->gather_copy;
> >> +             g->offset = offset;
> >> +             g->mem_id = 0;
> >> +             g->ref = 0;
> >> +
> >> +             host1x_memmgr_munmap(g->ref, gather);
> >> +             offset += g->words * sizeof(u32);
> >> +     }
> >> +
> >> +     return 0;
> >> +}
> > 
> > I wonder, where's this DMA buffer actually used? I can't find any use
> > between this copy and the corresponding dma_free_writecombine() call.
> 
> We replace the gathers in host1x_job with the ones we allocate here, so
> they are used when pushing the gather's to hardware.
> 
> This is done so that user space cannot tamper with the gathers once
> they've been checked by firewall.

Oh, I had missed how g->mem_base is assigned job->gather_copy, so I had
thought the memory wasn't used anywhere. I wonder if it wouldn't be more
efficient to pre-allocate this buffer. We number of gathers is limited
by HOST1X_GATHER_QUEUE_SIZE, right? So we could allocate a buffer of the
appropriate size for each job to avoid continuously reallocating and
freeing everytime the job in pinned or unpinned.

Also jobs are allocated for each submit and allocating them is quite
expensive, so eventually we may want to pool them. Which will not be
trivial though, given that it requires the number of command buffers
and relocs to match. Some clever checks can probably make this work,
though.

> >> +     /* Null kickoff prevents submit from being sent to hardware */
> >> +     bool null_kickoff;
> > 
> > I don't think this is used anywhere.
> 
> True, we can remove this as we haven't posted the code for null kickoff.

Make sure to explain what this is used for when you post. The one
comment above is a bit vague.

> >> +     /* Check if register is marked as an address reg */
> >> +     int (*is_addr_reg)(struct platform_device *dev, u32 reg, u32 class);
> > 
> > is_addr_reg() sounds a bit unusual. Maybe match this to the name of the
> > main firewall routine, validate()?
> 
> The point of this op is to just tell if a register for a class is
> pointing to a buffer. validate then uses this information. But both
> answers (yes/no) and both types of registers are still valid, so
> validate() wouldn't be the proper name.
> 
> validation is then done by checking that there's a reloc corresponding
> to each register write to a register that can hold an address.

I just remembered that we discussed this already and I think we agreed
that a table lookup might be a better implementation. That'd get rid of
the naming issue altogether, since you can just name the table something
like address_registers, which is quite unambiguous.

> >> diff --git a/drivers/gpu/host1x/memmgr.h b/drivers/gpu/host1x/memmgr.h
> > [...]
> >> +struct mem_handle;
> >> +struct platform_device;
> >> +
> >> +struct host1x_job_unpin_data {
> >> +     struct mem_handle *h;
> >> +     struct sg_table *mem;
> >> +};
> >> +
> >> +enum mem_mgr_flag {
> >> +     mem_mgr_flag_uncacheable = 0,
> >> +     mem_mgr_flag_write_combine = 1,
> >> +};
> > 
> > I'd like to see this use a more object-oriented approach and more common
> > terminology. All of these handles are essentially buffer objects, so
> > maybe something like host1x_bo would be a nice and short name.
> > 
> > To make this more object-oriented, I propose something like:
> > 
> >         struct host1x_bo_ops {
> >                 int (*alloc)(struct host1x_bo *bo, size_t size, unsigned long align,
> >                              unsigned long flags);
> >                 int (*free)(struct host1x_bo *bo);
> >                 ...
> >         };
> > 
> >         struct host1x_bo {
> >                 const struct host1x_bo_ops *ops;
> >         };
> > 
> >         struct host1x_cma_bo {
> >                 struct host1x_bo base;
> >                 struct drm_gem_cma_object *obj;
> >         };
> > 
> >         static inline struct host1x_cma_bo *to_host1x_cma_bo(struct host1x_bo *bo)
> >         {
> >                 return container_of(bo, struct host1x_cma_bo, base);
> >         }
> > 
> >         static inline int host1x_bo_alloc(struct host1x_bo *bo, size_t size,
> >                                           unsigned long align, unsigned long flags)
> >         {
> >                 return bo->ops->alloc(bo, size, align, flags);
> >         }
> > 
> >         ...
> > 
> > That should be easy to extend with a new type of BO once the IOMMU-based
> > allocator is ready. And as I said it is much closer in terminology to
> > what other drivers do.
> 
> One complexity is that we're using the same type for communicating with
> user space. Each buffer carries with it a flag indicating its allocator.
> We might be able to model the internal structure to be more like what
> you propose, but for the API we still need the flag.

I disagree. I don't see any need for passing around the type at all.
We've discussed this a few times already, and correct me if I'm wrong,
but I think we agreed that we don't want to mix handle/buffer types.

We only support CMA for now, so all buffers will be allocated from CMA.
Once the IOMMU-based allocator is ready we'll want to switch to that for
Tegra30 and later, but stick to CMA for Tegra20 since the GART isn't
very usable.

So the way I see it, the decision about which allocator to use is done
once at driver probe time. So all that's really needed is a function
that allocates a buffer object and returns the proper one for the given
Tegra SoC. Once a host1x_bo object is returned it can be used throughout
and we get rid of the additional memmgr abstraction. I think it'll make
things much simpler.

Thierry
Terje Bergstrom March 8, 2013, 4:16 p.m. UTC | #4
On 26.02.2013 11:48, Terje Bergström wrote:
> On 25.02.2013 17:24, Thierry Reding wrote:
>> You use two different styles to indent the function parameters. You
>> might want to stick to one, preferably aligning them with the first
>> parameter on the first line.
> 
> I've generally favored "two tabs" indenting, but we'll anyway
> standardize on one.

We standardized on the convention used in tegradrm, i.e. aligning with
first parameter.

>> There's nothing in this function that requires a platform_device, so
>> passing struct device should be enough. Or maybe host1x_cdma should get
>> a struct device * field?
> 
> I think we'll just start using struct device * in general in code.
> Arto's been already fixing a lot of these, so he might've already fixed
> this.

We did a sweep in the code and now I hope everything that can, uses
struct device *. The side effect was getting rid of a lot of casting,
which is good.

>> Why don't you use any of the kernel's reference counting mechanisms?
>>
>>> +void host1x_channel_put(struct host1x_channel *ch)
>>> +{
>>> +     mutex_lock(&ch->reflock);
>>> +     if (ch->refcount == 1) {
>>> +             host1x_get_host(ch->dev)->cdma_op.stop(&ch->cdma);
>>> +             host1x_cdma_deinit(&ch->cdma);
>>> +     }
>>> +     ch->refcount--;
>>> +     mutex_unlock(&ch->reflock);
>>> +}
>>
>> I think you can do all of this using a kref.
> 
> I think the original reason was that there's no reason to use atomic
> kref, as we anyway have to do mutual exclusion via mutex. But, using
> kref won't be any problem, so we could use that.

Actually, we ended up with a problem with this. kref assumes that once
refcount goes to zero, it gets destroyed. In ch->refcount, going to zero
is just fine and just indicates that we need to initialize. And, we
anyway need to do locking, so we didn't do the conversion to kref.

>>> +struct host1x_channel *host1x_channel_alloc(struct platform_device *pdev)
>>> +{
(...)
>>> +}
>>
>> I think the critical section could be shorter here. It's probably not
>> worth the extra trouble, though, given that channels are not often
>> allocated.
> 
> Yeah, boot time isn't measured in microseconds. :-) But, if we just make
> allocated_channels an atomic, we should be able to drop chlist_mutex
> altogether and it could simplify the code.

There wasn't much we could have moved outside the critical section, so
we didn't touch this area.

>> Also, is it really necessary to abstract these into an ops structure? I
>> get that newer hardware revisions might require different ops for sync-
>> point handling because the register layout or number of syncpoints may
>> be different, but the CDMA and push buffer (below) concepts are pretty
>> much a software abstraction, and as such its implementation is unlikely
>> to change with some future hardware revision.
> 
> Pushbuffer ops can become generic. There's only one catch - init uses
> the restart opcode. But the opcode is not going to change, so we can
> generalize that.

We ended up keeping the init as an operation, but rest of push buffer
ops became generic.

>>
>>> +/*
>>> + * Push two words to the push buffer
>>> + * Caller must ensure push buffer is not full
>>> + */
>>> +static void push_buffer_push_to(struct push_buffer *pb,
>>> +             struct mem_handle *handle,
>>> +             u32 op1, u32 op2)
>>> +{
>>> +     u32 cur = pb->cur;
>>> +     u32 *p = (u32 *)((u32)pb->mapped + cur);
>>
>> You do all this extra casting to make sure to increment by bytes and not
>> 32-bit words. How about you change pb->cur to contain the word index, so
>> that you don't have to go through hoops each time around.

When we changed DMASTART and DMAEND to actually denote the push buffer
area, we noticed that DMAGET and DMAPUT are actually relative to
DMASTART and DMAEND. This and the need to access both CPU and device
virtual addresses coupled with changing to word indexes didn't actually
simplify the code, so we kept still using byte indexes.

>>
>>> +/*
>>> + * Return the number of two word slots free in the push buffer
>>> + */
>>> +static u32 push_buffer_space(struct push_buffer *pb)
>>> +{
>>> +     return ((pb->fence - pb->cur) & (PUSH_BUFFER_SIZE - 1)) / 8;
>>> +}
>>
>> Why & (PUSH_BUFFER_SIZE - 1) here? fence - cur can never be larger than
>> PUSH_BUFFER_SIZE, can it?
> 
> You're right, this function doesn't need to worry about wrapping.

Arto noticed this, but actually I was wrong - the wrapping is very
possible. We just have to remember that if we're processing something at
the end of push buffer, cur might be in the end, and fence in the beginning.

>>> diff --git a/drivers/gpu/host1x/memmgr.h b/drivers/gpu/host1x/memmgr.h
>> [...]
>>> +struct mem_handle;
>>> +struct platform_device;
>>> +
>>> +struct host1x_job_unpin_data {
>>> +     struct mem_handle *h;
>>> +     struct sg_table *mem;
>>> +};
>>> +
>>> +enum mem_mgr_flag {
>>> +     mem_mgr_flag_uncacheable = 0,
>>> +     mem_mgr_flag_write_combine = 1,
>>> +};
>>
>> I'd like to see this use a more object-oriented approach and more common
>> terminology. All of these handles are essentially buffer objects, so
>> maybe something like host1x_bo would be a nice and short name.

We did this a bit differently, but following pretty much the same
principles. We have host1x_mem_handle, which contains an ops pointer.
The handle gets encapsulated inside drm_gem_cma_object.

_bo structs seem to usually contains a drm_gem_object, so we thought
it's better not to reuse that term.

Please check the code and let us know what you think. This pretty much
follows what Lucas proposed a while ago, and keeps neatly the DRM
specific parts inside the drm directory.

Other than these, we should have implemented all changes that we agreed
to include. If something's missing, it's because there were so many that
we just dropped the ball.

Terje
Thierry Reding March 8, 2013, 8:43 p.m. UTC | #5
On Fri, Mar 08, 2013 at 06:16:16PM +0200, Terje Bergström wrote:
> On 26.02.2013 11:48, Terje Bergström wrote:
> > On 25.02.2013 17:24, Thierry Reding wrote:
[...]
> >>> +struct mem_handle;
> >>> +struct platform_device;
> >>> +
> >>> +struct host1x_job_unpin_data {
> >>> +     struct mem_handle *h;
> >>> +     struct sg_table *mem;
> >>> +};
> >>> +
> >>> +enum mem_mgr_flag {
> >>> +     mem_mgr_flag_uncacheable = 0,
> >>> +     mem_mgr_flag_write_combine = 1,
> >>> +};
> >>
> >> I'd like to see this use a more object-oriented approach and more common
> >> terminology. All of these handles are essentially buffer objects, so
> >> maybe something like host1x_bo would be a nice and short name.
> 
> We did this a bit differently, but following pretty much the same
> principles. We have host1x_mem_handle, which contains an ops pointer.
> The handle gets encapsulated inside drm_gem_cma_object.
> 
> _bo structs seem to usually contains a drm_gem_object, so we thought
> it's better not to reuse that term.
> 
> Please check the code and let us know what you think. This pretty much
> follows what Lucas proposed a while ago, and keeps neatly the DRM
> specific parts inside the drm directory.

A bo is just a buffer object, so I don't see why the name shouldn't be
used. The name is in no way specific to DRM or GEM. But the point that I
was trying to make was that there is nothing to suggest that we couldn't
use drm_gem_object as the underlying scaffold to base all host1x buffer
objects on.

Furthermore I don't understand why you've chosen this approach. It is
completely different from what other drivers do and therefore makes it
more difficult to comprehend. That alone I could live with if there were
any advantages to that approach, but as far as I can tell there are
none.

Thierry
Terje Bergstrom March 11, 2013, 6:29 a.m. UTC | #6
On 08.03.2013 22:43, Thierry Reding wrote:
> A bo is just a buffer object, so I don't see why the name shouldn't
> be used. The name is in no way specific to DRM or GEM. But the point
> that I was trying to make was that there is nothing to suggest that
> we couldn't use drm_gem_object as the underlying scaffold to base all
> host1x buffer objects on.
> 
> Furthermore I don't understand why you've chosen this approach. It
> is completely different from what other drivers do and therefore
> makes it more difficult to comprehend. That alone I could live with
> if there were any advantages to that approach, but as far as I can
> tell there are none.

I was following the plan we agreed on earlier in email discussion with
you and Lucas:

On 29.11.2012 11:09, Lucas Stach wrote:
> We should aim for a clean split here. GEM handles are something which
> is really specific to how DRM works and as such should be constructed
> by tegradrm. nvhost should really just manage allocations/virtual
> address space and provide something that is able to back all the GEM
> handle operations.
> 
> nvhost has really no reason at all to even know about GEM handles.
> If you back a GEM object by a nvhost object you can just peel out
> the nvhost handles from the GEM wrapper in the tegradrm submit ioctl
> handler and queue the job to nvhost using it's native handles.
> 
> This way you would also be able to construct different handles (like
> GEM obj or V4L2 buffers) from the same backing nvhost object. Note
> that I'm not sure how useful this would be, but it seems like a
> reasonable design to me being able to do so.

With this structure, we are already prepared for non-DRM APIs. Tt's a
matter of familiarity of code versus future expansion. Code paths for
both are as simple/complex, so neither has a direct technical
superiority in performance.

I know other DRM drivers have opted to hard code GEM dependency
throughout the code. Then again, host1x hardware is managing much more
than graphics, so we need to think outside the DRM box, too.

Terje
Thierry Reding March 11, 2013, 7:18 a.m. UTC | #7
On Mon, Mar 11, 2013 at 08:29:59AM +0200, Terje Bergström wrote:
> On 08.03.2013 22:43, Thierry Reding wrote:
> > A bo is just a buffer object, so I don't see why the name shouldn't
> > be used. The name is in no way specific to DRM or GEM. But the point
> > that I was trying to make was that there is nothing to suggest that
> > we couldn't use drm_gem_object as the underlying scaffold to base all
> > host1x buffer objects on.
> > 
> > Furthermore I don't understand why you've chosen this approach. It
> > is completely different from what other drivers do and therefore
> > makes it more difficult to comprehend. That alone I could live with
> > if there were any advantages to that approach, but as far as I can
> > tell there are none.
> 
> I was following the plan we agreed on earlier in email discussion with
> you and Lucas:
> 
> On 29.11.2012 11:09, Lucas Stach wrote:
> > We should aim for a clean split here. GEM handles are something which
> > is really specific to how DRM works and as such should be constructed
> > by tegradrm. nvhost should really just manage allocations/virtual
> > address space and provide something that is able to back all the GEM
> > handle operations.
> > 
> > nvhost has really no reason at all to even know about GEM handles.
> > If you back a GEM object by a nvhost object you can just peel out
> > the nvhost handles from the GEM wrapper in the tegradrm submit ioctl
> > handler and queue the job to nvhost using it's native handles.
> > 
> > This way you would also be able to construct different handles (like
> > GEM obj or V4L2 buffers) from the same backing nvhost object. Note
> > that I'm not sure how useful this would be, but it seems like a
> > reasonable design to me being able to do so.
> 
> With this structure, we are already prepared for non-DRM APIs. Tt's a
> matter of familiarity of code versus future expansion. Code paths for
> both are as simple/complex, so neither has a direct technical
> superiority in performance.
> 
> I know other DRM drivers have opted to hard code GEM dependency
> throughout the code. Then again, host1x hardware is managing much more
> than graphics, so we need to think outside the DRM box, too.

This sound a bit over-engineered at this point in time. DRM is currently
the only user. Is anybody working on any non-DRM drivers that would use
this?

Even that aside, I don't think host1x_mem_handle is a good choice of
name here. The objects are much more than handles. They are in fact
buffer objects, which can optionally be attached to a handle. I also
think that using a void * to store the handle specific data isn't such a
good idea.

So how about the following proposal, which I think might satisfy both of
us:

	struct host1x_bo;

	struct host1x_bo_ops {
		struct host1x_bo *(*get)(struct host1x_bo *bo);
		void (*put)(struct host1x_bo *bo);
		dma_addr_t (*pin)(struct host1x_bo *bo, struct sg_table **sgt);
		...
	};

	struct host1x_bo *host1x_bo_get(struct host1x_bo *bo);
	void host1x_bo_put(struct host1x_bo *bo);
	dma_addr_t host1x_bo_pin(struct host1x_bo *bo, struct sg_table **sgt);
	...

	struct host1x_bo {
		const struct host1x_bo_ops *ops;
		...
	};

	struct tegra_drm_bo {
		struct host1x_bo base;
		...
	};

That way you can get rid of the host1x_memmgr_create_handle() helper and
instead embed host1x_bo into driver-/framework-specific structures with
the necessary initialization.

It also allows you to interact directly with the objects instead of
having to go through the memmgr API. The memory manager doesn't really
exist anymore so keeping the name in the API is only confusing. Your
current proposal deals with memory handles directly already so it's
really just making the naming more consistent.

Thierry
Terje Bergstrom March 11, 2013, 9:21 a.m. UTC | #8
On 11.03.2013 09:18, Thierry Reding wrote:
> This sound a bit over-engineered at this point in time. DRM is currently
> the only user. Is anybody working on any non-DRM drivers that would use
> this?

Well, this contains beginning of that:

http://nv-tegra.nvidia.com/gitweb/?p=linux-2.6.git;a=blob;f=drivers/media/video/tegra_v4l2_camera.c;h=644d0be5380367aca4c826c49724c03aad08387c;hb=l4t/l4t-r16-r2

I don't want to give these guys any excuse not to port it over to host1x
code base. :-)

> Even that aside, I don't think host1x_mem_handle is a good choice of
> name here. The objects are much more than handles. They are in fact
> buffer objects, which can optionally be attached to a handle. I also
> think that using a void * to store the handle specific data isn't such a
> good idea.

Naming if not an issue for me - we can easily agree on using _bo.

> So how about the following proposal, which I think might satisfy both of
> us:
> 
> 	struct host1x_bo;
> 
> 	struct host1x_bo_ops {
> 		struct host1x_bo *(*get)(struct host1x_bo *bo);
> 		void (*put)(struct host1x_bo *bo);
> 		dma_addr_t (*pin)(struct host1x_bo *bo, struct sg_table **sgt);
> 		...
> 	};
> 
> 	struct host1x_bo *host1x_bo_get(struct host1x_bo *bo);
> 	void host1x_bo_put(struct host1x_bo *bo);
> 	dma_addr_t host1x_bo_pin(struct host1x_bo *bo, struct sg_table **sgt);
> 	...
> 
> 	struct host1x_bo {
> 		const struct host1x_bo_ops *ops;
> 		...
> 	};
> 
> 	struct tegra_drm_bo {
> 		struct host1x_bo base;
> 		...
> 	};
> 
> That way you can get rid of the host1x_memmgr_create_handle() helper and
> instead embed host1x_bo into driver-/framework-specific structures with
> the necessary initialization.

This would make sense. We'll get back when we have enough of
implementation done to understand it all. One consequence is that we
cannot use drm_gem_cma_create() anymore. We'll have to introduce a
function that does the same as drm_gem_cma_create(), but it takes a
pre-allocated drm_gem_cma_object pointer. That way we can allocate the
struct, and use DRM CMA just to initialize the drm_gem_cma_object.

Other way would be just taking a copy of DRM CMA helper, but I'd like to
defer that to the next step when we implement IOMMU aware allocator.

> It also allows you to interact directly with the objects instead of
> having to go through the memmgr API. The memory manager doesn't really
> exist anymore so keeping the name in the API is only confusing. Your
> current proposal deals with memory handles directly already so it's
> really just making the naming more consistent.

The memmgr APIs are currently just a shortcut wrapper to the ops, so in
that sense the memmgr does not really exist. I think it might still make
sense to keep static inline wrappers for calling the ops within, but we
could rename them to host1x_bo_somethingandother. Then it'd follow the
pattern we are using for the hw ops in the latest set.

Terje
Thierry Reding March 11, 2013, 9:41 a.m. UTC | #9
On Mon, Mar 11, 2013 at 11:21:05AM +0200, Terje Bergström wrote:
> On 11.03.2013 09:18, Thierry Reding wrote:
> > This sound a bit over-engineered at this point in time. DRM is currently
> > the only user. Is anybody working on any non-DRM drivers that would use
> > this?
> 
> Well, this contains beginning of that:
> 
> http://nv-tegra.nvidia.com/gitweb/?p=linux-2.6.git;a=blob;f=drivers/media/video/tegra_v4l2_camera.c;h=644d0be5380367aca4c826c49724c03aad08387c;hb=l4t/l4t-r16-r2
> 
> I don't want to give these guys any excuse not to port it over to host1x
> code base. :-)

I was aware of that driver but I didn't realize it had been available
publicly. It's great to see this, though, and one more argument in
favour of not binding the host1x_bo too tightly to DRM/GEM.

> > So how about the following proposal, which I think might satisfy both of
> > us:
> > 
> > 	struct host1x_bo;
> > 
> > 	struct host1x_bo_ops {
> > 		struct host1x_bo *(*get)(struct host1x_bo *bo);
> > 		void (*put)(struct host1x_bo *bo);
> > 		dma_addr_t (*pin)(struct host1x_bo *bo, struct sg_table **sgt);
> > 		...
> > 	};
> > 
> > 	struct host1x_bo *host1x_bo_get(struct host1x_bo *bo);
> > 	void host1x_bo_put(struct host1x_bo *bo);
> > 	dma_addr_t host1x_bo_pin(struct host1x_bo *bo, struct sg_table **sgt);
> > 	...
> > 
> > 	struct host1x_bo {
> > 		const struct host1x_bo_ops *ops;
> > 		...
> > 	};
> > 
> > 	struct tegra_drm_bo {
> > 		struct host1x_bo base;
> > 		...
> > 	};
> > 
> > That way you can get rid of the host1x_memmgr_create_handle() helper and
> > instead embed host1x_bo into driver-/framework-specific structures with
> > the necessary initialization.
> 
> This would make sense. We'll get back when we have enough of
> implementation done to understand it all. One consequence is that we
> cannot use drm_gem_cma_create() anymore. We'll have to introduce a
> function that does the same as drm_gem_cma_create(), but it takes a
> pre-allocated drm_gem_cma_object pointer. That way we can allocate the
> struct, and use DRM CMA just to initialize the drm_gem_cma_object.

I certainly think that introducing a drm_gem_cma_object_init() function
shouldn't pose a problem. If you do, make sure to update the existing
drm_gem_cma_create() to use it. Having both lets users have the choice
to use drm_gem_cma_create() if they don't need to embed it, or
drm_gem_cma_object_init() otherwise.

> Other way would be just taking a copy of DRM CMA helper, but I'd like to
> defer that to the next step when we implement IOMMU aware allocator.

I'm not sure I understand what you're saying, but if you add a function
as discussed above this shouldn't be necessary.

> > It also allows you to interact directly with the objects instead of
> > having to go through the memmgr API. The memory manager doesn't really
> > exist anymore so keeping the name in the API is only confusing. Your
> > current proposal deals with memory handles directly already so it's
> > really just making the naming more consistent.
> 
> The memmgr APIs are currently just a shortcut wrapper to the ops, so in
> that sense the memmgr does not really exist. I think it might still make
> sense to keep static inline wrappers for calling the ops within, but we
> could rename them to host1x_bo_somethingandother. Then it'd follow the
> pattern we are using for the hw ops in the latest set.

Yes, that's exactly what I had in mind in the above proposal. They could
be inline, but it's probably also okay if they're not. They aren't meant
to be used very frequently so the extra function call shouldn't matter
much. It might be easier to do add some additional checks if they aren't
inlined. I'm fine either way.

Thierry
diff mbox

Patch

diff --git a/drivers/gpu/host1x/Kconfig b/drivers/gpu/host1x/Kconfig
index e89fb2b..57680a6 100644
--- a/drivers/gpu/host1x/Kconfig
+++ b/drivers/gpu/host1x/Kconfig
@@ -3,4 +3,27 @@  config TEGRA_HOST1X
 	help
 	  Driver for the Tegra host1x hardware.
 
-	  Required for enabling tegradrm.
+	  Required for enabling tegradrm and 2D acceleration.
+
+if TEGRA_HOST1X
+
+config TEGRA_HOST1X_CMA
+	bool "Support DRM CMA buffers"
+	depends on DRM
+	default y
+	select DRM_GEM_CMA_HELPER
+	select DRM_KMS_CMA_HELPER
+	help
+	  Say yes if you wish to use DRM CMA buffers.
+
+	  If unsure, choose Y.
+
+config TEGRA_HOST1X_FIREWALL
+	bool "Enable HOST1X security firewall"
+	default y
+	help
+	  Say yes if kernel should protect command streams from tampering.
+
+	  If unsure, choose Y.
+
+endif
diff --git a/drivers/gpu/host1x/Makefile b/drivers/gpu/host1x/Makefile
index 5ef47ff..cdd87c8 100644
--- a/drivers/gpu/host1x/Makefile
+++ b/drivers/gpu/host1x/Makefile
@@ -4,6 +4,11 @@  host1x-y = \
 	syncpt.o \
 	dev.o \
 	intr.o \
+	cdma.o \
+	channel.o \
+	job.o \
+	memmgr.o \
 	hw/host1x01.o
 
+host1x-$(CONFIG_TEGRA_HOST1X_CMA) += cma.o
 obj-$(CONFIG_TEGRA_HOST1X) += host1x.o
diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
new file mode 100644
index 0000000..d6a38d2
--- /dev/null
+++ b/drivers/gpu/host1x/cdma.c
@@ -0,0 +1,439 @@ 
+/*
+ * Tegra host1x Command DMA
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cdma.h"
+#include "channel.h"
+#include "dev.h"
+#include "memmgr.h"
+#include "job.h"
+#include <asm/cacheflush.h>
+
+#include <linux/slab.h>
+#include <linux/kfifo.h>
+#include <linux/interrupt.h>
+#include <trace/events/host1x.h>
+
+#define TRACE_MAX_LENGTH 128U
+
+/*
+ * Add an entry to the sync queue.
+ */
+static void add_to_sync_queue(struct host1x_cdma *cdma,
+			      struct host1x_job *job,
+			      u32 nr_slots,
+			      u32 first_get)
+{
+	if (job->syncpt_id == NVSYNCPT_INVALID) {
+		dev_warn(&job->ch->dev->dev, "%s: Invalid syncpt\n",
+				__func__);
+		return;
+	}
+
+	job->first_get = first_get;
+	job->num_slots = nr_slots;
+	host1x_job_get(job);
+	list_add_tail(&job->list, &cdma->sync_queue);
+}
+
+/*
+ * Return the status of the cdma's sync queue or push buffer for the given event
+ *  - sq empty: returns 1 for empty, 0 for not empty (as in "1 empty queue" :-)
+ *  - pb space: returns the number of free slots in the channel's push buffer
+ * Must be called with the cdma lock held.
+ */
+static unsigned int cdma_status_locked(struct host1x_cdma *cdma,
+		enum cdma_event event)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	switch (event) {
+	case CDMA_EVENT_SYNC_QUEUE_EMPTY:
+		return list_empty(&cdma->sync_queue) ? 1 : 0;
+	case CDMA_EVENT_PUSH_BUFFER_SPACE: {
+		struct push_buffer *pb = &cdma->push_buffer;
+		return host1x->cdma_pb_op.space(pb);
+	}
+	default:
+		return 0;
+	}
+}
+
+/*
+ * Sleep (if necessary) until the requested event happens
+ *   - CDMA_EVENT_SYNC_QUEUE_EMPTY : sync queue is completely empty.
+ *     - Returns 1
+ *   - CDMA_EVENT_PUSH_BUFFER_SPACE : there is space in the push buffer
+ *     - Return the amount of space (> 0)
+ * Must be called with the cdma lock held.
+ */
+unsigned int host1x_cdma_wait_locked(struct host1x_cdma *cdma,
+		enum cdma_event event)
+{
+	for (;;) {
+		unsigned int space = cdma_status_locked(cdma, event);
+		if (space)
+			return space;
+
+		trace_host1x_wait_cdma(cdma_to_channel(cdma)->dev->name,
+				event);
+
+		/* If somebody has managed to already start waiting, yield */
+		if (cdma->event != CDMA_EVENT_NONE) {
+			mutex_unlock(&cdma->lock);
+			schedule();
+			mutex_lock(&cdma->lock);
+			continue;
+		}
+		cdma->event = event;
+
+		mutex_unlock(&cdma->lock);
+		down(&cdma->sem);
+		mutex_lock(&cdma->lock);
+	}
+	return 0;
+}
+
+/*
+ * Start timer for a buffer submition that has completed yet.
+ * Must be called with the cdma lock held.
+ */
+static void cdma_start_timer_locked(struct host1x_cdma *cdma,
+		struct host1x_job *job)
+{
+	struct host1x *host = cdma_to_host1x(cdma);
+
+	if (cdma->timeout.clientid) {
+		/* timer already started */
+		return;
+	}
+
+	cdma->timeout.clientid = job->clientid;
+	cdma->timeout.syncpt = host1x_syncpt_get(host, job->syncpt_id);
+	cdma->timeout.syncpt_val = job->syncpt_end;
+	cdma->timeout.start_ktime = ktime_get();
+
+	schedule_delayed_work(&cdma->timeout.wq,
+			msecs_to_jiffies(job->timeout));
+}
+
+/*
+ * Stop timer when a buffer submition completes.
+ * Must be called with the cdma lock held.
+ */
+static void stop_cdma_timer_locked(struct host1x_cdma *cdma)
+{
+	cancel_delayed_work(&cdma->timeout.wq);
+	cdma->timeout.clientid = 0;
+}
+
+/*
+ * For all sync queue entries that have already finished according to the
+ * current sync point registers:
+ *  - unpin & unref their mems
+ *  - pop their push buffer slots
+ *  - remove them from the sync queue
+ * This is normally called from the host code's worker thread, but can be
+ * called manually if necessary.
+ * Must be called with the cdma lock held.
+ */
+static void update_cdma_locked(struct host1x_cdma *cdma)
+{
+	bool signal = false;
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct host1x_job *job, *n;
+
+	/* If CDMA is stopped, queue is cleared and we can return */
+	if (!cdma->running)
+		return;
+
+	/*
+	 * Walk the sync queue, reading the sync point registers as necessary,
+	 * to consume as many sync queue entries as possible without blocking
+	 */
+	list_for_each_entry_safe(job, n, &cdma->sync_queue, list) {
+		struct host1x_syncpt *sp = host1x->syncpt + job->syncpt_id;
+
+		/* Check whether this syncpt has completed, and bail if not */
+		if (!host1x_syncpt_is_expired(sp, job->syncpt_end)) {
+			/* Start timer on next pending syncpt */
+			if (job->timeout)
+				cdma_start_timer_locked(cdma, job);
+			break;
+		}
+
+		/* Cancel timeout, when a buffer completes */
+		if (cdma->timeout.clientid)
+			stop_cdma_timer_locked(cdma);
+
+		/* Unpin the memory */
+		host1x_job_unpin(job);
+
+		/* Pop push buffer slots */
+		if (job->num_slots) {
+			struct push_buffer *pb = &cdma->push_buffer;
+			host1x->cdma_pb_op.pop_from(pb, job->num_slots);
+			if (cdma->event == CDMA_EVENT_PUSH_BUFFER_SPACE)
+				signal = true;
+		}
+
+		list_del(&job->list);
+		host1x_job_put(job);
+	}
+
+	if (list_empty(&cdma->sync_queue) &&
+				cdma->event == CDMA_EVENT_SYNC_QUEUE_EMPTY)
+			signal = true;
+
+	/* Wake up CdmaWait() if the requested event happened */
+	if (signal) {
+		cdma->event = CDMA_EVENT_NONE;
+		up(&cdma->sem);
+	}
+}
+
+void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
+		struct platform_device *dev)
+{
+	u32 get_restart;
+	u32 syncpt_incrs;
+	struct host1x_job *job = NULL;
+	u32 syncpt_val;
+	struct host1x *host1x = cdma_to_host1x(cdma);
+
+	syncpt_val = host1x_syncpt_load_min(cdma->timeout.syncpt);
+
+	dev_dbg(&dev->dev,
+		"%s: starting cleanup (thresh %d)\n",
+		__func__, syncpt_val);
+
+	/*
+	 * Move the sync_queue read pointer to the first entry that hasn't
+	 * completed based on the current HW syncpt value. It's likely there
+	 * won't be any (i.e. we're still at the head), but covers the case
+	 * where a syncpt incr happens just prior/during the teardown.
+	 */
+
+	dev_dbg(&dev->dev,
+		"%s: skip completed buffers still in sync_queue\n",
+		__func__);
+
+	list_for_each_entry(job, &cdma->sync_queue, list) {
+		if (syncpt_val < job->syncpt_end)
+			break;
+
+		host1x_job_dump(&dev->dev, job);
+	}
+
+	/*
+	 * Walk the sync_queue, first incrementing with the CPU syncpts that
+	 * are partially executed (the first buffer) or fully skipped while
+	 * still in the current context (slots are also NOP-ed).
+	 *
+	 * At the point contexts are interleaved, syncpt increments must be
+	 * done inline with the pushbuffer from a GATHER buffer to maintain
+	 * the order (slots are modified to be a GATHER of syncpt incrs).
+	 *
+	 * Note: save in get_restart the location where the timed out buffer
+	 * started in the PB, so we can start the refetch from there (with the
+	 * modified NOP-ed PB slots). This lets things appear to have completed
+	 * properly for this buffer and resources are freed.
+	 */
+
+	dev_dbg(&dev->dev,
+		"%s: perform CPU incr on pending same ctx buffers\n",
+		__func__);
+
+	get_restart = cdma->last_put;
+	if (!list_empty(&cdma->sync_queue))
+		get_restart = job->first_get;
+
+	/* do CPU increments as long as this context continues */
+	list_for_each_entry_from(job, &cdma->sync_queue, list) {
+		/* different context, gets us out of this loop */
+		if (job->clientid != cdma->timeout.clientid)
+			break;
+
+		/* won't need a timeout when replayed */
+		job->timeout = 0;
+
+		syncpt_incrs = job->syncpt_end - syncpt_val;
+		dev_dbg(&dev->dev,
+			"%s: CPU incr (%d)\n", __func__, syncpt_incrs);
+
+		host1x_job_dump(&dev->dev, job);
+
+		/* safe to use CPU to incr syncpts */
+		host1x->cdma_op.timeout_cpu_incr(cdma,
+				job->first_get,
+				syncpt_incrs,
+				job->syncpt_end,
+				job->num_slots);
+
+		syncpt_val += syncpt_incrs;
+	}
+
+	list_for_each_entry_from(job, &cdma->sync_queue, list)
+		if (job->clientid == cdma->timeout.clientid)
+			job->timeout = 500;
+
+	dev_dbg(&dev->dev,
+		"%s: finished sync_queue modification\n", __func__);
+
+	/* roll back DMAGET and start up channel again */
+	host1x->cdma_op.timeout_teardown_end(cdma, get_restart);
+}
+
+/*
+ * Create a cdma
+ */
+int host1x_cdma_init(struct host1x_cdma *cdma)
+{
+	int err;
+	struct push_buffer *pb = &cdma->push_buffer;
+	struct host1x *host1x = cdma_to_host1x(cdma);
+
+	mutex_init(&cdma->lock);
+	sema_init(&cdma->sem, 0);
+
+	INIT_LIST_HEAD(&cdma->sync_queue);
+
+	cdma->event = CDMA_EVENT_NONE;
+	cdma->running = false;
+	cdma->torndown = false;
+
+	err = host1x->cdma_pb_op.init(pb);
+	if (err)
+		return err;
+	return 0;
+}
+
+/*
+ * Destroy a cdma
+ */
+void host1x_cdma_deinit(struct host1x_cdma *cdma)
+{
+	struct push_buffer *pb = &cdma->push_buffer;
+	struct host1x *host1x = cdma_to_host1x(cdma);
+
+	if (cdma->running) {
+		pr_warn("%s: CDMA still running\n",
+				__func__);
+	} else {
+		host1x->cdma_pb_op.destroy(pb);
+		host1x->cdma_op.timeout_destroy(cdma);
+	}
+}
+
+/*
+ * Begin a cdma submit
+ */
+int host1x_cdma_begin(struct host1x_cdma *cdma, struct host1x_job *job)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+
+	mutex_lock(&cdma->lock);
+
+	if (job->timeout) {
+		/* init state on first submit with timeout value */
+		if (!cdma->timeout.initialized) {
+			int err;
+			err = host1x->cdma_op.timeout_init(cdma,
+					job->syncpt_id);
+			if (err) {
+				mutex_unlock(&cdma->lock);
+				return err;
+			}
+		}
+	}
+	if (!cdma->running)
+		host1x->cdma_op.start(cdma);
+
+	cdma->slots_free = 0;
+	cdma->slots_used = 0;
+	cdma->first_get = host1x->cdma_pb_op.putptr(&cdma->push_buffer);
+
+	trace_host1x_cdma_begin(job->ch->dev->name);
+	return 0;
+}
+
+/*
+ * Push two words into a push buffer slot
+ * Blocks as necessary if the push buffer is full.
+ */
+void host1x_cdma_push(struct host1x_cdma *cdma, u32 op1, u32 op2)
+{
+	host1x_cdma_push_gather(cdma, NULL, 0, op1, op2);
+}
+
+/*
+ * Push two words into a push buffer slot
+ * Blocks as necessary if the push buffer is full.
+ */
+void host1x_cdma_push_gather(struct host1x_cdma *cdma,
+		struct mem_handle *handle,
+		u32 offset, u32 op1, u32 op2)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	u32 slots_free = cdma->slots_free;
+	struct push_buffer *pb = &cdma->push_buffer;
+
+	if (slots_free == 0) {
+		host1x->cdma_op.kick(cdma);
+		slots_free = host1x_cdma_wait_locked(cdma,
+				CDMA_EVENT_PUSH_BUFFER_SPACE);
+	}
+	cdma->slots_free = slots_free - 1;
+	cdma->slots_used++;
+	host1x->cdma_pb_op.push_to(pb, handle, op1, op2);
+}
+
+/*
+ * End a cdma submit
+ * Kick off DMA, add job to the sync queue, and a number of slots to be freed
+ * from the pushbuffer. The handles for a submit must all be pinned at the same
+ * time, but they can be unpinned in smaller chunks.
+ */
+void host1x_cdma_end(struct host1x_cdma *cdma,
+		struct host1x_job *job)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	bool was_idle = list_empty(&cdma->sync_queue);
+
+	host1x->cdma_op.kick(cdma);
+
+	add_to_sync_queue(cdma,
+			job,
+			cdma->slots_used,
+			cdma->first_get);
+
+	/* start timer on idle -> active transitions */
+	if (job->timeout && was_idle)
+		cdma_start_timer_locked(cdma, job);
+
+	trace_host1x_cdma_end(job->ch->dev->name);
+	mutex_unlock(&cdma->lock);
+}
+
+/*
+ * Update cdma state according to current sync point values
+ */
+void host1x_cdma_update(struct host1x_cdma *cdma)
+{
+	mutex_lock(&cdma->lock);
+	update_cdma_locked(cdma);
+	mutex_unlock(&cdma->lock);
+}
diff --git a/drivers/gpu/host1x/cdma.h b/drivers/gpu/host1x/cdma.h
new file mode 100644
index 0000000..d9cabef
--- /dev/null
+++ b/drivers/gpu/host1x/cdma.h
@@ -0,0 +1,107 @@ 
+/*
+ * Tegra host1x Command DMA
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __HOST1X_CDMA_H
+#define __HOST1X_CDMA_H
+
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+
+#include <linux/list.h>
+
+struct host1x_syncpt;
+struct host1x_userctx_timeout;
+struct host1x_job;
+struct mem_handle;
+struct platform_device;
+
+/*
+ * cdma
+ *
+ * This is in charge of a host command DMA channel.
+ * Sends ops to a push buffer, and takes responsibility for unpinning
+ * (& possibly freeing) of memory after those ops have completed.
+ * Producer:
+ *	begin
+ *		push - send ops to the push buffer
+ *	end - start command DMA and enqueue handles to be unpinned
+ * Consumer:
+ *	update - call to update sync queue and push buffer, unpin memory
+ */
+
+struct push_buffer {
+	u32 *mapped;			/* mapped pushbuffer memory */
+	dma_addr_t phys;		/* physical address of pushbuffer */
+	u32 fence;			/* index we've written */
+	u32 cur;			/* index to write to */
+	struct mem_handle **handle;	/* handle for each opcode pair */
+};
+
+struct buffer_timeout {
+	struct delayed_work wq;		/* work queue */
+	bool initialized;		/* timer one-time setup flag */
+	struct host1x_syncpt *syncpt;	/* buffer completion syncpt */
+	u32 syncpt_val;			/* syncpt value when completed */
+	ktime_t start_ktime;		/* starting time */
+	/* context timeout information */
+	int clientid;
+};
+
+enum cdma_event {
+	CDMA_EVENT_NONE,		/* not waiting for any event */
+	CDMA_EVENT_SYNC_QUEUE_EMPTY,	/* wait for empty sync queue */
+	CDMA_EVENT_PUSH_BUFFER_SPACE	/* wait for space in push buffer */
+};
+
+struct host1x_cdma {
+	struct mutex lock;		/* controls access to shared state */
+	struct semaphore sem;		/* signalled when event occurs */
+	enum cdma_event event;		/* event that sem is waiting for */
+	unsigned int slots_used;	/* pb slots used in current submit */
+	unsigned int slots_free;	/* pb slots free in current submit */
+	unsigned int first_get;		/* DMAGET value, where submit begins */
+	unsigned int last_put;		/* last value written to DMAPUT */
+	struct push_buffer push_buffer;	/* channel's push buffer */
+	struct list_head sync_queue;	/* job queue */
+	struct buffer_timeout timeout;	/* channel's timeout state/wq */
+	bool running;
+	bool torndown;
+};
+
+#define cdma_to_channel(cdma) container_of(cdma, struct host1x_channel, cdma)
+#define cdma_to_host1x(cdma) host1x_get_host(cdma_to_channel(cdma)->dev)
+#define cdma_to_memmgr(cdma) ((cdma_to_host1x(cdma))->memmgr)
+#define pb_to_cdma(pb) container_of(pb, struct host1x_cdma, push_buffer)
+
+int	host1x_cdma_init(struct host1x_cdma *cdma);
+void	host1x_cdma_deinit(struct host1x_cdma *cdma);
+void	host1x_cdma_stop(struct host1x_cdma *cdma);
+int	host1x_cdma_begin(struct host1x_cdma *cdma, struct host1x_job *job);
+void	host1x_cdma_push(struct host1x_cdma *cdma, u32 op1, u32 op2);
+void	host1x_cdma_push_gather(struct host1x_cdma *cdma,
+		struct mem_handle *handle, u32 offset, u32 op1, u32 op2);
+void	host1x_cdma_end(struct host1x_cdma *cdma,
+		struct host1x_job *job);
+void	host1x_cdma_update(struct host1x_cdma *cdma);
+void	host1x_cdma_peek(struct host1x_cdma *cdma,
+		u32 dmaget, int slot, u32 *out);
+unsigned int host1x_cdma_wait_locked(struct host1x_cdma *cdma,
+		enum cdma_event event);
+void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
+		struct platform_device *dev);
+#endif
diff --git a/drivers/gpu/host1x/channel.c b/drivers/gpu/host1x/channel.c
new file mode 100644
index 0000000..ff647ac
--- /dev/null
+++ b/drivers/gpu/host1x/channel.c
@@ -0,0 +1,140 @@ 
+/*
+ * Tegra host1x Channel
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "channel.h"
+#include "dev.h"
+#include "job.h"
+
+#include <linux/slab.h>
+#include <linux/module.h>
+
+/* Constructor for the host1x device list */
+void host1x_channel_list_init(struct host1x *host1x)
+{
+	INIT_LIST_HEAD(&host1x->chlist.list);
+	mutex_init(&host1x->chlist_mutex);
+}
+
+/*
+ * Iterator function for host1x device list
+ * It takes a fptr as an argument and calls that function for each
+ * device in the list
+ */
+void host1x_channel_for_all(struct host1x *host1x, void *data,
+	int (*fptr)(struct host1x_channel *ch, void *fdata))
+{
+	struct host1x_channel *ch;
+	int ret;
+
+	list_for_each_entry(ch, &host1x->chlist.list, list) {
+		if (ch && fptr) {
+			ret = fptr(ch, data);
+			if (ret) {
+				pr_info("%s: iterator error\n", __func__);
+				break;
+			}
+		}
+	}
+}
+
+
+int host1x_channel_submit(struct host1x_job *job)
+{
+	return host1x_get_host(job->ch->dev)->channel_op.submit(job);
+}
+
+struct host1x_channel *host1x_channel_get(struct host1x_channel *ch)
+{
+	int err = 0;
+
+	mutex_lock(&ch->reflock);
+	if (ch->refcount == 0)
+		err = host1x_cdma_init(&ch->cdma);
+	if (!err)
+		ch->refcount++;
+
+	mutex_unlock(&ch->reflock);
+
+	return err ? NULL : ch;
+}
+
+void host1x_channel_put(struct host1x_channel *ch)
+{
+	mutex_lock(&ch->reflock);
+	if (ch->refcount == 1) {
+		host1x_get_host(ch->dev)->cdma_op.stop(&ch->cdma);
+		host1x_cdma_deinit(&ch->cdma);
+	}
+	ch->refcount--;
+	mutex_unlock(&ch->reflock);
+}
+
+struct host1x_channel *host1x_channel_alloc(struct platform_device *pdev)
+{
+	struct host1x_channel *ch = NULL;
+	struct host1x *host1x = host1x_get_host(pdev);
+	int chindex;
+	int max_channels = host1x->info.nb_channels;
+	int err;
+
+	mutex_lock(&host1x->chlist_mutex);
+
+	chindex = host1x->allocated_channels;
+	if (chindex > max_channels)
+		goto fail;
+
+	ch = kzalloc(sizeof(*ch), GFP_KERNEL);
+	if (ch == NULL)
+		goto fail;
+
+	/* Link platform_device to host1x_channel */
+	err = host1x->channel_op.init(ch, host1x, chindex);
+	if (err < 0)
+		goto fail;
+
+	ch->dev = pdev;
+
+	/* Add to channel list */
+	list_add_tail(&ch->list, &host1x->chlist.list);
+
+	host1x->allocated_channels++;
+
+	mutex_unlock(&host1x->chlist_mutex);
+	return ch;
+
+fail:
+	dev_err(&pdev->dev, "failed to init channel\n");
+	kfree(ch);
+	mutex_unlock(&host1x->chlist_mutex);
+	return NULL;
+}
+
+void host1x_channel_free(struct host1x_channel *ch)
+{
+	struct host1x *host1x = host1x_get_host(ch->dev);
+	struct host1x_channel *chiter, *tmp;
+	list_for_each_entry_safe(chiter, tmp, &host1x->chlist.list, list) {
+		if (chiter == ch) {
+			list_del(&chiter->list);
+			kfree(ch);
+			host1x->allocated_channels--;
+
+			return;
+		}
+	}
+}
diff --git a/drivers/gpu/host1x/channel.h b/drivers/gpu/host1x/channel.h
new file mode 100644
index 0000000..41eb01e
--- /dev/null
+++ b/drivers/gpu/host1x/channel.h
@@ -0,0 +1,58 @@ 
+/*
+ * Tegra host1x Channel
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __HOST1X_CHANNEL_H
+#define __HOST1X_CHANNEL_H
+
+#include <linux/cdev.h>
+#include <linux/io.h>
+#include "cdma.h"
+
+struct host1x;
+struct platform_device;
+
+/*
+ * host1x device list in debug-fs dump of host1x and client device
+ * as well as channel state
+ */
+struct host1x_channel {
+	struct list_head list;
+
+	int refcount;
+	int chid;
+	struct mutex reflock;
+	struct mutex submitlock;
+	void __iomem *regs;
+	struct device *node;
+	struct platform_device *dev;
+	struct cdev cdev;
+	struct host1x_cdma cdma;
+};
+
+/* channel list operations */
+void host1x_channel_list_init(struct host1x *);
+void host1x_channel_for_all(struct host1x *, void *data,
+	int (*fptr)(struct host1x_channel *ch, void *fdata));
+
+struct host1x_channel *host1x_channel_alloc(struct platform_device *pdev);
+void host1x_channel_free(struct host1x_channel *ch);
+struct host1x_channel *host1x_channel_get(struct host1x_channel *ch);
+void host1x_channel_put(struct host1x_channel *ch);
+int host1x_channel_submit(struct host1x_job *job);
+
+#endif
diff --git a/drivers/gpu/host1x/cma.c b/drivers/gpu/host1x/cma.c
new file mode 100644
index 0000000..06b7959
--- /dev/null
+++ b/drivers/gpu/host1x/cma.c
@@ -0,0 +1,116 @@ 
+/*
+ * Tegra host1x CMA support
+ *
+ * Copyright (c) 2012-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <drm/drmP.h>
+#include <drm/drm.h>
+#include <drm/drm_gem_cma_helper.h>
+#include <linux/mutex.h>
+
+#include "cma.h"
+#include "memmgr.h"
+
+static inline struct drm_gem_cma_object *to_cma_obj(struct mem_handle *h)
+{
+	return (struct drm_gem_cma_object *)(((u32)h) & MEMMGR_ID_MASK);
+}
+
+struct mem_handle *host1x_cma_alloc(size_t size, size_t align, int flags)
+{
+	return NULL;
+}
+
+void host1x_cma_put(struct mem_handle *handle)
+{
+	struct drm_gem_cma_object *obj = to_cma_obj(handle);
+	struct mutex *struct_mutex = &obj->base.dev->struct_mutex;
+
+	mutex_lock(struct_mutex);
+	drm_gem_object_unreference(&obj->base);
+	mutex_unlock(struct_mutex);
+}
+
+struct sg_table *host1x_cma_pin(struct mem_handle *handle)
+{
+	return NULL;
+}
+
+void host1x_cma_unpin(struct mem_handle *handle, struct sg_table *sgt)
+{
+
+}
+
+
+void *host1x_cma_mmap(struct mem_handle *handle)
+{
+	return (to_cma_obj(handle))->vaddr;
+}
+
+void host1x_cma_munmap(struct mem_handle *handle, void *addr)
+{
+
+}
+
+void *host1x_cma_kmap(struct mem_handle *handle, unsigned int pagenum)
+{
+	return (to_cma_obj(handle))->vaddr + pagenum * PAGE_SIZE;
+}
+
+void host1x_cma_kunmap(struct mem_handle *handle, unsigned int pagenum,
+		void *addr)
+{
+
+}
+
+struct mem_handle *host1x_cma_get(u32 id, struct platform_device *dev)
+{
+	struct drm_gem_cma_object *obj = to_cma_obj((void *)id);
+	struct mutex *struct_mutex = &obj->base.dev->struct_mutex;
+
+	mutex_lock(struct_mutex);
+	drm_gem_object_reference(&obj->base);
+	mutex_unlock(struct_mutex);
+
+	return (struct mem_handle *) ((u32)id | mem_mgr_type_cma);
+}
+
+int host1x_cma_pin_array_ids(struct platform_device *dev,
+		long unsigned *ids,
+		long unsigned id_type_mask,
+		long unsigned id_type,
+		u32 count,
+		struct host1x_job_unpin_data *unpin_data,
+		dma_addr_t *phys_addr)
+{
+	int i;
+	int pin_count = 0;
+
+	for (i = 0; i < count; i++) {
+		struct mem_handle *handle;
+
+		if ((ids[i] & id_type_mask) != id_type)
+			continue;
+
+		handle = host1x_cma_get(ids[i], dev);
+
+		phys_addr[i] = (to_cma_obj(handle)->paddr);
+		unpin_data[pin_count].h = handle;
+
+		pin_count++;
+	}
+	return pin_count;
+}
diff --git a/drivers/gpu/host1x/cma.h b/drivers/gpu/host1x/cma.h
new file mode 100644
index 0000000..82ad710
--- /dev/null
+++ b/drivers/gpu/host1x/cma.h
@@ -0,0 +1,43 @@ 
+/*
+ * Tegra host1x cma memory manager
+ *
+ * Copyright (c) 2012-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __HOST1X_CMA_H
+#define __HOST1X_CMA_H
+
+#include "memmgr.h"
+
+struct platform_device;
+
+struct mem_handle *host1x_cma_alloc(size_t size, size_t align, int flags);
+void host1x_cma_put(struct mem_handle *handle);
+struct sg_table *host1x_cma_pin(struct mem_handle *handle);
+void host1x_cma_unpin(struct mem_handle *handle, struct sg_table *sgt);
+void *host1x_cma_mmap(struct mem_handle *handle);
+void host1x_cma_munmap(struct mem_handle *handle, void *addr);
+void *host1x_cma_kmap(struct mem_handle *handle, unsigned int pagenum);
+void host1x_cma_kunmap(struct mem_handle *handle, unsigned int pagenum,
+		void *addr);
+struct mem_handle *host1x_cma_get(u32 id, struct platform_device *dev);
+int host1x_cma_pin_array_ids(struct platform_device *dev,
+		long unsigned *ids,
+		long unsigned id_type_mask,
+		long unsigned id_type,
+		u32 count,
+		struct host1x_job_unpin_data *unpin_data,
+		dma_addr_t *phys_addr);
+#endif
diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c
index 7f9f389..80311ca 100644
--- a/drivers/gpu/host1x/dev.c
+++ b/drivers/gpu/host1x/dev.c
@@ -25,6 +25,7 @@ 
 #include <linux/io.h>
 #include "dev.h"
 #include "intr.h"
+#include "channel.h"
 #include "hw/host1x01.h"
 
 #define CREATE_TRACE_POINTS
@@ -46,6 +47,16 @@  u32 host1x_sync_readl(struct host1x *host1x, u32 r)
 	return readl(sync_regs + r);
 }
 
+void host1x_ch_writel(struct host1x_channel *ch, u32 v, u32 r)
+{
+	writel(v, ch->regs + r);
+}
+
+u32 host1x_ch_readl(struct host1x_channel *ch, u32 r)
+{
+	return readl(ch->regs + r);
+}
+
 static struct host1x_device_info host1x_info = {
 	.nb_channels	= 8,
 	.nb_pts		= 32,
@@ -135,6 +146,8 @@  static int host1x_probe(struct platform_device *dev)
 
 	host1x_syncpt_reset(host);
 
+	host1x_channel_list_init(host);
+
 	host1x_intr_start(&host->intr, clk_get_rate(host->clk));
 
 	dev_info(&dev->dev, "initialized\n");
diff --git a/drivers/gpu/host1x/dev.h b/drivers/gpu/host1x/dev.h
index 8376092..2fefa78 100644
--- a/drivers/gpu/host1x/dev.h
+++ b/drivers/gpu/host1x/dev.h
@@ -18,11 +18,58 @@ 
 #define HOST1X_DEV_H
 
 #include <linux/platform_device.h>
+
+#include "channel.h"
 #include "syncpt.h"
 #include "intr.h"
 
 struct host1x;
+struct host1x_intr;
 struct host1x_syncpt;
+struct host1x_channel;
+struct host1x_cdma;
+struct host1x_job;
+struct push_buffer;
+struct dentry;
+struct mem_handle;
+struct platform_device;
+
+struct host1x_channel_ops {
+	int (*init)(struct host1x_channel *,
+		    struct host1x *,
+		    int chid);
+	int (*submit)(struct host1x_job *job);
+};
+
+struct host1x_cdma_ops {
+	void (*start)(struct host1x_cdma *);
+	void (*stop)(struct host1x_cdma *);
+	void (*kick)(struct  host1x_cdma *);
+	int (*timeout_init)(struct host1x_cdma *,
+			    u32 syncpt_id);
+	void (*timeout_destroy)(struct host1x_cdma *);
+	void (*timeout_teardown_begin)(struct host1x_cdma *);
+	void (*timeout_teardown_end)(struct host1x_cdma *,
+				     u32 getptr);
+	void (*timeout_cpu_incr)(struct host1x_cdma *,
+				 u32 getptr,
+				 u32 syncpt_incrs,
+				 u32 syncval,
+				 u32 nr_slots);
+};
+
+struct host1x_pushbuffer_ops {
+	void (*reset)(struct push_buffer *);
+	int (*init)(struct push_buffer *);
+	void (*destroy)(struct push_buffer *);
+	void (*push_to)(struct push_buffer *,
+			struct mem_handle *,
+			u32 op1, u32 op2);
+	void (*pop_from)(struct push_buffer *,
+			 unsigned int slots);
+	u32 (*space)(struct push_buffer *);
+	u32 (*putptr)(struct push_buffer *);
+};
 
 struct host1x_syncpt_ops {
 	void (*reset)(struct host1x_syncpt *);
@@ -64,9 +111,19 @@  struct host1x {
 	struct host1x_device_info info;
 	struct clk *clk;
 
+	/* Sync point dedicated to replacing waits for expired fences */
+	struct host1x_syncpt *nop_sp;
+
+	struct host1x_channel_ops channel_op;
+	struct host1x_cdma_ops cdma_op;
+	struct host1x_pushbuffer_ops cdma_pb_op;
 	struct host1x_syncpt_ops syncpt_op;
 	struct host1x_intr_ops intr_op;
 
+	struct mutex chlist_mutex;
+	struct host1x_channel chlist;
+	int allocated_channels;
+
 	struct dentry *debugfs;
 };
 
@@ -84,5 +141,7 @@  struct host1x *host1x_get_host(struct platform_device *_dev)
 
 void host1x_sync_writel(struct host1x *host1x, u32 r, u32 v);
 u32 host1x_sync_readl(struct host1x *host1x, u32 r);
+void host1x_ch_writel(struct host1x_channel *ch, u32 r, u32 v);
+u32 host1x_ch_readl(struct host1x_channel *ch, u32 r);
 
 #endif
diff --git a/drivers/gpu/host1x/host1x.h b/drivers/gpu/host1x/host1x.h
new file mode 100644
index 0000000..ded0660
--- /dev/null
+++ b/drivers/gpu/host1x/host1x.h
@@ -0,0 +1,29 @@ 
+/*
+ * Tegra host1x driver
+ *
+ * Copyright (c) 2009-2013, NVIDIA Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __LINUX_HOST1X_H
+#define __LINUX_HOST1X_H
+
+enum host1x_class {
+	NV_HOST1X_CLASS_ID		= 0x1,
+	NV_GRAPHICS_2D_CLASS_ID		= 0x51,
+};
+
+#endif
diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
new file mode 100644
index 0000000..7a44418
--- /dev/null
+++ b/drivers/gpu/host1x/hw/cdma_hw.c
@@ -0,0 +1,475 @@ 
+/*
+ * Tegra host1x Command DMA
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/slab.h>
+#include <linux/scatterlist.h>
+#include <linux/dma-mapping.h>
+#include "cdma.h"
+#include "channel.h"
+#include "dev.h"
+#include "memmgr.h"
+
+#include "cdma_hw.h"
+
+static inline u32 host1x_channel_dmactrl(int stop, int get_rst, int init_get)
+{
+	return HOST1X_CHANNEL_DMACTRL_DMASTOP_F(stop)
+		| HOST1X_CHANNEL_DMACTRL_DMAGETRST_F(get_rst)
+		| HOST1X_CHANNEL_DMACTRL_DMAINITGET_F(init_get);
+}
+
+static void cdma_timeout_handler(struct work_struct *work);
+
+/*
+ * push_buffer
+ *
+ * The push buffer is a circular array of words to be fetched by command DMA.
+ * Note that it works slightly differently to the sync queue; fence == cur
+ * means that the push buffer is full, not empty.
+ */
+
+
+/**
+ * Reset to empty push buffer
+ */
+static void push_buffer_reset(struct push_buffer *pb)
+{
+	pb->fence = PUSH_BUFFER_SIZE - 8;
+	pb->cur = 0;
+}
+
+/**
+ * Init push buffer resources
+ */
+static void push_buffer_destroy(struct push_buffer *pb);
+static int push_buffer_init(struct push_buffer *pb)
+{
+	struct host1x_cdma *cdma = pb_to_cdma(pb);
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	pb->mapped = NULL;
+	pb->phys = 0;
+	pb->handle = NULL;
+
+	host1x->cdma_pb_op.reset(pb);
+
+	/* allocate and map pushbuffer memory */
+	pb->mapped = dma_alloc_writecombine(&host1x->dev->dev,
+			PUSH_BUFFER_SIZE + 4, &pb->phys, GFP_KERNEL);
+	if (!pb->mapped)
+		goto fail;
+
+	/* memory for storing mem client and handles for each opcode pair */
+	pb->handle = kzalloc(HOST1X_GATHER_QUEUE_SIZE *
+				sizeof(struct mem_handle *),
+			GFP_KERNEL);
+	if (!pb->handle)
+		goto fail;
+
+	/* put the restart at the end of pushbuffer memory */
+	*(pb->mapped + (PUSH_BUFFER_SIZE >> 2)) =
+		host1x_opcode_restart(pb->phys);
+
+	return 0;
+
+fail:
+	push_buffer_destroy(pb);
+	return -ENOMEM;
+}
+
+/*
+ * Clean up push buffer resources
+ */
+static void push_buffer_destroy(struct push_buffer *pb)
+{
+	struct host1x_cdma *cdma = pb_to_cdma(pb);
+	struct host1x *host1x = cdma_to_host1x(cdma);
+
+	if (pb->phys != 0)
+		dma_free_writecombine(&host1x->dev->dev,
+				PUSH_BUFFER_SIZE + 4,
+				pb->mapped, pb->phys);
+
+	kfree(pb->handle);
+
+	pb->mapped = NULL;
+	pb->phys = 0;
+	pb->handle = NULL;
+}
+
+/*
+ * Push two words to the push buffer
+ * Caller must ensure push buffer is not full
+ */
+static void push_buffer_push_to(struct push_buffer *pb,
+		struct mem_handle *handle,
+		u32 op1, u32 op2)
+{
+	u32 cur = pb->cur;
+	u32 *p = (u32 *)((u32)pb->mapped + cur);
+	u32 cur_mem = (cur/8) & (HOST1X_GATHER_QUEUE_SIZE - 1);
+	WARN_ON(cur == pb->fence);
+	*(p++) = op1;
+	*(p++) = op2;
+	pb->handle[cur_mem] = handle;
+	pb->cur = (cur + 8) & (PUSH_BUFFER_SIZE - 1);
+}
+
+/*
+ * Pop a number of two word slots from the push buffer
+ * Caller must ensure push buffer is not empty
+ */
+static void push_buffer_pop_from(struct push_buffer *pb,
+		unsigned int slots)
+{
+	/* Clear the mem references for old items from pb */
+	unsigned int i;
+	u32 fence_mem = pb->fence/8;
+	for (i = 0; i < slots; i++) {
+		int cur_fence_mem = (fence_mem+i)
+				& (HOST1X_GATHER_QUEUE_SIZE - 1);
+		pb->handle[cur_fence_mem] = NULL;
+	}
+	/* Advance the next write position */
+	pb->fence = (pb->fence + slots * 8) & (PUSH_BUFFER_SIZE - 1);
+}
+
+/*
+ * Return the number of two word slots free in the push buffer
+ */
+static u32 push_buffer_space(struct push_buffer *pb)
+{
+	return ((pb->fence - pb->cur) & (PUSH_BUFFER_SIZE - 1)) / 8;
+}
+
+static u32 push_buffer_putptr(struct push_buffer *pb)
+{
+	return pb->phys + pb->cur;
+}
+
+/*
+ * The syncpt incr buffer is filled with methods to increment syncpts, which
+ * is later GATHER-ed into the mainline PB. It's used when a timed out context
+ * is interleaved with other work, so needs to inline the syncpt increments
+ * to maintain the count (but otherwise does no work).
+ */
+
+/*
+ * Init timeout resources
+ */
+static int cdma_timeout_init(struct host1x_cdma *cdma,
+				 u32 syncpt_id)
+{
+	if (syncpt_id == NVSYNCPT_INVALID)
+		return -EINVAL;
+
+	INIT_DELAYED_WORK(&cdma->timeout.wq, cdma_timeout_handler);
+	cdma->timeout.initialized = true;
+
+	return 0;
+}
+
+/*
+ * Clean up timeout resources
+ */
+static void cdma_timeout_destroy(struct host1x_cdma *cdma)
+{
+	if (cdma->timeout.initialized)
+		cancel_delayed_work(&cdma->timeout.wq);
+	cdma->timeout.initialized = false;
+}
+
+/*
+ * Increment timedout buffer's syncpt via CPU.
+ */
+static void cdma_timeout_cpu_incr(struct host1x_cdma *cdma, u32 getptr,
+				u32 syncpt_incrs, u32 syncval, u32 nr_slots)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct push_buffer *pb = &cdma->push_buffer;
+	u32 i, getidx;
+
+	for (i = 0; i < syncpt_incrs; i++)
+		host1x_syncpt_cpu_incr(cdma->timeout.syncpt);
+
+	/* after CPU incr, ensure shadow is up to date */
+	host1x_syncpt_load_min(cdma->timeout.syncpt);
+
+	/* NOP all the PB slots */
+	getidx = getptr - pb->phys;
+	while (nr_slots--) {
+		u32 *p = (u32 *)((u32)pb->mapped + getidx);
+		*(p++) = HOST1X_OPCODE_NOOP;
+		*(p++) = HOST1X_OPCODE_NOOP;
+		dev_dbg(&host1x->dev->dev, "%s: NOP at 0x%x\n",
+			__func__, pb->phys + getidx);
+		getidx = (getidx + 8) & (PUSH_BUFFER_SIZE - 1);
+	}
+	wmb();
+}
+
+/*
+ * Start channel DMA
+ */
+static void cdma_start(struct host1x_cdma *cdma)
+{
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+	struct host1x *host1x = cdma_to_host1x(cdma);
+
+	if (cdma->running)
+		return;
+
+	cdma->last_put = host1x->cdma_pb_op.putptr(&cdma->push_buffer);
+
+	host1x_ch_writel(ch, host1x_channel_dmactrl(true, false, false),
+		HOST1X_CHANNEL_DMACTRL);
+
+	/* set base, put, end pointer (all of memory) */
+	host1x_ch_writel(ch, 0, HOST1X_CHANNEL_DMASTART);
+	host1x_ch_writel(ch, cdma->last_put, HOST1X_CHANNEL_DMAPUT);
+	host1x_ch_writel(ch, 0xFFFFFFFF, HOST1X_CHANNEL_DMAEND);
+
+	/* reset GET */
+	host1x_ch_writel(ch, host1x_channel_dmactrl(true, true, true),
+		HOST1X_CHANNEL_DMACTRL);
+
+	/* start the command DMA */
+	host1x_ch_writel(ch, host1x_channel_dmactrl(false, false, false),
+		HOST1X_CHANNEL_DMACTRL);
+
+	cdma->running = true;
+}
+
+/*
+ * Similar to cdma_start(), but rather than starting from an idle
+ * state (where DMA GET is set to DMA PUT), on a timeout we restore
+ * DMA GET from an explicit value (so DMA may again be pending).
+ */
+static void cdma_timeout_restart(struct host1x_cdma *cdma, u32 getptr)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+
+	if (cdma->running)
+		return;
+
+	cdma->last_put = host1x->cdma_pb_op.putptr(&cdma->push_buffer);
+
+	host1x_ch_writel(ch, host1x_channel_dmactrl(true, false, false),
+		HOST1X_CHANNEL_DMACTRL);
+
+	/* set base, end pointer (all of memory) */
+	host1x_ch_writel(ch, 0, HOST1X_CHANNEL_DMASTART);
+	host1x_ch_writel(ch, 0xFFFFFFFF, HOST1X_CHANNEL_DMAEND);
+
+	/* set GET, by loading the value in PUT (then reset GET) */
+	host1x_ch_writel(ch, getptr, HOST1X_CHANNEL_DMAPUT);
+	host1x_ch_writel(ch, host1x_channel_dmactrl(true, true, true),
+		HOST1X_CHANNEL_DMACTRL);
+
+	dev_dbg(&host1x->dev->dev,
+		"%s: DMA GET 0x%x, PUT HW 0x%x / shadow 0x%x\n",
+		__func__,
+		host1x_ch_readl(ch, HOST1X_CHANNEL_DMAGET),
+		host1x_ch_readl(ch, HOST1X_CHANNEL_DMAPUT),
+		cdma->last_put);
+
+	/* deassert GET reset and set PUT */
+	host1x_ch_writel(ch, host1x_channel_dmactrl(true, false, false),
+		HOST1X_CHANNEL_DMACTRL);
+	host1x_ch_writel(ch, cdma->last_put, HOST1X_CHANNEL_DMAPUT);
+
+	/* start the command DMA */
+	host1x_ch_writel(ch, host1x_channel_dmactrl(false, false, false),
+		HOST1X_CHANNEL_DMACTRL);
+
+	cdma->running = true;
+}
+
+/*
+ * Kick channel DMA into action by writing its PUT offset (if it has changed)
+ */
+static void cdma_kick(struct host1x_cdma *cdma)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+	u32 put;
+
+	put = host1x->cdma_pb_op.putptr(&cdma->push_buffer);
+
+	if (put != cdma->last_put) {
+		host1x_ch_writel(ch, put, HOST1X_CHANNEL_DMAPUT);
+		cdma->last_put = put;
+	}
+}
+
+static void cdma_stop(struct host1x_cdma *cdma)
+{
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+
+	mutex_lock(&cdma->lock);
+	if (cdma->running) {
+		host1x_cdma_wait_locked(cdma, CDMA_EVENT_SYNC_QUEUE_EMPTY);
+		host1x_ch_writel(ch, host1x_channel_dmactrl(true, false, false),
+			HOST1X_CHANNEL_DMACTRL);
+		cdma->running = false;
+	}
+	mutex_unlock(&cdma->lock);
+}
+
+/*
+ * Stops both channel's command processor and CDMA immediately.
+ * Also, tears down the channel and resets corresponding module.
+ */
+static void cdma_timeout_teardown_begin(struct host1x_cdma *cdma)
+{
+	struct host1x *dev = cdma_to_host1x(cdma);
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+	u32 cmdproc_stop;
+
+	if (cdma->torndown && !cdma->running) {
+		dev_warn(&dev->dev->dev, "Already torn down\n");
+		return;
+	}
+
+	dev_dbg(&dev->dev->dev,
+		"begin channel teardown (channel id %d)\n", ch->chid);
+
+	cmdproc_stop = host1x_sync_readl(dev, HOST1X_SYNC_CMDPROC_STOP);
+	cmdproc_stop |= BIT(ch->chid);
+	host1x_sync_writel(dev, cmdproc_stop, HOST1X_SYNC_CMDPROC_STOP);
+
+	dev_dbg(&dev->dev->dev,
+		"%s: DMA GET 0x%x, PUT HW 0x%x / shadow 0x%x\n",
+		__func__,
+		host1x_ch_readl(ch, HOST1X_CHANNEL_DMAGET),
+		host1x_ch_readl(ch, HOST1X_CHANNEL_DMAPUT),
+		cdma->last_put);
+
+	host1x_ch_writel(ch, host1x_channel_dmactrl(true, false, false),
+		HOST1X_CHANNEL_DMACTRL);
+
+	host1x_sync_writel(dev, BIT(ch->chid), HOST1X_SYNC_CH_TEARDOWN);
+
+	cdma->running = false;
+	cdma->torndown = true;
+}
+
+static void cdma_timeout_teardown_end(struct host1x_cdma *cdma, u32 getptr)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+	u32 cmdproc_stop;
+
+	dev_dbg(&host1x->dev->dev,
+		"end channel teardown (id %d, DMAGET restart = 0x%x)\n",
+		ch->chid, getptr);
+
+	cmdproc_stop = host1x_sync_readl(host1x, HOST1X_SYNC_CMDPROC_STOP);
+	cmdproc_stop &= ~(BIT(ch->chid));
+	host1x_sync_writel(host1x, cmdproc_stop, HOST1X_SYNC_CMDPROC_STOP);
+
+	cdma->torndown = false;
+	cdma_timeout_restart(cdma, getptr);
+}
+
+/*
+ * If this timeout fires, it indicates the current sync_queue entry has
+ * exceeded its TTL and the userctx should be timed out and remaining
+ * submits already issued cleaned up (future submits return an error).
+ */
+static void cdma_timeout_handler(struct work_struct *work)
+{
+	struct host1x_cdma *cdma;
+	struct host1x *host1x;
+	struct host1x_channel *ch;
+
+	u32 syncpt_val;
+
+	u32 prev_cmdproc, cmdproc_stop;
+
+	cdma = container_of(to_delayed_work(work), struct host1x_cdma,
+			    timeout.wq);
+	host1x = cdma_to_host1x(cdma);
+	ch = cdma_to_channel(cdma);
+
+	mutex_lock(&cdma->lock);
+
+	if (!cdma->timeout.clientid) {
+		dev_dbg(&host1x->dev->dev,
+			 "cdma_timeout: expired, but has no clientid\n");
+		mutex_unlock(&cdma->lock);
+		return;
+	}
+
+	/* stop processing to get a clean snapshot */
+	prev_cmdproc = host1x_sync_readl(host1x, HOST1X_SYNC_CMDPROC_STOP);
+	cmdproc_stop = prev_cmdproc | BIT(ch->chid);
+	host1x_sync_writel(host1x, cmdproc_stop, HOST1X_SYNC_CMDPROC_STOP);
+
+	dev_dbg(&host1x->dev->dev, "cdma_timeout: cmdproc was 0x%x is 0x%x\n",
+		prev_cmdproc, cmdproc_stop);
+
+	syncpt_val = host1x_syncpt_load_min(host1x->syncpt);
+
+	/* has buffer actually completed? */
+	if ((s32)(syncpt_val - cdma->timeout.syncpt_val) >= 0) {
+		dev_dbg(&host1x->dev->dev,
+			 "cdma_timeout: expired, but buffer had completed\n");
+		/* restore */
+		cmdproc_stop = prev_cmdproc & ~(BIT(ch->chid));
+		host1x_sync_writel(host1x, cmdproc_stop,
+			HOST1X_SYNC_CMDPROC_STOP);
+		mutex_unlock(&cdma->lock);
+		return;
+	}
+
+	dev_warn(&host1x->dev->dev,
+		"%s: timeout: %d (%s), HW thresh %d, done %d\n",
+		__func__,
+		cdma->timeout.syncpt->id, cdma->timeout.syncpt->name,
+		syncpt_val, cdma->timeout.syncpt_val);
+
+	/* stop HW, resetting channel/module */
+	host1x->cdma_op.timeout_teardown_begin(cdma);
+
+	host1x_cdma_update_sync_queue(cdma, ch->dev);
+	mutex_unlock(&cdma->lock);
+}
+
+static const struct host1x_cdma_ops host1x_cdma_ops = {
+	.start = cdma_start,
+	.stop = cdma_stop,
+	.kick = cdma_kick,
+
+	.timeout_init = cdma_timeout_init,
+	.timeout_destroy = cdma_timeout_destroy,
+	.timeout_teardown_begin = cdma_timeout_teardown_begin,
+	.timeout_teardown_end = cdma_timeout_teardown_end,
+	.timeout_cpu_incr = cdma_timeout_cpu_incr,
+};
+
+static const struct host1x_pushbuffer_ops host1x_pushbuffer_ops = {
+	.reset = push_buffer_reset,
+	.init = push_buffer_init,
+	.destroy = push_buffer_destroy,
+	.push_to = push_buffer_push_to,
+	.pop_from = push_buffer_pop_from,
+	.space = push_buffer_space,
+	.putptr = push_buffer_putptr,
+};
+
diff --git a/drivers/gpu/host1x/hw/cdma_hw.h b/drivers/gpu/host1x/hw/cdma_hw.h
new file mode 100644
index 0000000..80a085a
--- /dev/null
+++ b/drivers/gpu/host1x/hw/cdma_hw.h
@@ -0,0 +1,37 @@ 
+/*
+ * Tegra host1x Command DMA
+ *
+ * Copyright (c) 2011-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __HOST1X_CDMA_HW_H
+#define __HOST1X_CDMA_HW_H
+
+/*
+ * Size of the sync queue. If it is too small, we won't be able to queue up
+ * many command buffers. If it is too large, we waste memory.
+ */
+#define HOST1X_SYNC_QUEUE_SIZE 512
+
+/*
+ * Number of gathers we allow to be queued up per channel. Must be a
+ * power of two. Currently sized such that pushbuffer is 4KB (512*8B).
+ */
+#define HOST1X_GATHER_QUEUE_SIZE 512
+
+/* 8 bytes per slot. (This number does not include the final RESTART.) */
+#define PUSH_BUFFER_SIZE (HOST1X_GATHER_QUEUE_SIZE * 8)
+
+#endif
diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
new file mode 100644
index 0000000..905cfd2
--- /dev/null
+++ b/drivers/gpu/host1x/hw/channel_hw.c
@@ -0,0 +1,148 @@ 
+/*
+ * Tegra host1x Channel
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "host1x.h"
+#include "channel.h"
+#include "dev.h"
+#include <linux/slab.h>
+#include "intr.h"
+#include "job.h"
+#include <trace/events/host1x.h>
+
+static void submit_gathers(struct host1x_job *job)
+{
+	/* push user gathers */
+	int i;
+	for (i = 0 ; i < job->num_gathers; i++) {
+		struct host1x_job_gather *g = &job->gathers[i];
+		u32 op1 = host1x_opcode_gather(g->words);
+		u32 op2 = g->mem_base + g->offset;
+		host1x_cdma_push_gather(&job->ch->cdma,
+				job->gathers[i].ref,
+				job->gathers[i].offset,
+				op1, op2);
+	}
+}
+
+static int channel_submit(struct host1x_job *job)
+{
+	struct host1x_channel *ch = job->ch;
+	struct host1x_syncpt *sp;
+	u32 user_syncpt_incrs = job->syncpt_incrs;
+	u32 prev_max = 0;
+	u32 syncval;
+	int err;
+	void *completed_waiter = NULL;
+
+	sp = host1x_get_host(job->ch->dev)->syncpt + job->syncpt_id;
+	trace_host1x_channel_submit(ch->dev->name,
+			job->num_gathers, job->num_relocs, job->num_waitchk,
+			job->syncpt_id, job->syncpt_incrs);
+
+	/* before error checks, return current max */
+	prev_max = job->syncpt_end = host1x_syncpt_read_max(sp);
+
+	/* get submit lock */
+	err = mutex_lock_interruptible(&ch->submitlock);
+	if (err)
+		goto error;
+
+	completed_waiter = host1x_intr_alloc_waiter();
+	if (!completed_waiter) {
+		mutex_unlock(&ch->submitlock);
+		err = -ENOMEM;
+		goto error;
+	}
+
+	/* begin a CDMA submit */
+	err = host1x_cdma_begin(&ch->cdma, job);
+	if (err) {
+		mutex_unlock(&ch->submitlock);
+		goto error;
+	}
+
+	if (job->serialize) {
+		/*
+		 * Force serialization by inserting a host wait for the
+		 * previous job to finish before this one can commence.
+		 */
+		host1x_cdma_push(&ch->cdma,
+				host1x_opcode_setclass(NV_HOST1X_CLASS_ID,
+					host1x_uclass_wait_syncpt_r(),
+					1),
+				host1x_class_host_wait_syncpt(job->syncpt_id,
+					host1x_syncpt_read_max(sp)));
+	}
+
+	syncval = host1x_syncpt_incr_max(sp, user_syncpt_incrs);
+
+	job->syncpt_end = syncval;
+
+	/* add a setclass for modules that require it */
+	if (job->class)
+		host1x_cdma_push(&ch->cdma,
+			host1x_opcode_setclass(job->class, 0, 0),
+			HOST1X_OPCODE_NOOP);
+
+	submit_gathers(job);
+
+	/* end CDMA submit & stash pinned hMems into sync queue */
+	host1x_cdma_end(&ch->cdma, job);
+
+	trace_host1x_channel_submitted(ch->dev->name,
+			prev_max, syncval);
+
+	/* schedule a submit complete interrupt */
+	err = host1x_intr_add_action(&host1x_get_host(ch->dev)->intr,
+			job->syncpt_id, syncval,
+			HOST1X_INTR_ACTION_SUBMIT_COMPLETE, ch,
+			completed_waiter,
+			NULL);
+	completed_waiter = NULL;
+	WARN(err, "Failed to set submit complete interrupt");
+
+	mutex_unlock(&ch->submitlock);
+
+	return 0;
+
+error:
+	kfree(completed_waiter);
+	return err;
+}
+
+static inline void __iomem *host1x_channel_regs(void __iomem *p, int ndx)
+{
+	p += ndx * NV_HOST1X_CHANNEL_MAP_SIZE_BYTES;
+	return p;
+}
+
+static int host1x_channel_init(struct host1x_channel *ch,
+	struct host1x *dev, int index)
+{
+	ch->chid = index;
+	mutex_init(&ch->reflock);
+	mutex_init(&ch->submitlock);
+
+	ch->regs = host1x_channel_regs(dev->regs, index);
+	return 0;
+}
+
+static const struct host1x_channel_ops host1x_channel_ops = {
+	.init = host1x_channel_init,
+	.submit = channel_submit,
+};
diff --git a/drivers/gpu/host1x/hw/host1x01.c b/drivers/gpu/host1x/hw/host1x01.c
index 3d633a3..7569a1e 100644
--- a/drivers/gpu/host1x/hw/host1x01.c
+++ b/drivers/gpu/host1x/hw/host1x01.c
@@ -23,13 +23,19 @@ 
 
 #include "hw/host1x01.h"
 #include "dev.h"
+#include "channel.h"
 #include "hw/host1x01_hardware.h"
 
+#include "hw/channel_hw.c"
+#include "hw/cdma_hw.c"
 #include "hw/syncpt_hw.c"
 #include "hw/intr_hw.c"
 
 int host1x01_init(struct host1x *host)
 {
+	host->channel_op = host1x_channel_ops;
+	host->cdma_op = host1x_cdma_ops;
+	host->cdma_pb_op = host1x_pushbuffer_ops;
 	host->syncpt_op = host1x_syncpt_ops;
 	host->intr_op = host1x_intr_ops;
 
diff --git a/drivers/gpu/host1x/hw/host1x01_hardware.h b/drivers/gpu/host1x/hw/host1x01_hardware.h
index c1d5324..03873c0 100644
--- a/drivers/gpu/host1x/hw/host1x01_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x01_hardware.h
@@ -21,6 +21,130 @@ 
 
 #include <linux/types.h>
 #include <linux/bitops.h>
+#include "hw_host1x01_channel.h"
 #include "hw_host1x01_sync.h"
+#include "hw_host1x01_uclass.h"
+
+/* channel registers */
+#define NV_HOST1X_CHANNEL_MAP_SIZE_BYTES 16384
+
+static inline u32 host1x_class_host_wait_syncpt(
+	unsigned indx, unsigned threshold)
+{
+	return host1x_uclass_wait_syncpt_indx_f(indx)
+		| host1x_uclass_wait_syncpt_thresh_f(threshold);
+}
+
+static inline u32 host1x_class_host_load_syncpt_base(
+	unsigned indx, unsigned threshold)
+{
+	return host1x_uclass_load_syncpt_base_base_indx_f(indx)
+		| host1x_uclass_load_syncpt_base_value_f(threshold);
+}
+
+static inline u32 host1x_class_host_wait_syncpt_base(
+	unsigned indx, unsigned base_indx, unsigned offset)
+{
+	return host1x_uclass_wait_syncpt_base_indx_f(indx)
+		| host1x_uclass_wait_syncpt_base_base_indx_f(base_indx)
+		| host1x_uclass_wait_syncpt_base_offset_f(offset);
+}
+
+static inline u32 host1x_class_host_incr_syncpt_base(
+	unsigned base_indx, unsigned offset)
+{
+	return host1x_uclass_incr_syncpt_base_base_indx_f(base_indx)
+		| host1x_uclass_incr_syncpt_base_offset_f(offset);
+}
+
+static inline u32 host1x_class_host_incr_syncpt(
+	unsigned cond, unsigned indx)
+{
+	return host1x_uclass_incr_syncpt_cond_f(cond)
+		| host1x_uclass_incr_syncpt_indx_f(indx);
+}
+
+static inline u32 host1x_class_host_indoff_reg_write(
+	unsigned mod_id, unsigned offset, bool auto_inc)
+{
+	u32 v = host1x_uclass_indoff_indbe_f(0xf)
+		| host1x_uclass_indoff_indmodid_f(mod_id)
+		| host1x_uclass_indoff_indroffset_f(offset);
+	if (auto_inc)
+		v |= host1x_uclass_indoff_autoinc_f(1);
+	return v;
+}
+
+static inline u32 host1x_class_host_indoff_reg_read(
+	unsigned mod_id, unsigned offset, bool auto_inc)
+{
+	u32 v = host1x_uclass_indoff_indmodid_f(mod_id)
+		| host1x_uclass_indoff_indroffset_f(offset)
+		| host1x_uclass_indoff_rwn_read_v();
+	if (auto_inc)
+		v |= host1x_uclass_indoff_autoinc_f(1);
+	return v;
+}
+
+
+/* cdma opcodes */
+static inline u32 host1x_opcode_setclass(
+	unsigned class_id, unsigned offset, unsigned mask)
+{
+	return (0 << 28) | (offset << 16) | (class_id << 6) | mask;
+}
+
+static inline u32 host1x_opcode_incr(unsigned offset, unsigned count)
+{
+	return (1 << 28) | (offset << 16) | count;
+}
+
+static inline u32 host1x_opcode_nonincr(unsigned offset, unsigned count)
+{
+	return (2 << 28) | (offset << 16) | count;
+}
+
+static inline u32 host1x_opcode_mask(unsigned offset, unsigned mask)
+{
+	return (3 << 28) | (offset << 16) | mask;
+}
+
+static inline u32 host1x_opcode_imm(unsigned offset, unsigned value)
+{
+	return (4 << 28) | (offset << 16) | value;
+}
+
+static inline u32 host1x_opcode_imm_incr_syncpt(unsigned cond, unsigned indx)
+{
+	return host1x_opcode_imm(host1x_uclass_incr_syncpt_r(),
+		host1x_class_host_incr_syncpt(cond, indx));
+}
+
+static inline u32 host1x_opcode_restart(unsigned address)
+{
+	return (5 << 28) | (address >> 4);
+}
+
+static inline u32 host1x_opcode_gather(unsigned count)
+{
+	return (6 << 28) | count;
+}
+
+static inline u32 host1x_opcode_gather_nonincr(unsigned offset,	unsigned count)
+{
+	return (6 << 28) | (offset << 16) | BIT(15) | count;
+}
+
+static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
+{
+	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
+}
+
+#define HOST1X_OPCODE_NOOP host1x_opcode_nonincr(0, 0)
+
+static inline u32 host1x_mask2(unsigned x, unsigned y)
+{
+	return 1 | (1 << (y - x));
+}
 
 #endif
diff --git a/drivers/gpu/host1x/hw/hw_host1x01_channel.h b/drivers/gpu/host1x/hw/hw_host1x01_channel.h
new file mode 100644
index 0000000..dad4fee
--- /dev/null
+++ b/drivers/gpu/host1x/hw/hw_host1x01_channel.h
@@ -0,0 +1,102 @@ 
+/*
+ * Copyright (c) 2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+ /*
+  * Function naming determines intended use:
+  *
+  *     <x>_r(void) : Returns the offset for register <x>.
+  *
+  *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+  *
+  *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+  *
+  *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+  *         and masked to place it at field <y> of register <x>.  This value
+  *         can be |'d with others to produce a full register value for
+  *         register <x>.
+  *
+  *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+  *         value can be ~'d and then &'d to clear the value of field <y> for
+  *         register <x>.
+  *
+  *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+  *         to place it at field <y> of register <x>.  This value can be |'d
+  *         with others to produce a full register value for <x>.
+  *
+  *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+  *         <x> value 'r' after being shifted to place its LSB at bit 0.
+  *         This value is suitable for direct comparison with other unshifted
+  *         values appropriate for use in field <y> of register <x>.
+  *
+  *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+  *         field <y> of register <x>.  This value is suitable for direct
+  *         comparison with unshifted values appropriate for use in field <y>
+  *         of register <x>.
+  */
+
+#ifndef __hw_host1x_channel_host1x_h__
+#define __hw_host1x_channel_host1x_h__
+
+static inline u32 host1x_channel_dmastart_r(void)
+{
+	return 0x14;
+}
+#define HOST1X_CHANNEL_DMASTART \
+	host1x_channel_dmastart_r()
+static inline u32 host1x_channel_dmaput_r(void)
+{
+	return 0x18;
+}
+#define HOST1X_CHANNEL_DMAPUT \
+	host1x_channel_dmaput_r()
+static inline u32 host1x_channel_dmaget_r(void)
+{
+	return 0x1c;
+}
+#define HOST1X_CHANNEL_DMAGET \
+	host1x_channel_dmaget_r()
+static inline u32 host1x_channel_dmaend_r(void)
+{
+	return 0x20;
+}
+#define HOST1X_CHANNEL_DMAEND \
+	host1x_channel_dmaend_r()
+static inline u32 host1x_channel_dmactrl_r(void)
+{
+	return 0x24;
+}
+#define HOST1X_CHANNEL_DMACTRL \
+	host1x_channel_dmactrl_r()
+static inline u32 host1x_channel_dmactrl_dmastop_f(u32 v)
+{
+	return (v & 0x1) << 0;
+}
+#define HOST1X_CHANNEL_DMACTRL_DMASTOP_F(v) \
+	host1x_channel_dmactrl_dmastop_f(v)
+static inline u32 host1x_channel_dmactrl_dmagetrst_f(u32 v)
+{
+	return (v & 0x1) << 1;
+}
+#define HOST1X_CHANNEL_DMACTRL_DMAGETRST_F(v) \
+	host1x_channel_dmactrl_dmagetrst_f(v)
+static inline u32 host1x_channel_dmactrl_dmainitget_f(u32 v)
+{
+	return (v & 0x1) << 2;
+}
+#define HOST1X_CHANNEL_DMACTRL_DMAINITGET_F(v) \
+	host1x_channel_dmactrl_dmainitget_f(v)
+#endif
diff --git a/drivers/gpu/host1x/hw/hw_host1x01_sync.h b/drivers/gpu/host1x/hw/hw_host1x01_sync.h
index 5da9afb..3073d37 100644
--- a/drivers/gpu/host1x/hw/hw_host1x01_sync.h
+++ b/drivers/gpu/host1x/hw/hw_host1x01_sync.h
@@ -69,6 +69,18 @@  static inline u32 host1x_sync_syncpt_thresh_int_enable_cpu0_r(void)
 }
 #define HOST1X_SYNC_SYNCPT_THRESH_INT_ENABLE_CPU0 \
 	host1x_sync_syncpt_thresh_int_enable_cpu0_r()
+static inline u32 host1x_sync_cmdproc_stop_r(void)
+{
+	return 0xac;
+}
+#define HOST1X_SYNC_CMDPROC_STOP \
+	host1x_sync_cmdproc_stop_r()
+static inline u32 host1x_sync_ch_teardown_r(void)
+{
+	return 0xb0;
+}
+#define HOST1X_SYNC_CH_TEARDOWN \
+	host1x_sync_ch_teardown_r()
 static inline u32 host1x_sync_usec_clk_r(void)
 {
 	return 0x1a4;
diff --git a/drivers/gpu/host1x/hw/hw_host1x01_uclass.h b/drivers/gpu/host1x/hw/hw_host1x01_uclass.h
new file mode 100644
index 0000000..7af6609
--- /dev/null
+++ b/drivers/gpu/host1x/hw/hw_host1x01_uclass.h
@@ -0,0 +1,168 @@ 
+/*
+ * Copyright (c) 2012-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+ /*
+  * Function naming determines intended use:
+  *
+  *     <x>_r(void) : Returns the offset for register <x>.
+  *
+  *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+  *
+  *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+  *
+  *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+  *         and masked to place it at field <y> of register <x>.  This value
+  *         can be |'d with others to produce a full register value for
+  *         register <x>.
+  *
+  *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+  *         value can be ~'d and then &'d to clear the value of field <y> for
+  *         register <x>.
+  *
+  *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+  *         to place it at field <y> of register <x>.  This value can be |'d
+  *         with others to produce a full register value for <x>.
+  *
+  *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+  *         <x> value 'r' after being shifted to place its LSB at bit 0.
+  *         This value is suitable for direct comparison with other unshifted
+  *         values appropriate for use in field <y> of register <x>.
+  *
+  *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+  *         field <y> of register <x>.  This value is suitable for direct
+  *         comparison with unshifted values appropriate for use in field <y>
+  *         of register <x>.
+  */
+
+#ifndef __hw_host1x_uclass_host1x_h__
+#define __hw_host1x_uclass_host1x_h__
+
+static inline u32 host1x_uclass_incr_syncpt_r(void)
+{
+	return 0x0;
+}
+#define HOST1X_UCLASS_INCR_SYNCPT \
+	host1x_uclass_incr_syncpt_r()
+static inline u32 host1x_uclass_incr_syncpt_cond_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+#define HOST1X_UCLASS_INCR_SYNCPT_COND_F(v) \
+	host1x_uclass_incr_syncpt_cond_f(v)
+static inline u32 host1x_uclass_incr_syncpt_indx_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+#define HOST1X_UCLASS_INCR_SYNCPT_INDX_F(v) \
+	host1x_uclass_incr_syncpt_indx_f(v)
+static inline u32 host1x_uclass_wait_syncpt_r(void)
+{
+	return 0x8;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT \
+	host1x_uclass_wait_syncpt_r()
+static inline u32 host1x_uclass_wait_syncpt_indx_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_INDX_F(v) \
+	host1x_uclass_wait_syncpt_indx_f(v)
+static inline u32 host1x_uclass_wait_syncpt_thresh_f(u32 v)
+{
+	return (v & 0xffffff) << 0;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_THRESH_F(v) \
+	host1x_uclass_wait_syncpt_thresh_f(v)
+static inline u32 host1x_uclass_wait_syncpt_base_indx_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_BASE_INDX_F(v) \
+	host1x_uclass_wait_syncpt_base_indx_f(v)
+static inline u32 host1x_uclass_wait_syncpt_base_base_indx_f(u32 v)
+{
+	return (v & 0xff) << 16;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_BASE_BASE_INDX_F(v) \
+	host1x_uclass_wait_syncpt_base_base_indx_f(v)
+static inline u32 host1x_uclass_wait_syncpt_base_offset_f(u32 v)
+{
+	return (v & 0xffff) << 0;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_BASE_OFFSET_F(v) \
+	host1x_uclass_wait_syncpt_base_offset_f(v)
+static inline u32 host1x_uclass_load_syncpt_base_base_indx_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_BASE_BASE_INDX_F(v) \
+	host1x_uclass_load_syncpt_base_base_indx_f(v)
+static inline u32 host1x_uclass_load_syncpt_base_value_f(u32 v)
+{
+	return (v & 0xffffff) << 0;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_BASE_VALUE_F(v) \
+	host1x_uclass_load_syncpt_base_value_f(v)
+static inline u32 host1x_uclass_incr_syncpt_base_base_indx_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+#define HOST1X_UCLASS_INCR_SYNCPT_BASE_BASE_INDX_F(v) \
+	host1x_uclass_incr_syncpt_base_base_indx_f(v)
+static inline u32 host1x_uclass_incr_syncpt_base_offset_f(u32 v)
+{
+	return (v & 0xffffff) << 0;
+}
+#define HOST1X_UCLASS_INCR_SYNCPT_BASE_OFFSET_F(v) \
+	host1x_uclass_incr_syncpt_base_offset_f(v)
+static inline u32 host1x_uclass_indoff_r(void)
+{
+	return 0x2d;
+}
+#define HOST1X_UCLASS_INDOFF \
+	host1x_uclass_indoff_r()
+static inline u32 host1x_uclass_indoff_indbe_f(u32 v)
+{
+	return (v & 0xf) << 28;
+}
+#define HOST1X_UCLASS_INDOFF_INDBE_F(v) \
+	host1x_uclass_indoff_indbe_f(v)
+static inline u32 host1x_uclass_indoff_autoinc_f(u32 v)
+{
+	return (v & 0x1) << 27;
+}
+#define HOST1X_UCLASS_INDOFF_AUTOINC_F(v) \
+	host1x_uclass_indoff_autoinc_f(v)
+static inline u32 host1x_uclass_indoff_indmodid_f(u32 v)
+{
+	return (v & 0xff) << 18;
+}
+#define HOST1X_UCLASS_INDOFF_INDMODID_F(v) \
+	host1x_uclass_indoff_indmodid_f(v)
+static inline u32 host1x_uclass_indoff_indroffset_f(u32 v)
+{
+	return (v & 0xffff) << 2;
+}
+#define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
+	host1x_uclass_indoff_indroffset_f(v)
+static inline u32 host1x_uclass_indoff_rwn_read_v(void)
+{
+	return 1;
+}
+#define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
+	host1x_uclass_indoff_indroffset_f(v)
+#endif
diff --git a/drivers/gpu/host1x/hw/syncpt_hw.c b/drivers/gpu/host1x/hw/syncpt_hw.c
index 16e3ada..ba48cee 100644
--- a/drivers/gpu/host1x/hw/syncpt_hw.c
+++ b/drivers/gpu/host1x/hw/syncpt_hw.c
@@ -97,6 +97,15 @@  static void syncpt_cpu_incr(struct host1x_syncpt *sp)
 	wmb();
 }
 
+/* remove a wait pointed to by patch_addr */
+static int syncpt_patch_wait(struct host1x_syncpt *sp, void *patch_addr)
+{
+	u32 override = host1x_class_host_wait_syncpt(
+			NVSYNCPT_GRAPHICS_HOST, 0);
+	__raw_writel(override, patch_addr);
+	return 0;
+}
+
 static const char *syncpt_name(struct host1x_syncpt *sp)
 {
 	struct host1x_device_info *info = &sp->dev->info;
@@ -141,6 +150,7 @@  static const struct host1x_syncpt_ops host1x_syncpt_ops = {
 	.read_wait_base = syncpt_read_wait_base,
 	.load_min = syncpt_load_min,
 	.cpu_incr = syncpt_cpu_incr,
+	.patch_wait = syncpt_patch_wait,
 	.debug = syncpt_debug,
 	.name = syncpt_name,
 };
diff --git a/drivers/gpu/host1x/intr.c b/drivers/gpu/host1x/intr.c
index 26099b8..9d0b5f1 100644
--- a/drivers/gpu/host1x/intr.c
+++ b/drivers/gpu/host1x/intr.c
@@ -20,6 +20,8 @@ 
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/irq.h>
+#include <trace/events/host1x.h>
+#include "channel.h"
 #include "dev.h"
 
 /* Wait list management */
@@ -74,7 +76,7 @@  static void remove_completed_waiters(struct list_head *head, u32 sync,
 			struct list_head completed[HOST1X_INTR_ACTION_COUNT])
 {
 	struct list_head *dest;
-	struct host1x_waitlist *waiter, *next;
+	struct host1x_waitlist *waiter, *next, *prev;
 
 	list_for_each_entry_safe(waiter, next, head, list) {
 		if ((s32)(waiter->thresh - sync) > 0)
@@ -82,6 +84,17 @@  static void remove_completed_waiters(struct list_head *head, u32 sync,
 
 		dest = completed + waiter->action;
 
+		/* consolidate submit cleanups */
+		if (waiter->action == HOST1X_INTR_ACTION_SUBMIT_COMPLETE
+			&& !list_empty(dest)) {
+			prev = list_entry(dest->prev,
+					struct host1x_waitlist, list);
+			if (prev->data == waiter->data) {
+				prev->count++;
+				dest = NULL;
+			}
+		}
+
 		/* PENDING->REMOVED or CANCELLED->HANDLED */
 		if (atomic_inc_return(&waiter->state) == WLS_HANDLED || !dest) {
 			list_del(&waiter->list);
@@ -104,6 +117,19 @@  static void reset_threshold_interrupt(struct host1x_intr *intr,
 	host1x->intr_op.enable_syncpt_intr(intr, id);
 }
 
+static void action_submit_complete(struct host1x_waitlist *waiter)
+{
+	struct host1x_channel *channel = waiter->data;
+	int nr_completed = waiter->count;
+
+	host1x_cdma_update(&channel->cdma);
+
+	/*  Add nr_completed to trace */
+	trace_host1x_channel_submit_complete(channel->dev->name,
+			nr_completed, waiter->thresh);
+
+}
+
 static void action_wakeup(struct host1x_waitlist *waiter)
 {
 	wait_queue_head_t *wq = waiter->data;
@@ -121,6 +147,7 @@  static void action_wakeup_interruptible(struct host1x_waitlist *waiter)
 typedef void (*action_handler)(struct host1x_waitlist *waiter);
 
 static action_handler action_handlers[HOST1X_INTR_ACTION_COUNT] = {
+	action_submit_complete,
 	action_wakeup,
 	action_wakeup_interruptible,
 };
diff --git a/drivers/gpu/host1x/intr.h b/drivers/gpu/host1x/intr.h
index 679a7b4..979b929 100644
--- a/drivers/gpu/host1x/intr.h
+++ b/drivers/gpu/host1x/intr.h
@@ -24,6 +24,12 @@ 
 
 enum host1x_intr_action {
 	/*
+	 * Perform cleanup after a submit has completed.
+	 * 'data' points to a channel
+	 */
+	HOST1X_INTR_ACTION_SUBMIT_COMPLETE = 0,
+
+	/*
 	 * Wake up a  task.
 	 * 'data' points to a wait_queue_head_t
 	 */
diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c
new file mode 100644
index 0000000..cc9c84a
--- /dev/null
+++ b/drivers/gpu/host1x/job.c
@@ -0,0 +1,612 @@ 
+/*
+ * Tegra host1x Job
+ *
+ * Copyright (c) 2010-2012, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kref.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/scatterlist.h>
+#include <trace/events/host1x.h>
+#include <linux/dma-mapping.h>
+#include "job.h"
+#include "channel.h"
+#include "syncpt.h"
+#include "dev.h"
+#include "memmgr.h"
+
+#ifdef CONFIG_TEGRA_HOST1X_FIREWALL
+static int host1x_firewall = 1;
+#else
+static int host1x_firewall;
+#endif
+
+struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
+		u32 num_cmdbufs, u32 num_relocs, u32 num_waitchks)
+{
+	struct host1x_job *job = NULL;
+	int num_unpins = num_cmdbufs + num_relocs;
+	s64 total;
+	void *mem;
+
+	/* Check that we're not going to overflow */
+	total = sizeof(struct host1x_job)
+			+ num_relocs * sizeof(struct host1x_reloc)
+			+ num_unpins * sizeof(struct host1x_job_unpin_data)
+			+ num_waitchks * sizeof(struct host1x_waitchk)
+			+ num_cmdbufs * sizeof(struct host1x_job_gather)
+			+ num_unpins * sizeof(dma_addr_t)
+			+ num_unpins * sizeof(u32 *);
+	if (total > ULONG_MAX)
+		return NULL;
+
+	mem = job = kzalloc(total, GFP_KERNEL);
+	if (!job)
+		return NULL;
+
+	kref_init(&job->ref);
+	job->ch = ch;
+
+	/* First init state to zero */
+
+	/*
+	 * Redistribute memory to the structs.
+	 * Overflows and negative conditions have
+	 * already been checked in job_alloc().
+	 */
+	mem += sizeof(struct host1x_job);
+	job->relocarray = num_relocs ? mem : NULL;
+	mem += num_relocs * sizeof(struct host1x_reloc);
+	job->unpins = num_unpins ? mem : NULL;
+	mem += num_unpins * sizeof(struct host1x_job_unpin_data);
+	job->waitchk = num_waitchks ? mem : NULL;
+	mem += num_waitchks * sizeof(struct host1x_waitchk);
+	job->gathers = num_cmdbufs ? mem : NULL;
+	mem += num_cmdbufs * sizeof(struct host1x_job_gather);
+	job->addr_phys = num_unpins ? mem : NULL;
+	mem += num_unpins * sizeof(dma_addr_t);
+	job->pin_ids = num_unpins ? mem : NULL;
+
+	job->reloc_addr_phys = job->addr_phys;
+	job->gather_addr_phys = &job->addr_phys[num_relocs];
+
+	return job;
+}
+
+void host1x_job_get(struct host1x_job *job)
+{
+	kref_get(&job->ref);
+}
+
+static void job_free(struct kref *ref)
+{
+	struct host1x_job *job = container_of(ref, struct host1x_job, ref);
+
+	kfree(job);
+}
+
+void host1x_job_put(struct host1x_job *job)
+{
+	kref_put(&job->ref, job_free);
+}
+
+void host1x_job_add_gather(struct host1x_job *job,
+		u32 mem_id, u32 words, u32 offset)
+{
+	struct host1x_job_gather *cur_gather =
+			&job->gathers[job->num_gathers];
+
+	cur_gather->words = words;
+	cur_gather->mem_id = mem_id;
+	cur_gather->offset = offset;
+	job->num_gathers++;
+}
+
+/*
+ * Check driver supplied waitchk structs for syncpt thresholds
+ * that have already been satisfied and NULL the comparison (to
+ * avoid a wrap condition in the HW).
+ */
+static int do_waitchks(struct host1x_job *job, struct host1x *host,
+		u32 patch_mem, struct mem_handle *h)
+{
+	int i;
+
+	/* compare syncpt vs wait threshold */
+	for (i = 0; i < job->num_waitchk; i++) {
+		struct host1x_waitchk *wait = &job->waitchk[i];
+		struct host1x_syncpt *sp =
+			host1x_syncpt_get(host, wait->syncpt_id);
+
+		/* validate syncpt id */
+		if (wait->syncpt_id > host1x_syncpt_nb_pts(host))
+			continue;
+
+		/* skip all other gathers */
+		if (patch_mem != wait->mem)
+			continue;
+
+		trace_host1x_syncpt_wait_check(wait->mem, wait->offset,
+				wait->syncpt_id, wait->thresh,
+				host1x_syncpt_read_min(sp));
+		if (host1x_syncpt_is_expired(
+			host1x_syncpt_get(host, wait->syncpt_id),
+			wait->thresh)) {
+			struct host1x_syncpt *sp =
+				host1x_syncpt_get(host, wait->syncpt_id);
+
+			void *patch_addr = NULL;
+
+			/*
+			 * NULL an already satisfied WAIT_SYNCPT host method,
+			 * by patching its args in the command stream. The
+			 * method data is changed to reference a reserved
+			 * (never given out or incr) NVSYNCPT_GRAPHICS_HOST
+			 * syncpt with a matching threshold value of 0, so
+			 * is guaranteed to be popped by the host HW.
+			 */
+			dev_dbg(&host->dev->dev,
+			    "drop WAIT id %d (%s) thresh 0x%x, min 0x%x\n",
+			    wait->syncpt_id, sp->name, wait->thresh,
+			    host1x_syncpt_read_min(sp));
+
+			/* patch the wait */
+			patch_addr = host1x_memmgr_kmap(h,
+					wait->offset >> PAGE_SHIFT);
+			if (patch_addr) {
+				host1x_syncpt_patch_wait(sp,
+					(patch_addr +
+						(wait->offset & ~PAGE_MASK)));
+				host1x_memmgr_kunmap(h,
+						wait->offset >> PAGE_SHIFT,
+						patch_addr);
+			} else {
+				pr_err("Couldn't map cmdbuf for wait check\n");
+			}
+		}
+
+		wait->mem = 0;
+	}
+	return 0;
+}
+
+
+static int pin_job_mem(struct host1x_job *job)
+{
+	int i;
+	int count = 0;
+	int result;
+
+	for (i = 0; i < job->num_relocs; i++) {
+		struct host1x_reloc *reloc = &job->relocarray[i];
+		job->pin_ids[count] = reloc->target;
+		count++;
+	}
+
+	for (i = 0; i < job->num_gathers; i++) {
+		struct host1x_job_gather *g = &job->gathers[i];
+		job->pin_ids[count] = g->mem_id;
+		count++;
+	}
+
+	/* validate array and pin unique ids, get refs for unpinning */
+	result = host1x_memmgr_pin_array_ids(job->ch->dev,
+		job->pin_ids, job->addr_phys,
+		count,
+		job->unpins);
+
+	if (result > 0)
+		job->num_unpins = result;
+
+	return result;
+}
+
+static int do_relocs(struct host1x_job *job,
+		u32 cmdbuf_mem, struct mem_handle *h)
+{
+	int i = 0;
+	int last_page = -1;
+	void *cmdbuf_page_addr = NULL;
+
+	/* pin & patch the relocs for one gather */
+	while (i < job->num_relocs) {
+		struct host1x_reloc *reloc = &job->relocarray[i];
+
+		/* skip all other gathers */
+		if (cmdbuf_mem != reloc->cmdbuf_mem) {
+			i++;
+			continue;
+		}
+
+		if (last_page != reloc->cmdbuf_offset >> PAGE_SHIFT) {
+			if (cmdbuf_page_addr)
+				host1x_memmgr_kunmap(h,
+						last_page, cmdbuf_page_addr);
+
+			cmdbuf_page_addr = host1x_memmgr_kmap(h,
+					reloc->cmdbuf_offset >> PAGE_SHIFT);
+			last_page = reloc->cmdbuf_offset >> PAGE_SHIFT;
+
+			if (unlikely(!cmdbuf_page_addr)) {
+				pr_err("Couldn't map cmdbuf for relocation\n");
+				return -ENOMEM;
+			}
+		}
+
+		__raw_writel(
+			(job->reloc_addr_phys[i] +
+				reloc->target_offset) >> reloc->shift,
+			(cmdbuf_page_addr +
+				(reloc->cmdbuf_offset & ~PAGE_MASK)));
+
+		/* remove completed reloc from the job */
+		if (i != job->num_relocs - 1) {
+			struct host1x_reloc *reloc_last =
+				&job->relocarray[job->num_relocs - 1];
+			reloc->cmdbuf_mem	= reloc_last->cmdbuf_mem;
+			reloc->cmdbuf_offset	= reloc_last->cmdbuf_offset;
+			reloc->target		= reloc_last->target;
+			reloc->target_offset	= reloc_last->target_offset;
+			reloc->shift		= reloc_last->shift;
+			job->reloc_addr_phys[i] =
+				job->reloc_addr_phys[job->num_relocs - 1];
+			job->num_relocs--;
+		} else {
+			break;
+		}
+	}
+
+	if (cmdbuf_page_addr)
+		host1x_memmgr_kunmap(h, last_page, cmdbuf_page_addr);
+
+	return 0;
+}
+
+static int check_reloc(struct host1x_reloc *reloc,
+		u32 cmdbuf_id, int offset)
+{
+	int err = 0;
+	if (reloc->cmdbuf_mem != cmdbuf_id
+			|| reloc->cmdbuf_offset != offset * sizeof(u32))
+		err = -EINVAL;
+
+	return err;
+}
+
+static int check_mask(struct host1x_job *job,
+		struct platform_device *pdev,
+		struct host1x_reloc **reloc, int *num_relocs,
+		u32 cmdbuf_id, int *offset,
+		u32 *words, u32 class, u32 reg, u32 mask)
+{
+	while (mask) {
+		if (*words == 0)
+			return -EINVAL;
+
+		if (mask & 1) {
+			if (job->is_addr_reg(pdev, class, reg)) {
+				if (!*num_relocs ||
+					check_reloc(*reloc, cmdbuf_id, *offset))
+					return -EINVAL;
+				(*reloc)++;
+				(*num_relocs)--;
+			}
+			(*words)--;
+			(*offset)++;
+		}
+		mask >>= 1;
+		reg += 1;
+	}
+
+	return 0;
+}
+
+static int check_incr(struct host1x_job *job,
+		struct platform_device *pdev,
+		struct host1x_reloc **reloc, int *num_relocs,
+		u32 cmdbuf_id, int *offset,
+		u32 *words, u32 class, u32 reg, u32 count)
+{
+	while (count) {
+		if (*words == 0)
+			return -EINVAL;
+
+		if (job->is_addr_reg(pdev, class, reg)) {
+			if (!*num_relocs ||
+				check_reloc(*reloc, cmdbuf_id, *offset))
+				return -EINVAL;
+			(*reloc)++;
+			(*num_relocs)--;
+		}
+		reg += 1;
+		(*words)--;
+		(*offset)++;
+		count--;
+	}
+
+	return 0;
+}
+
+static int check_nonincr(struct host1x_job *job,
+		struct platform_device *pdev,
+		struct host1x_reloc **reloc, int *num_relocs,
+		u32 cmdbuf_id, int *offset,
+		u32 *words, u32 class, u32 reg, u32 count)
+{
+	int is_addr_reg = job->is_addr_reg(pdev, class, reg);
+
+	while (count) {
+		if (*words == 0)
+			return -EINVAL;
+
+		if (is_addr_reg) {
+			if (!*num_relocs ||
+				check_reloc(*reloc, cmdbuf_id, *offset))
+				return -EINVAL;
+			(*reloc)++;
+			(*num_relocs)--;
+		}
+		(*words)--;
+		(*offset)++;
+		count--;
+	}
+
+	return 0;
+}
+
+static int validate(struct host1x_job *job, struct platform_device *pdev,
+		struct host1x_job_gather *g)
+{
+	struct host1x_reloc *reloc = job->relocarray;
+	int num_relocs = job->num_relocs;
+	u32 *cmdbuf_base;
+	int offset = 0;
+	unsigned int words;
+	int err = 0;
+	int class = 0;
+
+	if (!job->is_addr_reg)
+		return 0;
+
+	cmdbuf_base = host1x_memmgr_mmap(g->ref);
+	if (!cmdbuf_base)
+		return -ENOMEM;
+
+	words = g->words;
+	while (words && !err) {
+		u32 word = cmdbuf_base[offset];
+		u32 opcode = (word & 0xf0000000) >> 28;
+		u32 mask = 0;
+		u32 reg = 0;
+		u32 count = 0;
+
+		words--;
+		offset++;
+
+		switch (opcode) {
+		case 0:
+			class = word >> 6 & 0x3ff;
+			mask = word & 0x3f;
+			reg = word >> 16 & 0xfff;
+			err = check_mask(job, pdev,
+					&reloc, &num_relocs, g->mem_id,
+					&offset, &words, class, reg, mask);
+			if (err)
+				goto out;
+			break;
+		case 1:
+			reg = word >> 16 & 0xfff;
+			count = word & 0xffff;
+			err = check_incr(job, pdev,
+					&reloc, &num_relocs, g->mem_id,
+					&offset, &words, class, reg, count);
+			if (err)
+				goto out;
+			break;
+
+		case 2:
+			reg = word >> 16 & 0xfff;
+			count = word & 0xffff;
+			err = check_nonincr(job, pdev,
+					&reloc, &num_relocs, g->mem_id,
+					&offset, &words, class, reg, count);
+			if (err)
+				goto out;
+			break;
+
+		case 3:
+			mask = word & 0xffff;
+			reg = word >> 16 & 0xfff;
+			err = check_mask(job, pdev,
+					&reloc, &num_relocs, g->mem_id,
+					&offset, &words, class, reg, mask);
+			if (err)
+				goto out;
+			break;
+		case 4:
+		case 5:
+		case 14:
+			break;
+		default:
+			err = -EINVAL;
+			break;
+		}
+	}
+
+	/* No relocs should remain at this point */
+	if (num_relocs)
+		err = -EINVAL;
+
+out:
+	host1x_memmgr_munmap(g->ref, cmdbuf_base);
+
+	return err;
+}
+
+static inline int copy_gathers(struct host1x_job *job,
+		struct platform_device *pdev)
+{
+	size_t size = 0;
+	size_t offset = 0;
+	int i;
+
+	for (i = 0; i < job->num_gathers; i++) {
+		struct host1x_job_gather *g = &job->gathers[i];
+		size += g->words * sizeof(u32);
+	}
+
+	job->gather_copy_mapped = dma_alloc_writecombine(&pdev->dev,
+			size, &job->gather_copy, GFP_KERNEL);
+	if (IS_ERR(job->gather_copy_mapped)) {
+		int err = PTR_ERR(job->gather_copy_mapped);
+		job->gather_copy_mapped = NULL;
+		return err;
+	}
+
+	job->gather_copy_size = size;
+
+	for (i = 0; i < job->num_gathers; i++) {
+		struct host1x_job_gather *g = &job->gathers[i];
+		void *gather = host1x_memmgr_mmap(g->ref);
+		memcpy(job->gather_copy_mapped + offset,
+				gather + g->offset,
+				g->words * sizeof(u32));
+
+		g->mem_base = job->gather_copy;
+		g->offset = offset;
+		g->mem_id = 0;
+		g->ref = 0;
+
+		host1x_memmgr_munmap(g->ref, gather);
+		offset += g->words * sizeof(u32);
+	}
+
+	return 0;
+}
+
+int host1x_job_pin(struct host1x_job *job, struct platform_device *pdev)
+{
+	int err = 0, i = 0, j = 0;
+	struct host1x *host = host1x_get_host(pdev);
+	DECLARE_BITMAP(waitchk_mask, host1x_syncpt_nb_pts(host));
+
+	bitmap_zero(waitchk_mask, host1x_syncpt_nb_pts(host));
+	for (i = 0; i < job->num_waitchk; i++) {
+		u32 syncpt_id = job->waitchk[i].syncpt_id;
+		if (syncpt_id < host1x_syncpt_nb_pts(host))
+			set_bit(syncpt_id, waitchk_mask);
+	}
+
+	/* get current syncpt values for waitchk */
+	for_each_set_bit(i, &waitchk_mask[0], sizeof(waitchk_mask))
+		host1x_syncpt_load_min(host->syncpt + i);
+
+	/* pin memory */
+	err = pin_job_mem(job);
+	if (err <= 0)
+		goto out;
+
+	/* patch gathers */
+	for (i = 0; i < job->num_gathers; i++) {
+		struct host1x_job_gather *g = &job->gathers[i];
+
+		/* process each gather mem only once */
+		if (!g->ref) {
+			g->ref = host1x_memmgr_get(g->mem_id, job->ch->dev);
+			if (IS_ERR(g->ref)) {
+				err = PTR_ERR(g->ref);
+				g->ref = NULL;
+				break;
+			}
+
+			g->mem_base = job->gather_addr_phys[i];
+
+			for (j = 0; j < job->num_gathers; j++) {
+				struct host1x_job_gather *tmp =
+					&job->gathers[j];
+				if (!tmp->ref && tmp->mem_id == g->mem_id) {
+					tmp->ref = g->ref;
+					tmp->mem_base = g->mem_base;
+				}
+			}
+			err = 0;
+			if (host1x_firewall)
+				err = validate(job, pdev, g);
+			if (err)
+				dev_err(&pdev->dev,
+					"Job validate returned %d\n", err);
+			if (!err)
+				err = do_relocs(job, g->mem_id,  g->ref);
+			if (!err)
+				err = do_waitchks(job, host,
+						g->mem_id, g->ref);
+			host1x_memmgr_put(g->ref);
+			if (err)
+				break;
+		}
+	}
+
+	if (host1x_firewall && !err) {
+		err = copy_gathers(job, pdev);
+		if (err) {
+			host1x_job_unpin(job);
+			return err;
+		}
+	}
+
+out:
+	wmb();
+
+	return err;
+}
+
+void host1x_job_unpin(struct host1x_job *job)
+{
+	int i;
+
+	for (i = 0; i < job->num_unpins; i++) {
+		struct host1x_job_unpin_data *unpin = &job->unpins[i];
+		host1x_memmgr_unpin(unpin->h, unpin->mem);
+		host1x_memmgr_put(unpin->h);
+	}
+	job->num_unpins = 0;
+
+	if (job->gather_copy_size)
+		dma_free_writecombine(&job->ch->dev->dev,
+			job->gather_copy_size,
+			job->gather_copy_mapped, job->gather_copy);
+}
+
+/*
+ * Debug routine used to dump job entries
+ */
+void host1x_job_dump(struct device *dev, struct host1x_job *job)
+{
+	dev_dbg(dev, "    SYNCPT_ID   %d\n",
+		job->syncpt_id);
+	dev_dbg(dev, "    SYNCPT_VAL  %d\n",
+		job->syncpt_end);
+	dev_dbg(dev, "    FIRST_GET   0x%x\n",
+		job->first_get);
+	dev_dbg(dev, "    TIMEOUT     %d\n",
+		job->timeout);
+	dev_dbg(dev, "    NUM_SLOTS   %d\n",
+		job->num_slots);
+	dev_dbg(dev, "    NUM_HANDLES %d\n",
+		job->num_unpins);
+}
diff --git a/drivers/gpu/host1x/job.h b/drivers/gpu/host1x/job.h
new file mode 100644
index 0000000..428c670
--- /dev/null
+++ b/drivers/gpu/host1x/job.h
@@ -0,0 +1,164 @@ 
+/*
+ * Tegra host1x Job
+ *
+ * Copyright (c) 2011-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __HOST1X_JOB_H
+#define __HOST1X_JOB_H
+
+struct platform_device;
+
+struct host1x_job_gather {
+	u32 words;
+	dma_addr_t mem_base;
+	u32 mem_id;
+	int offset;
+	struct mem_handle *ref;
+};
+
+struct host1x_cmdbuf {
+	__u32 mem;
+	__u32 offset;
+	__u32 words;
+	__u32 pad;
+};
+
+struct host1x_reloc {
+	__u32 cmdbuf_mem;
+	__u32 cmdbuf_offset;
+	__u32 target;
+	__u32 target_offset;
+	__u32 shift;
+	__u32 pad;
+};
+
+struct host1x_waitchk {
+	__u32 mem;
+	__u32 offset;
+	__u32 syncpt_id;
+	__u32 thresh;
+};
+
+/*
+ * Each submit is tracked as a host1x_job.
+ */
+struct host1x_job {
+	/* When refcount goes to zero, job can be freed */
+	struct kref ref;
+
+	/* List entry */
+	struct list_head list;
+
+	/* Channel where job is submitted to */
+	struct host1x_channel *ch;
+
+	int clientid;
+
+	/* Gathers and their memory */
+	struct host1x_job_gather *gathers;
+	int num_gathers;
+
+	/* Wait checks to be processed at submit time */
+	struct host1x_waitchk *waitchk;
+	int num_waitchk;
+	u32 waitchk_mask;
+
+	/* Array of handles to be pinned & unpinned */
+	struct host1x_reloc *relocarray;
+	int num_relocs;
+	struct host1x_job_unpin_data *unpins;
+	int num_unpins;
+
+	dma_addr_t *addr_phys;
+	dma_addr_t *gather_addr_phys;
+	dma_addr_t *reloc_addr_phys;
+
+	/* Sync point id, number of increments and end related to the submit */
+	u32 syncpt_id;
+	u32 syncpt_incrs;
+	u32 syncpt_end;
+
+	/* Maximum time to wait for this job */
+	int timeout;
+
+	/* Null kickoff prevents submit from being sent to hardware */
+	bool null_kickoff;
+
+	/* Index and number of slots used in the push buffer */
+	int first_get;
+	int num_slots;
+
+	/* Copy of gathers */
+	size_t gather_copy_size;
+	dma_addr_t gather_copy;
+	u8 *gather_copy_mapped;
+
+	/* Temporary space for unpin ids */
+	long unsigned int *pin_ids;
+
+	/* Check if register is marked as an address reg */
+	int (*is_addr_reg)(struct platform_device *dev, u32 reg, u32 class);
+
+	/* Request a SETCLASS to this class */
+	u32 class;
+
+	/* Add a channel wait for previous ops to complete */
+	u32 serialize;
+};
+/*
+ * Allocate memory for a job. Just enough memory will be allocated to
+ * accomodate the submit.
+ */
+struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
+		u32 num_cmdbufs, u32 num_relocs, u32 num_waitchks);
+
+/*
+ * Add a gather to a job.
+ */
+void host1x_job_add_gather(struct host1x_job *job,
+		u32 mem_id, u32 words, u32 offset);
+
+/*
+ * Increment reference going to host1x_job.
+ */
+void host1x_job_get(struct host1x_job *job);
+
+/*
+ * Decrement reference job, free if goes to zero.
+ */
+void host1x_job_put(struct host1x_job *job);
+
+/*
+ * Pin memory related to job. This handles relocation of addresses to the
+ * host1x address space. Handles both the gather memory and any other memory
+ * referred to from the gather buffers.
+ *
+ * Handles also patching out host waits that would wait for an expired sync
+ * point value.
+ */
+int host1x_job_pin(struct host1x_job *job, struct platform_device *pdev);
+
+/*
+ * Unpin memory related to job.
+ */
+void host1x_job_unpin(struct host1x_job *job);
+
+/*
+ * Dump contents of job to debug output.
+ */
+void host1x_job_dump(struct device *dev, struct host1x_job *job);
+
+#endif
diff --git a/drivers/gpu/host1x/memmgr.c b/drivers/gpu/host1x/memmgr.c
new file mode 100644
index 0000000..eceb782
--- /dev/null
+++ b/drivers/gpu/host1x/memmgr.c
@@ -0,0 +1,173 @@ 
+/*
+ * Tegra host1x Memory Management Abstraction
+ *
+ * Copyright (c) 2012-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+
+#include "memmgr.h"
+#include "cma.h"
+
+struct mem_handle *host1x_memmgr_alloc(size_t size, size_t align, int flags)
+{
+	return NULL;
+}
+
+struct mem_handle *host1x_memmgr_get(u32 id, struct platform_device *dev)
+{
+	struct mem_handle *h = NULL;
+
+	switch (host1x_memmgr_type(id)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		h = (struct mem_handle *) host1x_cma_get(id, dev);
+		break;
+#endif
+	default:
+		break;
+	}
+
+	return h;
+}
+
+void host1x_memmgr_put(struct mem_handle *handle)
+{
+	switch (host1x_memmgr_type((u32)handle)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		host1x_cma_put(handle);
+		break;
+#endif
+	default:
+		break;
+	}
+}
+
+struct sg_table *host1x_memmgr_pin(struct mem_handle *handle)
+{
+	switch (host1x_memmgr_type((u32)handle)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		return host1x_cma_pin(handle);
+		break;
+#endif
+	default:
+		return NULL;
+		break;
+	}
+}
+
+void host1x_memmgr_unpin(struct mem_handle *handle, struct sg_table *sgt)
+{
+	switch (host1x_memmgr_type((u32)handle)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		host1x_cma_unpin(handle, sgt);
+		break;
+#endif
+	default:
+		break;
+	}
+}
+
+void *host1x_memmgr_mmap(struct mem_handle *handle)
+{
+	switch (host1x_memmgr_type((u32)handle)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		return host1x_cma_mmap(handle);
+		break;
+#endif
+	default:
+		return NULL;
+		break;
+	}
+}
+
+void host1x_memmgr_munmap(struct mem_handle *handle, void *addr)
+{
+	switch (host1x_memmgr_type((u32)handle)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		host1x_cma_munmap(handle, addr);
+		break;
+#endif
+	default:
+		break;
+	}
+}
+
+void *host1x_memmgr_kmap(struct mem_handle *handle, unsigned int pagenum)
+{
+	switch (host1x_memmgr_type((u32)handle)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		return host1x_cma_kmap(handle, pagenum);
+		break;
+#endif
+	default:
+		return NULL;
+		break;
+	}
+}
+
+void host1x_memmgr_kunmap(struct mem_handle *handle, unsigned int pagenum,
+		void *addr)
+{
+	switch (host1x_memmgr_type((u32)handle)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		host1x_cma_kunmap(handle, pagenum, addr);
+		break;
+#endif
+	default:
+		break;
+	}
+}
+
+int host1x_memmgr_pin_array_ids(struct platform_device *dev,
+		long unsigned *ids,
+		dma_addr_t *phys_addr,
+		u32 count,
+		struct host1x_job_unpin_data *unpin_data)
+{
+	int pin_count = 0;
+
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	{
+		int cma_count = host1x_cma_pin_array_ids(dev,
+			ids, MEMMGR_TYPE_MASK,
+			mem_mgr_type_cma,
+			count, &unpin_data[pin_count],
+			phys_addr);
+
+		if (cma_count < 0) {
+			/* clean up previous handles */
+			while (pin_count) {
+				pin_count--;
+				/* unpin, put */
+				host1x_memmgr_unpin(unpin_data[pin_count].h,
+						unpin_data[pin_count].mem);
+				host1x_memmgr_put(unpin_data[pin_count].h);
+			}
+			return cma_count;
+		}
+		pin_count += cma_count;
+	}
+#endif
+	return pin_count;
+}
diff --git a/drivers/gpu/host1x/memmgr.h b/drivers/gpu/host1x/memmgr.h
new file mode 100644
index 0000000..a265fe8
--- /dev/null
+++ b/drivers/gpu/host1x/memmgr.h
@@ -0,0 +1,72 @@ 
+/*
+ * Tegra host1x Memory Management Abstraction header
+ *
+ * Copyright (c) 2012-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _HOST1X_MEM_MGR_H
+#define _HOST1X_MEM_MGR_H
+
+struct mem_handle;
+struct platform_device;
+
+struct host1x_job_unpin_data {
+	struct mem_handle *h;
+	struct sg_table *mem;
+};
+
+enum mem_mgr_flag {
+	mem_mgr_flag_uncacheable = 0,
+	mem_mgr_flag_write_combine = 1,
+};
+
+/* Buffer encapsulation */
+enum mem_mgr_type {
+	mem_mgr_type_cma = 2,
+};
+
+#define MEMMGR_TYPE_MASK	0x3
+#define MEMMGR_ID_MASK		~0x3
+
+static inline int host1x_memmgr_type(u32 id) { return id & MEMMGR_TYPE_MASK; }
+static inline int host1x_memmgr_id(u32 id) { return id & MEMMGR_ID_MASK; }
+static inline unsigned int host1x_memmgr_host1x_id(u32 type, u32 handle)
+{
+	if (host1x_memmgr_type(type) != type ||
+		host1x_memmgr_id(handle) != handle)
+		return 0;
+
+	return handle | type;
+}
+
+struct mem_handle *host1x_memmgr_alloc(size_t size, size_t align,
+		int flags);
+struct mem_handle *host1x_memmgr_get(u32 id, struct platform_device *dev);
+void host1x_memmgr_put(struct mem_handle *handle);
+struct sg_table *host1x_memmgr_pin(struct mem_handle *handle);
+void host1x_memmgr_unpin(struct mem_handle *handle, struct sg_table *sgt);
+void *host1x_memmgr_mmap(struct mem_handle *handle);
+void host1x_memmgr_munmap(struct mem_handle *handle, void *addr);
+void *host1x_memmgr_kmap(struct mem_handle *handle, unsigned int pagenum);
+void host1x_memmgr_kunmap(struct mem_handle *handle, unsigned int pagenum,
+		void *addr);
+
+int host1x_memmgr_pin_array_ids(struct platform_device *dev,
+		long unsigned *ids,
+		dma_addr_t *phys_addr,
+		u32 count,
+		struct host1x_job_unpin_data *unpin_data);
+
+#endif
diff --git a/drivers/gpu/host1x/syncpt.c b/drivers/gpu/host1x/syncpt.c
index 32e2b42..f21c688 100644
--- a/drivers/gpu/host1x/syncpt.c
+++ b/drivers/gpu/host1x/syncpt.c
@@ -287,6 +287,12 @@  void host1x_syncpt_debug(struct host1x_syncpt *sp)
 	sp->dev->syncpt_op.debug(sp);
 }
 
+/* remove a wait pointed to by patch_addr */
+int host1x_syncpt_patch_wait(struct host1x_syncpt *sp, void *patch_addr)
+{
+	return sp->dev->syncpt_op.patch_wait(sp, patch_addr);
+}
+
 int host1x_syncpt_init(struct host1x *host)
 {
 	struct host1x_syncpt *syncpt, *sp;
@@ -305,6 +311,11 @@  int host1x_syncpt_init(struct host1x *host)
 
 	host->syncpt = syncpt;
 
+	/* Allocate sync point to use for clearing waits for expired fences */
+	host->nop_sp = _host1x_syncpt_alloc(host, NULL, 0);
+	if (!host->nop_sp)
+		return -ENOMEM;
+
 	return 0;
 }
 
diff --git a/drivers/gpu/host1x/syncpt.h b/drivers/gpu/host1x/syncpt.h
index b46d044..255a3a3 100644
--- a/drivers/gpu/host1x/syncpt.h
+++ b/drivers/gpu/host1x/syncpt.h
@@ -26,6 +26,7 @@ 
 struct host1x;
 
 #define NVSYNCPT_INVALID			(-1)
+#define NVSYNCPT_GRAPHICS_HOST			0
 
 struct host1x_syncpt {
 	int id;
@@ -145,6 +146,9 @@  static inline int host1x_syncpt_is_valid(struct host1x_syncpt *sp)
 		sp->id < host1x_syncpt_nb_pts(sp->dev);
 }
 
+/* Patch a wait by replacing it with a wait for syncpt 0 value 0 */
+int host1x_syncpt_patch_wait(struct host1x_syncpt *sp, void *patch_addr);
+
 /* Return id of the sync point */
 u32 host1x_syncpt_id(struct host1x_syncpt *sp);
 
diff --git a/include/trace/events/host1x.h b/include/trace/events/host1x.h
index 3c14cac..c63d75c 100644
--- a/include/trace/events/host1x.h
+++ b/include/trace/events/host1x.h
@@ -37,6 +37,190 @@  DECLARE_EVENT_CLASS(host1x,
 	TP_printk("name=%s", __entry->name)
 );
 
+DEFINE_EVENT(host1x, host1x_channel_open,
+	TP_PROTO(const char *name),
+	TP_ARGS(name)
+);
+
+DEFINE_EVENT(host1x, host1x_channel_release,
+	TP_PROTO(const char *name),
+	TP_ARGS(name)
+);
+
+DEFINE_EVENT(host1x, host1x_cdma_begin,
+	TP_PROTO(const char *name),
+	TP_ARGS(name)
+);
+
+DEFINE_EVENT(host1x, host1x_cdma_end,
+	TP_PROTO(const char *name),
+	TP_ARGS(name)
+);
+
+TRACE_EVENT(host1x,
+	TP_PROTO(const char *name, int timeout),
+
+	TP_ARGS(name, timeout),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(int, timeout)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->timeout = timeout;
+	),
+
+	TP_printk("name=%s, timeout=%d",
+		__entry->name, __entry->timeout)
+);
+
+TRACE_EVENT(host1x_cdma_push,
+	TP_PROTO(const char *name, u32 op1, u32 op2),
+
+	TP_ARGS(name, op1, op2),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(u32, op1)
+		__field(u32, op2)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->op1 = op1;
+		__entry->op2 = op2;
+	),
+
+	TP_printk("name=%s, op1=%08x, op2=%08x",
+		__entry->name, __entry->op1, __entry->op2)
+);
+
+TRACE_EVENT(host1x_cdma_push_gather,
+	TP_PROTO(const char *name, u32 mem_id,
+			u32 words, u32 offset, void *cmdbuf),
+
+	TP_ARGS(name, mem_id, words, offset, cmdbuf),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(u32, mem_id)
+		__field(u32, words)
+		__field(u32, offset)
+		__field(bool, cmdbuf)
+		__dynamic_array(u32, cmdbuf, words)
+	),
+
+	TP_fast_assign(
+		if (cmdbuf) {
+			memcpy(__get_dynamic_array(cmdbuf), cmdbuf+offset,
+					words * sizeof(u32));
+		}
+		__entry->cmdbuf = cmdbuf;
+		__entry->name = name;
+		__entry->mem_id = mem_id;
+		__entry->words = words;
+		__entry->offset = offset;
+	),
+
+	TP_printk("name=%s, mem_id=%08x, words=%u, offset=%d, contents=[%s]",
+	  __entry->name, __entry->mem_id,
+	  __entry->words, __entry->offset,
+	  __print_hex(__get_dynamic_array(cmdbuf),
+		  __entry->cmdbuf ? __entry->words * 4 : 0))
+);
+
+TRACE_EVENT(host1x_channel_submit,
+	TP_PROTO(const char *name, u32 cmdbufs, u32 relocs, u32 waitchks,
+			u32 syncpt_id, u32 syncpt_incrs),
+
+	TP_ARGS(name, cmdbufs, relocs, waitchks, syncpt_id, syncpt_incrs),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(u32, cmdbufs)
+		__field(u32, relocs)
+		__field(u32, waitchks)
+		__field(u32, syncpt_id)
+		__field(u32, syncpt_incrs)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->cmdbufs = cmdbufs;
+		__entry->relocs = relocs;
+		__entry->waitchks = waitchks;
+		__entry->syncpt_id = syncpt_id;
+		__entry->syncpt_incrs = syncpt_incrs;
+	),
+
+	TP_printk("name=%s, cmdbufs=%u, relocs=%u, waitchks=%d,"
+		"syncpt_id=%u, syncpt_incrs=%u",
+	  __entry->name, __entry->cmdbufs, __entry->relocs, __entry->waitchks,
+	  __entry->syncpt_id, __entry->syncpt_incrs)
+);
+
+TRACE_EVENT(host1x_channel_submitted,
+	TP_PROTO(const char *name, u32 syncpt_base, u32 syncpt_max),
+
+	TP_ARGS(name, syncpt_base, syncpt_max),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(u32, syncpt_base)
+		__field(u32, syncpt_max)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->syncpt_base = syncpt_base;
+		__entry->syncpt_max = syncpt_max;
+	),
+
+	TP_printk("name=%s, syncpt_base=%d, syncpt_max=%d",
+		__entry->name, __entry->syncpt_base, __entry->syncpt_max)
+);
+
+TRACE_EVENT(host1x_channel_submit_complete,
+	TP_PROTO(const char *name, int count, u32 thresh),
+
+	TP_ARGS(name, count, thresh),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(int, count)
+		__field(u32, thresh)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->count = count;
+		__entry->thresh = thresh;
+	),
+
+	TP_printk("name=%s, count=%d, thresh=%d",
+		__entry->name, __entry->count, __entry->thresh)
+);
+
+TRACE_EVENT(host1x_wait_cdma,
+	TP_PROTO(const char *name, u32 eventid),
+
+	TP_ARGS(name, eventid),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(u32, eventid)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->eventid = eventid;
+	),
+
+	TP_printk("name=%s, event=%d", __entry->name, __entry->eventid)
+);
+
 TRACE_EVENT(host1x_syncpt_load_min,
 	TP_PROTO(u32 id, u32 val),
 
@@ -55,6 +239,33 @@  TRACE_EVENT(host1x_syncpt_load_min,
 	TP_printk("id=%d, val=%d", __entry->id, __entry->val)
 );
 
+TRACE_EVENT(host1x_syncpt_wait_check,
+	TP_PROTO(u32 mem_id, u32 offset, u32 syncpt_id, u32 thresh, u32 min),
+
+	TP_ARGS(mem_id, offset, syncpt_id, thresh, min),
+
+	TP_STRUCT__entry(
+		__field(u32, mem_id)
+		__field(u32, offset)
+		__field(u32, syncpt_id)
+		__field(u32, thresh)
+		__field(u32, min)
+	),
+
+	TP_fast_assign(
+		__entry->mem_id = mem_id;
+		__entry->offset = offset;
+		__entry->syncpt_id = syncpt_id;
+		__entry->thresh = thresh;
+		__entry->min = min;
+	),
+
+	TP_printk("mem_id=%08x, offset=%05x, id=%d, thresh=%d, current=%d",
+		__entry->mem_id, __entry->offset,
+		__entry->syncpt_id, __entry->thresh,
+		__entry->min)
+);
+
 #endif /*  _TRACE_HOST1X_H */
 
 /* This part must be outside protection */