diff mbox

[v2] drm/i915: Pre-allocation of shmem pages of a GEM object

Message ID 1399263929-11535-1-git-send-email-akash.goel@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

akash.goel@intel.com May 5, 2014, 4:25 a.m. UTC
From: Akash Goel <akash.goel@intel.com>

This patch could help to reduce the time, 'struct_mutex' is kept
locked during either the exec-buffer path or Page fault
handling path as now the backing pages are requested from shmem layer
without holding the 'struct_mutex'.

v2: Fixed the merge issue, due to which 'exec_lock' mutex was not released.

Signed-off-by: Akash Goel <akash.goel@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |  9 +++-
 drivers/gpu/drm/i915/i915_dma.c            |  1 +
 drivers/gpu/drm/i915/i915_drv.h            |  5 ++
 drivers/gpu/drm/i915/i915_gem.c            | 75 ++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 87 ++++++++++++++++++++++++++----
 drivers/gpu/drm/i915/i915_trace.h          | 35 ++++++++++++
 6 files changed, 200 insertions(+), 12 deletions(-)

Comments

Chris Wilson May 5, 2014, 8:17 a.m. UTC | #1
On Mon, May 05, 2014 at 09:55:29AM +0530, akash.goel@intel.com wrote:
> From: Akash Goel <akash.goel@intel.com>
> 
> This patch could help to reduce the time, 'struct_mutex' is kept
> locked during either the exec-buffer path or Page fault
> handling path as now the backing pages are requested from shmem layer
> without holding the 'struct_mutex'.
> 
> v2: Fixed the merge issue, due to which 'exec_lock' mutex was not released.

This would be a good excuse to work on per-object locks and augmenting
i915_gem_madvise_ioctl() to grab pages. iow, add obj->mutex and use that
for guarding all obj->pages related members/operations, then add
I915_MADV_POPULATE which can run without the struct mutex.

That should provide you with the lockless get_pages and keep execbuffer
reasonably clean and fast.

Again, please think about why you are *clflushing* so many pages so
often. That is a sign of userspace bo cache failure.
-Chris
akash.goel@intel.com May 5, 2014, 1:05 p.m. UTC | #2
On Mon, 2014-05-05 at 09:17 +0100, Chris Wilson wrote:
> On Mon, May 05, 2014 at 09:55:29AM +0530, akash.goel@intel.com wrote:
> > From: Akash Goel <akash.goel@intel.com>
> > 
> > This patch could help to reduce the time, 'struct_mutex' is kept
> > locked during either the exec-buffer path or Page fault
> > handling path as now the backing pages are requested from shmem layer
> > without holding the 'struct_mutex'.
> > 
> > v2: Fixed the merge issue, due to which 'exec_lock' mutex was not released.
> 
> This would be a good excuse to work on per-object locks and augmenting
> i915_gem_madvise_ioctl() to grab pages. iow, add obj->mutex and use that
> for guarding all obj->pages related members/operations, then add
> I915_MADV_POPULATE which can run without the struct mutex.
> 
> That should provide you with the lockless get_pages and keep execbuffer
> reasonably clean and fast.

Yes the per object lock would be a more cleaner approach here.
But it could take some time to implement it, so time being can we
consider this as a stopgap solution.

> Again, please think about why you are *clflushing* so many pages so
> often. That is a sign of userspace bo cache failure.
> -Chris
> 

Sorry not sure that whether I understood your point here, but we are not
doing any extra clflush here, just doing the needful.
Any newly allocated buffer from shmem, is by default marked as to be
present in CPU domain, so when it is being submitted to rendering on
GPU, all the pages of this buffer are 'clflushed'. 
This is probably to ensure that any stale data for this buffer in CPU
cache is flushed out, before GPU actually starts writing to this buffer,
otherwise the data written by GPU could be overwritten with stale data
in CPU cache subsequently.

Sometimes there is an odd need to process buffers of huge size (like ~34
MB) and that's where the user space bo cache might also fail. 

Best regards
Akash
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 18b3565..70c752b 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -247,10 +247,16 @@  static int i915_gem_stolen_list_info(struct seq_file *m, void *data)
 	LIST_HEAD(stolen);
 	int count, ret;
 
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
+	ret = mutex_lock_interruptible(&dev_priv->exec_lock);
 	if (ret)
 		return ret;
 
+	ret = mutex_lock_interruptible(&dev->struct_mutex);
+	if (ret) {
+		mutex_unlock(&dev_priv->exec_lock);
+		return ret;
+	}
+
 	total_obj_size = total_gtt_size = count = 0;
 	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) {
 		if (obj->stolen == NULL)
@@ -281,6 +287,7 @@  static int i915_gem_stolen_list_info(struct seq_file *m, void *data)
 		list_del_init(&obj->obj_exec_link);
 	}
 	mutex_unlock(&dev->struct_mutex);
+	mutex_unlock(&dev_priv->exec_lock);
 
 	seq_printf(m, "Total %d objects, %zu bytes, %zu GTT size\n",
 		   count, total_obj_size, total_gtt_size);
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index e393a14..e4d1cb0 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1572,6 +1572,7 @@  int i915_driver_load(struct drm_device *dev, unsigned long flags)
 	dev_priv->ring_index = 0;
 	mutex_init(&dev_priv->dpio_lock);
 	mutex_init(&dev_priv->modeset_restore_lock);
+	mutex_init(&dev_priv->exec_lock);
 
 	intel_pm_setup(dev);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5f4f631..6dc579a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1475,6 +1475,8 @@  struct drm_i915_private {
 	struct i915_ums_state ums;
 	/* the indicator for dispatch video commands on two BSD rings */
 	int ring_index;
+	/* for concurrent execbuffer protection */
+	struct mutex exec_lock;
 };
 
 static inline struct drm_i915_private *to_i915(const struct drm_device *dev)
@@ -2116,6 +2118,9 @@  int i915_gem_dumb_create(struct drm_file *file_priv,
 			 struct drm_mode_create_dumb *args);
 int i915_gem_mmap_gtt(struct drm_file *file_priv, struct drm_device *dev,
 		      uint32_t handle, uint64_t *offset);
+void
+i915_gem_object_shmem_preallocate(struct drm_i915_gem_object *obj);
+
 /**
  * Returns true if seq1 is later than seq2.
  */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index dae51c3..b19ccb8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1408,6 +1408,8 @@  int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >>
 		PAGE_SHIFT;
 
+	i915_gem_object_shmem_preallocate(obj);
+
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		goto out;
@@ -1873,6 +1875,79 @@  i915_gem_shrink_all(struct drm_i915_private *dev_priv)
 	return __i915_gem_shrink(dev_priv, LONG_MAX, false);
 }
 
+void
+i915_gem_object_shmem_preallocate(struct drm_i915_gem_object *obj)
+{
+	int page_count, i;
+	struct address_space *mapping;
+	struct page *page;
+	gfp_t gfp;
+
+	if (obj->pages)
+		return;
+
+	if (obj->madv != I915_MADV_WILLNEED) {
+		DRM_ERROR("Attempting to preallocate a purgeable object\n");
+		return;
+	}
+
+	if (obj->base.filp) {
+		int ret;
+		struct inode *inode = file_inode(obj->base.filp);
+		struct shmem_inode_info *info = SHMEM_I(inode);
+		if (!inode)
+			return;
+		/* The alloced field stores how many data pages are allocated
+		 * to the file. If already shmem space has been allocated for
+		 * the object, then we can simply return */
+		spin_lock(&info->lock);
+		ret = info->alloced;
+		spin_unlock(&info->lock);
+		if (ret > 0) {
+			DRM_DEBUG_DRIVER("Already shmem space alloced for obj %p, %d pages\n",
+					obj, ret);
+			return;
+		}
+	} else
+		return;
+
+	BUG_ON(obj->pages_pin_count);
+
+	/* Assert that the object is not currently in any GPU domain. As it
+	 * wasn't in the GTT, there shouldn't be any way it could have been in
+	 * a GPU cache
+	 */
+	BUG_ON(obj->base.read_domains & I915_GEM_GPU_DOMAINS);
+	BUG_ON(obj->base.write_domain & I915_GEM_GPU_DOMAINS);
+
+	trace_i915_gem_obj_prealloc_start(obj, obj->base.size);
+
+	page_count = obj->base.size / PAGE_SIZE;
+
+	/* Get the list of pages out of our struct file
+	 * Fail silently without starting the shrinker
+	 */
+	mapping = file_inode(obj->base.filp)->i_mapping;
+	gfp = mapping_gfp_mask(mapping);
+	gfp |= __GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD;
+	gfp &= ~(__GFP_IO | __GFP_WAIT);
+	for (i = 0; i < page_count; i++) {
+		page = shmem_read_mapping_page_gfp(mapping, i, gfp);
+		if (IS_ERR(page)) {
+			DRM_DEBUG_DRIVER("Failure for obj(%p), size(%x) at page(%d)\n",
+						obj, obj->base.size, i);
+			break;
+		}
+		/* Decrement the extra ref count on the returned page,
+		   otherwise when 'get_pages_gtt' will be called later on
+		   in the regular path, it will also increment the ref count,
+		   which will disturb the ref count management */
+		page_cache_release(page);
+	}
+
+	trace_i915_gem_obj_prealloc_end(obj);
+}
+
 static int
 i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
 {
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 6cc004f..da3cbdc 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -38,6 +38,7 @@ 
 
 struct eb_vmas {
 	struct list_head vmas;
+	struct list_head objects;
 	int and;
 	union {
 		struct i915_vma *lut[0];
@@ -93,10 +94,9 @@  eb_lookup_vmas(struct eb_vmas *eb,
 {
 	struct drm_i915_private *dev_priv = vm->dev->dev_private;
 	struct drm_i915_gem_object *obj;
-	struct list_head objects;
 	int i, ret;
 
-	INIT_LIST_HEAD(&objects);
+	INIT_LIST_HEAD(&eb->objects);
 	spin_lock(&file->table_lock);
 	/* Grab a reference to the object and release the lock so we can lookup
 	 * or create the VMA without using GFP_ATOMIC */
@@ -119,12 +119,12 @@  eb_lookup_vmas(struct eb_vmas *eb,
 		}
 
 		drm_gem_object_reference(&obj->base);
-		list_add_tail(&obj->obj_exec_link, &objects);
+		list_add_tail(&obj->obj_exec_link, &eb->objects);
 	}
 	spin_unlock(&file->table_lock);
 
 	i = 0;
-	while (!list_empty(&objects)) {
+	while (!list_empty(&eb->objects)) {
 		struct i915_vma *vma;
 		struct i915_address_space *bind_vm = vm;
 
@@ -141,7 +141,7 @@  eb_lookup_vmas(struct eb_vmas *eb,
 		    (i == (args->buffer_count - 1))))
 			bind_vm = &dev_priv->gtt.base;
 
-		obj = list_first_entry(&objects,
+		obj = list_first_entry(&eb->objects,
 				       struct drm_i915_gem_object,
 				       obj_exec_link);
 
@@ -162,7 +162,6 @@  eb_lookup_vmas(struct eb_vmas *eb,
 
 		/* Transfer ownership from the objects list to the vmas list. */
 		list_add_tail(&vma->exec_list, &eb->vmas);
-		list_del_init(&obj->obj_exec_link);
 
 		vma->exec_entry = &exec[i];
 		if (eb->and < 0) {
@@ -180,8 +179,8 @@  eb_lookup_vmas(struct eb_vmas *eb,
 
 
 err:
-	while (!list_empty(&objects)) {
-		obj = list_first_entry(&objects,
+	while (!list_empty(&eb->objects)) {
+		obj = list_first_entry(&eb->objects,
 				       struct drm_i915_gem_object,
 				       obj_exec_link);
 		list_del_init(&obj->obj_exec_link);
@@ -249,6 +248,15 @@  static void eb_destroy(struct eb_vmas *eb)
 		i915_gem_execbuffer_unreserve_vma(vma);
 		drm_gem_object_unreference(&vma->obj->base);
 	}
+
+	while (!list_empty(&eb->objects)) {
+		struct drm_i915_gem_object *obj;
+		obj = list_first_entry(&eb->objects,
+				struct drm_i915_gem_object,
+				obj_exec_link);
+		list_del_init(&obj->obj_exec_link);
+	}
+
 	kfree(eb);
 }
 
@@ -712,6 +720,7 @@  i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 				  struct eb_vmas *eb,
 				  struct drm_i915_gem_exec_object2 *exec)
 {
+	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_gem_relocation_entry *reloc;
 	struct i915_address_space *vm;
 	struct i915_vma *vma;
@@ -786,12 +795,24 @@  i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 		total += exec[i].relocation_count;
 	}
 
+	/* First acquire the 'exec_lock' mutex to prevent the concurrent
+	 * access to the 'obj_exec_link' field of the objects, by the
+	 * preallocation routine from the context of a new execbuffer ioctl */
+	ret = mutex_lock_interruptible(&dev_priv->exec_lock);
+	if (ret) {
+		mutex_lock(&dev->struct_mutex);
+		goto err;
+	}
+
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret) {
 		mutex_lock(&dev->struct_mutex);
 		goto err;
 	}
 
+	/* Now release the 'exec_lock' after acquiring the 'struct mutex' */
+	mutex_unlock(&dev_priv->exec_lock);
+
 	/* reacquire the objects */
 	eb_reset(eb);
 	ret = eb_lookup_vmas(eb, exec, args, vm, file);
@@ -856,6 +877,21 @@  i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring,
 	return intel_ring_invalidate_all_caches(ring);
 }
 
+static void
+i915_gem_execbuffer_preallocate_objs(struct list_head *objects)
+{
+	struct drm_i915_gem_object *obj;
+
+	/* Try to get the obj pages atomically */
+	while (!list_empty(objects)) {
+		obj = list_first_entry(objects,
+				struct drm_i915_gem_object,
+				obj_exec_link);
+		i915_gem_object_shmem_preallocate(obj);
+		list_del_init(&obj->obj_exec_link);
+	}
+}
+
 static bool
 i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec)
 {
@@ -1173,12 +1209,19 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 
 	intel_runtime_pm_get(dev_priv);
 
-	ret = i915_mutex_lock_interruptible(dev);
+	ret = mutex_lock_interruptible(&dev_priv->exec_lock);
 	if (ret)
 		goto pre_mutex_err;
 
+	ret = i915_mutex_lock_interruptible(dev);
+	if (ret) {
+		mutex_unlock(&dev_priv->exec_lock);
+		goto pre_mutex_err;
+	}
+
 	if (dev_priv->ums.mm_suspended) {
 		mutex_unlock(&dev->struct_mutex);
+		mutex_unlock(&dev_priv->exec_lock);
 		ret = -EBUSY;
 		goto pre_mutex_err;
 	}
@@ -1200,14 +1243,36 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	if (eb == NULL) {
 		i915_gem_context_unreference(ctx);
 		mutex_unlock(&dev->struct_mutex);
+		mutex_unlock(&dev_priv->exec_lock);
 		ret = -ENOMEM;
 		goto pre_mutex_err;
 	}
 
 	/* Look up object handles */
 	ret = eb_lookup_vmas(eb, exec, args, vm, file);
-	if (ret)
-		goto err;
+	if (ret) {
+		eb_destroy(eb);
+		i915_gem_context_unreference(ctx);
+		mutex_unlock(&dev->struct_mutex);
+		mutex_unlock(&dev_priv->exec_lock);
+		goto pre_mutex_err;
+	}
+
+	/*
+	 * Release the 'struct_mutex' before extracting the backing
+	 * pages of the objects, so as to allow other ioctls to get serviced,
+	 * while backing pages are being allocated (which is generally
+	 * the most time consuming phase). The 'exec_lock' mutex will provide
+	 * the protection meanwhile.
+	 */
+	mutex_unlock(&dev->struct_mutex);
+
+	i915_gem_execbuffer_preallocate_objs(&eb->objects);
+
+	/* Reacquire the 'struct_mutex' post preallocation */
+	ret = i915_mutex_lock_interruptible(dev);
+
+	mutex_unlock(&dev_priv->exec_lock);
 
 	/* take note of the batch buffer before we might reorder the lists */
 	batch_obj = list_entry(eb->vmas.prev, struct i915_vma, exec_list)->obj;
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index b29d7b1..21bf10d 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -245,6 +245,41 @@  TRACE_EVENT(i915_gem_object_fault,
 		      __entry->write ? ", writable" : "")
 );
 
+TRACE_EVENT(i915_gem_obj_prealloc_start,
+	    TP_PROTO(struct drm_i915_gem_object *obj, u32 size),
+	    TP_ARGS(obj, size),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_i915_gem_object *, obj)
+			     __field(u32, size)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->obj = obj;
+			   __entry->size = size;
+			   ),
+
+	    TP_printk("obj=%p, size=%x",
+		      __entry->obj,
+		      __entry->size)
+);
+
+TRACE_EVENT(i915_gem_obj_prealloc_end,
+	    TP_PROTO(struct drm_i915_gem_object *obj),
+	    TP_ARGS(obj),
+
+	    TP_STRUCT__entry(
+			     __field(struct drm_i915_gem_object *, obj)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->obj = obj;
+			   ),
+
+	    TP_printk("obj=%p",
+		      __entry->obj)
+);
+
 DECLARE_EVENT_CLASS(i915_gem_object,
 	    TP_PROTO(struct drm_i915_gem_object *obj),
 	    TP_ARGS(obj),