diff mbox

[v3] drm/i915/gtt: Avoid calling kcalloc in a loop when allocating temp bitmaps

Message ID 1441098415-12026-1-git-send-email-michal.winiarski@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

MichaƂ Winiarski Sept. 1, 2015, 9:06 a.m. UTC
On each call to gen8_alloc_va_range_3lvl we're allocating temporary
bitmaps needed for error handling. Unfortunately, when we increase
address space size (48b ppgtt) we do additional (512 - 4) calls to
kcalloc, increasing latency between exec and actual start of execution
on the GPU. Let's just do a single kcalloc, we can also drop the size
from free_gen8_temp_bitmaps since it's no longer used.

v2: Use GFP_TEMPORARY to make the allocations reclaimable.
v3: Drop the 2D array, just allocate a single block.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Signed-off-by: Micha? Winiarski <michal.winiarski@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c | 45 +++++++++++++------------------------
 1 file changed, 16 insertions(+), 29 deletions(-)

Comments

Michel Thierry Sept. 2, 2015, 1:40 p.m. UTC | #1
On 9/1/2015 10:06 AM, Micha? Winiarski wrote:
> On each call to gen8_alloc_va_range_3lvl we're allocating temporary
> bitmaps needed for error handling. Unfortunately, when we increase
> address space size (48b ppgtt) we do additional (512 - 4) calls to
> kcalloc, increasing latency between exec and actual start of execution
> on the GPU. Let's just do a single kcalloc, we can also drop the size
> from free_gen8_temp_bitmaps since it's no longer used.
>
> v2: Use GFP_TEMPORARY to make the allocations reclaimable.
> v3: Drop the 2D array, just allocate a single block.
>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> Signed-off-by: Micha? Winiarski <michal.winiarski@intel.com>

Unless Chris thinks otherwise, I see Micha? already addressed his comments.

Reviewed-by: Michel Thierry <michel.thierry@intel.com>

> ---
>   drivers/gpu/drm/i915/i915_gem_gtt.c | 45 +++++++++++++------------------------
>   1 file changed, 16 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 4a76807..f810762 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -1126,13 +1126,8 @@ unwind_out:
>   }
>
>   static void
> -free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long **new_pts,
> -		       uint32_t pdpes)
> +free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long *new_pts)
>   {
> -	int i;
> -
> -	for (i = 0; i < pdpes; i++)
> -		kfree(new_pts[i]);
>   	kfree(new_pts);
>   	kfree(new_pds);
>   }
> @@ -1142,29 +1137,20 @@ free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long **new_pts,
>    */
>   static
>   int __must_check alloc_gen8_temp_bitmaps(unsigned long **new_pds,
> -					 unsigned long ***new_pts,
> +					 unsigned long **new_pts,
>   					 uint32_t pdpes)
>   {
> -	int i;
>   	unsigned long *pds;
> -	unsigned long **pts;
> +	unsigned long *pts;
>
> -	pds = kcalloc(BITS_TO_LONGS(pdpes), sizeof(unsigned long), GFP_KERNEL);
> +	pds = kcalloc(BITS_TO_LONGS(pdpes), sizeof(unsigned long), GFP_TEMPORARY);
>   	if (!pds)
>   		return -ENOMEM;
>
> -	pts = kcalloc(pdpes, sizeof(unsigned long *), GFP_KERNEL);
> -	if (!pts) {
> -		kfree(pds);
> -		return -ENOMEM;
> -	}
> -
> -	for (i = 0; i < pdpes; i++) {
> -		pts[i] = kcalloc(BITS_TO_LONGS(I915_PDES),
> -				 sizeof(unsigned long), GFP_KERNEL);
> -		if (!pts[i])
> -			goto err_out;
> -	}
> +	pts = kcalloc(pdpes * BITS_TO_LONGS(I915_PDES),
> +			sizeof(unsigned long), GFP_TEMPORARY);
> +	if (!pts)
> +		goto err_out;
>
>   	*new_pds = pds;
>   	*new_pts = pts;
> @@ -1172,7 +1158,7 @@ int __must_check alloc_gen8_temp_bitmaps(unsigned long **new_pds,
>   	return 0;
>
>   err_out:
> -	free_gen8_temp_bitmaps(pds, pts, pdpes);
> +	free_gen8_temp_bitmaps(pds, pts);
>   	return -ENOMEM;
>   }
>
> @@ -1193,7 +1179,7 @@ static int gen8_alloc_va_range_3lvl(struct i915_address_space *vm,
>   {
>   	struct i915_hw_ppgtt *ppgtt =
>   		container_of(vm, struct i915_hw_ppgtt, base);
> -	unsigned long *new_page_dirs, **new_page_tables;
> +	unsigned long *new_page_dirs, *new_page_tables;
>   	struct drm_device *dev = vm->dev;
>   	struct i915_page_directory *pd;
>   	const uint64_t orig_start = start;
> @@ -1220,14 +1206,14 @@ static int gen8_alloc_va_range_3lvl(struct i915_address_space *vm,
>   	ret = gen8_ppgtt_alloc_page_directories(vm, pdp, start, length,
>   						new_page_dirs);
>   	if (ret) {
> -		free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes);
> +		free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);
>   		return ret;
>   	}
>
>   	/* For every page directory referenced, allocate page tables */
>   	gen8_for_each_pdpe(pd, pdp, start, length, temp, pdpe) {
>   		ret = gen8_ppgtt_alloc_pagetabs(vm, pd, start, length,
> -						new_page_tables[pdpe]);
> +						new_page_tables + pdpe * BITS_TO_LONGS(I915_PDES));
>   		if (ret)
>   			goto err_out;
>   	}
> @@ -1278,20 +1264,21 @@ static int gen8_alloc_va_range_3lvl(struct i915_address_space *vm,
>   		gen8_setup_page_directory(ppgtt, pdp, pd, pdpe);
>   	}
>
> -	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes);
> +	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);
>   	mark_tlbs_dirty(ppgtt);
>   	return 0;
>
>   err_out:
>   	while (pdpe--) {
> -		for_each_set_bit(temp, new_page_tables[pdpe], I915_PDES)
> +		for_each_set_bit(temp, new_page_tables + pdpe *
> +				BITS_TO_LONGS(I915_PDES), I915_PDES)
>   			free_pt(dev, pdp->page_directory[pdpe]->page_table[temp]);
>   	}
>
>   	for_each_set_bit(pdpe, new_page_dirs, pdpes)
>   		free_pd(dev, pdp->page_directory[pdpe]);
>
> -	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes);
> +	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);
>   	mark_tlbs_dirty(ppgtt);
>   	return ret;
>   }
>
Chris Wilson Sept. 2, 2015, 1:46 p.m. UTC | #2
On Wed, Sep 02, 2015 at 02:40:03PM +0100, Michel Thierry wrote:
> On 9/1/2015 10:06 AM, Micha? Winiarski wrote:
> >On each call to gen8_alloc_va_range_3lvl we're allocating temporary
> >bitmaps needed for error handling. Unfortunately, when we increase
> >address space size (48b ppgtt) we do additional (512 - 4) calls to
> >kcalloc, increasing latency between exec and actual start of execution
> >on the GPU. Let's just do a single kcalloc, we can also drop the size
> >from free_gen8_temp_bitmaps since it's no longer used.
> >
> >v2: Use GFP_TEMPORARY to make the allocations reclaimable.
> >v3: Drop the 2D array, just allocate a single block.
> >
> >Cc: Chris Wilson <chris@chris-wilson.co.uk>
> >Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> >Cc: Michel Thierry <michel.thierry@intel.com>
> >Signed-off-by: Micha? Winiarski <michal.winiarski@intel.com>
> 
> Unless Chris thinks otherwise, I see Micha? already addressed his comments.

I think I spotted a misaligned bracket... ;)
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chri
Daniel Vetter Sept. 2, 2015, 3:13 p.m. UTC | #3
On Wed, Sep 02, 2015 at 02:46:41PM +0100, Chris Wilson wrote:
> On Wed, Sep 02, 2015 at 02:40:03PM +0100, Michel Thierry wrote:
> > On 9/1/2015 10:06 AM, Micha? Winiarski wrote:
> > >On each call to gen8_alloc_va_range_3lvl we're allocating temporary
> > >bitmaps needed for error handling. Unfortunately, when we increase
> > >address space size (48b ppgtt) we do additional (512 - 4) calls to
> > >kcalloc, increasing latency between exec and actual start of execution
> > >on the GPU. Let's just do a single kcalloc, we can also drop the size
> > >from free_gen8_temp_bitmaps since it's no longer used.
> > >
> > >v2: Use GFP_TEMPORARY to make the allocations reclaimable.
> > >v3: Drop the 2D array, just allocate a single block.
> > >
> > >Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > >Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > >Cc: Michel Thierry <michel.thierry@intel.com>
> > >Signed-off-by: Micha? Winiarski <michal.winiarski@intel.com>
> > 
> > Unless Chris thinks otherwise, I see Micha? already addressed his comments.
> 
> I think I spotted a misaligned bracket... ;)

checkpatch didn't spot it and neither me ...

> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>

Queued for -next, thanks for the patch.
-Daniel
Daniel Vetter Sept. 2, 2015, 3:26 p.m. UTC | #4
On Wed, Sep 02, 2015 at 05:13:29PM +0200, Daniel Vetter wrote:
> On Wed, Sep 02, 2015 at 02:46:41PM +0100, Chris Wilson wrote:
> > On Wed, Sep 02, 2015 at 02:40:03PM +0100, Michel Thierry wrote:
> > > On 9/1/2015 10:06 AM, Micha? Winiarski wrote:
> > > >On each call to gen8_alloc_va_range_3lvl we're allocating temporary
> > > >bitmaps needed for error handling. Unfortunately, when we increase
> > > >address space size (48b ppgtt) we do additional (512 - 4) calls to
> > > >kcalloc, increasing latency between exec and actual start of execution
> > > >on the GPU. Let's just do a single kcalloc, we can also drop the size
> > > >from free_gen8_temp_bitmaps since it's no longer used.
> > > >
> > > >v2: Use GFP_TEMPORARY to make the allocations reclaimable.
> > > >v3: Drop the 2D array, just allocate a single block.
> > > >
> > > >Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > > >Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > > >Cc: Michel Thierry <michel.thierry@intel.com>
> > > >Signed-off-by: Micha? Winiarski <michal.winiarski@intel.com>
> > > 
> > > Unless Chris thinks otherwise, I see Micha? already addressed his comments.
> > 
> > I think I spotted a misaligned bracket... ;)
> 
> checkpatch didn't spot it and neither me ...
> 
> > Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Queued for -next, thanks for the patch.

Strike that, patch doesn't compile too well on plain dinq. What's going on
here? And there doesn't seem to be anything in -nightly which isn't in
dinq ...
-Daniel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 4a76807..f810762 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1126,13 +1126,8 @@  unwind_out:
 }
 
 static void
-free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long **new_pts,
-		       uint32_t pdpes)
+free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long *new_pts)
 {
-	int i;
-
-	for (i = 0; i < pdpes; i++)
-		kfree(new_pts[i]);
 	kfree(new_pts);
 	kfree(new_pds);
 }
@@ -1142,29 +1137,20 @@  free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long **new_pts,
  */
 static
 int __must_check alloc_gen8_temp_bitmaps(unsigned long **new_pds,
-					 unsigned long ***new_pts,
+					 unsigned long **new_pts,
 					 uint32_t pdpes)
 {
-	int i;
 	unsigned long *pds;
-	unsigned long **pts;
+	unsigned long *pts;
 
-	pds = kcalloc(BITS_TO_LONGS(pdpes), sizeof(unsigned long), GFP_KERNEL);
+	pds = kcalloc(BITS_TO_LONGS(pdpes), sizeof(unsigned long), GFP_TEMPORARY);
 	if (!pds)
 		return -ENOMEM;
 
-	pts = kcalloc(pdpes, sizeof(unsigned long *), GFP_KERNEL);
-	if (!pts) {
-		kfree(pds);
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < pdpes; i++) {
-		pts[i] = kcalloc(BITS_TO_LONGS(I915_PDES),
-				 sizeof(unsigned long), GFP_KERNEL);
-		if (!pts[i])
-			goto err_out;
-	}
+	pts = kcalloc(pdpes * BITS_TO_LONGS(I915_PDES),
+			sizeof(unsigned long), GFP_TEMPORARY);
+	if (!pts)
+		goto err_out;
 
 	*new_pds = pds;
 	*new_pts = pts;
@@ -1172,7 +1158,7 @@  int __must_check alloc_gen8_temp_bitmaps(unsigned long **new_pds,
 	return 0;
 
 err_out:
-	free_gen8_temp_bitmaps(pds, pts, pdpes);
+	free_gen8_temp_bitmaps(pds, pts);
 	return -ENOMEM;
 }
 
@@ -1193,7 +1179,7 @@  static int gen8_alloc_va_range_3lvl(struct i915_address_space *vm,
 {
 	struct i915_hw_ppgtt *ppgtt =
 		container_of(vm, struct i915_hw_ppgtt, base);
-	unsigned long *new_page_dirs, **new_page_tables;
+	unsigned long *new_page_dirs, *new_page_tables;
 	struct drm_device *dev = vm->dev;
 	struct i915_page_directory *pd;
 	const uint64_t orig_start = start;
@@ -1220,14 +1206,14 @@  static int gen8_alloc_va_range_3lvl(struct i915_address_space *vm,
 	ret = gen8_ppgtt_alloc_page_directories(vm, pdp, start, length,
 						new_page_dirs);
 	if (ret) {
-		free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes);
+		free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);
 		return ret;
 	}
 
 	/* For every page directory referenced, allocate page tables */
 	gen8_for_each_pdpe(pd, pdp, start, length, temp, pdpe) {
 		ret = gen8_ppgtt_alloc_pagetabs(vm, pd, start, length,
-						new_page_tables[pdpe]);
+						new_page_tables + pdpe * BITS_TO_LONGS(I915_PDES));
 		if (ret)
 			goto err_out;
 	}
@@ -1278,20 +1264,21 @@  static int gen8_alloc_va_range_3lvl(struct i915_address_space *vm,
 		gen8_setup_page_directory(ppgtt, pdp, pd, pdpe);
 	}
 
-	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes);
+	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);
 	mark_tlbs_dirty(ppgtt);
 	return 0;
 
 err_out:
 	while (pdpe--) {
-		for_each_set_bit(temp, new_page_tables[pdpe], I915_PDES)
+		for_each_set_bit(temp, new_page_tables + pdpe *
+				BITS_TO_LONGS(I915_PDES), I915_PDES)
 			free_pt(dev, pdp->page_directory[pdpe]->page_table[temp]);
 	}
 
 	for_each_set_bit(pdpe, new_page_dirs, pdpes)
 		free_pd(dev, pdp->page_directory[pdpe]);
 
-	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes);
+	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);
 	mark_tlbs_dirty(ppgtt);
 	return ret;
 }