Message ID | 1441098415-12026-1-git-send-email-michal.winiarski@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 9/1/2015 10:06 AM, Micha? Winiarski wrote: > On each call to gen8_alloc_va_range_3lvl we're allocating temporary > bitmaps needed for error handling. Unfortunately, when we increase > address space size (48b ppgtt) we do additional (512 - 4) calls to > kcalloc, increasing latency between exec and actual start of execution > on the GPU. Let's just do a single kcalloc, we can also drop the size > from free_gen8_temp_bitmaps since it's no longer used. > > v2: Use GFP_TEMPORARY to make the allocations reclaimable. > v3: Drop the 2D array, just allocate a single block. > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> > Cc: Michel Thierry <michel.thierry@intel.com> > Signed-off-by: Micha? Winiarski <michal.winiarski@intel.com> Unless Chris thinks otherwise, I see Micha? already addressed his comments. Reviewed-by: Michel Thierry <michel.thierry@intel.com> > --- > drivers/gpu/drm/i915/i915_gem_gtt.c | 45 +++++++++++++------------------------ > 1 file changed, 16 insertions(+), 29 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c > index 4a76807..f810762 100644 > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c > @@ -1126,13 +1126,8 @@ unwind_out: > } > > static void > -free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long **new_pts, > - uint32_t pdpes) > +free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long *new_pts) > { > - int i; > - > - for (i = 0; i < pdpes; i++) > - kfree(new_pts[i]); > kfree(new_pts); > kfree(new_pds); > } > @@ -1142,29 +1137,20 @@ free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long **new_pts, > */ > static > int __must_check alloc_gen8_temp_bitmaps(unsigned long **new_pds, > - unsigned long ***new_pts, > + unsigned long **new_pts, > uint32_t pdpes) > { > - int i; > unsigned long *pds; > - unsigned long **pts; > + unsigned long *pts; > > - pds = kcalloc(BITS_TO_LONGS(pdpes), sizeof(unsigned long), GFP_KERNEL); > + pds = kcalloc(BITS_TO_LONGS(pdpes), sizeof(unsigned long), GFP_TEMPORARY); > if (!pds) > return -ENOMEM; > > - pts = kcalloc(pdpes, sizeof(unsigned long *), GFP_KERNEL); > - if (!pts) { > - kfree(pds); > - return -ENOMEM; > - } > - > - for (i = 0; i < pdpes; i++) { > - pts[i] = kcalloc(BITS_TO_LONGS(I915_PDES), > - sizeof(unsigned long), GFP_KERNEL); > - if (!pts[i]) > - goto err_out; > - } > + pts = kcalloc(pdpes * BITS_TO_LONGS(I915_PDES), > + sizeof(unsigned long), GFP_TEMPORARY); > + if (!pts) > + goto err_out; > > *new_pds = pds; > *new_pts = pts; > @@ -1172,7 +1158,7 @@ int __must_check alloc_gen8_temp_bitmaps(unsigned long **new_pds, > return 0; > > err_out: > - free_gen8_temp_bitmaps(pds, pts, pdpes); > + free_gen8_temp_bitmaps(pds, pts); > return -ENOMEM; > } > > @@ -1193,7 +1179,7 @@ static int gen8_alloc_va_range_3lvl(struct i915_address_space *vm, > { > struct i915_hw_ppgtt *ppgtt = > container_of(vm, struct i915_hw_ppgtt, base); > - unsigned long *new_page_dirs, **new_page_tables; > + unsigned long *new_page_dirs, *new_page_tables; > struct drm_device *dev = vm->dev; > struct i915_page_directory *pd; > const uint64_t orig_start = start; > @@ -1220,14 +1206,14 @@ static int gen8_alloc_va_range_3lvl(struct i915_address_space *vm, > ret = gen8_ppgtt_alloc_page_directories(vm, pdp, start, length, > new_page_dirs); > if (ret) { > - free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes); > + free_gen8_temp_bitmaps(new_page_dirs, new_page_tables); > return ret; > } > > /* For every page directory referenced, allocate page tables */ > gen8_for_each_pdpe(pd, pdp, start, length, temp, pdpe) { > ret = gen8_ppgtt_alloc_pagetabs(vm, pd, start, length, > - new_page_tables[pdpe]); > + new_page_tables + pdpe * BITS_TO_LONGS(I915_PDES)); > if (ret) > goto err_out; > } > @@ -1278,20 +1264,21 @@ static int gen8_alloc_va_range_3lvl(struct i915_address_space *vm, > gen8_setup_page_directory(ppgtt, pdp, pd, pdpe); > } > > - free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes); > + free_gen8_temp_bitmaps(new_page_dirs, new_page_tables); > mark_tlbs_dirty(ppgtt); > return 0; > > err_out: > while (pdpe--) { > - for_each_set_bit(temp, new_page_tables[pdpe], I915_PDES) > + for_each_set_bit(temp, new_page_tables + pdpe * > + BITS_TO_LONGS(I915_PDES), I915_PDES) > free_pt(dev, pdp->page_directory[pdpe]->page_table[temp]); > } > > for_each_set_bit(pdpe, new_page_dirs, pdpes) > free_pd(dev, pdp->page_directory[pdpe]); > > - free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes); > + free_gen8_temp_bitmaps(new_page_dirs, new_page_tables); > mark_tlbs_dirty(ppgtt); > return ret; > } >
On Wed, Sep 02, 2015 at 02:40:03PM +0100, Michel Thierry wrote: > On 9/1/2015 10:06 AM, Micha? Winiarski wrote: > >On each call to gen8_alloc_va_range_3lvl we're allocating temporary > >bitmaps needed for error handling. Unfortunately, when we increase > >address space size (48b ppgtt) we do additional (512 - 4) calls to > >kcalloc, increasing latency between exec and actual start of execution > >on the GPU. Let's just do a single kcalloc, we can also drop the size > >from free_gen8_temp_bitmaps since it's no longer used. > > > >v2: Use GFP_TEMPORARY to make the allocations reclaimable. > >v3: Drop the 2D array, just allocate a single block. > > > >Cc: Chris Wilson <chris@chris-wilson.co.uk> > >Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> > >Cc: Michel Thierry <michel.thierry@intel.com> > >Signed-off-by: Micha? Winiarski <michal.winiarski@intel.com> > > Unless Chris thinks otherwise, I see Micha? already addressed his comments. I think I spotted a misaligned bracket... ;) Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> -Chri
On Wed, Sep 02, 2015 at 02:46:41PM +0100, Chris Wilson wrote: > On Wed, Sep 02, 2015 at 02:40:03PM +0100, Michel Thierry wrote: > > On 9/1/2015 10:06 AM, Micha? Winiarski wrote: > > >On each call to gen8_alloc_va_range_3lvl we're allocating temporary > > >bitmaps needed for error handling. Unfortunately, when we increase > > >address space size (48b ppgtt) we do additional (512 - 4) calls to > > >kcalloc, increasing latency between exec and actual start of execution > > >on the GPU. Let's just do a single kcalloc, we can also drop the size > > >from free_gen8_temp_bitmaps since it's no longer used. > > > > > >v2: Use GFP_TEMPORARY to make the allocations reclaimable. > > >v3: Drop the 2D array, just allocate a single block. > > > > > >Cc: Chris Wilson <chris@chris-wilson.co.uk> > > >Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> > > >Cc: Michel Thierry <michel.thierry@intel.com> > > >Signed-off-by: Micha? Winiarski <michal.winiarski@intel.com> > > > > Unless Chris thinks otherwise, I see Micha? already addressed his comments. > > I think I spotted a misaligned bracket... ;) checkpatch didn't spot it and neither me ... > Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Queued for -next, thanks for the patch. -Daniel
On Wed, Sep 02, 2015 at 05:13:29PM +0200, Daniel Vetter wrote: > On Wed, Sep 02, 2015 at 02:46:41PM +0100, Chris Wilson wrote: > > On Wed, Sep 02, 2015 at 02:40:03PM +0100, Michel Thierry wrote: > > > On 9/1/2015 10:06 AM, Micha? Winiarski wrote: > > > >On each call to gen8_alloc_va_range_3lvl we're allocating temporary > > > >bitmaps needed for error handling. Unfortunately, when we increase > > > >address space size (48b ppgtt) we do additional (512 - 4) calls to > > > >kcalloc, increasing latency between exec and actual start of execution > > > >on the GPU. Let's just do a single kcalloc, we can also drop the size > > > >from free_gen8_temp_bitmaps since it's no longer used. > > > > > > > >v2: Use GFP_TEMPORARY to make the allocations reclaimable. > > > >v3: Drop the 2D array, just allocate a single block. > > > > > > > >Cc: Chris Wilson <chris@chris-wilson.co.uk> > > > >Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> > > > >Cc: Michel Thierry <michel.thierry@intel.com> > > > >Signed-off-by: Micha? Winiarski <michal.winiarski@intel.com> > > > > > > Unless Chris thinks otherwise, I see Micha? already addressed his comments. > > > > I think I spotted a misaligned bracket... ;) > > checkpatch didn't spot it and neither me ... > > > Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> > > Queued for -next, thanks for the patch. Strike that, patch doesn't compile too well on plain dinq. What's going on here? And there doesn't seem to be anything in -nightly which isn't in dinq ... -Daniel
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index 4a76807..f810762 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -1126,13 +1126,8 @@ unwind_out: } static void -free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long **new_pts, - uint32_t pdpes) +free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long *new_pts) { - int i; - - for (i = 0; i < pdpes; i++) - kfree(new_pts[i]); kfree(new_pts); kfree(new_pds); } @@ -1142,29 +1137,20 @@ free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long **new_pts, */ static int __must_check alloc_gen8_temp_bitmaps(unsigned long **new_pds, - unsigned long ***new_pts, + unsigned long **new_pts, uint32_t pdpes) { - int i; unsigned long *pds; - unsigned long **pts; + unsigned long *pts; - pds = kcalloc(BITS_TO_LONGS(pdpes), sizeof(unsigned long), GFP_KERNEL); + pds = kcalloc(BITS_TO_LONGS(pdpes), sizeof(unsigned long), GFP_TEMPORARY); if (!pds) return -ENOMEM; - pts = kcalloc(pdpes, sizeof(unsigned long *), GFP_KERNEL); - if (!pts) { - kfree(pds); - return -ENOMEM; - } - - for (i = 0; i < pdpes; i++) { - pts[i] = kcalloc(BITS_TO_LONGS(I915_PDES), - sizeof(unsigned long), GFP_KERNEL); - if (!pts[i]) - goto err_out; - } + pts = kcalloc(pdpes * BITS_TO_LONGS(I915_PDES), + sizeof(unsigned long), GFP_TEMPORARY); + if (!pts) + goto err_out; *new_pds = pds; *new_pts = pts; @@ -1172,7 +1158,7 @@ int __must_check alloc_gen8_temp_bitmaps(unsigned long **new_pds, return 0; err_out: - free_gen8_temp_bitmaps(pds, pts, pdpes); + free_gen8_temp_bitmaps(pds, pts); return -ENOMEM; } @@ -1193,7 +1179,7 @@ static int gen8_alloc_va_range_3lvl(struct i915_address_space *vm, { struct i915_hw_ppgtt *ppgtt = container_of(vm, struct i915_hw_ppgtt, base); - unsigned long *new_page_dirs, **new_page_tables; + unsigned long *new_page_dirs, *new_page_tables; struct drm_device *dev = vm->dev; struct i915_page_directory *pd; const uint64_t orig_start = start; @@ -1220,14 +1206,14 @@ static int gen8_alloc_va_range_3lvl(struct i915_address_space *vm, ret = gen8_ppgtt_alloc_page_directories(vm, pdp, start, length, new_page_dirs); if (ret) { - free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes); + free_gen8_temp_bitmaps(new_page_dirs, new_page_tables); return ret; } /* For every page directory referenced, allocate page tables */ gen8_for_each_pdpe(pd, pdp, start, length, temp, pdpe) { ret = gen8_ppgtt_alloc_pagetabs(vm, pd, start, length, - new_page_tables[pdpe]); + new_page_tables + pdpe * BITS_TO_LONGS(I915_PDES)); if (ret) goto err_out; } @@ -1278,20 +1264,21 @@ static int gen8_alloc_va_range_3lvl(struct i915_address_space *vm, gen8_setup_page_directory(ppgtt, pdp, pd, pdpe); } - free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes); + free_gen8_temp_bitmaps(new_page_dirs, new_page_tables); mark_tlbs_dirty(ppgtt); return 0; err_out: while (pdpe--) { - for_each_set_bit(temp, new_page_tables[pdpe], I915_PDES) + for_each_set_bit(temp, new_page_tables + pdpe * + BITS_TO_LONGS(I915_PDES), I915_PDES) free_pt(dev, pdp->page_directory[pdpe]->page_table[temp]); } for_each_set_bit(pdpe, new_page_dirs, pdpes) free_pd(dev, pdp->page_directory[pdpe]); - free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes); + free_gen8_temp_bitmaps(new_page_dirs, new_page_tables); mark_tlbs_dirty(ppgtt); return ret; }
On each call to gen8_alloc_va_range_3lvl we're allocating temporary bitmaps needed for error handling. Unfortunately, when we increase address space size (48b ppgtt) we do additional (512 - 4) calls to kcalloc, increasing latency between exec and actual start of execution on the GPU. Let's just do a single kcalloc, we can also drop the size from free_gen8_temp_bitmaps since it's no longer used. v2: Use GFP_TEMPORARY to make the allocations reclaimable. v3: Drop the 2D array, just allocate a single block. Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Cc: Michel Thierry <michel.thierry@intel.com> Signed-off-by: Micha? Winiarski <michal.winiarski@intel.com> --- drivers/gpu/drm/i915/i915_gem_gtt.c | 45 +++++++++++++------------------------ 1 file changed, 16 insertions(+), 29 deletions(-)