diff mbox

[3/3] drm/ttm: under memory pressure minimize the size of memory pool

Message ID 1407901926-24516-4-git-send-email-j.glisse@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jerome Glisse Aug. 13, 2014, 3:52 a.m. UTC
From: Jérôme Glisse <jglisse@redhat.com>

When experiencing memory pressure we want to minimize pool size so that
memory we just shrinked is not added back again just as the next thing.

This will divide by 2 the maximum pool size for each device each time
the pool have to shrink. The limit is bumped again is next allocation
happen after one second since the last shrink. The one second delay is
obviously an arbitrary choice.

Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Mario Kleiner <mario.kleiner.de@gmail.com>
Cc: Michel Dänzer <michel@daenzer.net>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/gpu/drm/ttm/ttm_page_alloc.c     | 35 +++++++++++++++++++++++++-------
 drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 27 ++++++++++++++++++++++--
 2 files changed, 53 insertions(+), 9 deletions(-)

Comments

Michel Dänzer Aug. 13, 2014, 6:32 a.m. UTC | #1
On 13.08.2014 12:52, Jérôme Glisse wrote:
> From: Jérôme Glisse <jglisse@redhat.com>
> 
> When experiencing memory pressure we want to minimize pool size so that
> memory we just shrinked is not added back again just as the next thing.
> 
> This will divide by 2 the maximum pool size for each device each time
> the pool have to shrink. The limit is bumped again is next allocation
> happen after one second since the last shrink. The one second delay is
> obviously an arbitrary choice.
> 
> Signed-off-by: Jérôme Glisse <jglisse@redhat.com>

[...]

> diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
> index 09874d6..ab41adf 100644
> --- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
> +++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
> @@ -68,6 +68,8 @@
>   * @list: Pool of free uc/wc pages for fast reuse.
>   * @gfp_flags: Flags to pass for alloc_page.
>   * @npages: Number of pages in pool.
> + * @cur_max_size: Current maximum size for the pool.
> + * @shrink_timeout: Timeout for pool maximum size restriction.
>   */
>  struct ttm_page_pool {
>  	spinlock_t		lock;
> @@ -76,6 +78,8 @@ struct ttm_page_pool {
>  	gfp_t			gfp_flags;
>  	unsigned		npages;
>  	char			*name;
> +	unsigned		cur_max_size;
> +	unsigned long		last_shrink;

s/last_shrink/shrink_timeout/

Looks like maybe you posted an untested stale set of patches?


> @@ -289,6 +293,16 @@ static void ttm_pool_update_free_locked(struct ttm_page_pool *pool,
>  	pool->nfrees += freed_pages;
>  }
>  
> +static inline void ttm_pool_update_max_size(struct ttm_page_pool *pool)
> +{
> +	if (time_before(jiffies, pool->shrink_timeout))
> +		return;
> +	/* In case we reached zero bounce back to 512 pages. */
> +	pool->cur_max_size = max(pool->cur_max_size << 1, 512);

Another 'comparison of distinct pointer types lacks a cast' warning.


Both issues apply to ttm_page_alloc_dma.c as well.
Thomas Hellstrom Aug. 13, 2014, 9:06 a.m. UTC | #2
On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
> From: Jérôme Glisse <jglisse@redhat.com>
>
> When experiencing memory pressure we want to minimize pool size so that
> memory we just shrinked is not added back again just as the next thing.
>
> This will divide by 2 the maximum pool size for each device each time
> the pool have to shrink. The limit is bumped again is next allocation
> happen after one second since the last shrink. The one second delay is
> obviously an arbitrary choice.

Jérôme,

I don't like this patch. It adds extra complexity and its usefulness is
highly questionable.
There are a number of caches in the system, and if all of them added
some sort of voluntary shrink heuristics like this, we'd end up with
impossible-to-debug unpredictable performance issues.

We should let the memory subsystem decide when to reclaim pages from
caches and what caches to reclaim them from.

/Thomas
>
> Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
> Cc: Mario Kleiner <mario.kleiner.de@gmail.com>
> Cc: Michel Dänzer <michel@daenzer.net>
> Cc: Thomas Hellstrom <thellstrom@vmware.com>
> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> ---
>  drivers/gpu/drm/ttm/ttm_page_alloc.c     | 35 +++++++++++++++++++++++++-------
>  drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 27 ++++++++++++++++++++++--
>  2 files changed, 53 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
> index 09874d6..ab41adf 100644
> --- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
> +++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
> @@ -68,6 +68,8 @@
>   * @list: Pool of free uc/wc pages for fast reuse.
>   * @gfp_flags: Flags to pass for alloc_page.
>   * @npages: Number of pages in pool.
> + * @cur_max_size: Current maximum size for the pool.
> + * @shrink_timeout: Timeout for pool maximum size restriction.
>   */
>  struct ttm_page_pool {
>  	spinlock_t		lock;
> @@ -76,6 +78,8 @@ struct ttm_page_pool {
>  	gfp_t			gfp_flags;
>  	unsigned		npages;
>  	char			*name;
> +	unsigned		cur_max_size;
> +	unsigned long		last_shrink;
>  	unsigned long		nfrees;
>  	unsigned long		nrefills;
>  };
> @@ -289,6 +293,16 @@ static void ttm_pool_update_free_locked(struct ttm_page_pool *pool,
>  	pool->nfrees += freed_pages;
>  }
>  
> +static inline void ttm_pool_update_max_size(struct ttm_page_pool *pool)
> +{
> +	if (time_before(jiffies, pool->shrink_timeout))
> +		return;
> +	/* In case we reached zero bounce back to 512 pages. */
> +	pool->cur_max_size = max(pool->cur_max_size << 1, 512);
> +	pool->cur_max_size = min(pool->cur_max_size,
> +				 _manager->options.max_size);
> +}
> +
>  /**
>   * Free pages from pool.
>   *
> @@ -407,6 +421,9 @@ ttm_pool_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
>  		if (shrink_pages == 0)
>  			break;
>  		pool = &_manager->pools[(i + pool_offset)%NUM_POOLS];
> +		/* No matter what make sure the pool do not grow in next second. */
> +		pool->cur_max_size = pool->cur_max_size >> 1;
> +		pool->shrink_timeout = jiffies + HZ;
>  		shrink_pages = ttm_page_pool_free(pool, nr_free,
>  						  sc->gfp_mask);
>  		freed += nr_free - shrink_pages;
> @@ -701,13 +718,12 @@ static void ttm_put_pages(struct page **pages, unsigned npages, int flags,
>  	}
>  	/* Check that we don't go over the pool limit */
>  	npages = 0;
> -	if (pool->npages > _manager->options.max_size) {
> -		npages = pool->npages - _manager->options.max_size;
> -		/* free at least NUM_PAGES_TO_ALLOC number of pages
> -		 * to reduce calls to set_memory_wb */
> -		if (npages < NUM_PAGES_TO_ALLOC)
> -			npages = NUM_PAGES_TO_ALLOC;
> -	}
> +	/*
> +	 * Free at least NUM_PAGES_TO_ALLOC number of pages to reduce calls to
> +	 * set_memory_wb.
> +	 */
> +	if (pool->npages > (pool->cur_max_size + NUM_PAGES_TO_ALLOC))
> +		npages = pool->npages - pool->cur_max_size;
>  	spin_unlock_irqrestore(&pool->lock, irq_flags);
>  	if (npages)
>  		ttm_page_pool_free(pool, npages, GFP_KERNEL);
> @@ -751,6 +767,9 @@ static int ttm_get_pages(struct page **pages, unsigned npages, int flags,
>  		return 0;
>  	}
>  
> +	/* Update pool size in case shrinker limited it. */
> +	ttm_pool_update_max_size(pool);
> +
>  	/* combine zero flag to pool flags */
>  	gfp_flags |= pool->gfp_flags;
>  
> @@ -803,6 +822,8 @@ static void ttm_page_pool_init_locked(struct ttm_page_pool *pool, gfp_t flags,
>  	pool->npages = pool->nfrees = 0;
>  	pool->gfp_flags = flags;
>  	pool->name = name;
> +	pool->cur_max_size = _manager->options.max_size;
> +	pool->shrink_timeout = jiffies;
>  }
>  
>  int ttm_page_alloc_init(struct ttm_mem_global *glob, unsigned max_pages)
> diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> index a076ff3..80b10aa 100644
> --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> @@ -93,6 +93,8 @@ enum pool_type {
>   * @size: Size used during DMA allocation.
>   * @npages_free: Count of available pages for re-use.
>   * @npages_in_use: Count of pages that are in use.
> + * @cur_max_size: Current maximum size for the pool.
> + * @shrink_timeout: Timeout for pool maximum size restriction.
>   * @nfrees: Stats when pool is shrinking.
>   * @nrefills: Stats when the pool is grown.
>   * @gfp_flags: Flags to pass for alloc_page.
> @@ -110,6 +112,8 @@ struct dma_pool {
>  	unsigned size;
>  	unsigned npages_free;
>  	unsigned npages_in_use;
> +	unsigned cur_max_size;
> +	unsigned long last_shrink;
>  	unsigned long nfrees; /* Stats when shrunk. */
>  	unsigned long nrefills; /* Stats when grown. */
>  	gfp_t gfp_flags;
> @@ -331,6 +335,17 @@ static void __ttm_dma_free_page(struct dma_pool *pool, struct dma_page *d_page)
>  	kfree(d_page);
>  	d_page = NULL;
>  }
> +
> +static inline void ttm_dma_pool_update_max_size(struct dma_pool *pool)
> +{
> +	if (time_before(jiffies, pool->shrink_timeout))
> +		return;
> +	/* In case we reached zero bounce back to 512 pages. */
> +	pool->cur_max_size = max(pool->cur_max_size << 1, 512);
> +	pool->cur_max_size = min(pool->cur_max_size,
> +				 _manager->options.max_size);
> +}
> +
>  static struct dma_page *__ttm_dma_alloc_page(struct dma_pool *pool)
>  {
>  	struct dma_page *d_page;
> @@ -606,6 +621,8 @@ static struct dma_pool *ttm_dma_pool_init(struct device *dev, gfp_t flags,
>  	pool->size = PAGE_SIZE;
>  	pool->type = type;
>  	pool->nrefills = 0;
> +	pool->cur_max_size = _manager->options.max_size;
> +	pool->shrink_timeout = jiffies;
>  	p = pool->name;
>  	for (i = 0; i < 5; i++) {
>  		if (type & t[i]) {
> @@ -892,6 +909,9 @@ int ttm_dma_populate(struct ttm_dma_tt *ttm_dma, struct device *dev)
>  		}
>  	}
>  
> +	/* Update pool size in case shrinker limited it. */
> +	ttm_dma_pool_update_max_size(pool);
> +
>  	INIT_LIST_HEAD(&ttm_dma->pages_list);
>  	for (i = 0; i < ttm->num_pages; ++i) {
>  		ret = ttm_dma_pool_get_pages(pool, ttm_dma, i);
> @@ -953,9 +973,9 @@ void ttm_dma_unpopulate(struct ttm_dma_tt *ttm_dma, struct device *dev)
>  	} else {
>  		pool->npages_free += count;
>  		list_splice(&ttm_dma->pages_list, &pool->free_list);
> -		if (pool->npages_free >= (_manager->options.max_size +
> +		if (pool->npages_free >= (pool->cur_max_size +
>  					  NUM_PAGES_TO_ALLOC))
> -			npages = pool->npages_free - _manager->options.max_size;
> +			npages = pool->npages_free - pool->cur_max_size;
>  	}
>  	spin_unlock_irqrestore(&pool->lock, irq_flags);
>  
> @@ -1024,6 +1044,9 @@ ttm_dma_pool_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
>  		/* Do it in round-robin fashion. */
>  		if (++idx < pool_offset)
>  			continue;
> +		/* No matter what make sure the pool do not grow in next second. */
> +		p->pool->cur_max_size = p->pool->cur_max_size >> 1;
> +		p->pool->shrink_timeout = jiffies + HZ;
>  		nr_free = shrink_pages;
>  		shrink_pages = ttm_dma_page_pool_free(p->pool, nr_free,
>  						      sc->gfp_mask);
Daniel Vetter Aug. 13, 2014, 10:42 a.m. UTC | #3
On Wed, Aug 13, 2014 at 11:06:25AM +0200, Thomas Hellstrom wrote:
> 
> On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
> > From: Jérôme Glisse <jglisse@redhat.com>
> >
> > When experiencing memory pressure we want to minimize pool size so that
> > memory we just shrinked is not added back again just as the next thing.
> >
> > This will divide by 2 the maximum pool size for each device each time
> > the pool have to shrink. The limit is bumped again is next allocation
> > happen after one second since the last shrink. The one second delay is
> > obviously an arbitrary choice.
> 
> Jérôme,
> 
> I don't like this patch. It adds extra complexity and its usefulness is
> highly questionable.
> There are a number of caches in the system, and if all of them added
> some sort of voluntary shrink heuristics like this, we'd end up with
> impossible-to-debug unpredictable performance issues.
> 
> We should let the memory subsystem decide when to reclaim pages from
> caches and what caches to reclaim them from.

Yeah, artificially limiting your cache from growing when your shrinker
gets called will just break the equal-memory pressure the core mm uses to
rebalance between all caches when workload changes. In i915 we let
everything grow without artificial bounds and only rely upon the shrinker
callbacks to ensure we don't consume more than our fair share of available
memory overall.
-Daniel

> 
> /Thomas
> >
> > Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
> > Cc: Mario Kleiner <mario.kleiner.de@gmail.com>
> > Cc: Michel Dänzer <michel@daenzer.net>
> > Cc: Thomas Hellstrom <thellstrom@vmware.com>
> > Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> > ---
> >  drivers/gpu/drm/ttm/ttm_page_alloc.c     | 35 +++++++++++++++++++++++++-------
> >  drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 27 ++++++++++++++++++++++--
> >  2 files changed, 53 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
> > index 09874d6..ab41adf 100644
> > --- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
> > +++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
> > @@ -68,6 +68,8 @@
> >   * @list: Pool of free uc/wc pages for fast reuse.
> >   * @gfp_flags: Flags to pass for alloc_page.
> >   * @npages: Number of pages in pool.
> > + * @cur_max_size: Current maximum size for the pool.
> > + * @shrink_timeout: Timeout for pool maximum size restriction.
> >   */
> >  struct ttm_page_pool {
> >  	spinlock_t		lock;
> > @@ -76,6 +78,8 @@ struct ttm_page_pool {
> >  	gfp_t			gfp_flags;
> >  	unsigned		npages;
> >  	char			*name;
> > +	unsigned		cur_max_size;
> > +	unsigned long		last_shrink;
> >  	unsigned long		nfrees;
> >  	unsigned long		nrefills;
> >  };
> > @@ -289,6 +293,16 @@ static void ttm_pool_update_free_locked(struct ttm_page_pool *pool,
> >  	pool->nfrees += freed_pages;
> >  }
> >  
> > +static inline void ttm_pool_update_max_size(struct ttm_page_pool *pool)
> > +{
> > +	if (time_before(jiffies, pool->shrink_timeout))
> > +		return;
> > +	/* In case we reached zero bounce back to 512 pages. */
> > +	pool->cur_max_size = max(pool->cur_max_size << 1, 512);
> > +	pool->cur_max_size = min(pool->cur_max_size,
> > +				 _manager->options.max_size);
> > +}
> > +
> >  /**
> >   * Free pages from pool.
> >   *
> > @@ -407,6 +421,9 @@ ttm_pool_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> >  		if (shrink_pages == 0)
> >  			break;
> >  		pool = &_manager->pools[(i + pool_offset)%NUM_POOLS];
> > +		/* No matter what make sure the pool do not grow in next second. */
> > +		pool->cur_max_size = pool->cur_max_size >> 1;
> > +		pool->shrink_timeout = jiffies + HZ;
> >  		shrink_pages = ttm_page_pool_free(pool, nr_free,
> >  						  sc->gfp_mask);
> >  		freed += nr_free - shrink_pages;
> > @@ -701,13 +718,12 @@ static void ttm_put_pages(struct page **pages, unsigned npages, int flags,
> >  	}
> >  	/* Check that we don't go over the pool limit */
> >  	npages = 0;
> > -	if (pool->npages > _manager->options.max_size) {
> > -		npages = pool->npages - _manager->options.max_size;
> > -		/* free at least NUM_PAGES_TO_ALLOC number of pages
> > -		 * to reduce calls to set_memory_wb */
> > -		if (npages < NUM_PAGES_TO_ALLOC)
> > -			npages = NUM_PAGES_TO_ALLOC;
> > -	}
> > +	/*
> > +	 * Free at least NUM_PAGES_TO_ALLOC number of pages to reduce calls to
> > +	 * set_memory_wb.
> > +	 */
> > +	if (pool->npages > (pool->cur_max_size + NUM_PAGES_TO_ALLOC))
> > +		npages = pool->npages - pool->cur_max_size;
> >  	spin_unlock_irqrestore(&pool->lock, irq_flags);
> >  	if (npages)
> >  		ttm_page_pool_free(pool, npages, GFP_KERNEL);
> > @@ -751,6 +767,9 @@ static int ttm_get_pages(struct page **pages, unsigned npages, int flags,
> >  		return 0;
> >  	}
> >  
> > +	/* Update pool size in case shrinker limited it. */
> > +	ttm_pool_update_max_size(pool);
> > +
> >  	/* combine zero flag to pool flags */
> >  	gfp_flags |= pool->gfp_flags;
> >  
> > @@ -803,6 +822,8 @@ static void ttm_page_pool_init_locked(struct ttm_page_pool *pool, gfp_t flags,
> >  	pool->npages = pool->nfrees = 0;
> >  	pool->gfp_flags = flags;
> >  	pool->name = name;
> > +	pool->cur_max_size = _manager->options.max_size;
> > +	pool->shrink_timeout = jiffies;
> >  }
> >  
> >  int ttm_page_alloc_init(struct ttm_mem_global *glob, unsigned max_pages)
> > diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> > index a076ff3..80b10aa 100644
> > --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> > +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> > @@ -93,6 +93,8 @@ enum pool_type {
> >   * @size: Size used during DMA allocation.
> >   * @npages_free: Count of available pages for re-use.
> >   * @npages_in_use: Count of pages that are in use.
> > + * @cur_max_size: Current maximum size for the pool.
> > + * @shrink_timeout: Timeout for pool maximum size restriction.
> >   * @nfrees: Stats when pool is shrinking.
> >   * @nrefills: Stats when the pool is grown.
> >   * @gfp_flags: Flags to pass for alloc_page.
> > @@ -110,6 +112,8 @@ struct dma_pool {
> >  	unsigned size;
> >  	unsigned npages_free;
> >  	unsigned npages_in_use;
> > +	unsigned cur_max_size;
> > +	unsigned long last_shrink;
> >  	unsigned long nfrees; /* Stats when shrunk. */
> >  	unsigned long nrefills; /* Stats when grown. */
> >  	gfp_t gfp_flags;
> > @@ -331,6 +335,17 @@ static void __ttm_dma_free_page(struct dma_pool *pool, struct dma_page *d_page)
> >  	kfree(d_page);
> >  	d_page = NULL;
> >  }
> > +
> > +static inline void ttm_dma_pool_update_max_size(struct dma_pool *pool)
> > +{
> > +	if (time_before(jiffies, pool->shrink_timeout))
> > +		return;
> > +	/* In case we reached zero bounce back to 512 pages. */
> > +	pool->cur_max_size = max(pool->cur_max_size << 1, 512);
> > +	pool->cur_max_size = min(pool->cur_max_size,
> > +				 _manager->options.max_size);
> > +}
> > +
> >  static struct dma_page *__ttm_dma_alloc_page(struct dma_pool *pool)
> >  {
> >  	struct dma_page *d_page;
> > @@ -606,6 +621,8 @@ static struct dma_pool *ttm_dma_pool_init(struct device *dev, gfp_t flags,
> >  	pool->size = PAGE_SIZE;
> >  	pool->type = type;
> >  	pool->nrefills = 0;
> > +	pool->cur_max_size = _manager->options.max_size;
> > +	pool->shrink_timeout = jiffies;
> >  	p = pool->name;
> >  	for (i = 0; i < 5; i++) {
> >  		if (type & t[i]) {
> > @@ -892,6 +909,9 @@ int ttm_dma_populate(struct ttm_dma_tt *ttm_dma, struct device *dev)
> >  		}
> >  	}
> >  
> > +	/* Update pool size in case shrinker limited it. */
> > +	ttm_dma_pool_update_max_size(pool);
> > +
> >  	INIT_LIST_HEAD(&ttm_dma->pages_list);
> >  	for (i = 0; i < ttm->num_pages; ++i) {
> >  		ret = ttm_dma_pool_get_pages(pool, ttm_dma, i);
> > @@ -953,9 +973,9 @@ void ttm_dma_unpopulate(struct ttm_dma_tt *ttm_dma, struct device *dev)
> >  	} else {
> >  		pool->npages_free += count;
> >  		list_splice(&ttm_dma->pages_list, &pool->free_list);
> > -		if (pool->npages_free >= (_manager->options.max_size +
> > +		if (pool->npages_free >= (pool->cur_max_size +
> >  					  NUM_PAGES_TO_ALLOC))
> > -			npages = pool->npages_free - _manager->options.max_size;
> > +			npages = pool->npages_free - pool->cur_max_size;
> >  	}
> >  	spin_unlock_irqrestore(&pool->lock, irq_flags);
> >  
> > @@ -1024,6 +1044,9 @@ ttm_dma_pool_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> >  		/* Do it in round-robin fashion. */
> >  		if (++idx < pool_offset)
> >  			continue;
> > +		/* No matter what make sure the pool do not grow in next second. */
> > +		p->pool->cur_max_size = p->pool->cur_max_size >> 1;
> > +		p->pool->shrink_timeout = jiffies + HZ;
> >  		nr_free = shrink_pages;
> >  		shrink_pages = ttm_dma_page_pool_free(p->pool, nr_free,
> >  						      sc->gfp_mask);
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
Thomas Hellstrom Aug. 13, 2014, 12:35 p.m. UTC | #4
On 08/13/2014 12:42 PM, Daniel Vetter wrote:
> On Wed, Aug 13, 2014 at 11:06:25AM +0200, Thomas Hellstrom wrote:
>> On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
>>> From: Jérôme Glisse <jglisse@redhat.com>
>>>
>>> When experiencing memory pressure we want to minimize pool size so that
>>> memory we just shrinked is not added back again just as the next thing.
>>>
>>> This will divide by 2 the maximum pool size for each device each time
>>> the pool have to shrink. The limit is bumped again is next allocation
>>> happen after one second since the last shrink. The one second delay is
>>> obviously an arbitrary choice.
>> Jérôme,
>>
>> I don't like this patch. It adds extra complexity and its usefulness is
>> highly questionable.
>> There are a number of caches in the system, and if all of them added
>> some sort of voluntary shrink heuristics like this, we'd end up with
>> impossible-to-debug unpredictable performance issues.
>>
>> We should let the memory subsystem decide when to reclaim pages from
>> caches and what caches to reclaim them from.
> Yeah, artificially limiting your cache from growing when your shrinker
> gets called will just break the equal-memory pressure the core mm uses to
> rebalance between all caches when workload changes. In i915 we let
> everything grow without artificial bounds and only rely upon the shrinker
> callbacks to ensure we don't consume more than our fair share of available
> memory overall.
> -Daniel

Now when you bring i915 memory usage up, Daniel,
I can't refrain from bringing up the old user-space unreclaimable kernel
memory issue, for which gem open is a good example ;) Each time
user-space opens a gem handle, some un-reclaimable kernel memory is
allocated, for which there is no accounting, so theoretically I think a
user can bring a system to unusability this way.

Typically there are various limits on unreclaimable objects like this,
like open file descriptors, and IIRC the kernel even has an internal
limit on the number of struct files you initialize, based on the
available system memory, so dma-buf / prime should already have some
sort of protection.

/Thomas


>> /Thomas
>>> Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
>>> Cc: Mario Kleiner <mario.kleiner.de@gmail.com>
>>> Cc: Michel Dänzer <michel@daenzer.net>
>>> Cc: Thomas Hellstrom <thellstrom@vmware.com>
>>> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
>>> ---
>>>  drivers/gpu/drm/ttm/ttm_page_alloc.c     | 35 +++++++++++++++++++++++++-------
>>>  drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 27 ++++++++++++++++++++++--
>>>  2 files changed, 53 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
>>> index 09874d6..ab41adf 100644
>>> --- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
>>> +++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
>>> @@ -68,6 +68,8 @@
>>>   * @list: Pool of free uc/wc pages for fast reuse.
>>>   * @gfp_flags: Flags to pass for alloc_page.
>>>   * @npages: Number of pages in pool.
>>> + * @cur_max_size: Current maximum size for the pool.
>>> + * @shrink_timeout: Timeout for pool maximum size restriction.
>>>   */
>>>  struct ttm_page_pool {
>>>  	spinlock_t		lock;
>>> @@ -76,6 +78,8 @@ struct ttm_page_pool {
>>>  	gfp_t			gfp_flags;
>>>  	unsigned		npages;
>>>  	char			*name;
>>> +	unsigned		cur_max_size;
>>> +	unsigned long		last_shrink;
>>>  	unsigned long		nfrees;
>>>  	unsigned long		nrefills;
>>>  };
>>> @@ -289,6 +293,16 @@ static void ttm_pool_update_free_locked(struct ttm_page_pool *pool,
>>>  	pool->nfrees += freed_pages;
>>>  }
>>>  
>>> +static inline void ttm_pool_update_max_size(struct ttm_page_pool *pool)
>>> +{
>>> +	if (time_before(jiffies, pool->shrink_timeout))
>>> +		return;
>>> +	/* In case we reached zero bounce back to 512 pages. */
>>> +	pool->cur_max_size = max(pool->cur_max_size << 1, 512);
>>> +	pool->cur_max_size = min(pool->cur_max_size,
>>> +				 _manager->options.max_size);
>>> +}
>>> +
>>>  /**
>>>   * Free pages from pool.
>>>   *
>>> @@ -407,6 +421,9 @@ ttm_pool_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
>>>  		if (shrink_pages == 0)
>>>  			break;
>>>  		pool = &_manager->pools[(i + pool_offset)%NUM_POOLS];
>>> +		/* No matter what make sure the pool do not grow in next second. */
>>> +		pool->cur_max_size = pool->cur_max_size >> 1;
>>> +		pool->shrink_timeout = jiffies + HZ;
>>>  		shrink_pages = ttm_page_pool_free(pool, nr_free,
>>>  						  sc->gfp_mask);
>>>  		freed += nr_free - shrink_pages;
>>> @@ -701,13 +718,12 @@ static void ttm_put_pages(struct page **pages, unsigned npages, int flags,
>>>  	}
>>>  	/* Check that we don't go over the pool limit */
>>>  	npages = 0;
>>> -	if (pool->npages > _manager->options.max_size) {
>>> -		npages = pool->npages - _manager->options.max_size;
>>> -		/* free at least NUM_PAGES_TO_ALLOC number of pages
>>> -		 * to reduce calls to set_memory_wb */
>>> -		if (npages < NUM_PAGES_TO_ALLOC)
>>> -			npages = NUM_PAGES_TO_ALLOC;
>>> -	}
>>> +	/*
>>> +	 * Free at least NUM_PAGES_TO_ALLOC number of pages to reduce calls to
>>> +	 * set_memory_wb.
>>> +	 */
>>> +	if (pool->npages > (pool->cur_max_size + NUM_PAGES_TO_ALLOC))
>>> +		npages = pool->npages - pool->cur_max_size;
>>>  	spin_unlock_irqrestore(&pool->lock, irq_flags);
>>>  	if (npages)
>>>  		ttm_page_pool_free(pool, npages, GFP_KERNEL);
>>> @@ -751,6 +767,9 @@ static int ttm_get_pages(struct page **pages, unsigned npages, int flags,
>>>  		return 0;
>>>  	}
>>>  
>>> +	/* Update pool size in case shrinker limited it. */
>>> +	ttm_pool_update_max_size(pool);
>>> +
>>>  	/* combine zero flag to pool flags */
>>>  	gfp_flags |= pool->gfp_flags;
>>>  
>>> @@ -803,6 +822,8 @@ static void ttm_page_pool_init_locked(struct ttm_page_pool *pool, gfp_t flags,
>>>  	pool->npages = pool->nfrees = 0;
>>>  	pool->gfp_flags = flags;
>>>  	pool->name = name;
>>> +	pool->cur_max_size = _manager->options.max_size;
>>> +	pool->shrink_timeout = jiffies;
>>>  }
>>>  
>>>  int ttm_page_alloc_init(struct ttm_mem_global *glob, unsigned max_pages)
>>> diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
>>> index a076ff3..80b10aa 100644
>>> --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
>>> +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
>>> @@ -93,6 +93,8 @@ enum pool_type {
>>>   * @size: Size used during DMA allocation.
>>>   * @npages_free: Count of available pages for re-use.
>>>   * @npages_in_use: Count of pages that are in use.
>>> + * @cur_max_size: Current maximum size for the pool.
>>> + * @shrink_timeout: Timeout for pool maximum size restriction.
>>>   * @nfrees: Stats when pool is shrinking.
>>>   * @nrefills: Stats when the pool is grown.
>>>   * @gfp_flags: Flags to pass for alloc_page.
>>> @@ -110,6 +112,8 @@ struct dma_pool {
>>>  	unsigned size;
>>>  	unsigned npages_free;
>>>  	unsigned npages_in_use;
>>> +	unsigned cur_max_size;
>>> +	unsigned long last_shrink;
>>>  	unsigned long nfrees; /* Stats when shrunk. */
>>>  	unsigned long nrefills; /* Stats when grown. */
>>>  	gfp_t gfp_flags;
>>> @@ -331,6 +335,17 @@ static void __ttm_dma_free_page(struct dma_pool *pool, struct dma_page *d_page)
>>>  	kfree(d_page);
>>>  	d_page = NULL;
>>>  }
>>> +
>>> +static inline void ttm_dma_pool_update_max_size(struct dma_pool *pool)
>>> +{
>>> +	if (time_before(jiffies, pool->shrink_timeout))
>>> +		return;
>>> +	/* In case we reached zero bounce back to 512 pages. */
>>> +	pool->cur_max_size = max(pool->cur_max_size << 1, 512);
>>> +	pool->cur_max_size = min(pool->cur_max_size,
>>> +				 _manager->options.max_size);
>>> +}
>>> +
>>>  static struct dma_page *__ttm_dma_alloc_page(struct dma_pool *pool)
>>>  {
>>>  	struct dma_page *d_page;
>>> @@ -606,6 +621,8 @@ static struct dma_pool *ttm_dma_pool_init(struct device *dev, gfp_t flags,
>>>  	pool->size = PAGE_SIZE;
>>>  	pool->type = type;
>>>  	pool->nrefills = 0;
>>> +	pool->cur_max_size = _manager->options.max_size;
>>> +	pool->shrink_timeout = jiffies;
>>>  	p = pool->name;
>>>  	for (i = 0; i < 5; i++) {
>>>  		if (type & t[i]) {
>>> @@ -892,6 +909,9 @@ int ttm_dma_populate(struct ttm_dma_tt *ttm_dma, struct device *dev)
>>>  		}
>>>  	}
>>>  
>>> +	/* Update pool size in case shrinker limited it. */
>>> +	ttm_dma_pool_update_max_size(pool);
>>> +
>>>  	INIT_LIST_HEAD(&ttm_dma->pages_list);
>>>  	for (i = 0; i < ttm->num_pages; ++i) {
>>>  		ret = ttm_dma_pool_get_pages(pool, ttm_dma, i);
>>> @@ -953,9 +973,9 @@ void ttm_dma_unpopulate(struct ttm_dma_tt *ttm_dma, struct device *dev)
>>>  	} else {
>>>  		pool->npages_free += count;
>>>  		list_splice(&ttm_dma->pages_list, &pool->free_list);
>>> -		if (pool->npages_free >= (_manager->options.max_size +
>>> +		if (pool->npages_free >= (pool->cur_max_size +
>>>  					  NUM_PAGES_TO_ALLOC))
>>> -			npages = pool->npages_free - _manager->options.max_size;
>>> +			npages = pool->npages_free - pool->cur_max_size;
>>>  	}
>>>  	spin_unlock_irqrestore(&pool->lock, irq_flags);
>>>  
>>> @@ -1024,6 +1044,9 @@ ttm_dma_pool_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
>>>  		/* Do it in round-robin fashion. */
>>>  		if (++idx < pool_offset)
>>>  			continue;
>>> +		/* No matter what make sure the pool do not grow in next second. */
>>> +		p->pool->cur_max_size = p->pool->cur_max_size >> 1;
>>> +		p->pool->shrink_timeout = jiffies + HZ;
>>>  		nr_free = shrink_pages;
>>>  		shrink_pages = ttm_dma_page_pool_free(p->pool, nr_free,
>>>  						      sc->gfp_mask);
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> https://urldefense.proofpoint.com/v1/url?u=http://lists.freedesktop.org/mailman/listinfo/dri-devel&k=oIvRg1%2BdGAgOoM1BIlLLqw%3D%3D%0A&r=l5Ago9ekmVFZ3c4M6eauqrJWGwjf6fTb%2BP3CxbBFkVM%3D%0A&m=O3%2ForLs9%2FKcgsMAkAfqRcjXW1tOmP7AUEsPkaztSSrE%3D%0A&s=a7ca6074f66b92976893c6c6e18d96b96f905d4fc8ec0b8503614728fc387dc5
David Herrmann Aug. 13, 2014, 12:40 p.m. UTC | #5
Hi

On Wed, Aug 13, 2014 at 2:35 PM, Thomas Hellstrom <thellstrom@vmware.com> wrote:
> On 08/13/2014 12:42 PM, Daniel Vetter wrote:
>> On Wed, Aug 13, 2014 at 11:06:25AM +0200, Thomas Hellstrom wrote:
>>> On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
>>>> From: Jérôme Glisse <jglisse@redhat.com>
>>>>
>>>> When experiencing memory pressure we want to minimize pool size so that
>>>> memory we just shrinked is not added back again just as the next thing.
>>>>
>>>> This will divide by 2 the maximum pool size for each device each time
>>>> the pool have to shrink. The limit is bumped again is next allocation
>>>> happen after one second since the last shrink. The one second delay is
>>>> obviously an arbitrary choice.
>>> Jérôme,
>>>
>>> I don't like this patch. It adds extra complexity and its usefulness is
>>> highly questionable.
>>> There are a number of caches in the system, and if all of them added
>>> some sort of voluntary shrink heuristics like this, we'd end up with
>>> impossible-to-debug unpredictable performance issues.
>>>
>>> We should let the memory subsystem decide when to reclaim pages from
>>> caches and what caches to reclaim them from.
>> Yeah, artificially limiting your cache from growing when your shrinker
>> gets called will just break the equal-memory pressure the core mm uses to
>> rebalance between all caches when workload changes. In i915 we let
>> everything grow without artificial bounds and only rely upon the shrinker
>> callbacks to ensure we don't consume more than our fair share of available
>> memory overall.
>> -Daniel
>
> Now when you bring i915 memory usage up, Daniel,
> I can't refrain from bringing up the old user-space unreclaimable kernel
> memory issue, for which gem open is a good example ;) Each time
> user-space opens a gem handle, some un-reclaimable kernel memory is
> allocated, for which there is no accounting, so theoretically I think a
> user can bring a system to unusability this way.
>
> Typically there are various limits on unreclaimable objects like this,
> like open file descriptors, and IIRC the kernel even has an internal
> limit on the number of struct files you initialize, based on the
> available system memory, so dma-buf / prime should already have some
> sort of protection.

gem->filp points to a fresh shmem file, which itself is limited like
dmabuf. That should suffice, right?

Thanks
David
Thomas Hellstrom Aug. 13, 2014, 12:48 p.m. UTC | #6
On 08/13/2014 02:40 PM, David Herrmann wrote:
> Hi
>
> On Wed, Aug 13, 2014 at 2:35 PM, Thomas Hellstrom <thellstrom@vmware.com> wrote:
>> On 08/13/2014 12:42 PM, Daniel Vetter wrote:
>>> On Wed, Aug 13, 2014 at 11:06:25AM +0200, Thomas Hellstrom wrote:
>>>> On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
>>>>> From: Jérôme Glisse <jglisse@redhat.com>
>>>>>
>>>>> When experiencing memory pressure we want to minimize pool size so that
>>>>> memory we just shrinked is not added back again just as the next thing.
>>>>>
>>>>> This will divide by 2 the maximum pool size for each device each time
>>>>> the pool have to shrink. The limit is bumped again is next allocation
>>>>> happen after one second since the last shrink. The one second delay is
>>>>> obviously an arbitrary choice.
>>>> Jérôme,
>>>>
>>>> I don't like this patch. It adds extra complexity and its usefulness is
>>>> highly questionable.
>>>> There are a number of caches in the system, and if all of them added
>>>> some sort of voluntary shrink heuristics like this, we'd end up with
>>>> impossible-to-debug unpredictable performance issues.
>>>>
>>>> We should let the memory subsystem decide when to reclaim pages from
>>>> caches and what caches to reclaim them from.
>>> Yeah, artificially limiting your cache from growing when your shrinker
>>> gets called will just break the equal-memory pressure the core mm uses to
>>> rebalance between all caches when workload changes. In i915 we let
>>> everything grow without artificial bounds and only rely upon the shrinker
>>> callbacks to ensure we don't consume more than our fair share of available
>>> memory overall.
>>> -Daniel
>> Now when you bring i915 memory usage up, Daniel,
>> I can't refrain from bringing up the old user-space unreclaimable kernel
>> memory issue, for which gem open is a good example ;) Each time
>> user-space opens a gem handle, some un-reclaimable kernel memory is
>> allocated, for which there is no accounting, so theoretically I think a
>> user can bring a system to unusability this way.
>>
>> Typically there are various limits on unreclaimable objects like this,
>> like open file descriptors, and IIRC the kernel even has an internal
>> limit on the number of struct files you initialize, based on the
>> available system memory, so dma-buf / prime should already have some
>> sort of protection.
> gem->filp points to a fresh shmem file, which itself is limited like
> dmabuf. That should suffice, right?
>
> Thanks
> David
I'm thinking of situations where you have a gem name and open a new
handle. It allocates a new unaccounted idr object. Admittedly you'd have
to open a hell of a lot of new handles to stress the system, but that's
an example of the situation I'm thinking of. Similarly perhaps if you
create a gem handle from a prime file-descriptor but I haven't looked at
that code in detail.

Thanks

/Thomas
Daniel Vetter Aug. 13, 2014, 1:01 p.m. UTC | #7
On Wed, Aug 13, 2014 at 02:35:52PM +0200, Thomas Hellstrom wrote:
> On 08/13/2014 12:42 PM, Daniel Vetter wrote:
> > On Wed, Aug 13, 2014 at 11:06:25AM +0200, Thomas Hellstrom wrote:
> >> On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
> >>> From: Jérôme Glisse <jglisse@redhat.com>
> >>>
> >>> When experiencing memory pressure we want to minimize pool size so that
> >>> memory we just shrinked is not added back again just as the next thing.
> >>>
> >>> This will divide by 2 the maximum pool size for each device each time
> >>> the pool have to shrink. The limit is bumped again is next allocation
> >>> happen after one second since the last shrink. The one second delay is
> >>> obviously an arbitrary choice.
> >> Jérôme,
> >>
> >> I don't like this patch. It adds extra complexity and its usefulness is
> >> highly questionable.
> >> There are a number of caches in the system, and if all of them added
> >> some sort of voluntary shrink heuristics like this, we'd end up with
> >> impossible-to-debug unpredictable performance issues.
> >>
> >> We should let the memory subsystem decide when to reclaim pages from
> >> caches and what caches to reclaim them from.
> > Yeah, artificially limiting your cache from growing when your shrinker
> > gets called will just break the equal-memory pressure the core mm uses to
> > rebalance between all caches when workload changes. In i915 we let
> > everything grow without artificial bounds and only rely upon the shrinker
> > callbacks to ensure we don't consume more than our fair share of available
> > memory overall.
> > -Daniel
> 
> Now when you bring i915 memory usage up, Daniel,
> I can't refrain from bringing up the old user-space unreclaimable kernel
> memory issue, for which gem open is a good example ;) Each time
> user-space opens a gem handle, some un-reclaimable kernel memory is
> allocated, for which there is no accounting, so theoretically I think a
> user can bring a system to unusability this way.
> 
> Typically there are various limits on unreclaimable objects like this,
> like open file descriptors, and IIRC the kernel even has an internal
> limit on the number of struct files you initialize, based on the
> available system memory, so dma-buf / prime should already have some
> sort of protection.

Oh yeah, we have zero cgroups limits or similar stuff for gem allocations,
so there's not really a way to isolate gpu memory usage in a sane way for
specific processes. But there's also zero limits on actual gpu usage
itself (timeslices or whatever) so I guess no one asked for this yet.

My comment really was about balancing mm users under the assumption that
they're all unlimited.
-Daniel
Oded Gabbay Aug. 13, 2014, 2:09 p.m. UTC | #8
On 13/08/14 16:01, Daniel Vetter wrote:
> On Wed, Aug 13, 2014 at 02:35:52PM +0200, Thomas Hellstrom wrote:
>> On 08/13/2014 12:42 PM, Daniel Vetter wrote:
>>> On Wed, Aug 13, 2014 at 11:06:25AM +0200, Thomas Hellstrom wrote:
>>>> On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
>>>>> From: Jérôme Glisse <jglisse@redhat.com>
>>>>>
>>>>> When experiencing memory pressure we want to minimize pool size so that
>>>>> memory we just shrinked is not added back again just as the next thing.
>>>>>
>>>>> This will divide by 2 the maximum pool size for each device each time
>>>>> the pool have to shrink. The limit is bumped again is next allocation
>>>>> happen after one second since the last shrink. The one second delay is
>>>>> obviously an arbitrary choice.
>>>> Jérôme,
>>>>
>>>> I don't like this patch. It adds extra complexity and its usefulness is
>>>> highly questionable.
>>>> There are a number of caches in the system, and if all of them added
>>>> some sort of voluntary shrink heuristics like this, we'd end up with
>>>> impossible-to-debug unpredictable performance issues.
>>>>
>>>> We should let the memory subsystem decide when to reclaim pages from
>>>> caches and what caches to reclaim them from.
>>> Yeah, artificially limiting your cache from growing when your shrinker
>>> gets called will just break the equal-memory pressure the core mm uses to
>>> rebalance between all caches when workload changes. In i915 we let
>>> everything grow without artificial bounds and only rely upon the shrinker
>>> callbacks to ensure we don't consume more than our fair share of available
>>> memory overall.
>>> -Daniel
>>
>> Now when you bring i915 memory usage up, Daniel,
>> I can't refrain from bringing up the old user-space unreclaimable kernel
>> memory issue, for which gem open is a good example ;) Each time
>> user-space opens a gem handle, some un-reclaimable kernel memory is
>> allocated, for which there is no accounting, so theoretically I think a
>> user can bring a system to unusability this way.
>>
>> Typically there are various limits on unreclaimable objects like this,
>> like open file descriptors, and IIRC the kernel even has an internal
>> limit on the number of struct files you initialize, based on the
>> available system memory, so dma-buf / prime should already have some
>> sort of protection.
>
> Oh yeah, we have zero cgroups limits or similar stuff for gem allocations,
> so there's not really a way to isolate gpu memory usage in a sane way for
> specific processes. But there's also zero limits on actual gpu usage
> itself (timeslices or whatever) so I guess no one asked for this yet.
>
> My comment really was about balancing mm users under the assumption that
> they're all unlimited.
> -Daniel
>
I think the point you brought up becomes very important for compute (HSA) 
processes. I still don't know how to distinguish between legitimate use of GPU 
local memory and misbehaving/malicious processes.

We have a requirement that HSA processes will be allowed to allocate and pin GPU 
local memory. They do it through an ioctl.
In the kernel driver, we have an accounting of those memory allocations, meaning 
that I can print a list of all the objects that were allocated by a certain 
process, per device.
Therefore, in theory, I can reclaim any object, but that will probably break the 
userspace app. If the app is misbehaving/malicious than that's ok, I guess. But 
how do I know that ? And what prevents that malicious app to re-spawn and do the 
same allocation again ?

	Oded
Thomas Hellstrom Aug. 13, 2014, 3:13 p.m. UTC | #9
On 08/13/2014 03:01 PM, Daniel Vetter wrote:
> On Wed, Aug 13, 2014 at 02:35:52PM +0200, Thomas Hellstrom wrote:
>> On 08/13/2014 12:42 PM, Daniel Vetter wrote:
>>> On Wed, Aug 13, 2014 at 11:06:25AM +0200, Thomas Hellstrom wrote:
>>>> On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
>>>>> From: Jérôme Glisse <jglisse@redhat.com>
>>>>>
>>>>> When experiencing memory pressure we want to minimize pool size so that
>>>>> memory we just shrinked is not added back again just as the next thing.
>>>>>
>>>>> This will divide by 2 the maximum pool size for each device each time
>>>>> the pool have to shrink. The limit is bumped again is next allocation
>>>>> happen after one second since the last shrink. The one second delay is
>>>>> obviously an arbitrary choice.
>>>> Jérôme,
>>>>
>>>> I don't like this patch. It adds extra complexity and its usefulness is
>>>> highly questionable.
>>>> There are a number of caches in the system, and if all of them added
>>>> some sort of voluntary shrink heuristics like this, we'd end up with
>>>> impossible-to-debug unpredictable performance issues.
>>>>
>>>> We should let the memory subsystem decide when to reclaim pages from
>>>> caches and what caches to reclaim them from.
>>> Yeah, artificially limiting your cache from growing when your shrinker
>>> gets called will just break the equal-memory pressure the core mm uses to
>>> rebalance between all caches when workload changes. In i915 we let
>>> everything grow without artificial bounds and only rely upon the shrinker
>>> callbacks to ensure we don't consume more than our fair share of available
>>> memory overall.
>>> -Daniel
>> Now when you bring i915 memory usage up, Daniel,
>> I can't refrain from bringing up the old user-space unreclaimable kernel
>> memory issue, for which gem open is a good example ;) Each time
>> user-space opens a gem handle, some un-reclaimable kernel memory is
>> allocated, for which there is no accounting, so theoretically I think a
>> user can bring a system to unusability this way.
>>
>> Typically there are various limits on unreclaimable objects like this,
>> like open file descriptors, and IIRC the kernel even has an internal
>> limit on the number of struct files you initialize, based on the
>> available system memory, so dma-buf / prime should already have some
>> sort of protection.
> Oh yeah, we have zero cgroups limits or similar stuff for gem allocations,
> so there's not really a way to isolate gpu memory usage in a sane way for
> specific processes. But there's also zero limits on actual gpu usage
> itself (timeslices or whatever) so I guess no one asked for this yet.

In its simplest form (like in TTM if correctly implemented by drivers)
this type of accounting stops non-privileged malicious GPU-users from
exhausting all system physical memory causing grief for other kernel
systems but not from causing grief for other GPU users. I think that's
the minimum level that's intended also for example also for the struct
file accounting.

> My comment really was about balancing mm users under the assumption that
> they're all unlimited.

Yeah, sorry for stealing the thread. I usually bring this up now and
again but nowadays with an exponential backoff.


> -Daniel

Thomas
Thomas Hellstrom Aug. 13, 2014, 3:19 p.m. UTC | #10
On 08/13/2014 04:09 PM, Oded Gabbay wrote:
>
>
> On 13/08/14 16:01, Daniel Vetter wrote:
>> On Wed, Aug 13, 2014 at 02:35:52PM +0200, Thomas Hellstrom wrote:
>>> On 08/13/2014 12:42 PM, Daniel Vetter wrote:
>>>> On Wed, Aug 13, 2014 at 11:06:25AM +0200, Thomas Hellstrom wrote:
>>>>> On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
>>>>>> From: Jérôme Glisse <jglisse@redhat.com>
>>>>>>
>>>>>> When experiencing memory pressure we want to minimize pool size
>>>>>> so that
>>>>>> memory we just shrinked is not added back again just as the next
>>>>>> thing.
>>>>>>
>>>>>> This will divide by 2 the maximum pool size for each device each
>>>>>> time
>>>>>> the pool have to shrink. The limit is bumped again is next
>>>>>> allocation
>>>>>> happen after one second since the last shrink. The one second
>>>>>> delay is
>>>>>> obviously an arbitrary choice.
>>>>> Jérôme,
>>>>>
>>>>> I don't like this patch. It adds extra complexity and its
>>>>> usefulness is
>>>>> highly questionable.
>>>>> There are a number of caches in the system, and if all of them added
>>>>> some sort of voluntary shrink heuristics like this, we'd end up with
>>>>> impossible-to-debug unpredictable performance issues.
>>>>>
>>>>> We should let the memory subsystem decide when to reclaim pages from
>>>>> caches and what caches to reclaim them from.
>>>> Yeah, artificially limiting your cache from growing when your shrinker
>>>> gets called will just break the equal-memory pressure the core mm
>>>> uses to
>>>> rebalance between all caches when workload changes. In i915 we let
>>>> everything grow without artificial bounds and only rely upon the
>>>> shrinker
>>>> callbacks to ensure we don't consume more than our fair share of
>>>> available
>>>> memory overall.
>>>> -Daniel
>>>
>>> Now when you bring i915 memory usage up, Daniel,
>>> I can't refrain from bringing up the old user-space unreclaimable
>>> kernel
>>> memory issue, for which gem open is a good example ;) Each time
>>> user-space opens a gem handle, some un-reclaimable kernel memory is
>>> allocated, for which there is no accounting, so theoretically I think a
>>> user can bring a system to unusability this way.
>>>
>>> Typically there are various limits on unreclaimable objects like this,
>>> like open file descriptors, and IIRC the kernel even has an internal
>>> limit on the number of struct files you initialize, based on the
>>> available system memory, so dma-buf / prime should already have some
>>> sort of protection.
>>
>> Oh yeah, we have zero cgroups limits or similar stuff for gem
>> allocations,
>> so there's not really a way to isolate gpu memory usage in a sane way
>> for
>> specific processes. But there's also zero limits on actual gpu usage
>> itself (timeslices or whatever) so I guess no one asked for this yet.
>>
>> My comment really was about balancing mm users under the assumption that
>> they're all unlimited.
>> -Daniel
>>
> I think the point you brought up becomes very important for compute
> (HSA) processes. I still don't know how to distinguish between
> legitimate use of GPU local memory and misbehaving/malicious processes.
>
> We have a requirement that HSA processes will be allowed to allocate
> and pin GPU local memory. They do it through an ioctl.
> In the kernel driver, we have an accounting of those memory
> allocations, meaning that I can print a list of all the objects that
> were allocated by a certain process, per device.
> Therefore, in theory, I can reclaim any object, but that will probably
> break the userspace app. If the app is misbehaving/malicious than
> that's ok, I guess. But how do I know that ? And what prevents that
> malicious app to re-spawn and do the same allocation again ?

If you have per-process accounting of all those memory allocations and
you need to reclaim and there's no nice way of doing it, you should
probably do it like the kernel OOM killer: You simply kill the process
that is most likely to bring back most memory (or use any other
heuristic). Typically the kernel OOM killer does that when all swap
space is exhausted, probably assuming that the process that uses most
swap space is most likely to be malicious, if there are any malicious
processes.

If the process respawns, and you run out of resources again, repeat the
process.

/Thomas


>
>     Oded
Daniel Vetter Aug. 13, 2014, 4:24 p.m. UTC | #11
On Wed, Aug 13, 2014 at 05:13:56PM +0200, Thomas Hellstrom wrote:
> On 08/13/2014 03:01 PM, Daniel Vetter wrote:
> > On Wed, Aug 13, 2014 at 02:35:52PM +0200, Thomas Hellstrom wrote:
> >> On 08/13/2014 12:42 PM, Daniel Vetter wrote:
> >>> On Wed, Aug 13, 2014 at 11:06:25AM +0200, Thomas Hellstrom wrote:
> >>>> On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
> >>>>> From: Jérôme Glisse <jglisse@redhat.com>
> >>>>>
> >>>>> When experiencing memory pressure we want to minimize pool size so that
> >>>>> memory we just shrinked is not added back again just as the next thing.
> >>>>>
> >>>>> This will divide by 2 the maximum pool size for each device each time
> >>>>> the pool have to shrink. The limit is bumped again is next allocation
> >>>>> happen after one second since the last shrink. The one second delay is
> >>>>> obviously an arbitrary choice.
> >>>> Jérôme,
> >>>>
> >>>> I don't like this patch. It adds extra complexity and its usefulness is
> >>>> highly questionable.
> >>>> There are a number of caches in the system, and if all of them added
> >>>> some sort of voluntary shrink heuristics like this, we'd end up with
> >>>> impossible-to-debug unpredictable performance issues.
> >>>>
> >>>> We should let the memory subsystem decide when to reclaim pages from
> >>>> caches and what caches to reclaim them from.
> >>> Yeah, artificially limiting your cache from growing when your shrinker
> >>> gets called will just break the equal-memory pressure the core mm uses to
> >>> rebalance between all caches when workload changes. In i915 we let
> >>> everything grow without artificial bounds and only rely upon the shrinker
> >>> callbacks to ensure we don't consume more than our fair share of available
> >>> memory overall.
> >>> -Daniel
> >> Now when you bring i915 memory usage up, Daniel,
> >> I can't refrain from bringing up the old user-space unreclaimable kernel
> >> memory issue, for which gem open is a good example ;) Each time
> >> user-space opens a gem handle, some un-reclaimable kernel memory is
> >> allocated, for which there is no accounting, so theoretically I think a
> >> user can bring a system to unusability this way.
> >>
> >> Typically there are various limits on unreclaimable objects like this,
> >> like open file descriptors, and IIRC the kernel even has an internal
> >> limit on the number of struct files you initialize, based on the
> >> available system memory, so dma-buf / prime should already have some
> >> sort of protection.
> > Oh yeah, we have zero cgroups limits or similar stuff for gem allocations,
> > so there's not really a way to isolate gpu memory usage in a sane way for
> > specific processes. But there's also zero limits on actual gpu usage
> > itself (timeslices or whatever) so I guess no one asked for this yet.
> 
> In its simplest form (like in TTM if correctly implemented by drivers)
> this type of accounting stops non-privileged malicious GPU-users from
> exhausting all system physical memory causing grief for other kernel
> systems but not from causing grief for other GPU users. I think that's
> the minimum level that's intended also for example also for the struct
> file accounting.

I think in i915 we're fairly close on that minimal standard - interactions
with shrinkers and oom logic work decently. It starts to fall apart though
when we've actually run out of memory - if the real memory hog is a gpu
process the oom killer won't notice all that memory since it's not
accounted against processes correctly.

I don't agree that gpu process should be punished in general compared to
other subsystems in the kernel. If the user wants to use 90% of all memory
for gpu tasks then I want to make that possible, even if it means that
everything else thrashes horribly. And as long as the system recovers and
rebalances after that gpu memory hog is gone ofc. Iirc ttm currently has a
fairly arbitrary (tunable) setting to limit system memory consumption, but
I might be wrong on that.

> > My comment really was about balancing mm users under the assumption that
> > they're all unlimited.
> 
> Yeah, sorry for stealing the thread. I usually bring this up now and
> again but nowadays with an exponential backoff.

Oh I'd love to see some cgroups or similar tracking so that server users
could set sane per-process/user/task limits on how much memory/gpu time
that group is allowed to consume. It's just that I haven't seen real
demand for this and so couldn't make the time available to implement it.
So thus far my goal is to make everything work nicely for unlimited tasks
right up to the point where the OOM killer needs to step in. Past that
everything starts to fall apart, but thus far that was good enough for
desktop usage.

Maybe WebGL will finally make this important enough so that we can fix it
for real ...
-Daniel
Daniel Vetter Aug. 13, 2014, 4:30 p.m. UTC | #12
On Wed, Aug 13, 2014 at 05:09:49PM +0300, Oded Gabbay wrote:
> 
> 
> On 13/08/14 16:01, Daniel Vetter wrote:
> >On Wed, Aug 13, 2014 at 02:35:52PM +0200, Thomas Hellstrom wrote:
> >>On 08/13/2014 12:42 PM, Daniel Vetter wrote:
> >>>On Wed, Aug 13, 2014 at 11:06:25AM +0200, Thomas Hellstrom wrote:
> >>>>On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
> >>>>>From: Jérôme Glisse <jglisse@redhat.com>
> >>>>>
> >>>>>When experiencing memory pressure we want to minimize pool size so that
> >>>>>memory we just shrinked is not added back again just as the next thing.
> >>>>>
> >>>>>This will divide by 2 the maximum pool size for each device each time
> >>>>>the pool have to shrink. The limit is bumped again is next allocation
> >>>>>happen after one second since the last shrink. The one second delay is
> >>>>>obviously an arbitrary choice.
> >>>>Jérôme,
> >>>>
> >>>>I don't like this patch. It adds extra complexity and its usefulness is
> >>>>highly questionable.
> >>>>There are a number of caches in the system, and if all of them added
> >>>>some sort of voluntary shrink heuristics like this, we'd end up with
> >>>>impossible-to-debug unpredictable performance issues.
> >>>>
> >>>>We should let the memory subsystem decide when to reclaim pages from
> >>>>caches and what caches to reclaim them from.
> >>>Yeah, artificially limiting your cache from growing when your shrinker
> >>>gets called will just break the equal-memory pressure the core mm uses to
> >>>rebalance between all caches when workload changes. In i915 we let
> >>>everything grow without artificial bounds and only rely upon the shrinker
> >>>callbacks to ensure we don't consume more than our fair share of available
> >>>memory overall.
> >>>-Daniel
> >>
> >>Now when you bring i915 memory usage up, Daniel,
> >>I can't refrain from bringing up the old user-space unreclaimable kernel
> >>memory issue, for which gem open is a good example ;) Each time
> >>user-space opens a gem handle, some un-reclaimable kernel memory is
> >>allocated, for which there is no accounting, so theoretically I think a
> >>user can bring a system to unusability this way.
> >>
> >>Typically there are various limits on unreclaimable objects like this,
> >>like open file descriptors, and IIRC the kernel even has an internal
> >>limit on the number of struct files you initialize, based on the
> >>available system memory, so dma-buf / prime should already have some
> >>sort of protection.
> >
> >Oh yeah, we have zero cgroups limits or similar stuff for gem allocations,
> >so there's not really a way to isolate gpu memory usage in a sane way for
> >specific processes. But there's also zero limits on actual gpu usage
> >itself (timeslices or whatever) so I guess no one asked for this yet.
> >
> >My comment really was about balancing mm users under the assumption that
> >they're all unlimited.
> >-Daniel
> >
> I think the point you brought up becomes very important for compute (HSA)
> processes. I still don't know how to distinguish between legitimate use of
> GPU local memory and misbehaving/malicious processes.
> 
> We have a requirement that HSA processes will be allowed to allocate and pin
> GPU local memory. They do it through an ioctl.
> In the kernel driver, we have an accounting of those memory allocations,
> meaning that I can print a list of all the objects that were allocated by a
> certain process, per device.
> Therefore, in theory, I can reclaim any object, but that will probably break
> the userspace app. If the app is misbehaving/malicious than that's ok, I
> guess. But how do I know that ? And what prevents that malicious app to
> re-spawn and do the same allocation again ?

You can't do that in the kernel, this is policy decisions which is
userspaces job. But what we instead need to allow is to properly track
memory allocations so that memory limits can be set with cgroups. With SVM
you get that for free. Without SVM we need some work in that area since
currently the memory accounting for gem/ttm drivers is broken.

The other bit is limits for wasting gpu time, and I guess for that we want
a new gpu time cgroup system so that users can set soft/hard limits for
different gpgpu tasks on servers.
-Daniel
Alex Deucher Aug. 13, 2014, 4:30 p.m. UTC | #13
On Wed, Aug 13, 2014 at 12:24 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Wed, Aug 13, 2014 at 05:13:56PM +0200, Thomas Hellstrom wrote:
>> On 08/13/2014 03:01 PM, Daniel Vetter wrote:
>> > On Wed, Aug 13, 2014 at 02:35:52PM +0200, Thomas Hellstrom wrote:
>> >> On 08/13/2014 12:42 PM, Daniel Vetter wrote:
>> >>> On Wed, Aug 13, 2014 at 11:06:25AM +0200, Thomas Hellstrom wrote:
>> >>>> On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
>> >>>>> From: Jérôme Glisse <jglisse@redhat.com>
>> >>>>>
>> >>>>> When experiencing memory pressure we want to minimize pool size so that
>> >>>>> memory we just shrinked is not added back again just as the next thing.
>> >>>>>
>> >>>>> This will divide by 2 the maximum pool size for each device each time
>> >>>>> the pool have to shrink. The limit is bumped again is next allocation
>> >>>>> happen after one second since the last shrink. The one second delay is
>> >>>>> obviously an arbitrary choice.
>> >>>> Jérôme,
>> >>>>
>> >>>> I don't like this patch. It adds extra complexity and its usefulness is
>> >>>> highly questionable.
>> >>>> There are a number of caches in the system, and if all of them added
>> >>>> some sort of voluntary shrink heuristics like this, we'd end up with
>> >>>> impossible-to-debug unpredictable performance issues.
>> >>>>
>> >>>> We should let the memory subsystem decide when to reclaim pages from
>> >>>> caches and what caches to reclaim them from.
>> >>> Yeah, artificially limiting your cache from growing when your shrinker
>> >>> gets called will just break the equal-memory pressure the core mm uses to
>> >>> rebalance between all caches when workload changes. In i915 we let
>> >>> everything grow without artificial bounds and only rely upon the shrinker
>> >>> callbacks to ensure we don't consume more than our fair share of available
>> >>> memory overall.
>> >>> -Daniel
>> >> Now when you bring i915 memory usage up, Daniel,
>> >> I can't refrain from bringing up the old user-space unreclaimable kernel
>> >> memory issue, for which gem open is a good example ;) Each time
>> >> user-space opens a gem handle, some un-reclaimable kernel memory is
>> >> allocated, for which there is no accounting, so theoretically I think a
>> >> user can bring a system to unusability this way.
>> >>
>> >> Typically there are various limits on unreclaimable objects like this,
>> >> like open file descriptors, and IIRC the kernel even has an internal
>> >> limit on the number of struct files you initialize, based on the
>> >> available system memory, so dma-buf / prime should already have some
>> >> sort of protection.
>> > Oh yeah, we have zero cgroups limits or similar stuff for gem allocations,
>> > so there's not really a way to isolate gpu memory usage in a sane way for
>> > specific processes. But there's also zero limits on actual gpu usage
>> > itself (timeslices or whatever) so I guess no one asked for this yet.
>>
>> In its simplest form (like in TTM if correctly implemented by drivers)
>> this type of accounting stops non-privileged malicious GPU-users from
>> exhausting all system physical memory causing grief for other kernel
>> systems but not from causing grief for other GPU users. I think that's
>> the minimum level that's intended also for example also for the struct
>> file accounting.
>
> I think in i915 we're fairly close on that minimal standard - interactions
> with shrinkers and oom logic work decently. It starts to fall apart though
> when we've actually run out of memory - if the real memory hog is a gpu
> process the oom killer won't notice all that memory since it's not
> accounted against processes correctly.
>
> I don't agree that gpu process should be punished in general compared to
> other subsystems in the kernel. If the user wants to use 90% of all memory
> for gpu tasks then I want to make that possible, even if it means that
> everything else thrashes horribly. And as long as the system recovers and
> rebalances after that gpu memory hog is gone ofc. Iirc ttm currently has a
> fairly arbitrary (tunable) setting to limit system memory consumption, but
> I might be wrong on that.

Yes, it currently limits you to half of memory, but at least we would
like to make it tuneable since there are a lot of user cases where the
user wants to use 90% of memory for GPU tasks at the expense of
everything else.

Alex

>
>> > My comment really was about balancing mm users under the assumption that
>> > they're all unlimited.
>>
>> Yeah, sorry for stealing the thread. I usually bring this up now and
>> again but nowadays with an exponential backoff.
>
> Oh I'd love to see some cgroups or similar tracking so that server users
> could set sane per-process/user/task limits on how much memory/gpu time
> that group is allowed to consume. It's just that I haven't seen real
> demand for this and so couldn't make the time available to implement it.
> So thus far my goal is to make everything work nicely for unlimited tasks
> right up to the point where the OOM killer needs to step in. Past that
> everything starts to fall apart, but thus far that was good enough for
> desktop usage.
>
> Maybe WebGL will finally make this important enough so that we can fix it
> for real ...
> -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
Daniel Vetter Aug. 13, 2014, 4:38 p.m. UTC | #14
On Wed, Aug 13, 2014 at 12:30:45PM -0400, Alex Deucher wrote:
> On Wed, Aug 13, 2014 at 12:24 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> > On Wed, Aug 13, 2014 at 05:13:56PM +0200, Thomas Hellstrom wrote:
> >> On 08/13/2014 03:01 PM, Daniel Vetter wrote:
> >> > On Wed, Aug 13, 2014 at 02:35:52PM +0200, Thomas Hellstrom wrote:
> >> >> On 08/13/2014 12:42 PM, Daniel Vetter wrote:
> >> >>> On Wed, Aug 13, 2014 at 11:06:25AM +0200, Thomas Hellstrom wrote:
> >> >>>> On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
> >> >>>>> From: Jérôme Glisse <jglisse@redhat.com>
> >> >>>>>
> >> >>>>> When experiencing memory pressure we want to minimize pool size so that
> >> >>>>> memory we just shrinked is not added back again just as the next thing.
> >> >>>>>
> >> >>>>> This will divide by 2 the maximum pool size for each device each time
> >> >>>>> the pool have to shrink. The limit is bumped again is next allocation
> >> >>>>> happen after one second since the last shrink. The one second delay is
> >> >>>>> obviously an arbitrary choice.
> >> >>>> Jérôme,
> >> >>>>
> >> >>>> I don't like this patch. It adds extra complexity and its usefulness is
> >> >>>> highly questionable.
> >> >>>> There are a number of caches in the system, and if all of them added
> >> >>>> some sort of voluntary shrink heuristics like this, we'd end up with
> >> >>>> impossible-to-debug unpredictable performance issues.
> >> >>>>
> >> >>>> We should let the memory subsystem decide when to reclaim pages from
> >> >>>> caches and what caches to reclaim them from.
> >> >>> Yeah, artificially limiting your cache from growing when your shrinker
> >> >>> gets called will just break the equal-memory pressure the core mm uses to
> >> >>> rebalance between all caches when workload changes. In i915 we let
> >> >>> everything grow without artificial bounds and only rely upon the shrinker
> >> >>> callbacks to ensure we don't consume more than our fair share of available
> >> >>> memory overall.
> >> >>> -Daniel
> >> >> Now when you bring i915 memory usage up, Daniel,
> >> >> I can't refrain from bringing up the old user-space unreclaimable kernel
> >> >> memory issue, for which gem open is a good example ;) Each time
> >> >> user-space opens a gem handle, some un-reclaimable kernel memory is
> >> >> allocated, for which there is no accounting, so theoretically I think a
> >> >> user can bring a system to unusability this way.
> >> >>
> >> >> Typically there are various limits on unreclaimable objects like this,
> >> >> like open file descriptors, and IIRC the kernel even has an internal
> >> >> limit on the number of struct files you initialize, based on the
> >> >> available system memory, so dma-buf / prime should already have some
> >> >> sort of protection.
> >> > Oh yeah, we have zero cgroups limits or similar stuff for gem allocations,
> >> > so there's not really a way to isolate gpu memory usage in a sane way for
> >> > specific processes. But there's also zero limits on actual gpu usage
> >> > itself (timeslices or whatever) so I guess no one asked for this yet.
> >>
> >> In its simplest form (like in TTM if correctly implemented by drivers)
> >> this type of accounting stops non-privileged malicious GPU-users from
> >> exhausting all system physical memory causing grief for other kernel
> >> systems but not from causing grief for other GPU users. I think that's
> >> the minimum level that's intended also for example also for the struct
> >> file accounting.
> >
> > I think in i915 we're fairly close on that minimal standard - interactions
> > with shrinkers and oom logic work decently. It starts to fall apart though
> > when we've actually run out of memory - if the real memory hog is a gpu
> > process the oom killer won't notice all that memory since it's not
> > accounted against processes correctly.
> >
> > I don't agree that gpu process should be punished in general compared to
> > other subsystems in the kernel. If the user wants to use 90% of all memory
> > for gpu tasks then I want to make that possible, even if it means that
> > everything else thrashes horribly. And as long as the system recovers and
> > rebalances after that gpu memory hog is gone ofc. Iirc ttm currently has a
> > fairly arbitrary (tunable) setting to limit system memory consumption, but
> > I might be wrong on that.
> 
> Yes, it currently limits you to half of memory, but at least we would
> like to make it tuneable since there are a lot of user cases where the
> user wants to use 90% of memory for GPU tasks at the expense of
> everything else.

Ime a lot of fun stuff starts to happen when you go there. We have piles
of memory thrashing testcases and generally had lots of fun with our
shrinker, so I think until you've really beaten onto those paths in
ttm+radeon I'd keep the limit where it is.
-Daniel
Daniel Vetter Aug. 13, 2014, 4:45 p.m. UTC | #15
On Wed, Aug 13, 2014 at 6:38 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>> Yes, it currently limits you to half of memory, but at least we would
>> like to make it tuneable since there are a lot of user cases where the
>> user wants to use 90% of memory for GPU tasks at the expense of
>> everything else.
>
> Ime a lot of fun stuff starts to happen when you go there. We have piles
> of memory thrashing testcases and generally had lots of fun with our
> shrinker, so I think until you've really beaten onto those paths in
> ttm+radeon I'd keep the limit where it is.

One example that already starts if you go above 50% is that by default
the dirty pagecache is limited to 40% of memory. Above that you start
to stall in writeback, but gpus are really fast at re-dirtying memory.
So we've seen cases where the core mm OOMed with essentially 90% of
memory on writeback and piles of free swap. Waiting a few seconds for
the SSD to catch up would have gotten it out of that tight spot
without killing any process. One side-effect of such fun is that
memory allocations start to fail in really interesting places, and you
need to pile in hacks so make it all a bit more synchronous to avoid
the core mm freaking out.
-Daniel
Thomas Hellstrom Aug. 13, 2014, 5:20 p.m. UTC | #16
On 08/13/2014 06:24 PM, Daniel Vetter wrote:
> On Wed, Aug 13, 2014 at 05:13:56PM +0200, Thomas Hellstrom wrote:
>> On 08/13/2014 03:01 PM, Daniel Vetter wrote:
>>> On Wed, Aug 13, 2014 at 02:35:52PM +0200, Thomas Hellstrom wrote:
>>>> On 08/13/2014 12:42 PM, Daniel Vetter wrote:
>>>>> On Wed, Aug 13, 2014 at 11:06:25AM +0200, Thomas Hellstrom wrote:
>>>>>> On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
>>>>>>> From: Jérôme Glisse <jglisse@redhat.com>
>>>>>>>
>>>>>>> When experiencing memory pressure we want to minimize pool size so that
>>>>>>> memory we just shrinked is not added back again just as the next thing.
>>>>>>>
>>>>>>> This will divide by 2 the maximum pool size for each device each time
>>>>>>> the pool have to shrink. The limit is bumped again is next allocation
>>>>>>> happen after one second since the last shrink. The one second delay is
>>>>>>> obviously an arbitrary choice.
>>>>>> Jérôme,
>>>>>>
>>>>>> I don't like this patch. It adds extra complexity and its usefulness is
>>>>>> highly questionable.
>>>>>> There are a number of caches in the system, and if all of them added
>>>>>> some sort of voluntary shrink heuristics like this, we'd end up with
>>>>>> impossible-to-debug unpredictable performance issues.
>>>>>>
>>>>>> We should let the memory subsystem decide when to reclaim pages from
>>>>>> caches and what caches to reclaim them from.
>>>>> Yeah, artificially limiting your cache from growing when your shrinker
>>>>> gets called will just break the equal-memory pressure the core mm uses to
>>>>> rebalance between all caches when workload changes. In i915 we let
>>>>> everything grow without artificial bounds and only rely upon the shrinker
>>>>> callbacks to ensure we don't consume more than our fair share of available
>>>>> memory overall.
>>>>> -Daniel
>>>> Now when you bring i915 memory usage up, Daniel,
>>>> I can't refrain from bringing up the old user-space unreclaimable kernel
>>>> memory issue, for which gem open is a good example ;) Each time
>>>> user-space opens a gem handle, some un-reclaimable kernel memory is
>>>> allocated, for which there is no accounting, so theoretically I think a
>>>> user can bring a system to unusability this way.
>>>>
>>>> Typically there are various limits on unreclaimable objects like this,
>>>> like open file descriptors, and IIRC the kernel even has an internal
>>>> limit on the number of struct files you initialize, based on the
>>>> available system memory, so dma-buf / prime should already have some
>>>> sort of protection.
>>> Oh yeah, we have zero cgroups limits or similar stuff for gem allocations,
>>> so there's not really a way to isolate gpu memory usage in a sane way for
>>> specific processes. But there's also zero limits on actual gpu usage
>>> itself (timeslices or whatever) so I guess no one asked for this yet.
>> In its simplest form (like in TTM if correctly implemented by drivers)
>> this type of accounting stops non-privileged malicious GPU-users from
>> exhausting all system physical memory causing grief for other kernel
>> systems but not from causing grief for other GPU users. I think that's
>> the minimum level that's intended also for example also for the struct
>> file accounting.
> I think in i915 we're fairly close on that minimal standard - interactions
> with shrinkers and oom logic work decently. It starts to fall apart though
> when we've actually run out of memory - if the real memory hog is a gpu
> process the oom killer won't notice all that memory since it's not
> accounted against processes correctly.
>
> I don't agree that gpu process should be punished in general compared to
> other subsystems in the kernel. If the user wants to use 90% of all memory
> for gpu tasks then I want to make that possible, even if it means that
> everything else thrashes horribly. And as long as the system recovers and
> rebalances after that gpu memory hog is gone ofc. Iirc ttm currently has a
> fairly arbitrary (tunable) setting to limit system memory consumption, but
> I might be wrong on that.

No, that's correct, or rather it's intended to limit pinned
unreclaimable system memory (though part of what's unreclaimable could
actually be made reclaimable if we'd implement another shrinker level).

>>> My comment really was about balancing mm users under the assumption that
>>> they're all unlimited.
>> Yeah, sorry for stealing the thread. I usually bring this up now and
>> again but nowadays with an exponential backoff.
> Oh I'd love to see some cgroups or similar tracking so that server users
> could set sane per-process/user/task limits on how much memory/gpu time
> that group is allowed to consume. It's just that I haven't seen real
> demand for this and so couldn't make the time available to implement it.
> So thus far my goal is to make everything work nicely for unlimited tasks
> right up to the point where the OOM killer needs to step in. Past that
> everything starts to fall apart, but thus far that was good enough for
> desktop usage.

Well I'm not sure if things have changed but last time (a couple of
years ago) I looked at this situation (kernel out of physical memory but
a fair amount of swap space left) the OOM killer was never invoked, so a
number of more or less critical kernel systems (disk I/O, paging,
networking) where getting -ENOMEM and hitting rarely tested error paths.
A state you don't want to have the kernel in. Now the OOM algorithm may
of course have changed since then.

My point is that with unaccounted constructs like gem-open-from-name it
should be easy for any unpriviliged authenticated gem client to pin all
kernel physical memory, put the kernel in that state and keep it there,
and IMO a kernel-user space interface shouldn't allow that.

/Thomas

>
> Maybe WebGL will finally make this important enough so that we can fix it
> for real ...
> -Daniel
Thomas Hellstrom Aug. 13, 2014, 5:38 p.m. UTC | #17
On 08/13/2014 06:30 PM, Alex Deucher wrote:
> On Wed, Aug 13, 2014 at 12:24 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>> On Wed, Aug 13, 2014 at 05:13:56PM +0200, Thomas Hellstrom wrote:
>>> On 08/13/2014 03:01 PM, Daniel Vetter wrote:
>>>> On Wed, Aug 13, 2014 at 02:35:52PM +0200, Thomas Hellstrom wrote:
>>>>> On 08/13/2014 12:42 PM, Daniel Vetter wrote:
>>>>>> On Wed, Aug 13, 2014 at 11:06:25AM +0200, Thomas Hellstrom wrote:
>>>>>>> On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
>>>>>>>> From: Jérôme Glisse <jglisse@redhat.com>
>>>>>>>>
>>>>>>>> When experiencing memory pressure we want to minimize pool size so that
>>>>>>>> memory we just shrinked is not added back again just as the next thing.
>>>>>>>>
>>>>>>>> This will divide by 2 the maximum pool size for each device each time
>>>>>>>> the pool have to shrink. The limit is bumped again is next allocation
>>>>>>>> happen after one second since the last shrink. The one second delay is
>>>>>>>> obviously an arbitrary choice.
>>>>>>> Jérôme,
>>>>>>>
>>>>>>> I don't like this patch. It adds extra complexity and its usefulness is
>>>>>>> highly questionable.
>>>>>>> There are a number of caches in the system, and if all of them added
>>>>>>> some sort of voluntary shrink heuristics like this, we'd end up with
>>>>>>> impossible-to-debug unpredictable performance issues.
>>>>>>>
>>>>>>> We should let the memory subsystem decide when to reclaim pages from
>>>>>>> caches and what caches to reclaim them from.
>>>>>> Yeah, artificially limiting your cache from growing when your shrinker
>>>>>> gets called will just break the equal-memory pressure the core mm uses to
>>>>>> rebalance between all caches when workload changes. In i915 we let
>>>>>> everything grow without artificial bounds and only rely upon the shrinker
>>>>>> callbacks to ensure we don't consume more than our fair share of available
>>>>>> memory overall.
>>>>>> -Daniel
>>>>> Now when you bring i915 memory usage up, Daniel,
>>>>> I can't refrain from bringing up the old user-space unreclaimable kernel
>>>>> memory issue, for which gem open is a good example ;) Each time
>>>>> user-space opens a gem handle, some un-reclaimable kernel memory is
>>>>> allocated, for which there is no accounting, so theoretically I think a
>>>>> user can bring a system to unusability this way.
>>>>>
>>>>> Typically there are various limits on unreclaimable objects like this,
>>>>> like open file descriptors, and IIRC the kernel even has an internal
>>>>> limit on the number of struct files you initialize, based on the
>>>>> available system memory, so dma-buf / prime should already have some
>>>>> sort of protection.
>>>> Oh yeah, we have zero cgroups limits or similar stuff for gem allocations,
>>>> so there's not really a way to isolate gpu memory usage in a sane way for
>>>> specific processes. But there's also zero limits on actual gpu usage
>>>> itself (timeslices or whatever) so I guess no one asked for this yet.
>>> In its simplest form (like in TTM if correctly implemented by drivers)
>>> this type of accounting stops non-privileged malicious GPU-users from
>>> exhausting all system physical memory causing grief for other kernel
>>> systems but not from causing grief for other GPU users. I think that's
>>> the minimum level that's intended also for example also for the struct
>>> file accounting.
>> I think in i915 we're fairly close on that minimal standard - interactions
>> with shrinkers and oom logic work decently. It starts to fall apart though
>> when we've actually run out of memory - if the real memory hog is a gpu
>> process the oom killer won't notice all that memory since it's not
>> accounted against processes correctly.
>>
>> I don't agree that gpu process should be punished in general compared to
>> other subsystems in the kernel. If the user wants to use 90% of all memory
>> for gpu tasks then I want to make that possible, even if it means that
>> everything else thrashes horribly. And as long as the system recovers and
>> rebalances after that gpu memory hog is gone ofc. Iirc ttm currently has a
>> fairly arbitrary (tunable) setting to limit system memory consumption, but
>> I might be wrong on that.
> Yes, it currently limits you to half of memory, but at least we would
> like to make it tuneable since there are a lot of user cases where the
> user wants to use 90% of memory for GPU tasks at the expense of
> everything else.
>
> Alex
>

It's in /sys/devices/virtual/drm/ttm/memory_accounting/*

Run-time tunable, but you should probably write an app to tune if you
want to hand out to users, since if you up the limit, you probably want
to modify a number of values.

zone_memory: ro: Total memory in the zone.
used_memory: ro: Currently pinned memory.
available_memory: rw: Allocation limit.
emergency_memory: rw: Allocation limit for CAP_SYS_ADMIN
swap_limit: rw: Swapper thread starts at this limit.

/Thomas
Jesse Barnes Aug. 14, 2014, 10:29 p.m. UTC | #18
On Wed, 13 Aug 2014 17:13:56 +0200
Thomas Hellstrom <thellstrom@vmware.com> wrote:

> On 08/13/2014 03:01 PM, Daniel Vetter wrote:
> > On Wed, Aug 13, 2014 at 02:35:52PM +0200, Thomas Hellstrom wrote:
> >> On 08/13/2014 12:42 PM, Daniel Vetter wrote:
> >>> On Wed, Aug 13, 2014 at 11:06:25AM +0200, Thomas Hellstrom wrote:
> >>>> On 08/13/2014 05:52 AM, Jérôme Glisse wrote:
> >>>>> From: Jérôme Glisse <jglisse@redhat.com>
> >>>>>
> >>>>> When experiencing memory pressure we want to minimize pool size so that
> >>>>> memory we just shrinked is not added back again just as the next thing.
> >>>>>
> >>>>> This will divide by 2 the maximum pool size for each device each time
> >>>>> the pool have to shrink. The limit is bumped again is next allocation
> >>>>> happen after one second since the last shrink. The one second delay is
> >>>>> obviously an arbitrary choice.
> >>>> Jérôme,
> >>>>
> >>>> I don't like this patch. It adds extra complexity and its usefulness is
> >>>> highly questionable.
> >>>> There are a number of caches in the system, and if all of them added
> >>>> some sort of voluntary shrink heuristics like this, we'd end up with
> >>>> impossible-to-debug unpredictable performance issues.
> >>>>
> >>>> We should let the memory subsystem decide when to reclaim pages from
> >>>> caches and what caches to reclaim them from.
> >>> Yeah, artificially limiting your cache from growing when your shrinker
> >>> gets called will just break the equal-memory pressure the core mm uses to
> >>> rebalance between all caches when workload changes. In i915 we let
> >>> everything grow without artificial bounds and only rely upon the shrinker
> >>> callbacks to ensure we don't consume more than our fair share of available
> >>> memory overall.
> >>> -Daniel
> >> Now when you bring i915 memory usage up, Daniel,
> >> I can't refrain from bringing up the old user-space unreclaimable kernel
> >> memory issue, for which gem open is a good example ;) Each time
> >> user-space opens a gem handle, some un-reclaimable kernel memory is
> >> allocated, for which there is no accounting, so theoretically I think a
> >> user can bring a system to unusability this way.
> >>
> >> Typically there are various limits on unreclaimable objects like this,
> >> like open file descriptors, and IIRC the kernel even has an internal
> >> limit on the number of struct files you initialize, based on the
> >> available system memory, so dma-buf / prime should already have some
> >> sort of protection.
> > Oh yeah, we have zero cgroups limits or similar stuff for gem allocations,
> > so there's not really a way to isolate gpu memory usage in a sane way for
> > specific processes. But there's also zero limits on actual gpu usage
> > itself (timeslices or whatever) so I guess no one asked for this yet.
> 
> In its simplest form (like in TTM if correctly implemented by drivers)
> this type of accounting stops non-privileged malicious GPU-users from
> exhausting all system physical memory causing grief for other kernel
> systems but not from causing grief for other GPU users. I think that's
> the minimum level that's intended also for example also for the struct
> file accounting.
> 
> > My comment really was about balancing mm users under the assumption that
> > they're all unlimited.
> 
> Yeah, sorry for stealing the thread. I usually bring this up now and
> again but nowadays with an exponential backoff.

Yeah I agree we're missing some good limits stuff in i915 and DRM in
general probably.  Chris started looking at this awhile back, but I
haven't seen anything recently.  Tying into the ulimits/rlimits might
make sense, and at the very least we need to account for things
properly so we can add new limits where needed.
diff mbox

Patch

diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index 09874d6..ab41adf 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -68,6 +68,8 @@ 
  * @list: Pool of free uc/wc pages for fast reuse.
  * @gfp_flags: Flags to pass for alloc_page.
  * @npages: Number of pages in pool.
+ * @cur_max_size: Current maximum size for the pool.
+ * @shrink_timeout: Timeout for pool maximum size restriction.
  */
 struct ttm_page_pool {
 	spinlock_t		lock;
@@ -76,6 +78,8 @@  struct ttm_page_pool {
 	gfp_t			gfp_flags;
 	unsigned		npages;
 	char			*name;
+	unsigned		cur_max_size;
+	unsigned long		last_shrink;
 	unsigned long		nfrees;
 	unsigned long		nrefills;
 };
@@ -289,6 +293,16 @@  static void ttm_pool_update_free_locked(struct ttm_page_pool *pool,
 	pool->nfrees += freed_pages;
 }
 
+static inline void ttm_pool_update_max_size(struct ttm_page_pool *pool)
+{
+	if (time_before(jiffies, pool->shrink_timeout))
+		return;
+	/* In case we reached zero bounce back to 512 pages. */
+	pool->cur_max_size = max(pool->cur_max_size << 1, 512);
+	pool->cur_max_size = min(pool->cur_max_size,
+				 _manager->options.max_size);
+}
+
 /**
  * Free pages from pool.
  *
@@ -407,6 +421,9 @@  ttm_pool_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 		if (shrink_pages == 0)
 			break;
 		pool = &_manager->pools[(i + pool_offset)%NUM_POOLS];
+		/* No matter what make sure the pool do not grow in next second. */
+		pool->cur_max_size = pool->cur_max_size >> 1;
+		pool->shrink_timeout = jiffies + HZ;
 		shrink_pages = ttm_page_pool_free(pool, nr_free,
 						  sc->gfp_mask);
 		freed += nr_free - shrink_pages;
@@ -701,13 +718,12 @@  static void ttm_put_pages(struct page **pages, unsigned npages, int flags,
 	}
 	/* Check that we don't go over the pool limit */
 	npages = 0;
-	if (pool->npages > _manager->options.max_size) {
-		npages = pool->npages - _manager->options.max_size;
-		/* free at least NUM_PAGES_TO_ALLOC number of pages
-		 * to reduce calls to set_memory_wb */
-		if (npages < NUM_PAGES_TO_ALLOC)
-			npages = NUM_PAGES_TO_ALLOC;
-	}
+	/*
+	 * Free at least NUM_PAGES_TO_ALLOC number of pages to reduce calls to
+	 * set_memory_wb.
+	 */
+	if (pool->npages > (pool->cur_max_size + NUM_PAGES_TO_ALLOC))
+		npages = pool->npages - pool->cur_max_size;
 	spin_unlock_irqrestore(&pool->lock, irq_flags);
 	if (npages)
 		ttm_page_pool_free(pool, npages, GFP_KERNEL);
@@ -751,6 +767,9 @@  static int ttm_get_pages(struct page **pages, unsigned npages, int flags,
 		return 0;
 	}
 
+	/* Update pool size in case shrinker limited it. */
+	ttm_pool_update_max_size(pool);
+
 	/* combine zero flag to pool flags */
 	gfp_flags |= pool->gfp_flags;
 
@@ -803,6 +822,8 @@  static void ttm_page_pool_init_locked(struct ttm_page_pool *pool, gfp_t flags,
 	pool->npages = pool->nfrees = 0;
 	pool->gfp_flags = flags;
 	pool->name = name;
+	pool->cur_max_size = _manager->options.max_size;
+	pool->shrink_timeout = jiffies;
 }
 
 int ttm_page_alloc_init(struct ttm_mem_global *glob, unsigned max_pages)
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
index a076ff3..80b10aa 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
@@ -93,6 +93,8 @@  enum pool_type {
  * @size: Size used during DMA allocation.
  * @npages_free: Count of available pages for re-use.
  * @npages_in_use: Count of pages that are in use.
+ * @cur_max_size: Current maximum size for the pool.
+ * @shrink_timeout: Timeout for pool maximum size restriction.
  * @nfrees: Stats when pool is shrinking.
  * @nrefills: Stats when the pool is grown.
  * @gfp_flags: Flags to pass for alloc_page.
@@ -110,6 +112,8 @@  struct dma_pool {
 	unsigned size;
 	unsigned npages_free;
 	unsigned npages_in_use;
+	unsigned cur_max_size;
+	unsigned long last_shrink;
 	unsigned long nfrees; /* Stats when shrunk. */
 	unsigned long nrefills; /* Stats when grown. */
 	gfp_t gfp_flags;
@@ -331,6 +335,17 @@  static void __ttm_dma_free_page(struct dma_pool *pool, struct dma_page *d_page)
 	kfree(d_page);
 	d_page = NULL;
 }
+
+static inline void ttm_dma_pool_update_max_size(struct dma_pool *pool)
+{
+	if (time_before(jiffies, pool->shrink_timeout))
+		return;
+	/* In case we reached zero bounce back to 512 pages. */
+	pool->cur_max_size = max(pool->cur_max_size << 1, 512);
+	pool->cur_max_size = min(pool->cur_max_size,
+				 _manager->options.max_size);
+}
+
 static struct dma_page *__ttm_dma_alloc_page(struct dma_pool *pool)
 {
 	struct dma_page *d_page;
@@ -606,6 +621,8 @@  static struct dma_pool *ttm_dma_pool_init(struct device *dev, gfp_t flags,
 	pool->size = PAGE_SIZE;
 	pool->type = type;
 	pool->nrefills = 0;
+	pool->cur_max_size = _manager->options.max_size;
+	pool->shrink_timeout = jiffies;
 	p = pool->name;
 	for (i = 0; i < 5; i++) {
 		if (type & t[i]) {
@@ -892,6 +909,9 @@  int ttm_dma_populate(struct ttm_dma_tt *ttm_dma, struct device *dev)
 		}
 	}
 
+	/* Update pool size in case shrinker limited it. */
+	ttm_dma_pool_update_max_size(pool);
+
 	INIT_LIST_HEAD(&ttm_dma->pages_list);
 	for (i = 0; i < ttm->num_pages; ++i) {
 		ret = ttm_dma_pool_get_pages(pool, ttm_dma, i);
@@ -953,9 +973,9 @@  void ttm_dma_unpopulate(struct ttm_dma_tt *ttm_dma, struct device *dev)
 	} else {
 		pool->npages_free += count;
 		list_splice(&ttm_dma->pages_list, &pool->free_list);
-		if (pool->npages_free >= (_manager->options.max_size +
+		if (pool->npages_free >= (pool->cur_max_size +
 					  NUM_PAGES_TO_ALLOC))
-			npages = pool->npages_free - _manager->options.max_size;
+			npages = pool->npages_free - pool->cur_max_size;
 	}
 	spin_unlock_irqrestore(&pool->lock, irq_flags);
 
@@ -1024,6 +1044,9 @@  ttm_dma_pool_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 		/* Do it in round-robin fashion. */
 		if (++idx < pool_offset)
 			continue;
+		/* No matter what make sure the pool do not grow in next second. */
+		p->pool->cur_max_size = p->pool->cur_max_size >> 1;
+		p->pool->shrink_timeout = jiffies + HZ;
 		nr_free = shrink_pages;
 		shrink_pages = ttm_dma_page_pool_free(p->pool, nr_free,
 						      sc->gfp_mask);