Message ID | 20210312154331.32229-8-mgorman@techsingularity.net (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Introduce a bulk order-0 page allocator with two in-tree users | expand |
On Fri, Mar 12, 2021 at 7:43 AM Mel Gorman <mgorman@techsingularity.net> wrote: > > From: Jesper Dangaard Brouer <brouer@redhat.com> > > There are cases where the page_pool need to refill with pages from the > page allocator. Some workloads cause the page_pool to release pages > instead of recycling these pages. > > For these workload it can improve performance to bulk alloc pages from > the page-allocator to refill the alloc cache. > > For XDP-redirect workload with 100G mlx5 driver (that use page_pool) > redirecting xdp_frame packets into a veth, that does XDP_PASS to create > an SKB from the xdp_frame, which then cannot return the page to the > page_pool. In this case, we saw[1] an improvement of 18.8% from using > the alloc_pages_bulk API (3,677,958 pps -> 4,368,926 pps). > > [1] https://github.com/xdp-project/xdp-project/blob/master/areas/mem/page_pool06_alloc_pages_bulk.org > > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> > Signed-off-by: Mel Gorman <mgorman@techsingularity.net> > Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org> > --- > net/core/page_pool.c | 62 ++++++++++++++++++++++++++++---------------- > 1 file changed, 39 insertions(+), 23 deletions(-) > > diff --git a/net/core/page_pool.c b/net/core/page_pool.c > index 40e1b2beaa6c..a5889f1b86aa 100644 > --- a/net/core/page_pool.c > +++ b/net/core/page_pool.c > @@ -208,44 +208,60 @@ noinline > static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, > gfp_t _gfp) > { > + const int bulk = PP_ALLOC_CACHE_REFILL; > + struct page *page, *next, *first_page; > unsigned int pp_flags = pool->p.flags; > - struct page *page; > + unsigned int pp_order = pool->p.order; > + int pp_nid = pool->p.nid; > + LIST_HEAD(page_list); > gfp_t gfp = _gfp; > > - /* We could always set __GFP_COMP, and avoid this branch, as > - * prep_new_page() can handle order-0 with __GFP_COMP. > - */ > - if (pool->p.order) > + /* Don't support bulk alloc for high-order pages */ > + if (unlikely(pp_order)) { > gfp |= __GFP_COMP; > + first_page = alloc_pages_node(pp_nid, gfp, pp_order); > + if (unlikely(!first_page)) > + return NULL; > + goto out; > + } > > - /* FUTURE development: > - * > - * Current slow-path essentially falls back to single page > - * allocations, which doesn't improve performance. This code > - * need bulk allocation support from the page allocator code. > - */ > - > - /* Cache was empty, do real allocation */ > -#ifdef CONFIG_NUMA > - page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); > -#else > - page = alloc_pages(gfp, pool->p.order); > -#endif > - if (!page) > + if (unlikely(!__alloc_pages_bulk(gfp, pp_nid, NULL, bulk, &page_list))) > return NULL; > > + /* First page is extracted and returned to caller */ > + first_page = list_first_entry(&page_list, struct page, lru); > + list_del(&first_page->lru); > + This seems kind of broken to me. If you pull the first page and then cannot map it you end up returning NULL even if you placed a number of pages in the cache. It might make more sense to have the loop below record a pointer to the last page you processed and handle things in two stages so that on the first iteration you map one page. So something along the lines of: 1. Initialize last_page to NULL for each page in the list 2. Map page 3. If last_page is non-NULL, move to cache 4. Assign page to last_page 5. Return to step 2 for each page in list 6. return last_page > + /* Remaining pages store in alloc.cache */ > + list_for_each_entry_safe(page, next, &page_list, lru) { > + list_del(&page->lru); > + if ((pp_flags & PP_FLAG_DMA_MAP) && > + unlikely(!page_pool_dma_map(pool, page))) { > + put_page(page); > + continue; > + } So if you added a last_page pointer what you could do is check for it here and assign it to the alloc cache. If last_page is not set the block would be skipped. > + if (likely(pool->alloc.count < PP_ALLOC_CACHE_SIZE)) { > + pool->alloc.cache[pool->alloc.count++] = page; > + pool->pages_state_hold_cnt++; > + trace_page_pool_state_hold(pool, page, > + pool->pages_state_hold_cnt); > + } else { > + put_page(page); If you are just calling put_page here aren't you leaking DMA mappings? Wouldn't you need to potentially unmap the page before you call put_page on it? > + } > + } > +out: > if ((pp_flags & PP_FLAG_DMA_MAP) && > - unlikely(!page_pool_dma_map(pool, page))) { > - put_page(page); > + unlikely(!page_pool_dma_map(pool, first_page))) { > + put_page(first_page); I would probably move this block up and make it a part of the pp_order block above. Also since you are doing this in 2 spots it might make sense to look at possibly making this an inline function. > return NULL; > } > > /* Track how many pages are held 'in-flight' */ > pool->pages_state_hold_cnt++; > - trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); > + trace_page_pool_state_hold(pool, first_page, pool->pages_state_hold_cnt); > > /* When page just alloc'ed is should/must have refcnt 1. */ > - return page; > + return first_page; > } > > /* For using page_pool replace: alloc_pages() API calls, but provide > -- > 2.26.2 >
[...] > 6. return last_page > > > + /* Remaining pages store in alloc.cache */ > > + list_for_each_entry_safe(page, next, &page_list, lru) { > > + list_del(&page->lru); > > + if ((pp_flags & PP_FLAG_DMA_MAP) && > > + unlikely(!page_pool_dma_map(pool, page))) { > > + put_page(page); > > + continue; > > + } > > So if you added a last_page pointer what you could do is check for it > here and assign it to the alloc cache. If last_page is not set the > block would be skipped. > > > + if (likely(pool->alloc.count < PP_ALLOC_CACHE_SIZE)) { > > + pool->alloc.cache[pool->alloc.count++] = page; > > + pool->pages_state_hold_cnt++; > > + trace_page_pool_state_hold(pool, page, > > + pool->pages_state_hold_cnt); > > + } else { > > + put_page(page); > > If you are just calling put_page here aren't you leaking DMA mappings? > Wouldn't you need to potentially unmap the page before you call > put_page on it? Oops, I completely missed that. Alexander is right here. > > > + } > > + } > > +out: > > if ((pp_flags & PP_FLAG_DMA_MAP) && > > - unlikely(!page_pool_dma_map(pool, page))) { > > - put_page(page); > > + unlikely(!page_pool_dma_map(pool, first_page))) { > > + put_page(first_page); > > I would probably move this block up and make it a part of the pp_order > block above. Also since you are doing this in 2 spots it might make > sense to look at possibly making this an inline function. > > > return NULL; > > } > > > > /* Track how many pages are held 'in-flight' */ > > pool->pages_state_hold_cnt++; > > - trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); > > + trace_page_pool_state_hold(pool, first_page, pool->pages_state_hold_cnt); > > > > /* When page just alloc'ed is should/must have refcnt 1. */ > > - return page; > > + return first_page; > > } > > > > /* For using page_pool replace: alloc_pages() API calls, but provide > > -- > > 2.26.2 > > Cheers /Ilias
On Fri, Mar 12, 2021 at 11:44:09AM -0800, Alexander Duyck wrote: > > - /* FUTURE development: > > - * > > - * Current slow-path essentially falls back to single page > > - * allocations, which doesn't improve performance. This code > > - * need bulk allocation support from the page allocator code. > > - */ > > - > > - /* Cache was empty, do real allocation */ > > -#ifdef CONFIG_NUMA > > - page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); > > -#else > > - page = alloc_pages(gfp, pool->p.order); > > -#endif > > - if (!page) > > + if (unlikely(!__alloc_pages_bulk(gfp, pp_nid, NULL, bulk, &page_list))) > > return NULL; > > > > + /* First page is extracted and returned to caller */ > > + first_page = list_first_entry(&page_list, struct page, lru); > > + list_del(&first_page->lru); > > + > > This seems kind of broken to me. If you pull the first page and then > cannot map it you end up returning NULL even if you placed a number of > pages in the cache. > I think you're right but I'm punting this to Jesper to fix. He's more familiar with this particular code and can verify the performance is still ok for high speed networks.
On Fri, 12 Mar 2021 22:05:45 +0200 Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote: > [...] > > 6. return last_page > > > > > + /* Remaining pages store in alloc.cache */ > > > + list_for_each_entry_safe(page, next, &page_list, lru) { > > > + list_del(&page->lru); > > > + if ((pp_flags & PP_FLAG_DMA_MAP) && > > > + unlikely(!page_pool_dma_map(pool, page))) { > > > + put_page(page); > > > + continue; > > > + } > > > > So if you added a last_page pointer what you could do is check for it > > here and assign it to the alloc cache. If last_page is not set the > > block would be skipped. > > > > > + if (likely(pool->alloc.count < PP_ALLOC_CACHE_SIZE)) { > > > + pool->alloc.cache[pool->alloc.count++] = page; > > > + pool->pages_state_hold_cnt++; > > > + trace_page_pool_state_hold(pool, page, > > > + pool->pages_state_hold_cnt); > > > + } else { > > > + put_page(page); > > > > If you are just calling put_page here aren't you leaking DMA mappings? > > Wouldn't you need to potentially unmap the page before you call > > put_page on it? > > Oops, I completely missed that. Alexander is right here. Well, the put_page() case can never happen as the pool->alloc.cache[] is known to be empty when this function is called. I do agree that the code looks cumbersome and should free the DMA mapping, if it could happen.
diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 40e1b2beaa6c..a5889f1b86aa 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -208,44 +208,60 @@ noinline static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, gfp_t _gfp) { + const int bulk = PP_ALLOC_CACHE_REFILL; + struct page *page, *next, *first_page; unsigned int pp_flags = pool->p.flags; - struct page *page; + unsigned int pp_order = pool->p.order; + int pp_nid = pool->p.nid; + LIST_HEAD(page_list); gfp_t gfp = _gfp; - /* We could always set __GFP_COMP, and avoid this branch, as - * prep_new_page() can handle order-0 with __GFP_COMP. - */ - if (pool->p.order) + /* Don't support bulk alloc for high-order pages */ + if (unlikely(pp_order)) { gfp |= __GFP_COMP; + first_page = alloc_pages_node(pp_nid, gfp, pp_order); + if (unlikely(!first_page)) + return NULL; + goto out; + } - /* FUTURE development: - * - * Current slow-path essentially falls back to single page - * allocations, which doesn't improve performance. This code - * need bulk allocation support from the page allocator code. - */ - - /* Cache was empty, do real allocation */ -#ifdef CONFIG_NUMA - page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); -#else - page = alloc_pages(gfp, pool->p.order); -#endif - if (!page) + if (unlikely(!__alloc_pages_bulk(gfp, pp_nid, NULL, bulk, &page_list))) return NULL; + /* First page is extracted and returned to caller */ + first_page = list_first_entry(&page_list, struct page, lru); + list_del(&first_page->lru); + + /* Remaining pages store in alloc.cache */ + list_for_each_entry_safe(page, next, &page_list, lru) { + list_del(&page->lru); + if ((pp_flags & PP_FLAG_DMA_MAP) && + unlikely(!page_pool_dma_map(pool, page))) { + put_page(page); + continue; + } + if (likely(pool->alloc.count < PP_ALLOC_CACHE_SIZE)) { + pool->alloc.cache[pool->alloc.count++] = page; + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, page, + pool->pages_state_hold_cnt); + } else { + put_page(page); + } + } +out: if ((pp_flags & PP_FLAG_DMA_MAP) && - unlikely(!page_pool_dma_map(pool, page))) { - put_page(page); + unlikely(!page_pool_dma_map(pool, first_page))) { + put_page(first_page); return NULL; } /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; - trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); + trace_page_pool_state_hold(pool, first_page, pool->pages_state_hold_cnt); /* When page just alloc'ed is should/must have refcnt 1. */ - return page; + return first_page; } /* For using page_pool replace: alloc_pages() API calls, but provide