Message ID | 162fe40c387cd395d633729fa4f2b5245531514a.1663879752.git.pabeni@redhat.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net-next,v2] net: skb: introduce and use a single page frag cache | expand |
On Thu, Sep 22, 2022 at 2:01 PM Paolo Abeni <pabeni@redhat.com> wrote: > > After commit 3226b158e67c ("net: avoid 32 x truesize under-estimation > for tiny skbs") we are observing 10-20% regressions in performance > tests with small packets. The perf trace points to high pressure on > the slab allocator. > > This change tries to improve the allocation schema for small packets > using an idea originally suggested by Eric: a new per CPU page frag is > introduced and used in __napi_alloc_skb to cope with small allocation > requests. > > To ensure that the above does not lead to excessive truesize > underestimation, the frag size for small allocation is inflated to 1K > and all the above is restricted to build with 4K page size. > > Note that we need to update accordingly the run-time check introduced > with commit fd9ea57f4e95 ("net: add napi_get_frags_check() helper"). > > Alex suggested a smart page refcount schema to reduce the number > of atomic operations and deal properly with pfmemalloc pages. > > Under small packet UDP flood, I measure a 15% peak tput increases. > > Suggested-by: Eric Dumazet <eric.dumazet@gmail.com> > Suggested-by: Alexander H Duyck <alexander.duyck@gmail.com> Please update my email to <alexanderduyck@fb.com>. > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > --- > v1 -> v2: > - better page_frag_alloc_1k() (Alex & Eric) > - avoid duplicate code and gfp_flags misuse in __napi_alloc_skb() (Alex) > --- > include/linux/netdevice.h | 1 + > net/core/dev.c | 17 ------ > net/core/skbuff.c | 106 ++++++++++++++++++++++++++++++++++++-- > 3 files changed, 102 insertions(+), 22 deletions(-) > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 9f42fc871c3b..a1938560192a 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -3822,6 +3822,7 @@ void netif_receive_skb_list(struct list_head *head); > gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); > void napi_gro_flush(struct napi_struct *napi, bool flush_old); > struct sk_buff *napi_get_frags(struct napi_struct *napi); > +void napi_get_frags_check(struct napi_struct *napi); > gro_result_t napi_gro_frags(struct napi_struct *napi); > struct packet_offload *gro_find_receive_by_type(__be16 type); > struct packet_offload *gro_find_complete_by_type(__be16 type); > diff --git a/net/core/dev.c b/net/core/dev.c > index d66c73c1c734..fa53830d0683 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -6358,23 +6358,6 @@ int dev_set_threaded(struct net_device *dev, bool threaded) > } > EXPORT_SYMBOL(dev_set_threaded); > > -/* Double check that napi_get_frags() allocates skbs with > - * skb->head being backed by slab, not a page fragment. > - * This is to make sure bug fixed in 3226b158e67c > - * ("net: avoid 32 x truesize under-estimation for tiny skbs") > - * does not accidentally come back. > - */ > -static void napi_get_frags_check(struct napi_struct *napi) > -{ > - struct sk_buff *skb; > - > - local_bh_disable(); > - skb = napi_get_frags(napi); > - WARN_ON_ONCE(skb && skb->head_frag); > - napi_free_frags(napi); > - local_bh_enable(); > -} > - > void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, > int (*poll)(struct napi_struct *, int), int weight) > { > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index f1b8b20fc20b..00340b0cf6eb 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -134,8 +134,67 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) > #define NAPI_SKB_CACHE_BULK 16 > #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) > > +/* the compiler doesn't like 'SKB_TRUESIZE(GRO_MAX_HEAD) > 512', but we > + * can imply such condition checking the double word and MAX_HEADER size > + */ > +#if PAGE_SIZE == SZ_4K && (defined(CONFIG_64BIT) || MAX_HEADER > 64) > + > +#define NAPI_HAS_SMALL_PAGE_FRAG 1 > + > +/* specializzed page frag allocator using a single order 0 page > + * and slicing it into 1K sized fragment. Constrained to system > + * with: > + * - a very limited amount of 1K fragments fitting a single > + * page - to avoid excessive truesize underestimation > + * - reasonably high truesize value for napi_get_frags() > + * allocation - to avoid memory usage increased compared > + * to kalloc, see __napi_alloc_skb() > + */ > + > +struct page_frag_1k { > + void *va; > + u16 offset; > + bool pfmemalloc; > +}; > + > +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) > +{ > + struct page *page; > + int offset; > + > + offset = nc->offset - SZ_1K; > + if (likely(offset >= 0)) > + goto use_frag; > + > + page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); > + if (!page) > + return NULL; > + > + nc->va = page_address(page); > + nc->pfmemalloc = page_is_pfmemalloc(page); > + offset = PAGE_SIZE - SZ_1K; > + page_ref_add(page, offset / SZ_1K); > + > +use_frag: > + nc->offset = offset; > + return nc->va + offset; > +} > +#else > +#define NAPI_HAS_SMALL_PAGE_FRAG 0 > + > +struct page_frag_1k { > +}; > + > +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) > +{ > + return NULL; > +} > + > +#endif > + > struct napi_alloc_cache { > struct page_frag_cache page; > + struct page_frag_1k page_small; My suggestion earlier was to just make the 1k cache a page_frag_cache. It will allow you to reuse the same structure members and a single pointer to track them. Waste would be minimal since the only real difference between the structures is about 8B for the structure, and odds are the napi_alloc_cache allocation is being rounded up anyway. > unsigned int skb_count; > void *skb_cache[NAPI_SKB_CACHE_SIZE]; > }; > @@ -143,6 +202,23 @@ struct napi_alloc_cache { > static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); > static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); > > +/* Double check that napi_get_frags() allocates skbs with > + * skb->head being backed by slab, not a page fragment. > + * This is to make sure bug fixed in 3226b158e67c > + * ("net: avoid 32 x truesize under-estimation for tiny skbs") > + * does not accidentally come back. > + */ > +void napi_get_frags_check(struct napi_struct *napi) > +{ > + struct sk_buff *skb; > + > + local_bh_disable(); > + skb = napi_get_frags(napi); > + WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); > + napi_free_frags(napi); > + local_bh_enable(); > +} > + > void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) > { > struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); > @@ -561,6 +637,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > { > struct napi_alloc_cache *nc; > struct sk_buff *skb; > + bool pfmemalloc; Rather than adding this I think you would be better off adding a struct page_frag_cache pointer. I will reference it here as "pfc". > void *data; > > DEBUG_NET_WARN_ON_ONCE(!in_softirq()); > @@ -568,8 +645,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > > /* If requested length is either too small or too big, > * we use kmalloc() for skb->head allocation. > + * When the small frag allocator is available, prefer it over kmalloc > + * for small fragments > */ > - if (len <= SKB_WITH_OVERHEAD(1024) || > + if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || > len > SKB_WITH_OVERHEAD(PAGE_SIZE) || > (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { > skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, > @@ -580,13 +659,30 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > } > > nc = this_cpu_ptr(&napi_alloc_cache); > - len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > - len = SKB_DATA_ALIGN(len); > > if (sk_memalloc_socks()) > gfp_mask |= __GFP_MEMALLOC; > > - data = page_frag_alloc(&nc->page, len, gfp_mask); > + if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { Then here you would add a line that would be: pfc = &nc->page_small; > + /* we are artificially inflating the allocation size, but > + * that is not as bad as it may look like, as: > + * - 'len' less then GRO_MAX_HEAD makes little sense > + * - larger 'len' values lead to fragment size above 512 bytes > + * as per NAPI_HAS_SMALL_PAGE_FRAG definition > + * - kmalloc would use the kmalloc-1k slab for such values > + */ > + len = SZ_1K; > + > + data = page_frag_alloc_1k(&nc->page_small, gfp_mask); > + pfmemalloc = nc->page_small.pfmemalloc; Instead of setting pfmemalloc you could just update the line below. In addition you would just be passing pfc as the parameter. > + } else { Likewise here you would have the line: pfc = &nc->page; > + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > + len = SKB_DATA_ALIGN(len); > + > + data = page_frag_alloc(&nc->page, len, gfp_mask); > + pfmemalloc = nc->page.pfmemalloc; Again no need for the pfmemalloc and the alloc could just come from pfc > + } > + > if (unlikely(!data)) > return NULL; > > @@ -596,7 +692,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > return NULL; > } > > - if (nc->page.pfmemalloc) > + if (pfmemalloc) Instead of passing pfmemalloc you could just check pfc->pfmemalloc. Alternatively I wonder if it wouldn't be faster to just set the value directly based on frag_cache->pfmemalloc and avoid the conditional check entirely. > skb->pfmemalloc = 1; > skb->head_frag = 1; > > -- > 2.37.3 >
Hello, On Thu, 2022-09-22 at 14:17 -0700, Alexander Duyck wrote: [...] > My suggestion earlier was to just make the 1k cache a page_frag_cache. > It will allow you to reuse the same structure members and a single > pointer to track them. Waste would be minimal since the only real > difference between the structures is about 8B for the structure, and > odds are the napi_alloc_cache allocation is being rounded up anyway. > > > unsigned int skb_count; > > void *skb_cache[NAPI_SKB_CACHE_SIZE]; > > }; > > @@ -143,6 +202,23 @@ struct napi_alloc_cache { > > static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); > > static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); > > > > +/* Double check that napi_get_frags() allocates skbs with > > + * skb->head being backed by slab, not a page fragment. > > + * This is to make sure bug fixed in 3226b158e67c > > + * ("net: avoid 32 x truesize under-estimation for tiny skbs") > > + * does not accidentally come back. > > + */ > > +void napi_get_frags_check(struct napi_struct *napi) > > +{ > > + struct sk_buff *skb; > > + > > + local_bh_disable(); > > + skb = napi_get_frags(napi); > > + WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); > > + napi_free_frags(napi); > > + local_bh_enable(); > > +} > > + > > void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) > > { > > struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); > > @@ -561,6 +637,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > > { > > struct napi_alloc_cache *nc; > > struct sk_buff *skb; > > + bool pfmemalloc; > > Rather than adding this I think you would be better off adding a > struct page_frag_cache pointer. I will reference it here as "pfc". > > > void *data; > > > > DEBUG_NET_WARN_ON_ONCE(!in_softirq()); > > @@ -568,8 +645,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > > > > /* If requested length is either too small or too big, > > * we use kmalloc() for skb->head allocation. > > + * When the small frag allocator is available, prefer it over kmalloc > > + * for small fragments > > */ > > - if (len <= SKB_WITH_OVERHEAD(1024) || > > + if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || > > len > SKB_WITH_OVERHEAD(PAGE_SIZE) || > > (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { > > skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, > > @@ -580,13 +659,30 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > > } > > > > nc = this_cpu_ptr(&napi_alloc_cache); > > - len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > > - len = SKB_DATA_ALIGN(len); > > > > if (sk_memalloc_socks()) > > gfp_mask |= __GFP_MEMALLOC; > > > > - data = page_frag_alloc(&nc->page, len, gfp_mask); > > + if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { > > Then here you would add a line that would be: > pfc = &nc->page_small; > > > + /* we are artificially inflating the allocation size, but > > + * that is not as bad as it may look like, as: > > + * - 'len' less then GRO_MAX_HEAD makes little sense > > + * - larger 'len' values lead to fragment size above 512 bytes > > + * as per NAPI_HAS_SMALL_PAGE_FRAG definition > > + * - kmalloc would use the kmalloc-1k slab for such values > > + */ > > + len = SZ_1K; > > + > > + data = page_frag_alloc_1k(&nc->page_small, gfp_mask); > > + pfmemalloc = nc->page_small.pfmemalloc; > > Instead of setting pfmemalloc you could just update the line below. In > addition you would just be passing pfc as the parameter. > > > + } else { > > Likewise here you would have the line: > pfc = &nc->page; Probaly I was not clear in my previois email, sorry: before posting this version I tried locally exactly all the above, and the generated code with gcc 11.3.1 is a little bigger (a couple of instructions) than what this version produces (with gcc 11.3.1-2). It has the same number of conditionals and a slightly larger napi_alloc_cache. Additionally the suggested alternative needs more pre-processor conditionals to handle the !NAPI_HAS_SMALL_PAGE_FRAG case - avoiding adding a 2nd, unused in that case, page_frag_cache. [...] > > > @@ -596,7 +692,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > > return NULL; > > } > > > > - if (nc->page.pfmemalloc) > > + if (pfmemalloc) > > Instead of passing pfmemalloc you could just check pfc->pfmemalloc. > Alternatively I wonder if it wouldn't be faster to just set the value > directly based on frag_cache->pfmemalloc and avoid the conditional > heck entirely. Note that: skb->pfmemalloc = pfmemalloc; avoids a branch but requires more instructions than the current code (verified with gcc). The gain looks doubtful?!? Additionally we have statement alike: if (<pfmemalloc>) skb->pfmemalloc = 1; in other functions in skbuff.c - still fast-path - and would be better updating all the places together for consistency - if that is really considered an improvement. IMHO it should at least land in a different patch. I'll post a v3 with your updated email address, but I think the current code is the better option. Cheers, Paolo
Hi Paolo, I love your patch! Yet something to improve: [auto build test ERROR on net-next/master] url: https://github.com/intel-lab-lkp/linux/commits/Paolo-Abeni/net-skb-introduce-and-use-a-single-page-frag-cache/20220923-050251 base: https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 2b9977470b39e011ee5fbc01ca55411a7768fb9d config: arc-randconfig-r043-20220922 (https://download.01.org/0day-ci/archive/20220923/202209231747.KOTeKw3E-lkp@intel.com/config) compiler: arceb-elf-gcc (GCC) 12.1.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/intel-lab-lkp/linux/commit/40ceb68029b30e158de46d21f73d0439ab8c2277 git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Paolo-Abeni/net-skb-introduce-and-use-a-single-page-frag-cache/20220923-050251 git checkout 40ceb68029b30e158de46d21f73d0439ab8c2277 # save the config file mkdir build_dir && cp config build_dir/.config COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=arc SHELL=/bin/bash net/ If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot <lkp@intel.com> All errors (new ones prefixed by >>): net/core/skbuff.c: In function '__napi_alloc_skb': >> net/core/skbuff.c:677:44: error: 'struct page_frag_1k' has no member named 'pfmemalloc' 677 | pfmemalloc = nc->page_small.pfmemalloc; | ^ vim +677 net/core/skbuff.c 621 622 /** 623 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance 624 * @napi: napi instance this buffer was allocated for 625 * @len: length to allocate 626 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages 627 * 628 * Allocate a new sk_buff for use in NAPI receive. This buffer will 629 * attempt to allocate the head from a special reserved region used 630 * only for NAPI Rx allocation. By doing this we can save several 631 * CPU cycles by avoiding having to disable and re-enable IRQs. 632 * 633 * %NULL is returned if there is no free memory. 634 */ 635 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, 636 gfp_t gfp_mask) 637 { 638 struct napi_alloc_cache *nc; 639 struct sk_buff *skb; 640 bool pfmemalloc; 641 void *data; 642 643 DEBUG_NET_WARN_ON_ONCE(!in_softirq()); 644 len += NET_SKB_PAD + NET_IP_ALIGN; 645 646 /* If requested length is either too small or too big, 647 * we use kmalloc() for skb->head allocation. 648 * When the small frag allocator is available, prefer it over kmalloc 649 * for small fragments 650 */ 651 if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || 652 len > SKB_WITH_OVERHEAD(PAGE_SIZE) || 653 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { 654 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, 655 NUMA_NO_NODE); 656 if (!skb) 657 goto skb_fail; 658 goto skb_success; 659 } 660 661 nc = this_cpu_ptr(&napi_alloc_cache); 662 663 if (sk_memalloc_socks()) 664 gfp_mask |= __GFP_MEMALLOC; 665 666 if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { 667 /* we are artificially inflating the allocation size, but 668 * that is not as bad as it may look like, as: 669 * - 'len' less then GRO_MAX_HEAD makes little sense 670 * - larger 'len' values lead to fragment size above 512 bytes 671 * as per NAPI_HAS_SMALL_PAGE_FRAG definition 672 * - kmalloc would use the kmalloc-1k slab for such values 673 */ 674 len = SZ_1K; 675 676 data = page_frag_alloc_1k(&nc->page_small, gfp_mask); > 677 pfmemalloc = nc->page_small.pfmemalloc; 678 } else { 679 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 680 len = SKB_DATA_ALIGN(len); 681 682 data = page_frag_alloc(&nc->page, len, gfp_mask); 683 pfmemalloc = nc->page.pfmemalloc; 684 } 685 686 if (unlikely(!data)) 687 return NULL; 688 689 skb = __napi_build_skb(data, len); 690 if (unlikely(!skb)) { 691 skb_free_frag(data); 692 return NULL; 693 } 694 695 if (pfmemalloc) 696 skb->pfmemalloc = 1; 697 skb->head_frag = 1; 698 699 skb_success: 700 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); 701 skb->dev = napi->dev; 702 703 skb_fail: 704 return skb; 705 } 706 EXPORT_SYMBOL(__napi_alloc_skb); 707
On Fri, Sep 23, 2022 at 12:41 AM Paolo Abeni <pabeni@redhat.com> wrote: > > Hello, > > On Thu, 2022-09-22 at 14:17 -0700, Alexander Duyck wrote: > [...] > > My suggestion earlier was to just make the 1k cache a page_frag_cache. > > It will allow you to reuse the same structure members and a single > > pointer to track them. Waste would be minimal since the only real > > difference between the structures is about 8B for the structure, and > > odds are the napi_alloc_cache allocation is being rounded up anyway. > > > > > unsigned int skb_count; > > > void *skb_cache[NAPI_SKB_CACHE_SIZE]; > > > }; > > > @@ -143,6 +202,23 @@ struct napi_alloc_cache { > > > static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); > > > static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); > > > > > > +/* Double check that napi_get_frags() allocates skbs with > > > + * skb->head being backed by slab, not a page fragment. > > > + * This is to make sure bug fixed in 3226b158e67c > > > + * ("net: avoid 32 x truesize under-estimation for tiny skbs") > > > + * does not accidentally come back. > > > + */ > > > +void napi_get_frags_check(struct napi_struct *napi) > > > +{ > > > + struct sk_buff *skb; > > > + > > > + local_bh_disable(); > > > + skb = napi_get_frags(napi); > > > + WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); > > > + napi_free_frags(napi); > > > + local_bh_enable(); > > > +} > > > + > > > void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) > > > { > > > struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); > > > @@ -561,6 +637,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > > > { > > > struct napi_alloc_cache *nc; > > > struct sk_buff *skb; > > > + bool pfmemalloc; > > > > Rather than adding this I think you would be better off adding a > > struct page_frag_cache pointer. I will reference it here as "pfc". > > > > > void *data; > > > > > > DEBUG_NET_WARN_ON_ONCE(!in_softirq()); > > > @@ -568,8 +645,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > > > > > > /* If requested length is either too small or too big, > > > * we use kmalloc() for skb->head allocation. > > > + * When the small frag allocator is available, prefer it over kmalloc > > > + * for small fragments > > > */ > > > - if (len <= SKB_WITH_OVERHEAD(1024) || > > > + if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || > > > len > SKB_WITH_OVERHEAD(PAGE_SIZE) || > > > (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { > > > skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, > > > @@ -580,13 +659,30 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > > > } > > > > > > nc = this_cpu_ptr(&napi_alloc_cache); > > > - len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > > > - len = SKB_DATA_ALIGN(len); > > > > > > if (sk_memalloc_socks()) > > > gfp_mask |= __GFP_MEMALLOC; > > > > > > - data = page_frag_alloc(&nc->page, len, gfp_mask); > > > + if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { > > > > Then here you would add a line that would be: > > pfc = &nc->page_small; > > > > > + /* we are artificially inflating the allocation size, but > > > + * that is not as bad as it may look like, as: > > > + * - 'len' less then GRO_MAX_HEAD makes little sense > > > + * - larger 'len' values lead to fragment size above 512 bytes > > > + * as per NAPI_HAS_SMALL_PAGE_FRAG definition > > > + * - kmalloc would use the kmalloc-1k slab for such values > > > + */ > > > + len = SZ_1K; > > > + > > > + data = page_frag_alloc_1k(&nc->page_small, gfp_mask); > > > + pfmemalloc = nc->page_small.pfmemalloc; > > > > Instead of setting pfmemalloc you could just update the line below. In > > addition you would just be passing pfc as the parameter. > > > > > + } else { > > > > Likewise here you would have the line: > > pfc = &nc->page; > > Probaly I was not clear in my previois email, sorry: before posting > this version I tried locally exactly all the above, and the generated > code with gcc 11.3.1 is a little bigger (a couple of instructions) than > what this version produces (with gcc 11.3.1-2). It has the same number > of conditionals and a slightly larger napi_alloc_cache. > > Additionally the suggested alternative needs more pre-processor > conditionals to handle the !NAPI_HAS_SMALL_PAGE_FRAG case - avoiding > adding a 2nd, unused in that case, page_frag_cache. > > [...] Why would that be? You should still be using the pointer to the page_frag_cache in the standard case. Like I was saying what you are doing is essentially replacing the use of napi_alloc_cache with the page_frag_cache, so for example with the existing setup all references to "nc->page" would become "pfc->" so there shouldn't be any extra unused variables in such a case since it would be used for both the frag allocation and the pfmemalloc check. One alternate way that occured to me to handle this would be to look at possibly having napi_alloc_cache contain an array of page_frag_cache structures. With that approach you could just size the array and have it stick with a size of 1 in the case that small doesn't exist, and support a size of 2 if it does. You could define them via an enum so that the max would vary depending on if you add a small frag cache or not. With that you could just bump the pointer with a ++ so it goes from large to small and you wouldn't have any warnings about items not existing in your structures, and the code with the ++ could be kept from being called in the !NAPI_HAS_SMALL_PAGE_FRAG case. > > > > > @@ -596,7 +692,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > > > return NULL; > > > } > > > > > > - if (nc->page.pfmemalloc) > > > + if (pfmemalloc) > > > > Instead of passing pfmemalloc you could just check pfc->pfmemalloc. > > Alternatively I wonder if it wouldn't be faster to just set the value > > directly based on frag_cache->pfmemalloc and avoid the conditional > > heck entirely. > > Note that: > > skb->pfmemalloc = pfmemalloc; > > avoids a branch but requires more instructions than the current code > (verified with gcc). The gain looks doubtful?!? Additionally we have > statement alike: > > if (<pfmemalloc>) > skb->pfmemalloc = 1; > > in other functions in skbuff.c - still fast-path - and would be better > updating all the places together for consistency - if that is really > considered an improvement. IMHO it should at least land in a different > patch. We cannot really use "skb->pfmemalloc = pfmemalloc" because one is a bitfield and the other is a boolean value. I suspect the complexity would be greatly reduced if we converted the pfmemalloc to a bitfield similar to skb->pfmemalloc. It isn't important though. I was mostly just speculating on possible future optimizations. > I'll post a v3 with your updated email address, but I think the current > code is the better option. That's fine. Like I mentioned I am mostly just trying to think things out and identify any possible gaps we may have missed. I will keep an eye out for the next version.
On Fri, 2022-09-23 at 08:22 -0700, Alexander Duyck wrote: > On Fri, Sep 23, 2022 at 12:41 AM Paolo Abeni <pabeni@redhat.com> wrote: > > > > Hello, > > > > On Thu, 2022-09-22 at 14:17 -0700, Alexander Duyck wrote: > > [...] > > > My suggestion earlier was to just make the 1k cache a page_frag_cache. > > > It will allow you to reuse the same structure members and a single > > > pointer to track them. Waste would be minimal since the only real > > > difference between the structures is about 8B for the structure, and > > > odds are the napi_alloc_cache allocation is being rounded up anyway. > > > > > > > unsigned int skb_count; > > > > void *skb_cache[NAPI_SKB_CACHE_SIZE]; > > > > }; > > > > @@ -143,6 +202,23 @@ struct napi_alloc_cache { > > > > static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); > > > > static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); > > > > > > > > +/* Double check that napi_get_frags() allocates skbs with > > > > + * skb->head being backed by slab, not a page fragment. > > > > + * This is to make sure bug fixed in 3226b158e67c > > > > + * ("net: avoid 32 x truesize under-estimation for tiny skbs") > > > > + * does not accidentally come back. > > > > + */ > > > > +void napi_get_frags_check(struct napi_struct *napi) > > > > +{ > > > > + struct sk_buff *skb; > > > > + > > > > + local_bh_disable(); > > > > + skb = napi_get_frags(napi); > > > > + WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); > > > > + napi_free_frags(napi); > > > > + local_bh_enable(); > > > > +} > > > > + > > > > void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) > > > > { > > > > struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); > > > > @@ -561,6 +637,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > > > > { > > > > struct napi_alloc_cache *nc; > > > > struct sk_buff *skb; > > > > + bool pfmemalloc; > > > > > > Rather than adding this I think you would be better off adding a > > > struct page_frag_cache pointer. I will reference it here as "pfc". > > > > > > > void *data; > > > > > > > > DEBUG_NET_WARN_ON_ONCE(!in_softirq()); > > > > @@ -568,8 +645,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > > > > > > > > /* If requested length is either too small or too big, > > > > * we use kmalloc() for skb->head allocation. > > > > + * When the small frag allocator is available, prefer it over kmalloc > > > > + * for small fragments > > > > */ > > > > - if (len <= SKB_WITH_OVERHEAD(1024) || > > > > + if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || > > > > len > SKB_WITH_OVERHEAD(PAGE_SIZE) || > > > > (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { > > > > skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, > > > > @@ -580,13 +659,30 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > > > > } > > > > > > > > nc = this_cpu_ptr(&napi_alloc_cache); > > > > - len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > > > > - len = SKB_DATA_ALIGN(len); > > > > > > > > if (sk_memalloc_socks()) > > > > gfp_mask |= __GFP_MEMALLOC; > > > > > > > > - data = page_frag_alloc(&nc->page, len, gfp_mask); > > > > + if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { > > > > > > Then here you would add a line that would be: > > > pfc = &nc->page_small; > > > > > > > + /* we are artificially inflating the allocation size, but > > > > + * that is not as bad as it may look like, as: > > > > + * - 'len' less then GRO_MAX_HEAD makes little sense > > > > + * - larger 'len' values lead to fragment size above 512 bytes > > > > + * as per NAPI_HAS_SMALL_PAGE_FRAG definition > > > > + * - kmalloc would use the kmalloc-1k slab for such values > > > > + */ > > > > + len = SZ_1K; > > > > + > > > > + data = page_frag_alloc_1k(&nc->page_small, gfp_mask); > > > > + pfmemalloc = nc->page_small.pfmemalloc; > > > > > > Instead of setting pfmemalloc you could just update the line below. In > > > addition you would just be passing pfc as the parameter. > > > > > > > + } else { > > > > > > Likewise here you would have the line: > > > pfc = &nc->page; > > > > Probaly I was not clear in my previois email, sorry: before posting > > this version I tried locally exactly all the above, and the generated > > code with gcc 11.3.1 is a little bigger (a couple of instructions) than > > what this version produces (with gcc 11.3.1-2). It has the same number > > of conditionals and a slightly larger napi_alloc_cache. > > > > Additionally the suggested alternative needs more pre-processor > > conditionals to handle the !NAPI_HAS_SMALL_PAGE_FRAG case - avoiding > > adding a 2nd, unused in that case, page_frag_cache. > > > > [...] > > Why would that be? You should still be using the pointer to the > page_frag_cache in the standard case. Like I was saying what you are > doing is essentially replacing the use of napi_alloc_cache with the > page_frag_cache, so for example with the existing setup all references > to "nc->page" would become "pfc->" so there shouldn't be any extra > unused variables in such a case since it would be used for both the > frag allocation and the pfmemalloc check. The problem is that regardless of the NAPI_HAS_SMALL_PAGE_FRAG value, under the branch: if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { gcc sees nc->page_small so we need page_small to exists and be 0 bytes wide for !NAPI_HAS_SMALL_PAGE_FRAG. > One alternate way that occured to me to handle this would be to look > at possibly having napi_alloc_cache contain an array of > page_frag_cache structures. With that approach you could just size the > array and have it stick with a size of 1 in the case that small > doesn't exist, and support a size of 2 if it does. You could define > them via an enum so that the max would vary depending on if you add a > small frag cache or not. With that you could just bump the pointer > with a ++ so it goes from large to small and you wouldn't have any > warnings about items not existing in your structures, and the code > with the ++ could be kept from being called in the > !NAPI_HAS_SMALL_PAGE_FRAG case. Yes, the above works nicely from a 'no additional preprocessor conditional perspective', and the generate code for __napi_alloc_skb() is 3 bytes shorted then my variant - which boils down to nothing due to alignment - but the most critical path (small allocations) requires more instructions and the diff is IMHO less readable, touching all the other nc->page users. Allocations >1K should be a less relevant code path, as e.g. there is still a ~ 32x truesize underestimation there... > > > > @@ -596,7 +692,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, > > > > return NULL; > > > > } > > > > > > > > - if (nc->page.pfmemalloc) > > > > + if (pfmemalloc) > > > > > > Instead of passing pfmemalloc you could just check pfc->pfmemalloc. > > > Alternatively I wonder if it wouldn't be faster to just set the value > > > directly based on frag_cache->pfmemalloc and avoid the conditional > > > heck entirely. > > > > Note that: > > > > skb->pfmemalloc = pfmemalloc; > > > > avoids a branch but requires more instructions than the current code > > (verified with gcc). The gain looks doubtful?!? Additionally we have > > statement alike: > > > > if (<pfmemalloc>) > > skb->pfmemalloc = 1; > > > > in other functions in skbuff.c - still fast-path - and would be better > > updating all the places together for consistency - if that is really > > considered an improvement. IMHO it should at least land in a different > > patch. > > We cannot really use "skb->pfmemalloc = pfmemalloc" because one is a > bitfield and the other is a boolean value. I suspect the complexity > would be greatly reduced if we converted the pfmemalloc to a bitfield > similar to skb->pfmemalloc. It isn't important though. I was mostly > just speculating on possible future optimizations. > > > I'll post a v3 with your updated email address, but I think the current > > code is the better option. > > That's fine. Like I mentioned I am mostly just trying to think things > out and identify any possible gaps we may have missed. I will keep an > eye out for the next version. Yep, let me go aheat with this... Thanks, Paolo
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9f42fc871c3b..a1938560192a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3822,6 +3822,7 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); +void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); struct packet_offload *gro_find_receive_by_type(__be16 type); struct packet_offload *gro_find_complete_by_type(__be16 type); diff --git a/net/core/dev.c b/net/core/dev.c index d66c73c1c734..fa53830d0683 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6358,23 +6358,6 @@ int dev_set_threaded(struct net_device *dev, bool threaded) } EXPORT_SYMBOL(dev_set_threaded); -/* Double check that napi_get_frags() allocates skbs with - * skb->head being backed by slab, not a page fragment. - * This is to make sure bug fixed in 3226b158e67c - * ("net: avoid 32 x truesize under-estimation for tiny skbs") - * does not accidentally come back. - */ -static void napi_get_frags_check(struct napi_struct *napi) -{ - struct sk_buff *skb; - - local_bh_disable(); - skb = napi_get_frags(napi); - WARN_ON_ONCE(skb && skb->head_frag); - napi_free_frags(napi); - local_bh_enable(); -} - void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f1b8b20fc20b..00340b0cf6eb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -134,8 +134,67 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) +/* the compiler doesn't like 'SKB_TRUESIZE(GRO_MAX_HEAD) > 512', but we + * can imply such condition checking the double word and MAX_HEADER size + */ +#if PAGE_SIZE == SZ_4K && (defined(CONFIG_64BIT) || MAX_HEADER > 64) + +#define NAPI_HAS_SMALL_PAGE_FRAG 1 + +/* specializzed page frag allocator using a single order 0 page + * and slicing it into 1K sized fragment. Constrained to system + * with: + * - a very limited amount of 1K fragments fitting a single + * page - to avoid excessive truesize underestimation + * - reasonably high truesize value for napi_get_frags() + * allocation - to avoid memory usage increased compared + * to kalloc, see __napi_alloc_skb() + */ + +struct page_frag_1k { + void *va; + u16 offset; + bool pfmemalloc; +}; + +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) +{ + struct page *page; + int offset; + + offset = nc->offset - SZ_1K; + if (likely(offset >= 0)) + goto use_frag; + + page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); + if (!page) + return NULL; + + nc->va = page_address(page); + nc->pfmemalloc = page_is_pfmemalloc(page); + offset = PAGE_SIZE - SZ_1K; + page_ref_add(page, offset / SZ_1K); + +use_frag: + nc->offset = offset; + return nc->va + offset; +} +#else +#define NAPI_HAS_SMALL_PAGE_FRAG 0 + +struct page_frag_1k { +}; + +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) +{ + return NULL; +} + +#endif + struct napi_alloc_cache { struct page_frag_cache page; + struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -143,6 +202,23 @@ struct napi_alloc_cache { static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -561,6 +637,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, { struct napi_alloc_cache *nc; struct sk_buff *skb; + bool pfmemalloc; void *data; DEBUG_NET_WARN_ON_ONCE(!in_softirq()); @@ -568,8 +645,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. + * When the small frag allocator is available, prefer it over kmalloc + * for small fragments */ - if (len <= SKB_WITH_OVERHEAD(1024) || + if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -580,13 +659,30 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, } nc = this_cpu_ptr(&napi_alloc_cache); - len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - len = SKB_DATA_ALIGN(len); if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = page_frag_alloc(&nc->page, len, gfp_mask); + if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { + /* we are artificially inflating the allocation size, but + * that is not as bad as it may look like, as: + * - 'len' less then GRO_MAX_HEAD makes little sense + * - larger 'len' values lead to fragment size above 512 bytes + * as per NAPI_HAS_SMALL_PAGE_FRAG definition + * - kmalloc would use the kmalloc-1k slab for such values + */ + len = SZ_1K; + + data = page_frag_alloc_1k(&nc->page_small, gfp_mask); + pfmemalloc = nc->page_small.pfmemalloc; + } else { + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + len = SKB_DATA_ALIGN(len); + + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = nc->page.pfmemalloc; + } + if (unlikely(!data)) return NULL; @@ -596,7 +692,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, return NULL; } - if (nc->page.pfmemalloc) + if (pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1;
After commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs") we are observing 10-20% regressions in performance tests with small packets. The perf trace points to high pressure on the slab allocator. This change tries to improve the allocation schema for small packets using an idea originally suggested by Eric: a new per CPU page frag is introduced and used in __napi_alloc_skb to cope with small allocation requests. To ensure that the above does not lead to excessive truesize underestimation, the frag size for small allocation is inflated to 1K and all the above is restricted to build with 4K page size. Note that we need to update accordingly the run-time check introduced with commit fd9ea57f4e95 ("net: add napi_get_frags_check() helper"). Alex suggested a smart page refcount schema to reduce the number of atomic operations and deal properly with pfmemalloc pages. Under small packet UDP flood, I measure a 15% peak tput increases. Suggested-by: Eric Dumazet <eric.dumazet@gmail.com> Suggested-by: Alexander H Duyck <alexander.duyck@gmail.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com> --- v1 -> v2: - better page_frag_alloc_1k() (Alex & Eric) - avoid duplicate code and gfp_flags misuse in __napi_alloc_skb() (Alex) --- include/linux/netdevice.h | 1 + net/core/dev.c | 17 ------ net/core/skbuff.c | 106 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 102 insertions(+), 22 deletions(-)