diff mbox series

[net-next,v2] net: skb: introduce and use a single page frag cache

Message ID 162fe40c387cd395d633729fa4f2b5245531514a.1663879752.git.pabeni@redhat.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net-next,v2] net: skb: introduce and use a single page frag cache | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 4377 this patch: 4377
netdev/cc_maintainers warning 1 maintainers not CCed: petrm@nvidia.com
netdev/build_clang success Errors and warnings before: 1059 this patch: 1059
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4566 this patch: 4566
netdev/checkpatch warning WARNING: Comparisons should place the constant on the right side of the test
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Paolo Abeni Sept. 22, 2022, 9:01 p.m. UTC
After commit 3226b158e67c ("net: avoid 32 x truesize under-estimation
for tiny skbs") we are observing 10-20% regressions in performance
tests with small packets. The perf trace points to high pressure on
the slab allocator.

This change tries to improve the allocation schema for small packets
using an idea originally suggested by Eric: a new per CPU page frag is
introduced and used in __napi_alloc_skb to cope with small allocation
requests.

To ensure that the above does not lead to excessive truesize
underestimation, the frag size for small allocation is inflated to 1K
and all the above is restricted to build with 4K page size.

Note that we need to update accordingly the run-time check introduced
with commit fd9ea57f4e95 ("net: add napi_get_frags_check() helper").

Alex suggested a smart page refcount schema to reduce the number
of atomic operations and deal properly with pfmemalloc pages.

Under small packet UDP flood, I measure a 15% peak tput increases.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Suggested-by: Alexander H Duyck <alexander.duyck@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v1 -> v2:
 - better page_frag_alloc_1k() (Alex & Eric)
 - avoid duplicate code and gfp_flags misuse in __napi_alloc_skb() (Alex)
---
 include/linux/netdevice.h |   1 +
 net/core/dev.c            |  17 ------
 net/core/skbuff.c         | 106 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 102 insertions(+), 22 deletions(-)

Comments

Alexander Duyck Sept. 22, 2022, 9:17 p.m. UTC | #1
On Thu, Sep 22, 2022 at 2:01 PM Paolo Abeni <pabeni@redhat.com> wrote:
>
> After commit 3226b158e67c ("net: avoid 32 x truesize under-estimation
> for tiny skbs") we are observing 10-20% regressions in performance
> tests with small packets. The perf trace points to high pressure on
> the slab allocator.
>
> This change tries to improve the allocation schema for small packets
> using an idea originally suggested by Eric: a new per CPU page frag is
> introduced and used in __napi_alloc_skb to cope with small allocation
> requests.
>
> To ensure that the above does not lead to excessive truesize
> underestimation, the frag size for small allocation is inflated to 1K
> and all the above is restricted to build with 4K page size.
>
> Note that we need to update accordingly the run-time check introduced
> with commit fd9ea57f4e95 ("net: add napi_get_frags_check() helper").
>
> Alex suggested a smart page refcount schema to reduce the number
> of atomic operations and deal properly with pfmemalloc pages.
>
> Under small packet UDP flood, I measure a 15% peak tput increases.
>
> Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
> Suggested-by: Alexander H Duyck <alexander.duyck@gmail.com>

Please update my email to <alexanderduyck@fb.com>.

> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> v1 -> v2:
>  - better page_frag_alloc_1k() (Alex & Eric)
>  - avoid duplicate code and gfp_flags misuse in __napi_alloc_skb() (Alex)
> ---
>  include/linux/netdevice.h |   1 +
>  net/core/dev.c            |  17 ------
>  net/core/skbuff.c         | 106 ++++++++++++++++++++++++++++++++++++--
>  3 files changed, 102 insertions(+), 22 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 9f42fc871c3b..a1938560192a 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -3822,6 +3822,7 @@ void netif_receive_skb_list(struct list_head *head);
>  gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
>  void napi_gro_flush(struct napi_struct *napi, bool flush_old);
>  struct sk_buff *napi_get_frags(struct napi_struct *napi);
> +void napi_get_frags_check(struct napi_struct *napi);
>  gro_result_t napi_gro_frags(struct napi_struct *napi);
>  struct packet_offload *gro_find_receive_by_type(__be16 type);
>  struct packet_offload *gro_find_complete_by_type(__be16 type);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index d66c73c1c734..fa53830d0683 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6358,23 +6358,6 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
>  }
>  EXPORT_SYMBOL(dev_set_threaded);
>
> -/* Double check that napi_get_frags() allocates skbs with
> - * skb->head being backed by slab, not a page fragment.
> - * This is to make sure bug fixed in 3226b158e67c
> - * ("net: avoid 32 x truesize under-estimation for tiny skbs")
> - * does not accidentally come back.
> - */
> -static void napi_get_frags_check(struct napi_struct *napi)
> -{
> -       struct sk_buff *skb;
> -
> -       local_bh_disable();
> -       skb = napi_get_frags(napi);
> -       WARN_ON_ONCE(skb && skb->head_frag);
> -       napi_free_frags(napi);
> -       local_bh_enable();
> -}
> -
>  void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
>                            int (*poll)(struct napi_struct *, int), int weight)
>  {
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f1b8b20fc20b..00340b0cf6eb 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -134,8 +134,67 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
>  #define NAPI_SKB_CACHE_BULK    16
>  #define NAPI_SKB_CACHE_HALF    (NAPI_SKB_CACHE_SIZE / 2)
>
> +/* the compiler doesn't like 'SKB_TRUESIZE(GRO_MAX_HEAD) > 512', but we
> + * can imply such condition checking the double word and MAX_HEADER size
> + */
> +#if PAGE_SIZE == SZ_4K && (defined(CONFIG_64BIT) || MAX_HEADER > 64)
> +
> +#define NAPI_HAS_SMALL_PAGE_FRAG       1
> +
> +/* specializzed page frag allocator using a single order 0 page
> + * and slicing it into 1K sized fragment. Constrained to system
> + * with:
> + * - a very limited amount of 1K fragments fitting a single
> + *   page - to avoid excessive truesize underestimation
> + * - reasonably high truesize value for napi_get_frags()
> + *   allocation - to avoid memory usage increased compared
> + *   to kalloc, see __napi_alloc_skb()
> + */
> +
> +struct page_frag_1k {
> +       void *va;
> +       u16 offset;
> +       bool pfmemalloc;
> +};
> +
> +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
> +{
> +       struct page *page;
> +       int offset;
> +
> +       offset = nc->offset - SZ_1K;
> +       if (likely(offset >= 0))
> +               goto use_frag;
> +
> +       page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> +       if (!page)
> +               return NULL;
> +
> +       nc->va = page_address(page);
> +       nc->pfmemalloc = page_is_pfmemalloc(page);
> +       offset = PAGE_SIZE - SZ_1K;
> +       page_ref_add(page, offset / SZ_1K);
> +
> +use_frag:
> +       nc->offset = offset;
> +       return nc->va + offset;
> +}
> +#else
> +#define NAPI_HAS_SMALL_PAGE_FRAG       0
> +
> +struct page_frag_1k {
> +};
> +
> +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
> +{
> +       return NULL;
> +}
> +
> +#endif
> +
>  struct napi_alloc_cache {
>         struct page_frag_cache page;
> +       struct page_frag_1k page_small;

My suggestion earlier was to just make the 1k cache a page_frag_cache.
It will allow you to reuse the same structure members and a single
pointer to track them. Waste would be minimal since the only real
difference between the structures is about 8B for the structure, and
odds are the napi_alloc_cache allocation is being rounded up anyway.

>         unsigned int skb_count;
>         void *skb_cache[NAPI_SKB_CACHE_SIZE];
>  };
> @@ -143,6 +202,23 @@ struct napi_alloc_cache {
>  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
>  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
>
> +/* Double check that napi_get_frags() allocates skbs with
> + * skb->head being backed by slab, not a page fragment.
> + * This is to make sure bug fixed in 3226b158e67c
> + * ("net: avoid 32 x truesize under-estimation for tiny skbs")
> + * does not accidentally come back.
> + */
> +void napi_get_frags_check(struct napi_struct *napi)
> +{
> +       struct sk_buff *skb;
> +
> +       local_bh_disable();
> +       skb = napi_get_frags(napi);
> +       WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
> +       napi_free_frags(napi);
> +       local_bh_enable();
> +}
> +
>  void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
>  {
>         struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
> @@ -561,6 +637,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
>  {
>         struct napi_alloc_cache *nc;
>         struct sk_buff *skb;
> +       bool pfmemalloc;

Rather than adding this I think you would be better off adding a
struct page_frag_cache pointer. I will reference it here as "pfc".

>         void *data;
>
>         DEBUG_NET_WARN_ON_ONCE(!in_softirq());
> @@ -568,8 +645,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
>
>         /* If requested length is either too small or too big,
>          * we use kmalloc() for skb->head allocation.
> +        * When the small frag allocator is available, prefer it over kmalloc
> +        * for small fragments
>          */
> -       if (len <= SKB_WITH_OVERHEAD(1024) ||
> +       if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
>             len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
>             (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
>                 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
> @@ -580,13 +659,30 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
>         }
>
>         nc = this_cpu_ptr(&napi_alloc_cache);
> -       len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> -       len = SKB_DATA_ALIGN(len);
>
>         if (sk_memalloc_socks())
>                 gfp_mask |= __GFP_MEMALLOC;
>
> -       data = page_frag_alloc(&nc->page, len, gfp_mask);
> +       if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {

Then here you would add a line that would be:
pfc = &nc->page_small;

> +               /* we are artificially inflating the allocation size, but
> +                * that is not as bad as it may look like, as:
> +                * - 'len' less then GRO_MAX_HEAD makes little sense
> +                * - larger 'len' values lead to fragment size above 512 bytes
> +                *   as per NAPI_HAS_SMALL_PAGE_FRAG definition
> +                * - kmalloc would use the kmalloc-1k slab for such values
> +                */
> +               len = SZ_1K;
> +
> +               data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
> +               pfmemalloc = nc->page_small.pfmemalloc;

Instead of setting pfmemalloc you could just update the line below. In
addition you would just be passing pfc as the parameter.

> +       } else {

Likewise here you would have the line:
pfc = &nc->page;

> +               len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> +               len = SKB_DATA_ALIGN(len);
> +
> +               data = page_frag_alloc(&nc->page, len, gfp_mask);
> +               pfmemalloc = nc->page.pfmemalloc;

Again no need for the pfmemalloc and the alloc could just come from pfc

> +       }
> +
>         if (unlikely(!data))
>                 return NULL;
>
> @@ -596,7 +692,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
>                 return NULL;
>         }
>
> -       if (nc->page.pfmemalloc)
> +       if (pfmemalloc)

Instead of passing pfmemalloc you could just check pfc->pfmemalloc.
Alternatively I wonder if it wouldn't be faster to just set the value
directly based on frag_cache->pfmemalloc and avoid the conditional
check entirely.

>                 skb->pfmemalloc = 1;
>         skb->head_frag = 1;
>
> --
> 2.37.3
>
Paolo Abeni Sept. 23, 2022, 7:41 a.m. UTC | #2
Hello,

On Thu, 2022-09-22 at 14:17 -0700, Alexander Duyck wrote:
[...]
> My suggestion earlier was to just make the 1k cache a page_frag_cache.
> It will allow you to reuse the same structure members and a single
> pointer to track them. Waste would be minimal since the only real
> difference between the structures is about 8B for the structure, and
> odds are the napi_alloc_cache allocation is being rounded up anyway.
> 
> >         unsigned int skb_count;
> >         void *skb_cache[NAPI_SKB_CACHE_SIZE];
> >  };
> > @@ -143,6 +202,23 @@ struct napi_alloc_cache {
> >  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
> >  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
> > 
> > +/* Double check that napi_get_frags() allocates skbs with
> > + * skb->head being backed by slab, not a page fragment.
> > + * This is to make sure bug fixed in 3226b158e67c
> > + * ("net: avoid 32 x truesize under-estimation for tiny skbs")
> > + * does not accidentally come back.
> > + */
> > +void napi_get_frags_check(struct napi_struct *napi)
> > +{
> > +       struct sk_buff *skb;
> > +
> > +       local_bh_disable();
> > +       skb = napi_get_frags(napi);
> > +       WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
> > +       napi_free_frags(napi);
> > +       local_bh_enable();
> > +}
> > +
> >  void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
> >  {
> >         struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
> > @@ -561,6 +637,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> >  {
> >         struct napi_alloc_cache *nc;
> >         struct sk_buff *skb;
> > +       bool pfmemalloc;
> 
> Rather than adding this I think you would be better off adding a
> struct page_frag_cache pointer. I will reference it here as "pfc".
> 
> >         void *data;
> > 
> >         DEBUG_NET_WARN_ON_ONCE(!in_softirq());
> > @@ -568,8 +645,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> > 
> >         /* If requested length is either too small or too big,
> >          * we use kmalloc() for skb->head allocation.
> > +        * When the small frag allocator is available, prefer it over kmalloc
> > +        * for small fragments
> >          */
> > -       if (len <= SKB_WITH_OVERHEAD(1024) ||
> > +       if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
> >             len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
> >             (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
> >                 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
> > @@ -580,13 +659,30 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> >         }
> > 
> >         nc = this_cpu_ptr(&napi_alloc_cache);
> > -       len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> > -       len = SKB_DATA_ALIGN(len);
> > 
> >         if (sk_memalloc_socks())
> >                 gfp_mask |= __GFP_MEMALLOC;
> > 
> > -       data = page_frag_alloc(&nc->page, len, gfp_mask);
> > +       if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
> 
> Then here you would add a line that would be:
> pfc = &nc->page_small;
> 
> > +               /* we are artificially inflating the allocation size, but
> > +                * that is not as bad as it may look like, as:
> > +                * - 'len' less then GRO_MAX_HEAD makes little sense
> > +                * - larger 'len' values lead to fragment size above 512 bytes
> > +                *   as per NAPI_HAS_SMALL_PAGE_FRAG definition
> > +                * - kmalloc would use the kmalloc-1k slab for such values
> > +                */
> > +               len = SZ_1K;
> > +
> > +               data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
> > +               pfmemalloc = nc->page_small.pfmemalloc;
> 
> Instead of setting pfmemalloc you could just update the line below. In
> addition you would just be passing pfc as the parameter.
> 
> > +       } else {
> 
> Likewise here you would have the line:
> pfc = &nc->page;

Probaly I was not clear in my previois email, sorry: before posting
this version I tried locally exactly all the above, and the generated
code with gcc 11.3.1 is a little bigger (a couple of instructions) than
what this version produces (with gcc 11.3.1-2). It has the same number
of conditionals and a slightly larger napi_alloc_cache.

Additionally the suggested alternative needs more pre-processor
conditionals to handle the !NAPI_HAS_SMALL_PAGE_FRAG case - avoiding
adding a 2nd, unused in that case, page_frag_cache.

[...]
> 
> > @@ -596,7 +692,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> >                 return NULL;
> >         }
> > 
> > -       if (nc->page.pfmemalloc)
> > +       if (pfmemalloc)
> 
> Instead of passing pfmemalloc you could just check pfc->pfmemalloc.
> Alternatively I wonder if it wouldn't be faster to just set the value
> directly based on frag_cache->pfmemalloc and avoid the conditional
> heck entirely.

Note that:

	skb->pfmemalloc = pfmemalloc;

avoids a branch but requires more instructions than the current code
(verified with gcc). The gain looks doubtful?!? Additionally we have
statement alike:

	if (<pfmemalloc>)
		skb->pfmemalloc = 1;

in other functions in skbuff.c - still fast-path - and would be better
updating all the places together for consistency - if that is really
considered an improvement. IMHO it should at least land in a different
patch.
	
I'll post a v3 with your updated email address, but I think the current
code is the better option.

Cheers,

Paolo
kernel test robot Sept. 23, 2022, 9:55 a.m. UTC | #3
Hi Paolo,

I love your patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Paolo-Abeni/net-skb-introduce-and-use-a-single-page-frag-cache/20220923-050251
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 2b9977470b39e011ee5fbc01ca55411a7768fb9d
config: arc-randconfig-r043-20220922 (https://download.01.org/0day-ci/archive/20220923/202209231747.KOTeKw3E-lkp@intel.com/config)
compiler: arceb-elf-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/40ceb68029b30e158de46d21f73d0439ab8c2277
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Paolo-Abeni/net-skb-introduce-and-use-a-single-page-frag-cache/20220923-050251
        git checkout 40ceb68029b30e158de46d21f73d0439ab8c2277
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=arc SHELL=/bin/bash net/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   net/core/skbuff.c: In function '__napi_alloc_skb':
>> net/core/skbuff.c:677:44: error: 'struct page_frag_1k' has no member named 'pfmemalloc'
     677 |                 pfmemalloc = nc->page_small.pfmemalloc;
         |                                            ^


vim +677 net/core/skbuff.c

   621	
   622	/**
   623	 *	__napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
   624	 *	@napi: napi instance this buffer was allocated for
   625	 *	@len: length to allocate
   626	 *	@gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
   627	 *
   628	 *	Allocate a new sk_buff for use in NAPI receive.  This buffer will
   629	 *	attempt to allocate the head from a special reserved region used
   630	 *	only for NAPI Rx allocation.  By doing this we can save several
   631	 *	CPU cycles by avoiding having to disable and re-enable IRQs.
   632	 *
   633	 *	%NULL is returned if there is no free memory.
   634	 */
   635	struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
   636					 gfp_t gfp_mask)
   637	{
   638		struct napi_alloc_cache *nc;
   639		struct sk_buff *skb;
   640		bool pfmemalloc;
   641		void *data;
   642	
   643		DEBUG_NET_WARN_ON_ONCE(!in_softirq());
   644		len += NET_SKB_PAD + NET_IP_ALIGN;
   645	
   646		/* If requested length is either too small or too big,
   647		 * we use kmalloc() for skb->head allocation.
   648		 * When the small frag allocator is available, prefer it over kmalloc
   649		 * for small fragments
   650		 */
   651		if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
   652		    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
   653		    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
   654			skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
   655					  NUMA_NO_NODE);
   656			if (!skb)
   657				goto skb_fail;
   658			goto skb_success;
   659		}
   660	
   661		nc = this_cpu_ptr(&napi_alloc_cache);
   662	
   663		if (sk_memalloc_socks())
   664			gfp_mask |= __GFP_MEMALLOC;
   665	
   666		if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
   667			/* we are artificially inflating the allocation size, but
   668			 * that is not as bad as it may look like, as:
   669			 * - 'len' less then GRO_MAX_HEAD makes little sense
   670			 * - larger 'len' values lead to fragment size above 512 bytes
   671			 *   as per NAPI_HAS_SMALL_PAGE_FRAG definition
   672			 * - kmalloc would use the kmalloc-1k slab for such values
   673			 */
   674			len = SZ_1K;
   675	
   676			data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
 > 677			pfmemalloc = nc->page_small.pfmemalloc;
   678		} else {
   679			len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
   680			len = SKB_DATA_ALIGN(len);
   681	
   682			data = page_frag_alloc(&nc->page, len, gfp_mask);
   683			pfmemalloc = nc->page.pfmemalloc;
   684		}
   685	
   686		if (unlikely(!data))
   687			return NULL;
   688	
   689		skb = __napi_build_skb(data, len);
   690		if (unlikely(!skb)) {
   691			skb_free_frag(data);
   692			return NULL;
   693		}
   694	
   695		if (pfmemalloc)
   696			skb->pfmemalloc = 1;
   697		skb->head_frag = 1;
   698	
   699	skb_success:
   700		skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
   701		skb->dev = napi->dev;
   702	
   703	skb_fail:
   704		return skb;
   705	}
   706	EXPORT_SYMBOL(__napi_alloc_skb);
   707
Alexander Duyck Sept. 23, 2022, 3:22 p.m. UTC | #4
On Fri, Sep 23, 2022 at 12:41 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> Hello,
>
> On Thu, 2022-09-22 at 14:17 -0700, Alexander Duyck wrote:
> [...]
> > My suggestion earlier was to just make the 1k cache a page_frag_cache.
> > It will allow you to reuse the same structure members and a single
> > pointer to track them. Waste would be minimal since the only real
> > difference between the structures is about 8B for the structure, and
> > odds are the napi_alloc_cache allocation is being rounded up anyway.
> >
> > >         unsigned int skb_count;
> > >         void *skb_cache[NAPI_SKB_CACHE_SIZE];
> > >  };
> > > @@ -143,6 +202,23 @@ struct napi_alloc_cache {
> > >  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
> > >  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
> > >
> > > +/* Double check that napi_get_frags() allocates skbs with
> > > + * skb->head being backed by slab, not a page fragment.
> > > + * This is to make sure bug fixed in 3226b158e67c
> > > + * ("net: avoid 32 x truesize under-estimation for tiny skbs")
> > > + * does not accidentally come back.
> > > + */
> > > +void napi_get_frags_check(struct napi_struct *napi)
> > > +{
> > > +       struct sk_buff *skb;
> > > +
> > > +       local_bh_disable();
> > > +       skb = napi_get_frags(napi);
> > > +       WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
> > > +       napi_free_frags(napi);
> > > +       local_bh_enable();
> > > +}
> > > +
> > >  void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
> > >  {
> > >         struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
> > > @@ -561,6 +637,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> > >  {
> > >         struct napi_alloc_cache *nc;
> > >         struct sk_buff *skb;
> > > +       bool pfmemalloc;
> >
> > Rather than adding this I think you would be better off adding a
> > struct page_frag_cache pointer. I will reference it here as "pfc".
> >
> > >         void *data;
> > >
> > >         DEBUG_NET_WARN_ON_ONCE(!in_softirq());
> > > @@ -568,8 +645,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> > >
> > >         /* If requested length is either too small or too big,
> > >          * we use kmalloc() for skb->head allocation.
> > > +        * When the small frag allocator is available, prefer it over kmalloc
> > > +        * for small fragments
> > >          */
> > > -       if (len <= SKB_WITH_OVERHEAD(1024) ||
> > > +       if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
> > >             len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
> > >             (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
> > >                 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
> > > @@ -580,13 +659,30 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> > >         }
> > >
> > >         nc = this_cpu_ptr(&napi_alloc_cache);
> > > -       len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> > > -       len = SKB_DATA_ALIGN(len);
> > >
> > >         if (sk_memalloc_socks())
> > >                 gfp_mask |= __GFP_MEMALLOC;
> > >
> > > -       data = page_frag_alloc(&nc->page, len, gfp_mask);
> > > +       if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
> >
> > Then here you would add a line that would be:
> > pfc = &nc->page_small;
> >
> > > +               /* we are artificially inflating the allocation size, but
> > > +                * that is not as bad as it may look like, as:
> > > +                * - 'len' less then GRO_MAX_HEAD makes little sense
> > > +                * - larger 'len' values lead to fragment size above 512 bytes
> > > +                *   as per NAPI_HAS_SMALL_PAGE_FRAG definition
> > > +                * - kmalloc would use the kmalloc-1k slab for such values
> > > +                */
> > > +               len = SZ_1K;
> > > +
> > > +               data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
> > > +               pfmemalloc = nc->page_small.pfmemalloc;
> >
> > Instead of setting pfmemalloc you could just update the line below. In
> > addition you would just be passing pfc as the parameter.
> >
> > > +       } else {
> >
> > Likewise here you would have the line:
> > pfc = &nc->page;
>
> Probaly I was not clear in my previois email, sorry: before posting
> this version I tried locally exactly all the above, and the generated
> code with gcc 11.3.1 is a little bigger (a couple of instructions) than
> what this version produces (with gcc 11.3.1-2). It has the same number
> of conditionals and a slightly larger napi_alloc_cache.
>
> Additionally the suggested alternative needs more pre-processor
> conditionals to handle the !NAPI_HAS_SMALL_PAGE_FRAG case - avoiding
> adding a 2nd, unused in that case, page_frag_cache.
>
> [...]

Why would that be? You should still be using the pointer to the
page_frag_cache in the standard case. Like I was saying what you are
doing is essentially replacing the use of napi_alloc_cache with the
page_frag_cache, so for example with the existing setup all references
to "nc->page" would become "pfc->" so there shouldn't be any extra
unused variables in such a case since it would be used for both the
frag allocation and the pfmemalloc check.

One alternate way that occured to me to handle this would be to look
at possibly having napi_alloc_cache contain an array of
page_frag_cache structures. With that approach you could just size the
array and have it stick with a size of 1 in the case that small
doesn't exist, and support a size of 2 if it does. You could define
them via an enum so that the max would vary depending on if you add a
small frag cache or not. With that you could just bump the pointer
with a ++ so it goes from large to small and you wouldn't have any
warnings about items not existing in your structures, and the code
with the ++ could be kept from being called in the
!NAPI_HAS_SMALL_PAGE_FRAG case.

> >
> > > @@ -596,7 +692,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> > >                 return NULL;
> > >         }
> > >
> > > -       if (nc->page.pfmemalloc)
> > > +       if (pfmemalloc)
> >
> > Instead of passing pfmemalloc you could just check pfc->pfmemalloc.
> > Alternatively I wonder if it wouldn't be faster to just set the value
> > directly based on frag_cache->pfmemalloc and avoid the conditional
> > heck entirely.
>
> Note that:
>
>         skb->pfmemalloc = pfmemalloc;
>
> avoids a branch but requires more instructions than the current code
> (verified with gcc). The gain looks doubtful?!? Additionally we have
> statement alike:
>
>         if (<pfmemalloc>)
>                 skb->pfmemalloc = 1;
>
> in other functions in skbuff.c - still fast-path - and would be better
> updating all the places together for consistency - if that is really
> considered an improvement. IMHO it should at least land in a different
> patch.

We cannot really use "skb->pfmemalloc = pfmemalloc" because one is a
bitfield and the other is a boolean value. I suspect the complexity
would be greatly reduced if we converted the pfmemalloc to a bitfield
similar to skb->pfmemalloc. It isn't important though. I was mostly
just speculating on possible future optimizations.

> I'll post a v3 with your updated email address, but I think the current
> code is the better option.

That's fine. Like I mentioned I am mostly just trying to think things
out and identify any possible gaps we may have missed. I will keep an
eye out for the next version.
Paolo Abeni Sept. 23, 2022, 4:19 p.m. UTC | #5
On Fri, 2022-09-23 at 08:22 -0700, Alexander Duyck wrote:
> On Fri, Sep 23, 2022 at 12:41 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > 
> > Hello,
> > 
> > On Thu, 2022-09-22 at 14:17 -0700, Alexander Duyck wrote:
> > [...]
> > > My suggestion earlier was to just make the 1k cache a page_frag_cache.
> > > It will allow you to reuse the same structure members and a single
> > > pointer to track them. Waste would be minimal since the only real
> > > difference between the structures is about 8B for the structure, and
> > > odds are the napi_alloc_cache allocation is being rounded up anyway.
> > > 
> > > >         unsigned int skb_count;
> > > >         void *skb_cache[NAPI_SKB_CACHE_SIZE];
> > > >  };
> > > > @@ -143,6 +202,23 @@ struct napi_alloc_cache {
> > > >  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
> > > >  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
> > > > 
> > > > +/* Double check that napi_get_frags() allocates skbs with
> > > > + * skb->head being backed by slab, not a page fragment.
> > > > + * This is to make sure bug fixed in 3226b158e67c
> > > > + * ("net: avoid 32 x truesize under-estimation for tiny skbs")
> > > > + * does not accidentally come back.
> > > > + */
> > > > +void napi_get_frags_check(struct napi_struct *napi)
> > > > +{
> > > > +       struct sk_buff *skb;
> > > > +
> > > > +       local_bh_disable();
> > > > +       skb = napi_get_frags(napi);
> > > > +       WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
> > > > +       napi_free_frags(napi);
> > > > +       local_bh_enable();
> > > > +}
> > > > +
> > > >  void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
> > > >  {
> > > >         struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
> > > > @@ -561,6 +637,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> > > >  {
> > > >         struct napi_alloc_cache *nc;
> > > >         struct sk_buff *skb;
> > > > +       bool pfmemalloc;
> > > 
> > > Rather than adding this I think you would be better off adding a
> > > struct page_frag_cache pointer. I will reference it here as "pfc".
> > > 
> > > >         void *data;
> > > > 
> > > >         DEBUG_NET_WARN_ON_ONCE(!in_softirq());
> > > > @@ -568,8 +645,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> > > > 
> > > >         /* If requested length is either too small or too big,
> > > >          * we use kmalloc() for skb->head allocation.
> > > > +        * When the small frag allocator is available, prefer it over kmalloc
> > > > +        * for small fragments
> > > >          */
> > > > -       if (len <= SKB_WITH_OVERHEAD(1024) ||
> > > > +       if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
> > > >             len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
> > > >             (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
> > > >                 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
> > > > @@ -580,13 +659,30 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> > > >         }
> > > > 
> > > >         nc = this_cpu_ptr(&napi_alloc_cache);
> > > > -       len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> > > > -       len = SKB_DATA_ALIGN(len);
> > > > 
> > > >         if (sk_memalloc_socks())
> > > >                 gfp_mask |= __GFP_MEMALLOC;
> > > > 
> > > > -       data = page_frag_alloc(&nc->page, len, gfp_mask);
> > > > +       if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
> > > 
> > > Then here you would add a line that would be:
> > > pfc = &nc->page_small;
> > > 
> > > > +               /* we are artificially inflating the allocation size, but
> > > > +                * that is not as bad as it may look like, as:
> > > > +                * - 'len' less then GRO_MAX_HEAD makes little sense
> > > > +                * - larger 'len' values lead to fragment size above 512 bytes
> > > > +                *   as per NAPI_HAS_SMALL_PAGE_FRAG definition
> > > > +                * - kmalloc would use the kmalloc-1k slab for such values
> > > > +                */
> > > > +               len = SZ_1K;
> > > > +
> > > > +               data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
> > > > +               pfmemalloc = nc->page_small.pfmemalloc;
> > > 
> > > Instead of setting pfmemalloc you could just update the line below. In
> > > addition you would just be passing pfc as the parameter.
> > > 
> > > > +       } else {
> > > 
> > > Likewise here you would have the line:
> > > pfc = &nc->page;
> > 
> > Probaly I was not clear in my previois email, sorry: before posting
> > this version I tried locally exactly all the above, and the generated
> > code with gcc 11.3.1 is a little bigger (a couple of instructions) than
> > what this version produces (with gcc 11.3.1-2). It has the same number
> > of conditionals and a slightly larger napi_alloc_cache.
> > 
> > Additionally the suggested alternative needs more pre-processor
> > conditionals to handle the !NAPI_HAS_SMALL_PAGE_FRAG case - avoiding
> > adding a 2nd, unused in that case, page_frag_cache.
> > 
> > [...]
> 
> Why would that be? You should still be using the pointer to the
> page_frag_cache in the standard case. Like I was saying what you are
> doing is essentially replacing the use of napi_alloc_cache with the
> page_frag_cache, so for example with the existing setup all references
> to "nc->page" would become "pfc->" so there shouldn't be any extra
> unused variables in such a case since it would be used for both the
> frag allocation and the pfmemalloc check.

The problem is that regardless of the NAPI_HAS_SMALL_PAGE_FRAG value,
under the branch:
	
	if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {

gcc sees nc->page_small so we need page_small to exists and be 0 bytes
wide for !NAPI_HAS_SMALL_PAGE_FRAG.

> One alternate way that occured to me to handle this would be to look
> at possibly having napi_alloc_cache contain an array of
> page_frag_cache structures. With that approach you could just size the
> array and have it stick with a size of 1 in the case that small
> doesn't exist, and support a size of 2 if it does. You could define
> them via an enum so that the max would vary depending on if you add a
> small frag cache or not. With that you could just bump the pointer
> with a ++ so it goes from large to small and you wouldn't have any
> warnings about items not existing in your structures, and the code
> with the ++ could be kept from being called in the
> !NAPI_HAS_SMALL_PAGE_FRAG case.

Yes, the above works nicely from a 'no additional preprocessor
conditional perspective', and the generate code for __napi_alloc_skb()
is 3 bytes shorted then my variant - which boils down to nothing due to
alignment - but the most critical path (small allocations) requires
more instructions and the diff is IMHO less readable, touching all the
other nc->page users.

Allocations >1K should be a less relevant code path, as e.g. there is
still a ~ 32x truesize underestimation there...

> > > > @@ -596,7 +692,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> > > >                 return NULL;
> > > >         }
> > > > 
> > > > -       if (nc->page.pfmemalloc)
> > > > +       if (pfmemalloc)
> > > 
> > > Instead of passing pfmemalloc you could just check pfc->pfmemalloc.
> > > Alternatively I wonder if it wouldn't be faster to just set the value
> > > directly based on frag_cache->pfmemalloc and avoid the conditional
> > > heck entirely.
> > 
> > Note that:
> > 
> >         skb->pfmemalloc = pfmemalloc;
> > 
> > avoids a branch but requires more instructions than the current code
> > (verified with gcc). The gain looks doubtful?!? Additionally we have
> > statement alike:
> > 
> >         if (<pfmemalloc>)
> >                 skb->pfmemalloc = 1;
> > 
> > in other functions in skbuff.c - still fast-path - and would be better
> > updating all the places together for consistency - if that is really
> > considered an improvement. IMHO it should at least land in a different
> > patch.
> 
> We cannot really use "skb->pfmemalloc = pfmemalloc" because one is a
> bitfield and the other is a boolean value. I suspect the complexity
> would be greatly reduced if we converted the pfmemalloc to a bitfield
> similar to skb->pfmemalloc. It isn't important though. I was mostly
> just speculating on possible future optimizations.
> 
> > I'll post a v3 with your updated email address, but I think the current
> > code is the better option.
> 
> That's fine. Like I mentioned I am mostly just trying to think things
> out and identify any possible gaps we may have missed. I will keep an
> eye out for the next version.

Yep, let me go aheat with this...

Thanks,

Paolo
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9f42fc871c3b..a1938560192a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3822,6 +3822,7 @@  void netif_receive_skb_list(struct list_head *head);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
+void napi_get_frags_check(struct napi_struct *napi);
 gro_result_t napi_gro_frags(struct napi_struct *napi);
 struct packet_offload *gro_find_receive_by_type(__be16 type);
 struct packet_offload *gro_find_complete_by_type(__be16 type);
diff --git a/net/core/dev.c b/net/core/dev.c
index d66c73c1c734..fa53830d0683 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6358,23 +6358,6 @@  int dev_set_threaded(struct net_device *dev, bool threaded)
 }
 EXPORT_SYMBOL(dev_set_threaded);
 
-/* Double check that napi_get_frags() allocates skbs with
- * skb->head being backed by slab, not a page fragment.
- * This is to make sure bug fixed in 3226b158e67c
- * ("net: avoid 32 x truesize under-estimation for tiny skbs")
- * does not accidentally come back.
- */
-static void napi_get_frags_check(struct napi_struct *napi)
-{
-	struct sk_buff *skb;
-
-	local_bh_disable();
-	skb = napi_get_frags(napi);
-	WARN_ON_ONCE(skb && skb->head_frag);
-	napi_free_frags(napi);
-	local_bh_enable();
-}
-
 void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 			   int (*poll)(struct napi_struct *, int), int weight)
 {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f1b8b20fc20b..00340b0cf6eb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -134,8 +134,67 @@  static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
 #define NAPI_SKB_CACHE_BULK	16
 #define NAPI_SKB_CACHE_HALF	(NAPI_SKB_CACHE_SIZE / 2)
 
+/* the compiler doesn't like 'SKB_TRUESIZE(GRO_MAX_HEAD) > 512', but we
+ * can imply such condition checking the double word and MAX_HEADER size
+ */
+#if PAGE_SIZE == SZ_4K && (defined(CONFIG_64BIT) || MAX_HEADER > 64)
+
+#define NAPI_HAS_SMALL_PAGE_FRAG	1
+
+/* specializzed page frag allocator using a single order 0 page
+ * and slicing it into 1K sized fragment. Constrained to system
+ * with:
+ * - a very limited amount of 1K fragments fitting a single
+ *   page - to avoid excessive truesize underestimation
+ * - reasonably high truesize value for napi_get_frags()
+ *   allocation - to avoid memory usage increased compared
+ *   to kalloc, see __napi_alloc_skb()
+ */
+
+struct page_frag_1k {
+	void *va;
+	u16 offset;
+	bool pfmemalloc;
+};
+
+static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
+{
+	struct page *page;
+	int offset;
+
+	offset = nc->offset - SZ_1K;
+	if (likely(offset >= 0))
+		goto use_frag;
+
+	page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+	if (!page)
+		return NULL;
+
+	nc->va = page_address(page);
+	nc->pfmemalloc = page_is_pfmemalloc(page);
+	offset = PAGE_SIZE - SZ_1K;
+	page_ref_add(page, offset / SZ_1K);
+
+use_frag:
+	nc->offset = offset;
+	return nc->va + offset;
+}
+#else
+#define NAPI_HAS_SMALL_PAGE_FRAG	0
+
+struct page_frag_1k {
+};
+
+static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
+{
+	return NULL;
+}
+
+#endif
+
 struct napi_alloc_cache {
 	struct page_frag_cache page;
+	struct page_frag_1k page_small;
 	unsigned int skb_count;
 	void *skb_cache[NAPI_SKB_CACHE_SIZE];
 };
@@ -143,6 +202,23 @@  struct napi_alloc_cache {
 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
 static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
 
+/* Double check that napi_get_frags() allocates skbs with
+ * skb->head being backed by slab, not a page fragment.
+ * This is to make sure bug fixed in 3226b158e67c
+ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
+ * does not accidentally come back.
+ */
+void napi_get_frags_check(struct napi_struct *napi)
+{
+	struct sk_buff *skb;
+
+	local_bh_disable();
+	skb = napi_get_frags(napi);
+	WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
+	napi_free_frags(napi);
+	local_bh_enable();
+}
+
 void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 {
 	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
@@ -561,6 +637,7 @@  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 {
 	struct napi_alloc_cache *nc;
 	struct sk_buff *skb;
+	bool pfmemalloc;
 	void *data;
 
 	DEBUG_NET_WARN_ON_ONCE(!in_softirq());
@@ -568,8 +645,10 @@  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 
 	/* If requested length is either too small or too big,
 	 * we use kmalloc() for skb->head allocation.
+	 * When the small frag allocator is available, prefer it over kmalloc
+	 * for small fragments
 	 */
-	if (len <= SKB_WITH_OVERHEAD(1024) ||
+	if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
 	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
 	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
 		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
@@ -580,13 +659,30 @@  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 	}
 
 	nc = this_cpu_ptr(&napi_alloc_cache);
-	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-	len = SKB_DATA_ALIGN(len);
 
 	if (sk_memalloc_socks())
 		gfp_mask |= __GFP_MEMALLOC;
 
-	data = page_frag_alloc(&nc->page, len, gfp_mask);
+	if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
+		/* we are artificially inflating the allocation size, but
+		 * that is not as bad as it may look like, as:
+		 * - 'len' less then GRO_MAX_HEAD makes little sense
+		 * - larger 'len' values lead to fragment size above 512 bytes
+		 *   as per NAPI_HAS_SMALL_PAGE_FRAG definition
+		 * - kmalloc would use the kmalloc-1k slab for such values
+		 */
+		len = SZ_1K;
+
+		data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
+		pfmemalloc = nc->page_small.pfmemalloc;
+	} else {
+		len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+		len = SKB_DATA_ALIGN(len);
+
+		data = page_frag_alloc(&nc->page, len, gfp_mask);
+		pfmemalloc = nc->page.pfmemalloc;
+	}
+
 	if (unlikely(!data))
 		return NULL;
 
@@ -596,7 +692,7 @@  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 		return NULL;
 	}
 
-	if (nc->page.pfmemalloc)
+	if (pfmemalloc)
 		skb->pfmemalloc = 1;
 	skb->head_frag = 1;