diff mbox series

[net-next] net: skb: introduce and use a single page frag cache

Message ID 59a54c9a654fe19cc9fb7da5b2377029d93a181e.1663778475.git.pabeni@redhat.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net-next] net: skb: introduce and use a single page frag cache | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 4377 this patch: 4377
netdev/cc_maintainers warning 1 maintainers not CCed: petrm@nvidia.com
netdev/build_clang success Errors and warnings before: 1059 this patch: 1059
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4566 this patch: 4566
netdev/checkpatch warning WARNING: Comparisons should place the constant on the right side of the test
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Paolo Abeni Sept. 21, 2022, 4:41 p.m. UTC
After commit 3226b158e67c ("net: avoid 32 x truesize under-estimation
for tiny skbs") we are observing 10-20% regressions in performance
tests with small packets. The perf trace points to high pressure on
the slab allocator.

This change tries to improve the allocation schema for small packets
using an idea originally suggested by Eric: a new per CPU page frag is
introduced and used in __napi_alloc_skb to cope with small allocation
requests.

To ensure that the above does not lead to excessive truesize
underestimation, the frag size for small allocation is inflated to 1K
and all the above is restricted to build with 4K page size.

Note that we need to update accordingly the run-time check introduced
with commit fd9ea57f4e95 ("net: add napi_get_frags_check() helper").

Alex suggested a smart page refcount schema to reduce the number
of atomic operations and deal properly with pfmemalloc pages.

Under small packet UDP flood, I measure a 15% peak tput increases.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Suggested-by: Alexander H Duyck <alexander.duyck@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
@Eric, @Alex please let me know if you are comfortable with the
attribution
---
 include/linux/netdevice.h |   1 +
 net/core/dev.c            |  17 ------
 net/core/skbuff.c         | 115 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 113 insertions(+), 20 deletions(-)

Comments

Eric Dumazet Sept. 21, 2022, 5:18 p.m. UTC | #1
On Wed, Sep 21, 2022 at 9:42 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> After commit 3226b158e67c ("net: avoid 32 x truesize under-estimation
> for tiny skbs") we are observing 10-20% regressions in performance
> tests with small packets. The perf trace points to high pressure on
> the slab allocator.
>
> This change tries to improve the allocation schema for small packets
> using an idea originally suggested by Eric: a new per CPU page frag is
> introduced and used in __napi_alloc_skb to cope with small allocation
> requests.
>
> To ensure that the above does not lead to excessive truesize
> underestimation, the frag size for small allocation is inflated to 1K
> and all the above is restricted to build with 4K page size.
>
> Note that we need to update accordingly the run-time check introduced
> with commit fd9ea57f4e95 ("net: add napi_get_frags_check() helper").
>
> Alex suggested a smart page refcount schema to reduce the number
> of atomic operations and deal properly with pfmemalloc pages.
>
> Under small packet UDP flood, I measure a 15% peak tput increases.
>
> Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
> Suggested-by: Alexander H Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> @Eric, @Alex please let me know if you are comfortable with the
> attribution
> ---
>  include/linux/netdevice.h |   1 +
>  net/core/dev.c            |  17 ------
>  net/core/skbuff.c         | 115 +++++++++++++++++++++++++++++++++++++-
>  3 files changed, 113 insertions(+), 20 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 9f42fc871c3b..a1938560192a 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -3822,6 +3822,7 @@ void netif_receive_skb_list(struct list_head *head);
>  gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
>  void napi_gro_flush(struct napi_struct *napi, bool flush_old);
>  struct sk_buff *napi_get_frags(struct napi_struct *napi);
> +void napi_get_frags_check(struct napi_struct *napi);
>  gro_result_t napi_gro_frags(struct napi_struct *napi);
>  struct packet_offload *gro_find_receive_by_type(__be16 type);
>  struct packet_offload *gro_find_complete_by_type(__be16 type);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index d66c73c1c734..fa53830d0683 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6358,23 +6358,6 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
>  }
>  EXPORT_SYMBOL(dev_set_threaded);
>
> -/* Double check that napi_get_frags() allocates skbs with
> - * skb->head being backed by slab, not a page fragment.
> - * This is to make sure bug fixed in 3226b158e67c
> - * ("net: avoid 32 x truesize under-estimation for tiny skbs")
> - * does not accidentally come back.
> - */
> -static void napi_get_frags_check(struct napi_struct *napi)
> -{
> -       struct sk_buff *skb;
> -
> -       local_bh_disable();
> -       skb = napi_get_frags(napi);
> -       WARN_ON_ONCE(skb && skb->head_frag);
> -       napi_free_frags(napi);
> -       local_bh_enable();
> -}
> -
>  void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
>                            int (*poll)(struct napi_struct *, int), int weight)
>  {
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f1b8b20fc20b..2be11b487df1 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -134,8 +134,73 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
>  #define NAPI_SKB_CACHE_BULK    16
>  #define NAPI_SKB_CACHE_HALF    (NAPI_SKB_CACHE_SIZE / 2)
>
> +/* the compiler doesn't like 'SKB_TRUESIZE(GRO_MAX_HEAD) > 512', but we
> + * can imply such condition checking the double word and MAX_HEADER size
> + */
> +#if PAGE_SIZE == SZ_4K && (defined(CONFIG_64BIT) || MAX_HEADER > 64)
> +
> +#define NAPI_HAS_SMALL_PAGE_FRAG 1
> +
> +/* specializzed page frag allocator using a single order 0 page
> + * and slicing it into 1K sized fragment. Constrained to system
> + * with:
> + * - a very limited amount of 1K fragments fitting a single
> + *   page - to avoid excessive truesize underestimation
> + * - reasonably high truesize value for napi_get_frags()
> + *   allocation - to avoid memory usage increased compared
> + *   to kalloc, see __napi_alloc_skb()
> + *
> + */
> +struct page_frag_1k {
> +       void *va;
> +       u16 offset;
> +       bool pfmemalloc;
> +};
> +
> +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
> +{
> +       struct page *page;
> +       int offset;
> +
> +       if (likely(nc->va)) {
> +               offset = nc->offset - SZ_1K;
> +               if (likely(offset >= 0))
> +                       goto out;
> +
> +               put_page(virt_to_page(nc->va));

This probably can be removed, if the page_ref_add() later is adjusted by one ?

We know that for an exact chunk size of 1K, a 4K page is split in 4,
no matter what.

> +       }
> +
> +       page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> +       if (!page) {
> +               nc->va = NULL;
> +               return NULL;
> +       }
> +
> +       nc->va = page_address(page);
> +       nc->pfmemalloc = page_is_pfmemalloc(page);
> +       page_ref_add(page, PAGE_SIZE / SZ_1K);

page_ref_add(page, PAGE_SIZE / SZ_1K - 1);


> +       offset = PAGE_SIZE - SZ_1K;
> +
> +out:
> +       nc->offset = offset;
> +       return nc->va + offset;
> +}
> +#else
> +#define NAPI_HAS_SMALL_PAGE_FRAG 0
> +
> +struct page_frag_1k {
> +};
> +
> +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
> +{
> +       return NULL;
> +}
> +
> +#endif
> +
>  struct napi_alloc_cache {
>         struct page_frag_cache page;
> +       struct page_frag_1k page_small;
>         unsigned int skb_count;
>         void *skb_cache[NAPI_SKB_CACHE_SIZE];
>  };
> @@ -143,6 +208,23 @@ struct napi_alloc_cache {
>  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
>  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
>
> +/* Double check that napi_get_frags() allocates skbs with
> + * skb->head being backed by slab, not a page fragment.
> + * This is to make sure bug fixed in 3226b158e67c
> + * ("net: avoid 32 x truesize under-estimation for tiny skbs")
> + * does not accidentally come back.
> + */
> +void napi_get_frags_check(struct napi_struct *napi)
> +{
> +       struct sk_buff *skb;
> +
> +       local_bh_disable();
> +       skb = napi_get_frags(napi);
> +       WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
> +       napi_free_frags(napi);
> +       local_bh_enable();
> +}
> +
>  void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
>  {
>         struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
> @@ -561,15 +643,39 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
>  {
>         struct napi_alloc_cache *nc;
>         struct sk_buff *skb;
> +       bool pfmemalloc;
>         void *data;
>
>         DEBUG_NET_WARN_ON_ONCE(!in_softirq());
>         len += NET_SKB_PAD + NET_IP_ALIGN;
>
> +       /* When the small frag allocator is available, prefer it over kmalloc
> +        * for small fragments
> +        */
> +       if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
> +               nc = this_cpu_ptr(&napi_alloc_cache);
> +
> +               if (sk_memalloc_socks())
> +                       gfp_mask |= __GFP_MEMALLOC;
> +
> +               /* we are artificially inflating the allocation size, but
> +                * that is not as bad as it may look like, as:
> +                * - 'len' less then GRO_MAX_HEAD makes little sense
> +                * - larger 'len' values lead to fragment size above 512 bytes
> +                *   as per NAPI_HAS_SMALL_PAGE_FRAG definition
> +                * - kmalloc would use the kmalloc-1k slab for such values
> +                */
> +               len = SZ_1K;
> +
> +               data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
> +               pfmemalloc = nc->page_small.pfmemalloc;
> +               goto check_data;
> +       }
> +
>         /* If requested length is either too small or too big,
>          * we use kmalloc() for skb->head allocation.
>          */
> -       if (len <= SKB_WITH_OVERHEAD(1024) ||
> +       if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
>             len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
>             (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
>                 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
> @@ -587,6 +693,9 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
>                 gfp_mask |= __GFP_MEMALLOC;
>
>         data = page_frag_alloc(&nc->page, len, gfp_mask);
> +       pfmemalloc = nc->page.pfmemalloc;
> +
> +check_data:
>         if (unlikely(!data))
>                 return NULL;
>
> @@ -596,8 +705,8 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
>                 return NULL;
>         }
>
> -       if (nc->page.pfmemalloc)
> -               skb->pfmemalloc = 1;
> +       if (pfmemalloc)
> +               skb->pfmemalloc = pfmemalloc;
>         skb->head_frag = 1;
>
>  skb_success:
> --
> 2.37.3
>
Paolo Abeni Sept. 21, 2022, 6:10 p.m. UTC | #2
On Wed, 2022-09-21 at 10:18 -0700, Eric Dumazet wrote:
> On Wed, Sep 21, 2022 at 9:42 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > 
> > After commit 3226b158e67c ("net: avoid 32 x truesize under-estimation
> > for tiny skbs") we are observing 10-20% regressions in performance
> > tests with small packets. The perf trace points to high pressure on
> > the slab allocator.
> > 
> > This change tries to improve the allocation schema for small packets
> > using an idea originally suggested by Eric: a new per CPU page frag is
> > introduced and used in __napi_alloc_skb to cope with small allocation
> > requests.
> > 
> > To ensure that the above does not lead to excessive truesize
> > underestimation, the frag size for small allocation is inflated to 1K
> > and all the above is restricted to build with 4K page size.
> > 
> > Note that we need to update accordingly the run-time check introduced
> > with commit fd9ea57f4e95 ("net: add napi_get_frags_check() helper").
> > 
> > Alex suggested a smart page refcount schema to reduce the number
> > of atomic operations and deal properly with pfmemalloc pages.
> > 
> > Under small packet UDP flood, I measure a 15% peak tput increases.
> > 
> > Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
> > Suggested-by: Alexander H Duyck <alexander.duyck@gmail.com>
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> > @Eric, @Alex please let me know if you are comfortable with the
> > attribution
> > ---
> >  include/linux/netdevice.h |   1 +
> >  net/core/dev.c            |  17 ------
> >  net/core/skbuff.c         | 115 +++++++++++++++++++++++++++++++++++++-
> >  3 files changed, 113 insertions(+), 20 deletions(-)
> > 
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 9f42fc871c3b..a1938560192a 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -3822,6 +3822,7 @@ void netif_receive_skb_list(struct list_head *head);
> >  gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
> >  void napi_gro_flush(struct napi_struct *napi, bool flush_old);
> >  struct sk_buff *napi_get_frags(struct napi_struct *napi);
> > +void napi_get_frags_check(struct napi_struct *napi);
> >  gro_result_t napi_gro_frags(struct napi_struct *napi);
> >  struct packet_offload *gro_find_receive_by_type(__be16 type);
> >  struct packet_offload *gro_find_complete_by_type(__be16 type);
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index d66c73c1c734..fa53830d0683 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -6358,23 +6358,6 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
> >  }
> >  EXPORT_SYMBOL(dev_set_threaded);
> > 
> > -/* Double check that napi_get_frags() allocates skbs with
> > - * skb->head being backed by slab, not a page fragment.
> > - * This is to make sure bug fixed in 3226b158e67c
> > - * ("net: avoid 32 x truesize under-estimation for tiny skbs")
> > - * does not accidentally come back.
> > - */
> > -static void napi_get_frags_check(struct napi_struct *napi)
> > -{
> > -       struct sk_buff *skb;
> > -
> > -       local_bh_disable();
> > -       skb = napi_get_frags(napi);
> > -       WARN_ON_ONCE(skb && skb->head_frag);
> > -       napi_free_frags(napi);
> > -       local_bh_enable();
> > -}
> > -
> >  void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
> >                            int (*poll)(struct napi_struct *, int), int weight)
> >  {
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index f1b8b20fc20b..2be11b487df1 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -134,8 +134,73 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
> >  #define NAPI_SKB_CACHE_BULK    16
> >  #define NAPI_SKB_CACHE_HALF    (NAPI_SKB_CACHE_SIZE / 2)
> > 
> > +/* the compiler doesn't like 'SKB_TRUESIZE(GRO_MAX_HEAD) > 512', but we
> > + * can imply such condition checking the double word and MAX_HEADER size
> > + */
> > +#if PAGE_SIZE == SZ_4K && (defined(CONFIG_64BIT) || MAX_HEADER > 64)
> > +
> > +#define NAPI_HAS_SMALL_PAGE_FRAG 1
> > +
> > +/* specializzed page frag allocator using a single order 0 page
> > + * and slicing it into 1K sized fragment. Constrained to system
> > + * with:
> > + * - a very limited amount of 1K fragments fitting a single
> > + *   page - to avoid excessive truesize underestimation
> > + * - reasonably high truesize value for napi_get_frags()
> > + *   allocation - to avoid memory usage increased compared
> > + *   to kalloc, see __napi_alloc_skb()
> > + *
> > + */
> > +struct page_frag_1k {
> > +       void *va;
> > +       u16 offset;
> > +       bool pfmemalloc;
> > +};
> > +
> > +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
> > +{
> > +       struct page *page;
> > +       int offset;
> > +
> > +       if (likely(nc->va)) {
> > +               offset = nc->offset - SZ_1K;
> > +               if (likely(offset >= 0))
> > +                       goto out;
> > +
> > +               put_page(virt_to_page(nc->va));
> 
> This probably can be removed, if the page_ref_add() later is adjusted by one ?

I think you are right. It looks like we never touch the page after the
last fragment is used. One less atomic operation :) And one less cold
cacheline accessed.

I read the above as you are somewhat ok with the overall size and
number of conditionals in this change, am I guessing too much?

Thanks!

Paolo
Alexander Duyck Sept. 21, 2022, 6:11 p.m. UTC | #3
On Wed, 2022-09-21 at 18:41 +0200, Paolo Abeni wrote:
> After commit 3226b158e67c ("net: avoid 32 x truesize under-estimation
> for tiny skbs") we are observing 10-20% regressions in performance
> tests with small packets. The perf trace points to high pressure on
> the slab allocator.
> 
> This change tries to improve the allocation schema for small packets
> using an idea originally suggested by Eric: a new per CPU page frag is
> introduced and used in __napi_alloc_skb to cope with small allocation
> requests.
> 
> To ensure that the above does not lead to excessive truesize
> underestimation, the frag size for small allocation is inflated to 1K
> and all the above is restricted to build with 4K page size.
> 
> Note that we need to update accordingly the run-time check introduced
> with commit fd9ea57f4e95 ("net: add napi_get_frags_check() helper").
> 
> Alex suggested a smart page refcount schema to reduce the number
> of atomic operations and deal properly with pfmemalloc pages.
> 
> Under small packet UDP flood, I measure a 15% peak tput increases.
> 
> Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
> Suggested-by: Alexander H Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> @Eric, @Alex please let me know if you are comfortable with the
> attribution
> ---
>  include/linux/netdevice.h |   1 +
>  net/core/dev.c            |  17 ------
>  net/core/skbuff.c         | 115 +++++++++++++++++++++++++++++++++++++-
>  3 files changed, 113 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 9f42fc871c3b..a1938560192a 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -3822,6 +3822,7 @@ void netif_receive_skb_list(struct list_head *head);
>  gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
>  void napi_gro_flush(struct napi_struct *napi, bool flush_old);
>  struct sk_buff *napi_get_frags(struct napi_struct *napi);
> +void napi_get_frags_check(struct napi_struct *napi);
>  gro_result_t napi_gro_frags(struct napi_struct *napi);
>  struct packet_offload *gro_find_receive_by_type(__be16 type);
>  struct packet_offload *gro_find_complete_by_type(__be16 type);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index d66c73c1c734..fa53830d0683 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6358,23 +6358,6 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
>  }
>  EXPORT_SYMBOL(dev_set_threaded);
>  
> -/* Double check that napi_get_frags() allocates skbs with
> - * skb->head being backed by slab, not a page fragment.
> - * This is to make sure bug fixed in 3226b158e67c
> - * ("net: avoid 32 x truesize under-estimation for tiny skbs")
> - * does not accidentally come back.
> - */
> -static void napi_get_frags_check(struct napi_struct *napi)
> -{
> -	struct sk_buff *skb;
> -
> -	local_bh_disable();
> -	skb = napi_get_frags(napi);
> -	WARN_ON_ONCE(skb && skb->head_frag);
> -	napi_free_frags(napi);
> -	local_bh_enable();
> -}
> -
>  void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
>  			   int (*poll)(struct napi_struct *, int), int weight)
>  {
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f1b8b20fc20b..2be11b487df1 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -134,8 +134,73 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
>  #define NAPI_SKB_CACHE_BULK	16
>  #define NAPI_SKB_CACHE_HALF	(NAPI_SKB_CACHE_SIZE / 2)
>  
> +/* the compiler doesn't like 'SKB_TRUESIZE(GRO_MAX_HEAD) > 512', but we
> + * can imply such condition checking the double word and MAX_HEADER size
> + */
> +#if PAGE_SIZE == SZ_4K && (defined(CONFIG_64BIT) || MAX_HEADER > 64)
> +
> +#define NAPI_HAS_SMALL_PAGE_FRAG 1
> +
> +/* specializzed page frag allocator using a single order 0 page
> + * and slicing it into 1K sized fragment. Constrained to system
> + * with:
> + * - a very limited amount of 1K fragments fitting a single
> + *   page - to avoid excessive truesize underestimation
> + * - reasonably high truesize value for napi_get_frags()
> + *   allocation - to avoid memory usage increased compared
> + *   to kalloc, see __napi_alloc_skb()
> + *
> + */
> +struct page_frag_1k {
> +	void *va;
> +	u16 offset;
> +	bool pfmemalloc;
> +};
> +
> +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
> +{
> +	struct page *page;
> +	int offset;
> +
> +	if (likely(nc->va)) {
> +		offset = nc->offset - SZ_1K;
> +		if (likely(offset >= 0))
> +			goto out;
> +
> +		put_page(virt_to_page(nc->va));
> +	}
> +
> +	page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> +	if (!page) {
> +		nc->va = NULL;
> +		return NULL;
> +	}
> +
> +	nc->va = page_address(page);
> +	nc->pfmemalloc = page_is_pfmemalloc(page);
> +	page_ref_add(page, PAGE_SIZE / SZ_1K);
> +	offset = PAGE_SIZE - SZ_1K;
> +
> +out:
> +	nc->offset = offset;
> +	return nc->va + offset;

So you might be better off organizing this around the offset rather
than the virtual address. As long as offset is 0 you know the page
isn't there and has to be replaced.

	offset = nc->offset - SZ_1K;
	if (offset >= 0)
		goto out;

	page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
	if (!page)
		return NULL;

	nc->va = page_address(page);
	nc->pfmemalloc = page_is_pfmemalloc(page);
	offset = PAGE_SIZE - SZ_1K;
	page_ref_add(page, offset / SZ_1K);
out:
	nc->offset = offset;
	return nc->va + offset;

That will save you from having to call put_page and cleans it up so you
only have to perform 1 conditional check instead of 2 in the fast path.

> +}
> +#else
> +#define NAPI_HAS_SMALL_PAGE_FRAG 0
> +
> +struct page_frag_1k {
> +};
> +
> +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
> +{
> +	return NULL;
> +}
> +
> +#endif
> +

Rather than have this return NULL why not just point it at the
page_frag_alloc?

>  struct napi_alloc_cache {
>  	struct page_frag_cache page;
> +	struct page_frag_1k page_small;
>  	unsigned int skb_count;
>  	void *skb_cache[NAPI_SKB_CACHE_SIZE];
>  };
> @@ -143,6 +208,23 @@ struct napi_alloc_cache {
>  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
>  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
>  
> +/* Double check that napi_get_frags() allocates skbs with
> + * skb->head being backed by slab, not a page fragment.
> + * This is to make sure bug fixed in 3226b158e67c
> + * ("net: avoid 32 x truesize under-estimation for tiny skbs")
> + * does not accidentally come back.
> + */
> +void napi_get_frags_check(struct napi_struct *napi)
> +{
> +	struct sk_buff *skb;
> +
> +	local_bh_disable();
> +	skb = napi_get_frags(napi);
> +	WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
> +	napi_free_frags(napi);
> +	local_bh_enable();
> +}
> +
>  void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
>  {
>  	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
> @@ -561,15 +643,39 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
>  {
>  	struct napi_alloc_cache *nc;
>  	struct sk_buff *skb;
> +	bool pfmemalloc;
>  	void *data;
>  
>  	DEBUG_NET_WARN_ON_ONCE(!in_softirq());
>  	len += NET_SKB_PAD + NET_IP_ALIGN;
>  
> +	/* When the small frag allocator is available, prefer it over kmalloc
> +	 * for small fragments
> +	 */
> +	if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
> +		nc = this_cpu_ptr(&napi_alloc_cache);
> +
> +		if (sk_memalloc_socks())
> +			gfp_mask |= __GFP_MEMALLOC;
> +
> +		/* we are artificially inflating the allocation size, but
> +		 * that is not as bad as it may look like, as:
> +		 * - 'len' less then GRO_MAX_HEAD makes little sense
> +		 * - larger 'len' values lead to fragment size above 512 bytes
> +		 *   as per NAPI_HAS_SMALL_PAGE_FRAG definition
> +		 * - kmalloc would use the kmalloc-1k slab for such values
> +		 */
> +		len = SZ_1K;
> +
> +		data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
> +		pfmemalloc = nc->page_small.pfmemalloc;
> +		goto check_data;
> +	}
> +

It might be better to place this code further down as a branch rather
than having to duplicate things up here such as the __GFP_MEMALLOC
setting.

You could essentially just put the lines getting the napi_alloc_cache
and adding the shared info after the sk_memalloc_socks() check. Then it
could just be an if/else block either calling page_frag_alloc or your
page_frag_alloc_1k.

>  	/* If requested length is either too small or too big,
>  	 * we use kmalloc() for skb->head allocation.
>  	 */
> -	if (len <= SKB_WITH_OVERHEAD(1024) ||
> +	if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
>  	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
>  	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
>  		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
> @@ -587,6 +693,9 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
>  		gfp_mask |= __GFP_MEMALLOC;
>  
>  	data = page_frag_alloc(&nc->page, len, gfp_mask);
> +	pfmemalloc = nc->page.pfmemalloc;
> +
> +check_data:
>  	if (unlikely(!data))
>  		return NULL;
>  
> @@ -596,8 +705,8 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
>  		return NULL;
>  	}
>  
> -	if (nc->page.pfmemalloc)
> -		skb->pfmemalloc = 1;
> +	if (pfmemalloc)
> +		skb->pfmemalloc = pfmemalloc;
>  	skb->head_frag = 1;
>  
>  skb_success:

In regards to the pfmemalloc bits I wonder if it wouldn't be better to
just have them both using the page_frag_cache and just use a pointer to
that to populate the skb->pfmemalloc based on frag_cache->pfmemalloc at
the end?
Paolo Abeni Sept. 21, 2022, 7:33 p.m. UTC | #4
On Wed, 2022-09-21 at 11:11 -0700, Alexander H Duyck wrote:
> On Wed, 2022-09-21 at 18:41 +0200, Paolo Abeni wrote:
> > After commit 3226b158e67c ("net: avoid 32 x truesize under-estimation
> > for tiny skbs") we are observing 10-20% regressions in performance
> > tests with small packets. The perf trace points to high pressure on
> > the slab allocator.
> > 
> > This change tries to improve the allocation schema for small packets
> > using an idea originally suggested by Eric: a new per CPU page frag is
> > introduced and used in __napi_alloc_skb to cope with small allocation
> > requests.
> > 
> > To ensure that the above does not lead to excessive truesize
> > underestimation, the frag size for small allocation is inflated to 1K
> > and all the above is restricted to build with 4K page size.
> > 
> > Note that we need to update accordingly the run-time check introduced
> > with commit fd9ea57f4e95 ("net: add napi_get_frags_check() helper").
> > 
> > Alex suggested a smart page refcount schema to reduce the number
> > of atomic operations and deal properly with pfmemalloc pages.
> > 
> > Under small packet UDP flood, I measure a 15% peak tput increases.
> > 
> > Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
> > Suggested-by: Alexander H Duyck <alexander.duyck@gmail.com>
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> > @Eric, @Alex please let me know if you are comfortable with the
> > attribution
> > ---
> >  include/linux/netdevice.h |   1 +
> >  net/core/dev.c            |  17 ------
> >  net/core/skbuff.c         | 115 +++++++++++++++++++++++++++++++++++++-
> >  3 files changed, 113 insertions(+), 20 deletions(-)
> > 
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 9f42fc871c3b..a1938560192a 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -3822,6 +3822,7 @@ void netif_receive_skb_list(struct list_head *head);
> >  gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
> >  void napi_gro_flush(struct napi_struct *napi, bool flush_old);
> >  struct sk_buff *napi_get_frags(struct napi_struct *napi);
> > +void napi_get_frags_check(struct napi_struct *napi);
> >  gro_result_t napi_gro_frags(struct napi_struct *napi);
> >  struct packet_offload *gro_find_receive_by_type(__be16 type);
> >  struct packet_offload *gro_find_complete_by_type(__be16 type);
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index d66c73c1c734..fa53830d0683 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -6358,23 +6358,6 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
> >  }
> >  EXPORT_SYMBOL(dev_set_threaded);
> >  
> > -/* Double check that napi_get_frags() allocates skbs with
> > - * skb->head being backed by slab, not a page fragment.
> > - * This is to make sure bug fixed in 3226b158e67c
> > - * ("net: avoid 32 x truesize under-estimation for tiny skbs")
> > - * does not accidentally come back.
> > - */
> > -static void napi_get_frags_check(struct napi_struct *napi)
> > -{
> > -	struct sk_buff *skb;
> > -
> > -	local_bh_disable();
> > -	skb = napi_get_frags(napi);
> > -	WARN_ON_ONCE(skb && skb->head_frag);
> > -	napi_free_frags(napi);
> > -	local_bh_enable();
> > -}
> > -
> >  void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
> >  			   int (*poll)(struct napi_struct *, int), int weight)
> >  {
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index f1b8b20fc20b..2be11b487df1 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -134,8 +134,73 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
> >  #define NAPI_SKB_CACHE_BULK	16
> >  #define NAPI_SKB_CACHE_HALF	(NAPI_SKB_CACHE_SIZE / 2)
> >  
> > +/* the compiler doesn't like 'SKB_TRUESIZE(GRO_MAX_HEAD) > 512', but we
> > + * can imply such condition checking the double word and MAX_HEADER size
> > + */
> > +#if PAGE_SIZE == SZ_4K && (defined(CONFIG_64BIT) || MAX_HEADER > 64)
> > +
> > +#define NAPI_HAS_SMALL_PAGE_FRAG 1
> > +
> > +/* specializzed page frag allocator using a single order 0 page
> > + * and slicing it into 1K sized fragment. Constrained to system
> > + * with:
> > + * - a very limited amount of 1K fragments fitting a single
> > + *   page - to avoid excessive truesize underestimation
> > + * - reasonably high truesize value for napi_get_frags()
> > + *   allocation - to avoid memory usage increased compared
> > + *   to kalloc, see __napi_alloc_skb()
> > + *
> > + */
> > +struct page_frag_1k {
> > +	void *va;
> > +	u16 offset;
> > +	bool pfmemalloc;
> > +};
> > +
> > +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
> > +{
> > +	struct page *page;
> > +	int offset;
> > +
> > +	if (likely(nc->va)) {
> > +		offset = nc->offset - SZ_1K;
> > +		if (likely(offset >= 0))
> > +			goto out;
> > +
> > +		put_page(virt_to_page(nc->va));
> > +	}
> > +
> > +	page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> > +	if (!page) {
> > +		nc->va = NULL;
> > +		return NULL;
> > +	}
> > +
> > +	nc->va = page_address(page);
> > +	nc->pfmemalloc = page_is_pfmemalloc(page);
> > +	page_ref_add(page, PAGE_SIZE / SZ_1K);
> > +	offset = PAGE_SIZE - SZ_1K;
> > +
> > +out:
> > +	nc->offset = offset;
> > +	return nc->va + offset;
> 
> So you might be better off organizing this around the offset rather
> than the virtual address. As long as offset is 0 you know the page
> isn't there and has to be replaced.
> 
> 	offset = nc->offset - SZ_1K;
> 	if (offset >= 0)
> 		goto out;
> 
> 	page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> 	if (!page)
> 		return NULL;
> 
> 	nc->va = page_address(page);
> 	nc->pfmemalloc = page_is_pfmemalloc(page);
> 	offset = PAGE_SIZE - SZ_1K;
> 	page_ref_add(page, offset / SZ_1K);
> out:
> 	nc->offset = offset;
> 	return nc->va + offset;
> 
> That will save you from having to call put_page and cleans it up so you
> only have to perform 1 conditional check instead of 2 in the fast path.

Nice! I'll use that in v2, with page_ref_add(page, offset / SZ_1K - 1);
or we will leak the page.

> > +}
> > +#else
> > +#define NAPI_HAS_SMALL_PAGE_FRAG 0
> > +
> > +struct page_frag_1k {
> > +};
> > +
> > +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
> > +{
> > +	return NULL;
> > +}
> > +
> > +#endif
> > +
> 
> Rather than have this return NULL why not just point it at the
> page_frag_alloc?

When NAPI_HAS_SMALL_PAGE_FRAG is 0, page_frag_alloc_1k() is never used.
the definition is there just to please the compiler. I preferred this
style to avoid more #ifdef in __napi_alloc_skb().

> >  struct napi_alloc_cache {
> >  	struct page_frag_cache page;
> > +	struct page_frag_1k page_small;
> >  	unsigned int skb_count;
> >  	void *skb_cache[NAPI_SKB_CACHE_SIZE];
> >  };
> > @@ -143,6 +208,23 @@ struct napi_alloc_cache {
> >  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
> >  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
> >  
> > +/* Double check that napi_get_frags() allocates skbs with
> > + * skb->head being backed by slab, not a page fragment.
> > + * This is to make sure bug fixed in 3226b158e67c
> > + * ("net: avoid 32 x truesize under-estimation for tiny skbs")
> > + * does not accidentally come back.
> > + */
> > +void napi_get_frags_check(struct napi_struct *napi)
> > +{
> > +	struct sk_buff *skb;
> > +
> > +	local_bh_disable();
> > +	skb = napi_get_frags(napi);
> > +	WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
> > +	napi_free_frags(napi);
> > +	local_bh_enable();
> > +}
> > +
> >  void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
> >  {
> >  	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
> > @@ -561,15 +643,39 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> >  {
> >  	struct napi_alloc_cache *nc;
> >  	struct sk_buff *skb;
> > +	bool pfmemalloc;
> >  	void *data;
> >  
> >  	DEBUG_NET_WARN_ON_ONCE(!in_softirq());
> >  	len += NET_SKB_PAD + NET_IP_ALIGN;
> >  
> > +	/* When the small frag allocator is available, prefer it over kmalloc
> > +	 * for small fragments
> > +	 */
> > +	if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
> > +		nc = this_cpu_ptr(&napi_alloc_cache);
> > +
> > +		if (sk_memalloc_socks())
> > +			gfp_mask |= __GFP_MEMALLOC;
> > +
> > +		/* we are artificially inflating the allocation size, but
> > +		 * that is not as bad as it may look like, as:
> > +		 * - 'len' less then GRO_MAX_HEAD makes little sense
> > +		 * - larger 'len' values lead to fragment size above 512 bytes
> > +		 *   as per NAPI_HAS_SMALL_PAGE_FRAG definition
> > +		 * - kmalloc would use the kmalloc-1k slab for such values
> > +		 */
> > +		len = SZ_1K;
> > +
> > +		data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
> > +		pfmemalloc = nc->page_small.pfmemalloc;
> > +		goto check_data;
> > +	}
> > +
> 
> It might be better to place this code further down as a branch rather
> than having to duplicate things up here such as the __GFP_MEMALLOC
> setting.
> 
> You could essentially just put the lines getting the napi_alloc_cache
> and adding the shared info after the sk_memalloc_socks() check. Then it
> could just be an if/else block either calling page_frag_alloc or your
> page_frag_alloc_1k.

I thought about that option, but I did not like it much because adds a
conditional in the fast-path for small-size allocation, and the
duplicate code is very little.

I can change the code that way, if you have strong opinion in that
regards.

> >  	/* If requested length is either too small or too big,
> >  	 * we use kmalloc() for skb->head allocation.
> >  	 */
> > -	if (len <= SKB_WITH_OVERHEAD(1024) ||
> > +	if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
> >  	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
> >  	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
> >  		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
> > @@ -587,6 +693,9 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> >  		gfp_mask |= __GFP_MEMALLOC;
> >  
> >  	data = page_frag_alloc(&nc->page, len, gfp_mask);
> > +	pfmemalloc = nc->page.pfmemalloc;
> > +
> > +check_data:
> >  	if (unlikely(!data))
> >  		return NULL;
> >  
> > @@ -596,8 +705,8 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> >  		return NULL;
> >  	}
> >  
> > -	if (nc->page.pfmemalloc)
> > -		skb->pfmemalloc = 1;
> > +	if (pfmemalloc)
> > +		skb->pfmemalloc = pfmemalloc;
> >  	skb->head_frag = 1;
> >  
> >  skb_success:
> 
> In regards to the pfmemalloc bits I wonder if it wouldn't be better to
> just have them both using the page_frag_cache and just use a pointer to
> that to populate the skb->pfmemalloc based on frag_cache->pfmemalloc at
> the end?

Why? in the end we will still use an ancillary variable and the
napi_alloc_cache struct will be bigger (probaly not very relevant, but
for no gain at all).

Thanks!

Paolo
Paolo Abeni Sept. 21, 2022, 8:21 p.m. UTC | #5
On Wed, 2022-09-21 at 21:33 +0200, Paolo Abeni wrote:
> On Wed, 2022-09-21 at 11:11 -0700, Alexander H Duyck wrote:
[...]
> 
> > >  {
> > >  	struct napi_alloc_cache *nc;
> > >  	struct sk_buff *skb;
> > > +	bool pfmemalloc;
> > >  	void *data;
> > >  
> > >  	DEBUG_NET_WARN_ON_ONCE(!in_softirq());
> > >  	len += NET_SKB_PAD + NET_IP_ALIGN;
> > >  
> > > +	/* When the small frag allocator is available, prefer it over kmalloc
> > > +	 * for small fragments
> > > +	 */
> > > +	if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
> > > +		nc = this_cpu_ptr(&napi_alloc_cache);
> > > +
> > > +		if (sk_memalloc_socks())
> > > +			gfp_mask |= __GFP_MEMALLOC;
> > > +
> > > +		/* we are artificially inflating the allocation size, but
> > > +		 * that is not as bad as it may look like, as:
> > > +		 * - 'len' less then GRO_MAX_HEAD makes little sense
> > > +		 * - larger 'len' values lead to fragment size above 512 bytes
> > > +		 *   as per NAPI_HAS_SMALL_PAGE_FRAG definition
> > > +		 * - kmalloc would use the kmalloc-1k slab for such values
> > > +		 */
> > > +		len = SZ_1K;
> > > +
> > > +		data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
> > > +		pfmemalloc = nc->page_small.pfmemalloc;
> > > +		goto check_data;
> > > +	}
> > > +
> > 
> > It might be better to place this code further down as a branch rather
> > than having to duplicate things up here such as the __GFP_MEMALLOC
> > setting.
> > 
> > You could essentially just put the lines getting the napi_alloc_cache
> > and adding the shared info after the sk_memalloc_socks() check. Then it
> > could just be an if/else block either calling page_frag_alloc or your
> > page_frag_alloc_1k.
> 
> I thought about that option, but I did not like it much because adds a
> conditional in the fast-path for small-size allocation, and the
> duplicate code is very little.
> 
> I can change the code that way, if you have strong opinion in that
> regards.

Thinking again about the above, I now belive that what you suggest is
the right thing to do: my patch ignores the requested 
__GFP_DIRECT_RECLAIM and GFP_DMA flags for small allocation - we always
need to fallback to kmalloc() when the caller ask for them.

TL;DR: I'll move the page_frag_alloc_1k() call below in v2.

Thanks!

Paolo
Alexander Duyck Sept. 21, 2022, 8:23 p.m. UTC | #6
On Wed, 2022-09-21 at 21:33 +0200, Paolo Abeni wrote:
> On Wed, 2022-09-21 at 11:11 -0700, Alexander H Duyck wrote:
> > On Wed, 2022-09-21 at 18:41 +0200, Paolo Abeni wrote:
> > > After commit 3226b158e67c ("net: avoid 32 x truesize under-estimation
> > > for tiny skbs") we are observing 10-20% regressions in performance
> > > tests with small packets. The perf trace points to high pressure on
> > > the slab allocator.
> > > 
> > > This change tries to improve the allocation schema for small packets
> > > using an idea originally suggested by Eric: a new per CPU page frag is
> > > introduced and used in __napi_alloc_skb to cope with small allocation
> > > requests.
> > > 
> > > To ensure that the above does not lead to excessive truesize
> > > underestimation, the frag size for small allocation is inflated to 1K
> > > and all the above is restricted to build with 4K page size.
> > > 
> > > Note that we need to update accordingly the run-time check introduced
> > > with commit fd9ea57f4e95 ("net: add napi_get_frags_check() helper").
> > > 
> > > Alex suggested a smart page refcount schema to reduce the number
> > > of atomic operations and deal properly with pfmemalloc pages.
> > > 
> > > Under small packet UDP flood, I measure a 15% peak tput increases.
> > > 
> > > Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
> > > Suggested-by: Alexander H Duyck <alexander.duyck@gmail.com>
> > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > > ---
> > > @Eric, @Alex please let me know if you are comfortable with the
> > > attribution
> > > ---
> > >  include/linux/netdevice.h |   1 +
> > >  net/core/dev.c            |  17 ------
> > >  net/core/skbuff.c         | 115 +++++++++++++++++++++++++++++++++++++-
> > >  3 files changed, 113 insertions(+), 20 deletions(-)
> > > 
> > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > > index 9f42fc871c3b..a1938560192a 100644
> > > --- a/include/linux/netdevice.h
> > > +++ b/include/linux/netdevice.h
> > > @@ -3822,6 +3822,7 @@ void netif_receive_skb_list(struct list_head *head);
> > >  gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
> > >  void napi_gro_flush(struct napi_struct *napi, bool flush_old);
> > >  struct sk_buff *napi_get_frags(struct napi_struct *napi);
> > > +void napi_get_frags_check(struct napi_struct *napi);
> > >  gro_result_t napi_gro_frags(struct napi_struct *napi);
> > >  struct packet_offload *gro_find_receive_by_type(__be16 type);
> > >  struct packet_offload *gro_find_complete_by_type(__be16 type);
> > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > index d66c73c1c734..fa53830d0683 100644
> > > --- a/net/core/dev.c
> > > +++ b/net/core/dev.c
> > > @@ -6358,23 +6358,6 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
> > >  }
> > >  EXPORT_SYMBOL(dev_set_threaded);
> > >  
> > > -/* Double check that napi_get_frags() allocates skbs with
> > > - * skb->head being backed by slab, not a page fragment.
> > > - * This is to make sure bug fixed in 3226b158e67c
> > > - * ("net: avoid 32 x truesize under-estimation for tiny skbs")
> > > - * does not accidentally come back.
> > > - */
> > > -static void napi_get_frags_check(struct napi_struct *napi)
> > > -{
> > > -	struct sk_buff *skb;
> > > -
> > > -	local_bh_disable();
> > > -	skb = napi_get_frags(napi);
> > > -	WARN_ON_ONCE(skb && skb->head_frag);
> > > -	napi_free_frags(napi);
> > > -	local_bh_enable();
> > > -}
> > > -
> > >  void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
> > >  			   int (*poll)(struct napi_struct *, int), int weight)
> > >  {
> > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > > index f1b8b20fc20b..2be11b487df1 100644
> > > --- a/net/core/skbuff.c
> > > +++ b/net/core/skbuff.c
> > > @@ -134,8 +134,73 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
> > >  #define NAPI_SKB_CACHE_BULK	16
> > >  #define NAPI_SKB_CACHE_HALF	(NAPI_SKB_CACHE_SIZE / 2)
> > >  
> > > +/* the compiler doesn't like 'SKB_TRUESIZE(GRO_MAX_HEAD) > 512', but we
> > > + * can imply such condition checking the double word and MAX_HEADER size
> > > + */
> > > +#if PAGE_SIZE == SZ_4K && (defined(CONFIG_64BIT) || MAX_HEADER > 64)
> > > +
> > > +#define NAPI_HAS_SMALL_PAGE_FRAG 1
> > > +
> > > +/* specializzed page frag allocator using a single order 0 page
> > > + * and slicing it into 1K sized fragment. Constrained to system
> > > + * with:
> > > + * - a very limited amount of 1K fragments fitting a single
> > > + *   page - to avoid excessive truesize underestimation
> > > + * - reasonably high truesize value for napi_get_frags()
> > > + *   allocation - to avoid memory usage increased compared
> > > + *   to kalloc, see __napi_alloc_skb()
> > > + *
> > > + */
> > > +struct page_frag_1k {
> > > +	void *va;
> > > +	u16 offset;
> > > +	bool pfmemalloc;
> > > +};
> > > +
> > > +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
> > > +{
> > > +	struct page *page;
> > > +	int offset;
> > > +
> > > +	if (likely(nc->va)) {
> > > +		offset = nc->offset - SZ_1K;
> > > +		if (likely(offset >= 0))
> > > +			goto out;
> > > +
> > > +		put_page(virt_to_page(nc->va));
> > > +	}
> > > +
> > > +	page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> > > +	if (!page) {
> > > +		nc->va = NULL;
> > > +		return NULL;
> > > +	}
> > > +
> > > +	nc->va = page_address(page);
> > > +	nc->pfmemalloc = page_is_pfmemalloc(page);
> > > +	page_ref_add(page, PAGE_SIZE / SZ_1K);
> > > +	offset = PAGE_SIZE - SZ_1K;
> > > +
> > > +out:
> > > +	nc->offset = offset;
> > > +	return nc->va + offset;
> > 
> > So you might be better off organizing this around the offset rather
> > than the virtual address. As long as offset is 0 you know the page
> > isn't there and has to be replaced.
> > 
> > 	offset = nc->offset - SZ_1K;
> > 	if (offset >= 0)
> > 		goto out;
> > 
> > 	page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> > 	if (!page)
> > 		return NULL;
> > 
> > 	nc->va = page_address(page);
> > 	nc->pfmemalloc = page_is_pfmemalloc(page);
> > 	offset = PAGE_SIZE - SZ_1K;
> > 	page_ref_add(page, offset / SZ_1K);
> > out:
> > 	nc->offset = offset;
> > 	return nc->va + offset;
> > 
> > That will save you from having to call put_page and cleans it up so you
> > only have to perform 1 conditional check instead of 2 in the fast path.
> 
> Nice! I'll use that in v2, with page_ref_add(page, offset / SZ_1K - 1);
> or we will leak the page.

No, the offset already takes care of the -1 via the "- SZ_1K". What we
are adding is references for the unused offset.

> > > +}
> > > +#else
> > > +#define NAPI_HAS_SMALL_PAGE_FRAG 0
> > > +
> > > +struct page_frag_1k {
> > > +};
> > > +
> > > +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
> > > +{
> > > +	return NULL;
> > > +}
> > > +
> > > +#endif
> > > +
> > 
> > Rather than have this return NULL why not just point it at the
> > page_frag_alloc?
> 
> When NAPI_HAS_SMALL_PAGE_FRAG is 0, page_frag_alloc_1k() is never used.
> the definition is there just to please the compiler. I preferred this
> style to avoid more #ifdef in __napi_alloc_skb().
> 

Okay, makes sense I guess.

> > >  struct napi_alloc_cache {
> > >  	struct page_frag_cache page;
> > > +	struct page_frag_1k page_small;
> > >  	unsigned int skb_count;
> > >  	void *skb_cache[NAPI_SKB_CACHE_SIZE];
> > >  };
> > > @@ -143,6 +208,23 @@ struct napi_alloc_cache {
> > >  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
> > >  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
> > >  
> > > +/* Double check that napi_get_frags() allocates skbs with
> > > + * skb->head being backed by slab, not a page fragment.
> > > + * This is to make sure bug fixed in 3226b158e67c
> > > + * ("net: avoid 32 x truesize under-estimation for tiny skbs")
> > > + * does not accidentally come back.
> > > + */
> > > +void napi_get_frags_check(struct napi_struct *napi)
> > > +{
> > > +	struct sk_buff *skb;
> > > +
> > > +	local_bh_disable();
> > > +	skb = napi_get_frags(napi);
> > > +	WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
> > > +	napi_free_frags(napi);
> > > +	local_bh_enable();
> > > +}
> > > +
> > >  void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
> > >  {
> > >  	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
> > > @@ -561,15 +643,39 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> > >  {
> > >  	struct napi_alloc_cache *nc;
> > >  	struct sk_buff *skb;
> > > +	bool pfmemalloc;
> > >  	void *data;
> > >  
> > >  	DEBUG_NET_WARN_ON_ONCE(!in_softirq());
> > >  	len += NET_SKB_PAD + NET_IP_ALIGN;
> > >  
> > > +	/* When the small frag allocator is available, prefer it over kmalloc
> > > +	 * for small fragments
> > > +	 */
> > > +	if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
> > > +		nc = this_cpu_ptr(&napi_alloc_cache);
> > > +
> > > +		if (sk_memalloc_socks())
> > > +			gfp_mask |= __GFP_MEMALLOC;
> > > +
> > > +		/* we are artificially inflating the allocation size, but
> > > +		 * that is not as bad as it may look like, as:
> > > +		 * - 'len' less then GRO_MAX_HEAD makes little sense
> > > +		 * - larger 'len' values lead to fragment size above 512 bytes
> > > +		 *   as per NAPI_HAS_SMALL_PAGE_FRAG definition
> > > +		 * - kmalloc would use the kmalloc-1k slab for such values
> > > +		 */
> > > +		len = SZ_1K;
> > > +
> > > +		data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
> > > +		pfmemalloc = nc->page_small.pfmemalloc;
> > > +		goto check_data;
> > > +	}
> > > +
> > 
> > It might be better to place this code further down as a branch rather
> > than having to duplicate things up here such as the __GFP_MEMALLOC
> > setting.
> > 
> > You could essentially just put the lines getting the napi_alloc_cache
> > and adding the shared info after the sk_memalloc_socks() check. Then it
> > could just be an if/else block either calling page_frag_alloc or your
> > page_frag_alloc_1k.
> 
> I thought about that option, but I did not like it much because adds a
> conditional in the fast-path for small-size allocation, and the
> duplicate code is very little.
> 
> I can change the code that way, if you have strong opinion in that
> regards.
> > 

I see, so you are trying to optimize for the smaller packet size.

It occurs to me that I think you are missing the check for the gfp_mask
and the reclaim and DMA flags values as a result with your change. I
think we will need to perform that check before we can do the direct
page allocation based on size.

> > >  	/* If requested length is either too small or too big,
> > >  	 * we use kmalloc() for skb->head allocation.
> > >  	 */
> > > -	if (len <= SKB_WITH_OVERHEAD(1024) ||
> > > +	if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
> > >  	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
> > >  	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
> > >  		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
> > > @@ -587,6 +693,9 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> > >  		gfp_mask |= __GFP_MEMALLOC;
> > >  
> > >  	data = page_frag_alloc(&nc->page, len, gfp_mask);
> > > +	pfmemalloc = nc->page.pfmemalloc;
> > > +
> > > +check_data:
> > >  	if (unlikely(!data))
> > >  		return NULL;
> > >  
> > > @@ -596,8 +705,8 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
> > >  		return NULL;
> > >  	}
> > >  
> > > -	if (nc->page.pfmemalloc)
> > > -		skb->pfmemalloc = 1;
> > > +	if (pfmemalloc)
> > > +		skb->pfmemalloc = pfmemalloc;
> > >  	skb->head_frag = 1;
> > >  
> > >  skb_success:
> > 
> > In regards to the pfmemalloc bits I wonder if it wouldn't be better to
> > just have them both using the page_frag_cache and just use a pointer to
> > that to populate the skb->pfmemalloc based on frag_cache->pfmemalloc at
> > the end?
> 
> Why? in the end we will still use an ancillary variable and the
> napi_alloc_cache struct will be bigger (probaly not very relevant, but
> for no gain at all).

It was mostly just about reducing instructions. The thought is we could
get rid of the storage of the napi cache entirely since the only thing
used is the page member, so if we just passed that around instead it
would save us the trouble and not really be another variable. Basically
we would be passing a frag cache pointer instead of a napi_alloc_cache.
Paolo Abeni Sept. 21, 2022, 8:51 p.m. UTC | #7
On Wed, 2022-09-21 at 13:23 -0700, Alexander H Duyck wrote:
> On Wed, 2022-09-21 at 21:33 +0200, Paolo Abeni wrote:
> > Nice! I'll use that in v2, with page_ref_add(page, offset / SZ_1K - 1);
> > or we will leak the page.
> 
> No, the offset already takes care of the -1 via the "- SZ_1K". What we
> are adding is references for the unused offset.

You are right. For some reasons I keep reading PAGE_SIZE instead of
'offset'.

> > > 
> It occurs to me that I think you are missing the check for the gfp_mask
> and the reclaim and DMA flags values as a result with your change. I
> think we will need to perform that check before we can do the direct
> page allocation based on size.

Yes, the gtp_mask checks are required (it just stuck me a few moments
ago ;). I will move the code as you originally suggested.

> > > 
> > Why? in the end we will still use an ancillary variable and the
> > napi_alloc_cache struct will be bigger (probaly not very relevant, but
> > for no gain at all).
> 
> It was mostly just about reducing instructions. The thought is we could
> get rid of the storage of the napi cache entirely since the only thing
> used is the page member, so if we just passed that around instead it
> would save us the trouble and not really be another variable. Basically
> we would be passing a frag cache pointer instead of a napi_alloc_cache.

In that case we will still duplicate a bit of code  -
this_cpu_ptr(&napi_alloc_cache) on both branches. gcc 11.3.1 here says
that the generated code is smaller without this change.

Cheers,

Paolo
Alexander Duyck Sept. 21, 2022, 9:44 p.m. UTC | #8
On Wed, Sep 21, 2022 at 1:52 PM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Wed, 2022-09-21 at 13:23 -0700, Alexander H Duyck wrote:
> > On Wed, 2022-09-21 at 21:33 +0200, Paolo Abeni wrote:
> > > Nice! I'll use that in v2, with page_ref_add(page, offset / SZ_1K - 1);
> > > or we will leak the page.
> >
> > No, the offset already takes care of the -1 via the "- SZ_1K". What we
> > are adding is references for the unused offset.
>
> You are right. For some reasons I keep reading PAGE_SIZE instead of
> 'offset'.
>
> > > >
> > It occurs to me that I think you are missing the check for the gfp_mask
> > and the reclaim and DMA flags values as a result with your change. I
> > think we will need to perform that check before we can do the direct
> > page allocation based on size.
>
> Yes, the gtp_mask checks are required (it just stuck me a few moments
> ago ;). I will move the code as you originally suggested.
>
> > > >
> > > Why? in the end we will still use an ancillary variable and the
> > > napi_alloc_cache struct will be bigger (probaly not very relevant, but
> > > for no gain at all).
> >
> > It was mostly just about reducing instructions. The thought is we could
> > get rid of the storage of the napi cache entirely since the only thing
> > used is the page member, so if we just passed that around instead it
> > would save us the trouble and not really be another variable. Basically
> > we would be passing a frag cache pointer instead of a napi_alloc_cache.
>
> In that case we will still duplicate a bit of code  -
> this_cpu_ptr(&napi_alloc_cache) on both branches. gcc 11.3.1 here says
> that the generated code is smaller without this change.

Why do you need to duplicate it? I thought you would either be going
with nc->page or nc->page_small depending on the size so either way
you are accessing nc. Once you know you aren't going to be using the
slab cache you could basically fetch that and do the setting of
__GFP_MEMALLOC before you would even need to look at branching based
on the length. The branch on size would then assign the
page_frag_cache pointer, update the length, fetch the page frag, and
then resume the normal path.
Paolo Abeni Sept. 22, 2022, 4:29 p.m. UTC | #9
On Wed, 2022-09-21 at 14:44 -0700, Alexander Duyck wrote:
> On Wed, Sep 21, 2022 at 1:52 PM Paolo Abeni <pabeni@redhat.com> wrote:
> > In that case we will still duplicate a bit of code  -
> > this_cpu_ptr(&napi_alloc_cache) on both branches. gcc 11.3.1 here says
> > that the generated code is smaller without this change.
> 
> Why do you need to duplicate it? 

The goal was using a single local variable to track the napi cache and
the memory info. I thought ("was sure") that keeping two separate
variables ('nc' and 'page_frag' instead of 'nc' and 'pfmemalloc') would
produce the same amount of code. gcc says I'm wrong and you are right
;)

I'll use that in v2, thanks!

Paolo
Paolo Abeni Sept. 22, 2022, 8:21 p.m. UTC | #10
On Thu, 2022-09-22 at 18:29 +0200, Paolo Abeni wrote:
> On Wed, 2022-09-21 at 14:44 -0700, Alexander Duyck wrote:
> > On Wed, Sep 21, 2022 at 1:52 PM Paolo Abeni <pabeni@redhat.com> wrote:
> > > In that case we will still duplicate a bit of code  -
> > > this_cpu_ptr(&napi_alloc_cache) on both branches. gcc 11.3.1 here says
> > > that the generated code is smaller without this change.
> > 
> > Why do you need to duplicate it? 
> 
> The goal was using a single local variable to track the napi cache and
> the memory info. I thought ("was sure") that keeping two separate
> variables ('nc' and 'page_frag' instead of 'nc' and 'pfmemalloc') would
> produce the same amount of code. gcc says I'm wrong and you are right
> ;)
> 
> I'll use that in v2, thanks!

I'm sorry for being so noisy lately. I've to take back the above.
Before I measured the code size with for debug builds. With non debug
build the above schema does not reduce the instructions number.

I'll share the code after some more testing.

Cheers,

Paolo
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9f42fc871c3b..a1938560192a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3822,6 +3822,7 @@  void netif_receive_skb_list(struct list_head *head);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
+void napi_get_frags_check(struct napi_struct *napi);
 gro_result_t napi_gro_frags(struct napi_struct *napi);
 struct packet_offload *gro_find_receive_by_type(__be16 type);
 struct packet_offload *gro_find_complete_by_type(__be16 type);
diff --git a/net/core/dev.c b/net/core/dev.c
index d66c73c1c734..fa53830d0683 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6358,23 +6358,6 @@  int dev_set_threaded(struct net_device *dev, bool threaded)
 }
 EXPORT_SYMBOL(dev_set_threaded);
 
-/* Double check that napi_get_frags() allocates skbs with
- * skb->head being backed by slab, not a page fragment.
- * This is to make sure bug fixed in 3226b158e67c
- * ("net: avoid 32 x truesize under-estimation for tiny skbs")
- * does not accidentally come back.
- */
-static void napi_get_frags_check(struct napi_struct *napi)
-{
-	struct sk_buff *skb;
-
-	local_bh_disable();
-	skb = napi_get_frags(napi);
-	WARN_ON_ONCE(skb && skb->head_frag);
-	napi_free_frags(napi);
-	local_bh_enable();
-}
-
 void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 			   int (*poll)(struct napi_struct *, int), int weight)
 {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f1b8b20fc20b..2be11b487df1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -134,8 +134,73 @@  static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
 #define NAPI_SKB_CACHE_BULK	16
 #define NAPI_SKB_CACHE_HALF	(NAPI_SKB_CACHE_SIZE / 2)
 
+/* the compiler doesn't like 'SKB_TRUESIZE(GRO_MAX_HEAD) > 512', but we
+ * can imply such condition checking the double word and MAX_HEADER size
+ */
+#if PAGE_SIZE == SZ_4K && (defined(CONFIG_64BIT) || MAX_HEADER > 64)
+
+#define NAPI_HAS_SMALL_PAGE_FRAG 1
+
+/* specializzed page frag allocator using a single order 0 page
+ * and slicing it into 1K sized fragment. Constrained to system
+ * with:
+ * - a very limited amount of 1K fragments fitting a single
+ *   page - to avoid excessive truesize underestimation
+ * - reasonably high truesize value for napi_get_frags()
+ *   allocation - to avoid memory usage increased compared
+ *   to kalloc, see __napi_alloc_skb()
+ *
+ */
+struct page_frag_1k {
+	void *va;
+	u16 offset;
+	bool pfmemalloc;
+};
+
+static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
+{
+	struct page *page;
+	int offset;
+
+	if (likely(nc->va)) {
+		offset = nc->offset - SZ_1K;
+		if (likely(offset >= 0))
+			goto out;
+
+		put_page(virt_to_page(nc->va));
+	}
+
+	page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+	if (!page) {
+		nc->va = NULL;
+		return NULL;
+	}
+
+	nc->va = page_address(page);
+	nc->pfmemalloc = page_is_pfmemalloc(page);
+	page_ref_add(page, PAGE_SIZE / SZ_1K);
+	offset = PAGE_SIZE - SZ_1K;
+
+out:
+	nc->offset = offset;
+	return nc->va + offset;
+}
+#else
+#define NAPI_HAS_SMALL_PAGE_FRAG 0
+
+struct page_frag_1k {
+};
+
+static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
+{
+	return NULL;
+}
+
+#endif
+
 struct napi_alloc_cache {
 	struct page_frag_cache page;
+	struct page_frag_1k page_small;
 	unsigned int skb_count;
 	void *skb_cache[NAPI_SKB_CACHE_SIZE];
 };
@@ -143,6 +208,23 @@  struct napi_alloc_cache {
 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
 static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
 
+/* Double check that napi_get_frags() allocates skbs with
+ * skb->head being backed by slab, not a page fragment.
+ * This is to make sure bug fixed in 3226b158e67c
+ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
+ * does not accidentally come back.
+ */
+void napi_get_frags_check(struct napi_struct *napi)
+{
+	struct sk_buff *skb;
+
+	local_bh_disable();
+	skb = napi_get_frags(napi);
+	WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
+	napi_free_frags(napi);
+	local_bh_enable();
+}
+
 void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 {
 	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
@@ -561,15 +643,39 @@  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 {
 	struct napi_alloc_cache *nc;
 	struct sk_buff *skb;
+	bool pfmemalloc;
 	void *data;
 
 	DEBUG_NET_WARN_ON_ONCE(!in_softirq());
 	len += NET_SKB_PAD + NET_IP_ALIGN;
 
+	/* When the small frag allocator is available, prefer it over kmalloc
+	 * for small fragments
+	 */
+	if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
+		nc = this_cpu_ptr(&napi_alloc_cache);
+
+		if (sk_memalloc_socks())
+			gfp_mask |= __GFP_MEMALLOC;
+
+		/* we are artificially inflating the allocation size, but
+		 * that is not as bad as it may look like, as:
+		 * - 'len' less then GRO_MAX_HEAD makes little sense
+		 * - larger 'len' values lead to fragment size above 512 bytes
+		 *   as per NAPI_HAS_SMALL_PAGE_FRAG definition
+		 * - kmalloc would use the kmalloc-1k slab for such values
+		 */
+		len = SZ_1K;
+
+		data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
+		pfmemalloc = nc->page_small.pfmemalloc;
+		goto check_data;
+	}
+
 	/* If requested length is either too small or too big,
 	 * we use kmalloc() for skb->head allocation.
 	 */
-	if (len <= SKB_WITH_OVERHEAD(1024) ||
+	if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
 	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
 	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
 		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
@@ -587,6 +693,9 @@  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 		gfp_mask |= __GFP_MEMALLOC;
 
 	data = page_frag_alloc(&nc->page, len, gfp_mask);
+	pfmemalloc = nc->page.pfmemalloc;
+
+check_data:
 	if (unlikely(!data))
 		return NULL;
 
@@ -596,8 +705,8 @@  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 		return NULL;
 	}
 
-	if (nc->page.pfmemalloc)
-		skb->pfmemalloc = 1;
+	if (pfmemalloc)
+		skb->pfmemalloc = pfmemalloc;
 	skb->head_frag = 1;
 
 skb_success: