@@ -162,6 +162,20 @@ attribute-sets:
type: uint
doc: |
Amount of memory held by inflight pages.
+ -
+ name: item_mem_resident
+ type: uint
+ doc: |
+ Amount of actual memory allocated to track inflight pages.
+ memory fragmentation ratio for item memory can be calculated
+ using item_mem_resident / item_mem_used.
+ -
+ name: item_mem_used
+ type: uint
+ doc: |
+ Amount of actual memory used to track inflight pages.
+ memory fragmentation ratio for item memory can be calculated
+ using item_mem_resident / item_mem_used.
-
name: detach-time
type: uint
@@ -602,6 +616,8 @@ operations:
- detach-time
- dmabuf
- io-uring
+ - item_mem_resident
+ - item_mem_used
dump:
reply: *pp-reply
config-cond: page-pool
@@ -103,6 +103,7 @@ struct page_pool_params {
* @waive: pages obtained from the ptr ring that cannot be added to
* the cache due to a NUMA mismatch
* @item_fast_empty: pre-allocated item cache is empty
+ * @item_slow_failed: failed to allocate memory for item_block
*/
struct page_pool_alloc_stats {
u64 fast;
@@ -112,6 +113,7 @@ struct page_pool_alloc_stats {
u64 refill;
u64 waive;
u64 item_fast_empty;
+ u64 item_slow_failed;
};
/**
@@ -159,9 +161,16 @@ struct page_pool_item {
struct page_pool_item_block {
struct page_pool *pp;
struct list_head list;
+ unsigned int flags;
+ refcount_t ref;
struct page_pool_item items[];
};
+struct page_pool_slow_item {
+ struct page_pool_item_block *block;
+ unsigned int next_to_use;
+};
+
/* Ensure the offset of 'pp' field for both 'page_pool_item_block' and
* 'netmem_item_block' are the same.
*/
@@ -191,6 +200,7 @@ struct page_pool {
int cpuid;
u32 pages_state_hold_cnt;
struct llist_head hold_items;
+ struct page_pool_slow_item slow_items;
bool has_init_callback:1; /* slow::init_callback is set */
bool dma_map:1; /* Perform DMA mapping */
@@ -103,6 +103,8 @@ enum {
NETDEV_A_PAGE_POOL_DETACH_TIME,
NETDEV_A_PAGE_POOL_DMABUF,
NETDEV_A_PAGE_POOL_IO_URING,
+ NETDEV_A_PAGE_POOL_ITEM_MEM_RESIDENT,
+ NETDEV_A_PAGE_POOL_ITEM_MEM_USED,
__NETDEV_A_PAGE_POOL_MAX,
NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1)
@@ -64,6 +64,7 @@ static const char pp_stats[][ETH_GSTRING_LEN] = {
"rx_pp_alloc_refill",
"rx_pp_alloc_waive",
"rx_pp_alloc_item_fast_empty",
+ "rx_pp_alloc_item_slow_failed",
"rx_pp_recycle_cached",
"rx_pp_recycle_cache_full",
"rx_pp_recycle_ring",
@@ -98,6 +99,7 @@ bool page_pool_get_stats(const struct page_pool *pool,
stats->alloc_stats.refill += pool->alloc_stats.refill;
stats->alloc_stats.waive += pool->alloc_stats.waive;
stats->alloc_stats.item_fast_empty += pool->alloc_stats.item_fast_empty;
+ stats->alloc_stats.item_slow_failed += pool->alloc_stats.item_slow_failed;
for_each_possible_cpu(cpu) {
const struct page_pool_recycle_stats *pcpu =
@@ -144,6 +146,7 @@ u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
*data++ = pool_stats->alloc_stats.refill;
*data++ = pool_stats->alloc_stats.waive;
*data++ = pool_stats->alloc_stats.item_fast_empty;
+ *data++ = pool_stats->alloc_stats.item_slow_failed;
*data++ = pool_stats->recycle_stats.cached;
*data++ = pool_stats->recycle_stats.cache_full;
*data++ = pool_stats->recycle_stats.ring;
@@ -431,6 +434,7 @@ static void page_pool_item_uninit(struct page_pool *pool)
struct page_pool_item_block,
list);
list_del(&block->list);
+ WARN_ON(refcount_read(&block->ref));
put_page(virt_to_page(block));
}
}
@@ -514,10 +518,42 @@ static struct page_pool_item *page_pool_fast_item_alloc(struct page_pool *pool)
return llist_entry(first, struct page_pool_item, lentry);
}
+static struct page_pool_item *page_pool_slow_item_alloc(struct page_pool *pool)
+{
+ if (unlikely(!pool->slow_items.block ||
+ pool->slow_items.next_to_use >= ITEMS_PER_PAGE)) {
+ struct page_pool_item_block *block;
+ struct page *page;
+
+ page = alloc_pages_node(pool->p.nid, GFP_ATOMIC | __GFP_NOWARN |
+ __GFP_ZERO, 0);
+ if (!page) {
+ alloc_stat_inc(pool, item_slow_failed);
+ return NULL;
+ }
+
+ block = page_address(page);
+ block->pp = pool;
+ block->flags |= PAGE_POOL_SLOW_ITEM_BLOCK_BIT;
+ refcount_set(&block->ref, ITEMS_PER_PAGE);
+ pool->slow_items.block = block;
+ pool->slow_items.next_to_use = 0;
+
+ spin_lock_bh(&pool->item_lock);
+ list_add(&block->list, &pool->item_blocks);
+ spin_unlock_bh(&pool->item_lock);
+ }
+
+ return &pool->slow_items.block->items[pool->slow_items.next_to_use++];
+}
+
static bool page_pool_set_item_info(struct page_pool *pool, netmem_ref netmem)
{
struct page_pool_item *item = page_pool_fast_item_alloc(pool);
+ if (unlikely(!item))
+ item = page_pool_slow_item_alloc(pool);
+
if (likely(item)) {
item->pp_netmem = netmem;
page_pool_item_set_used(item);
@@ -527,6 +563,37 @@ static bool page_pool_set_item_info(struct page_pool *pool, netmem_ref netmem)
return !!item;
}
+static void __page_pool_slow_item_free(struct page_pool *pool,
+ struct page_pool_item_block *block)
+{
+ spin_lock_bh(&pool->item_lock);
+ list_del(&block->list);
+ spin_unlock_bh(&pool->item_lock);
+
+ put_page(virt_to_page(block));
+}
+
+static void page_pool_slow_item_drain(struct page_pool *pool)
+{
+ struct page_pool_item_block *block = pool->slow_items.block;
+
+ if (!block || pool->slow_items.next_to_use >= ITEMS_PER_PAGE)
+ return;
+
+ if (refcount_sub_and_test(ITEMS_PER_PAGE - pool->slow_items.next_to_use,
+ &block->ref))
+ __page_pool_slow_item_free(pool, block);
+}
+
+static void page_pool_slow_item_free(struct page_pool *pool,
+ struct page_pool_item_block *block)
+{
+ if (likely(!refcount_dec_and_test(&block->ref)))
+ return;
+
+ __page_pool_slow_item_free(pool, block);
+}
+
static void page_pool_fast_item_free(struct page_pool *pool,
struct page_pool_item *item)
{
@@ -536,13 +603,22 @@ static void page_pool_fast_item_free(struct page_pool *pool,
static void page_pool_clear_item_info(struct page_pool *pool, netmem_ref netmem)
{
struct page_pool_item *item = netmem_get_pp_item(netmem);
+ struct page_pool_item_block *block;
DEBUG_NET_WARN_ON_ONCE(item->pp_netmem != netmem);
DEBUG_NET_WARN_ON_ONCE(page_pool_item_is_mapped(item));
DEBUG_NET_WARN_ON_ONCE(!page_pool_item_is_used(item));
page_pool_item_clear_used(item);
netmem_set_pp_item(netmem, NULL);
- page_pool_fast_item_free(pool, item);
+
+ block = page_pool_item_to_block(item);
+ if (likely(!(block->flags & PAGE_POOL_SLOW_ITEM_BLOCK_BIT))) {
+ DEBUG_NET_WARN_ON_ONCE(refcount_read(&block->ref));
+ page_pool_fast_item_free(pool, item);
+ return;
+ }
+
+ page_pool_slow_item_free(pool, block);
}
/**
@@ -1390,6 +1466,7 @@ void page_pool_destroy(struct page_pool *pool)
page_pool_disable_direct_recycling(pool);
page_pool_free_frag(pool);
+ page_pool_slow_item_drain(pool);
if (!page_pool_release(pool))
return;
@@ -7,6 +7,8 @@
#include "netmem_priv.h"
+#define PAGE_POOL_SLOW_ITEM_BLOCK_BIT BIT(0)
+
extern struct mutex page_pools_lock;
s32 page_pool_inflight(const struct page_pool *pool, bool strict);
@@ -33,7 +33,7 @@ DEFINE_MUTEX(page_pools_lock);
* - user.list: unhashed, netdev: unknown
*/
-typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, const struct page_pool *pool,
+typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, struct page_pool *pool,
const struct genl_info *info);
static int
@@ -111,7 +111,7 @@ netdev_nl_page_pool_get_dump(struct sk_buff *skb, struct netlink_callback *cb,
}
static int
-page_pool_nl_stats_fill(struct sk_buff *rsp, const struct page_pool *pool,
+page_pool_nl_stats_fill(struct sk_buff *rsp, struct page_pool *pool,
const struct genl_info *info)
{
#ifdef CONFIG_PAGE_POOL_STATS
@@ -212,8 +212,36 @@ int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb,
return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_stats_fill);
}
+static int page_pool_nl_fill_item_mem_info(struct page_pool *pool,
+ struct sk_buff *rsp)
+{
+ struct page_pool_item_block *block;
+ size_t resident = 0, used = 0;
+ int err;
+
+ spin_lock_bh(&pool->item_lock);
+
+ list_for_each_entry(block, &pool->item_blocks, list) {
+ resident += PAGE_SIZE;
+
+ if (block->flags & PAGE_POOL_SLOW_ITEM_BLOCK_BIT)
+ used += (PAGE_SIZE - sizeof(struct page_pool_item) *
+ refcount_read(&block->ref));
+ else
+ used += PAGE_SIZE;
+ }
+
+ spin_unlock_bh(&pool->item_lock);
+
+ err = nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ITEM_MEM_RESIDENT, resident);
+ if (err)
+ return err;
+
+ return nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ITEM_MEM_USED, used);
+}
+
static int
-page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
+page_pool_nl_fill(struct sk_buff *rsp, struct page_pool *pool,
const struct genl_info *info)
{
size_t inflight, refsz;
@@ -251,6 +279,9 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL))
goto err_cancel;
+ if (page_pool_nl_fill_item_mem_info(pool, rsp))
+ goto err_cancel;
+
genlmsg_end(rsp, hdr);
return 0;
@@ -259,7 +290,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
return -EMSGSIZE;
}
-static void netdev_nl_page_pool_event(const struct page_pool *pool, u32 cmd)
+static void netdev_nl_page_pool_event(struct page_pool *pool, u32 cmd)
{
struct genl_info info;
struct sk_buff *ntf;
@@ -16,6 +16,7 @@ struct stat {
struct {
unsigned int cnt;
size_t refs, bytes;
+ size_t item_mem_resident, item_mem_used;
} live[2];
size_t alloc_slow, alloc_fast, recycle_ring, recycle_cache;
@@ -52,6 +53,12 @@ static void count(struct stat *s, unsigned int l,
s->live[l].refs += pp->inflight;
if (pp->_present.inflight_mem)
s->live[l].bytes += pp->inflight_mem;
+
+ if (pp->_present.item_mem_resident)
+ s->live[l].item_mem_resident += pp->item_mem_resident;
+
+ if (pp->_present.item_mem_used)
+ s->live[l].item_mem_used += pp->item_mem_used;
}
int main(int argc, char **argv)
@@ -127,6 +134,10 @@ int main(int argc, char **argv)
s->live[1].refs, s->live[1].bytes,
s->live[0].refs, s->live[0].bytes);
+ printf("\t\titem_mem_resident: %zu item_mem_used: %zu (item_mem_resident: %zu item_mem_used: %zu)\n",
+ s->live[1].item_mem_resident, s->live[1].item_mem_used,
+ s->live[0].item_mem_resident, s->live[0].item_mem_used);
+
/* We don't know how many pages are sitting in cache and ring
* so we will under-count the recycling rate a bit.
*/
Currently a fixed size of pre-allocated memory is used to keep track of the inflight pages, in order to use the DMA API correctly. As mentioned [1], the number of inflight pages can be up to 73203 depending on the use cases. Allocate memory dynamically to keep track of the inflight pages when pre-allocated memory runs out. The overhead of using dynamic memory allocation is about 10ns~ 20ns, which causes 5%~10% performance degradation for the test case of time_bench_page_pool03_slow() in [2]. 1. https://lore.kernel.org/all/b8b7818a-e44b-45f5-91c2-d5eceaa5dd5b@kernel.org/ 2. https://github.com/netoptimizer/prototype-kernel CC: Robin Murphy <robin.murphy@arm.com> CC: Alexander Duyck <alexander.duyck@gmail.com> CC: IOMMU <iommu@lists.linux.dev> Fixes: f71fec47c2df ("page_pool: make sure struct device is stable") Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> --- Documentation/netlink/specs/netdev.yaml | 16 +++++ include/net/page_pool/types.h | 10 ++++ include/uapi/linux/netdev.h | 2 + net/core/page_pool.c | 79 ++++++++++++++++++++++++- net/core/page_pool_priv.h | 2 + net/core/page_pool_user.c | 39 ++++++++++-- tools/net/ynl/samples/page-pool.c | 11 ++++ 7 files changed, 154 insertions(+), 5 deletions(-)