diff mbox series

[net-next,v1,08/16] memory-provider: dmabuf devmem memory provider

Message ID 20231208005250.2910004-9-almasrymina@google.com (mailing list archive)
State New, archived
Headers show
Series Device Memory TCP | expand

Commit Message

Mina Almasry Dec. 8, 2023, 12:52 a.m. UTC
Implement a memory provider that allocates dmabuf devmem page_pool_iovs.

The provider receives a reference to the struct netdev_dmabuf_binding
via the pool->mp_priv pointer. The driver needs to set this pointer for
the provider in the page_pool_params.

The provider obtains a reference on the netdev_dmabuf_binding which
guarantees the binding and the underlying mapping remains alive until
the provider is destroyed.

Usage of PP_FLAG_DMA_MAP is required for this memory provide such that
the page_pool can provide the driver with the dma-addrs of the devmem.

Support for PP_FLAG_DMA_SYNC_DEV is omitted for simplicity.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>

---

v1:
- static_branch check in page_is_page_pool_iov() (Willem & Paolo).
- PP_DEVMEM -> PP_IOV (David).
- Require PP_FLAG_DMA_MAP (Jakub).

---
 include/net/page_pool/helpers.h | 47 +++++++++++++++++
 include/net/page_pool/types.h   |  9 ++++
 net/core/page_pool.c            | 89 ++++++++++++++++++++++++++++++++-
 3 files changed, 144 insertions(+), 1 deletion(-)

Comments

Pavel Begunkov Dec. 8, 2023, 10:48 p.m. UTC | #1
On 12/8/23 00:52, Mina Almasry wrote:
> Implement a memory provider that allocates dmabuf devmem page_pool_iovs.
> 
> The provider receives a reference to the struct netdev_dmabuf_binding
> via the pool->mp_priv pointer. The driver needs to set this pointer for
> the provider in the page_pool_params.
> 
> The provider obtains a reference on the netdev_dmabuf_binding which
> guarantees the binding and the underlying mapping remains alive until
> the provider is destroyed.
> 
> Usage of PP_FLAG_DMA_MAP is required for this memory provide such that
> the page_pool can provide the driver with the dma-addrs of the devmem.
> 
> Support for PP_FLAG_DMA_SYNC_DEV is omitted for simplicity.
> 
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
> Signed-off-by: Mina Almasry <almasrymina@google.com>
> 
> ---
> 
> v1:
> - static_branch check in page_is_page_pool_iov() (Willem & Paolo).
> - PP_DEVMEM -> PP_IOV (David).
> - Require PP_FLAG_DMA_MAP (Jakub).
> 
> ---
>   include/net/page_pool/helpers.h | 47 +++++++++++++++++
>   include/net/page_pool/types.h   |  9 ++++
>   net/core/page_pool.c            | 89 ++++++++++++++++++++++++++++++++-
>   3 files changed, 144 insertions(+), 1 deletion(-)
> 
> diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
> index 8bfc2d43efd4..00197f14aa87 100644
> --- a/include/net/page_pool/helpers.h
> +++ b/include/net/page_pool/helpers.h
> @@ -53,6 +53,8 @@
>   #define _NET_PAGE_POOL_HELPERS_H
>   
>   #include <net/page_pool/types.h>
> +#include <net/net_debug.h>
> +#include <net/devmem.h>
>   
>   #ifdef CONFIG_PAGE_POOL_STATS
>   /* Deprecated driver-facing API, use netlink instead */
> @@ -92,6 +94,11 @@ static inline unsigned int page_pool_iov_idx(const struct page_pool_iov *ppiov)
>   	return ppiov - page_pool_iov_owner(ppiov)->ppiovs;
>   }
>   
> +static inline u32 page_pool_iov_binding_id(const struct page_pool_iov *ppiov)
> +{
> +	return page_pool_iov_owner(ppiov)->binding->id;
> +}
> +
>   static inline dma_addr_t
>   page_pool_iov_dma_addr(const struct page_pool_iov *ppiov)
>   {
> @@ -107,6 +114,46 @@ page_pool_iov_binding(const struct page_pool_iov *ppiov)
>   	return page_pool_iov_owner(ppiov)->binding;
>   }
>   
> +static inline int page_pool_iov_refcount(const struct page_pool_iov *ppiov)
> +{
> +	return refcount_read(&ppiov->refcount);
> +}
> +
> +static inline void page_pool_iov_get_many(struct page_pool_iov *ppiov,
> +					  unsigned int count)
> +{
> +	refcount_add(count, &ppiov->refcount);
> +}
> +
> +void __page_pool_iov_free(struct page_pool_iov *ppiov);
> +
> +static inline void page_pool_iov_put_many(struct page_pool_iov *ppiov,
> +					  unsigned int count)
> +{
> +	if (!refcount_sub_and_test(count, &ppiov->refcount))
> +		return;
> +
> +	__page_pool_iov_free(ppiov);
> +}
> +
> +/* page pool mm helpers */
> +
> +DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);
> +static inline bool page_is_page_pool_iov(const struct page *page)
> +{
> +	return static_branch_unlikely(&page_pool_mem_providers) &&
> +	       (unsigned long)page & PP_IOV;
> +}
> +
> +static inline struct page_pool_iov *page_to_page_pool_iov(struct page *page)
> +{
> +	if (page_is_page_pool_iov(page))
> +		return (struct page_pool_iov *)((unsigned long)page & ~PP_IOV);
> +
> +	DEBUG_NET_WARN_ON_ONCE(true);
> +	return NULL;
> +}
> +
>   /**
>    * page_pool_dev_alloc_pages() - allocate a page.
>    * @pool:	pool from which to allocate
> diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
> index 44faee7a7b02..136930a238de 100644
> --- a/include/net/page_pool/types.h
> +++ b/include/net/page_pool/types.h
> @@ -134,8 +134,15 @@ struct memory_provider_ops {
>   	bool (*release_page)(struct page_pool *pool, struct page *page);
>   };
>   
> +extern const struct memory_provider_ops dmabuf_devmem_ops;
> +
>   /* page_pool_iov support */
>   
> +/*  We overload the LSB of the struct page pointer to indicate whether it's
> + *  a page or page_pool_iov.
> + */
> +#define PP_IOV 0x01UL
> +
>   /* Owner of the dma-buf chunks inserted into the gen pool. Each scatterlist
>    * entry from the dmabuf is inserted into the genpool as a chunk, and needs
>    * this owner struct to keep track of some metadata necessary to create
> @@ -159,6 +166,8 @@ struct page_pool_iov {
>   	struct dmabuf_genpool_chunk_owner *owner;
>   
>   	refcount_t refcount;
> +
> +	struct page_pool *pp;
>   };
>   
>   struct page_pool {
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index f5c84d2a4510..423c88564a00 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -12,6 +12,7 @@
>   
>   #include <net/page_pool/helpers.h>
>   #include <net/xdp.h>
> +#include <net/netdev_rx_queue.h>
>   
>   #include <linux/dma-direction.h>
>   #include <linux/dma-mapping.h>
> @@ -20,12 +21,15 @@
>   #include <linux/poison.h>
>   #include <linux/ethtool.h>
>   #include <linux/netdevice.h>
> +#include <linux/genalloc.h>
> +#include <net/devmem.h>
>   
>   #include <trace/events/page_pool.h>
>   
>   #include "page_pool_priv.h"
>   
> -static DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
> +DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
> +EXPORT_SYMBOL(page_pool_mem_providers);
>   
>   #define DEFER_TIME (msecs_to_jiffies(1000))
>   #define DEFER_WARN_INTERVAL (60 * HZ)
> @@ -175,6 +179,7 @@ static void page_pool_producer_unlock(struct page_pool *pool,
>   static int page_pool_init(struct page_pool *pool,
>   			  const struct page_pool_params *params)
>   {
> +	struct netdev_dmabuf_binding *binding = NULL;
>   	unsigned int ring_qsize = 1024; /* Default */
>   	int err;
>   
> @@ -237,6 +242,14 @@ static int page_pool_init(struct page_pool *pool,
>   	/* Driver calling page_pool_create() also call page_pool_destroy() */
>   	refcount_set(&pool->user_cnt, 1);
>   
> +	if (pool->p.queue)
> +		binding = READ_ONCE(pool->p.queue->binding);
> +
> +	if (binding) {
> +		pool->mp_ops = &dmabuf_devmem_ops;
> +		pool->mp_priv = binding;
> +	}

Hmm, I don't understand why would we replace a nice transparent
api with page pool relying on a queue having devmem specific
pointer? It seemed more flexible and cleaner in the last RFC.

> +
>   	if (pool->mp_ops) {
>   		err = pool->mp_ops->init(pool);
>   		if (err) {
> @@ -1020,3 +1033,77 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
>   	}
>   }
>   EXPORT_SYMBOL(page_pool_update_nid);
> +
> +void __page_pool_iov_free(struct page_pool_iov *ppiov)
> +{
> +	if (WARN_ON(ppiov->pp->mp_ops != &dmabuf_devmem_ops))
> +		return;
> +
> +	netdev_free_dmabuf(ppiov);
> +}
> +EXPORT_SYMBOL_GPL(__page_pool_iov_free);

I didn't look too deep but I don't think I immediately follow
the pp refcounting. It increments pages_state_hold_cnt on
allocation, but IIUC doesn't mark skbs for recycle? Then, they all
will be put down via page_pool_iov_put_many() bypassing
page_pool_return_page() and friends. That will call
netdev_free_dmabuf(), which doesn't bump pages_state_release_cnt.

At least I couldn't make it work with io_uring, and for my purposes,
I forced all puts to go through page_pool_return_page(), which calls
the ->release_page callback. The callback will put the reference and
ask its page pool to account release_cnt. It also gets rid of
__page_pool_iov_free(), as we'd need to add a hook there for
customization otherwise.

I didn't care about overhead because the hot path for me is getting
buffers from a ring, which is somewhat analogous to sock_devmem_dontneed(),
but done on pp allocations under napi, and it's done separately.

Completely untested with TCP devmem:

https://github.com/isilence/linux/commit/14bd56605183dc80b540999e8058c79ac92ae2d8

> +
> +/*** "Dmabuf devmem memory provider" ***/
> +
> +static int mp_dmabuf_devmem_init(struct page_pool *pool)
> +{
> +	struct netdev_dmabuf_binding *binding = pool->mp_priv;
> +
> +	if (!binding)
> +		return -EINVAL;
> +
> +	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
> +		return -EOPNOTSUPP;
> +
> +	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
> +		return -EOPNOTSUPP;
> +
> +	netdev_dmabuf_binding_get(binding);
> +	return 0;
> +}
> +
> +static struct page *mp_dmabuf_devmem_alloc_pages(struct page_pool *pool,
> +						 gfp_t gfp)
> +{
> +	struct netdev_dmabuf_binding *binding = pool->mp_priv;
> +	struct page_pool_iov *ppiov;
> +
> +	ppiov = netdev_alloc_dmabuf(binding);
> +	if (!ppiov)
> +		return NULL;
> +
> +	ppiov->pp = pool;
> +	pool->pages_state_hold_cnt++;
> +	trace_page_pool_state_hold(pool, (struct page *)ppiov,
> +				   pool->pages_state_hold_cnt);
> +	return (struct page *)((unsigned long)ppiov | PP_IOV);
> +}
> +
> +static void mp_dmabuf_devmem_destroy(struct page_pool *pool)
> +{
> +	struct netdev_dmabuf_binding *binding = pool->mp_priv;
> +
> +	netdev_dmabuf_binding_put(binding);
> +}
> +
> +static bool mp_dmabuf_devmem_release_page(struct page_pool *pool,
> +					  struct page *page)
> +{
> +	struct page_pool_iov *ppiov;
> +
> +	if (WARN_ON_ONCE(!page_is_page_pool_iov(page)))
> +		return false;
> +
> +	ppiov = page_to_page_pool_iov(page);
> +	page_pool_iov_put_many(ppiov, 1);
> +	/* We don't want the page pool put_page()ing our page_pool_iovs. */
> +	return false;
> +}
> +
> +const struct memory_provider_ops dmabuf_devmem_ops = {
> +	.init			= mp_dmabuf_devmem_init,
> +	.destroy		= mp_dmabuf_devmem_destroy,
> +	.alloc_pages		= mp_dmabuf_devmem_alloc_pages,
> +	.release_page		= mp_dmabuf_devmem_release_page,
> +};
> +EXPORT_SYMBOL(dmabuf_devmem_ops);
Pavel Begunkov Dec. 8, 2023, 11:05 p.m. UTC | #2
On 12/8/23 00:52, Mina Almasry wrote:
> Implement a memory provider that allocates dmabuf devmem page_pool_iovs.
> 
> The provider receives a reference to the struct netdev_dmabuf_binding
> via the pool->mp_priv pointer. The driver needs to set this pointer for
> the provider in the page_pool_params.
> 
> The provider obtains a reference on the netdev_dmabuf_binding which
> guarantees the binding and the underlying mapping remains alive until
> the provider is destroyed.
> 
> Usage of PP_FLAG_DMA_MAP is required for this memory provide such that
> the page_pool can provide the driver with the dma-addrs of the devmem.
> 
> Support for PP_FLAG_DMA_SYNC_DEV is omitted for simplicity.
> 
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
> Signed-off-by: Mina Almasry <almasrymina@google.com>
[...]
> +void __page_pool_iov_free(struct page_pool_iov *ppiov);
> +
> +static inline void page_pool_iov_put_many(struct page_pool_iov *ppiov,
> +					  unsigned int count)
> +{
> +	if (!refcount_sub_and_test(count, &ppiov->refcount))
> +		return;
> +
> +	__page_pool_iov_free(ppiov);
> +}
> +
> +/* page pool mm helpers */
> +
> +DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);
> +static inline bool page_is_page_pool_iov(const struct page *page)
> +{
> +	return static_branch_unlikely(&page_pool_mem_providers) &&
> +	       (unsigned long)page & PP_IOV;

Are there any recommendations of not using static keys in widely
used inline functions? I'm not familiar with static key code
generation, but I think the compiler will bloat users with fat chunks
of code in unlikely paths. And I'd assume it creates an array of all
uses, which it'll be walked on enabling/disabling the branch.

> +}
> +
> +static inline struct page_pool_iov *page_to_page_pool_iov(struct page *page)
> +{
> +	if (page_is_page_pool_iov(page))
> +		return (struct page_pool_iov *)((unsigned long)page & ~PP_IOV);
> +
> +	DEBUG_NET_WARN_ON_ONCE(true);
> +	return NULL;
> +}
> +
>   /**
>    * page_pool_dev_alloc_pages() - allocate a page.
>    * @pool:	pool from which to allocate
Mina Almasry Dec. 8, 2023, 11:25 p.m. UTC | #3
On Fri, Dec 8, 2023 at 2:56 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 12/8/23 00:52, Mina Almasry wrote:
...
> > +     if (pool->p.queue)
> > +             binding = READ_ONCE(pool->p.queue->binding);
> > +
> > +     if (binding) {
> > +             pool->mp_ops = &dmabuf_devmem_ops;
> > +             pool->mp_priv = binding;
> > +     }
>
> Hmm, I don't understand why would we replace a nice transparent
> api with page pool relying on a queue having devmem specific
> pointer? It seemed more flexible and cleaner in the last RFC.
>

Jakub requested this change and may chime in, but I suspect it's to
further abstract the devmem changes from driver. In this iteration,
the driver grabs the netdev_rx_queue and passes it to the page_pool,
and any future configurations between the net stack and page_pool can
be passed this way with the driver unbothered.

> > +
> >       if (pool->mp_ops) {
> >               err = pool->mp_ops->init(pool);
> >               if (err) {
> > @@ -1020,3 +1033,77 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
> >       }
> >   }
> >   EXPORT_SYMBOL(page_pool_update_nid);
> > +
> > +void __page_pool_iov_free(struct page_pool_iov *ppiov)
> > +{
> > +     if (WARN_ON(ppiov->pp->mp_ops != &dmabuf_devmem_ops))
> > +             return;
> > +
> > +     netdev_free_dmabuf(ppiov);
> > +}
> > +EXPORT_SYMBOL_GPL(__page_pool_iov_free);
>
> I didn't look too deep but I don't think I immediately follow
> the pp refcounting. It increments pages_state_hold_cnt on
> allocation, but IIUC doesn't mark skbs for recycle? Then, they all
> will be put down via page_pool_iov_put_many() bypassing
> page_pool_return_page() and friends. That will call
> netdev_free_dmabuf(), which doesn't bump pages_state_release_cnt.
>
> At least I couldn't make it work with io_uring, and for my purposes,
> I forced all puts to go through page_pool_return_page(), which calls
> the ->release_page callback. The callback will put the reference and
> ask its page pool to account release_cnt. It also gets rid of
> __page_pool_iov_free(), as we'd need to add a hook there for
> customization otherwise.
>
> I didn't care about overhead because the hot path for me is getting
> buffers from a ring, which is somewhat analogous to sock_devmem_dontneed(),
> but done on pp allocations under napi, and it's done separately.
>
> Completely untested with TCP devmem:
>
> https://github.com/isilence/linux/commit/14bd56605183dc80b540999e8058c79ac92ae2d8
>

This was a mistake in the last RFC, which should be fixed in v1. In
the RFC I was not marking the skbs as skb_mark_for_recycle(), so the
unreffing path wasn't as expected.

In this iteration, that should be completely fixed. I suspect since I
just posted this you're actually referring to the issue tested on the
last RFC? Correct me if wrong.

In this iteration, the reffing story:

- memory provider allocs ppiov and returns it to the page pool with
ppiov->refcount == 1.
- The page_pool gives the page to the driver. The driver may
obtain/release references with page_pool_page_[get|put]_many(), but
the driver is likely not doing that unless it's doing its own page
recycling.
- The net stack obtains references via skb_frag_ref() ->
page_pool_page_get_many()
- The net stack drops references via skb_frag_unref() ->
napi_pp_put_page() -> page_pool_return_page() and friends.

Thus, the issue where the unref path was skipping
page_pool_return_page() and friends should be resolved in this
iteration, let me know if you think otherwise, but I think this was an
issue limited to the last RFC.

> > +
> > +/*** "Dmabuf devmem memory provider" ***/
> > +
> > +static int mp_dmabuf_devmem_init(struct page_pool *pool)
> > +{
> > +     struct netdev_dmabuf_binding *binding = pool->mp_priv;
> > +
> > +     if (!binding)
> > +             return -EINVAL;
> > +
> > +     if (!(pool->p.flags & PP_FLAG_DMA_MAP))
> > +             return -EOPNOTSUPP;
> > +
> > +     if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
> > +             return -EOPNOTSUPP;
> > +
> > +     netdev_dmabuf_binding_get(binding);
> > +     return 0;
> > +}
> > +
> > +static struct page *mp_dmabuf_devmem_alloc_pages(struct page_pool *pool,
> > +                                              gfp_t gfp)
> > +{
> > +     struct netdev_dmabuf_binding *binding = pool->mp_priv;
> > +     struct page_pool_iov *ppiov;
> > +
> > +     ppiov = netdev_alloc_dmabuf(binding);
> > +     if (!ppiov)
> > +             return NULL;
> > +
> > +     ppiov->pp = pool;
> > +     pool->pages_state_hold_cnt++;
> > +     trace_page_pool_state_hold(pool, (struct page *)ppiov,
> > +                                pool->pages_state_hold_cnt);
> > +     return (struct page *)((unsigned long)ppiov | PP_IOV);
> > +}
> > +
> > +static void mp_dmabuf_devmem_destroy(struct page_pool *pool)
> > +{
> > +     struct netdev_dmabuf_binding *binding = pool->mp_priv;
> > +
> > +     netdev_dmabuf_binding_put(binding);
> > +}
> > +
> > +static bool mp_dmabuf_devmem_release_page(struct page_pool *pool,
> > +                                       struct page *page)
> > +{
> > +     struct page_pool_iov *ppiov;
> > +
> > +     if (WARN_ON_ONCE(!page_is_page_pool_iov(page)))
> > +             return false;
> > +
> > +     ppiov = page_to_page_pool_iov(page);
> > +     page_pool_iov_put_many(ppiov, 1);
> > +     /* We don't want the page pool put_page()ing our page_pool_iovs. */
> > +     return false;
> > +}
> > +
> > +const struct memory_provider_ops dmabuf_devmem_ops = {
> > +     .init                   = mp_dmabuf_devmem_init,
> > +     .destroy                = mp_dmabuf_devmem_destroy,
> > +     .alloc_pages            = mp_dmabuf_devmem_alloc_pages,
> > +     .release_page           = mp_dmabuf_devmem_release_page,
> > +};
> > +EXPORT_SYMBOL(dmabuf_devmem_ops);
>
> --
> Pavel Begunkov
Pavel Begunkov Dec. 10, 2023, 3:03 a.m. UTC | #4
On 12/8/23 23:25, Mina Almasry wrote:
> On Fri, Dec 8, 2023 at 2:56 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> On 12/8/23 00:52, Mina Almasry wrote:
> ...
>>> +     if (pool->p.queue)
>>> +             binding = READ_ONCE(pool->p.queue->binding);
>>> +
>>> +     if (binding) {
>>> +             pool->mp_ops = &dmabuf_devmem_ops;
>>> +             pool->mp_priv = binding;
>>> +     }
>>
>> Hmm, I don't understand why would we replace a nice transparent
>> api with page pool relying on a queue having devmem specific
>> pointer? It seemed more flexible and cleaner in the last RFC.
>>
> 
> Jakub requested this change and may chime in, but I suspect it's to
> further abstract the devmem changes from driver. In this iteration,
> the driver grabs the netdev_rx_queue and passes it to the page_pool,
> and any future configurations between the net stack and page_pool can
> be passed this way with the driver unbothered.

Ok, that makes sense, but even if passed via an rx queue I'd
at least hope it keeping abstract provider parameters, e.g.
ops, but not hard coded with devmem specific code.

It might even be better done with a helper like
create_page_pool_from_queue(), unless there is some deeper
interaction b/w pp and rx queues is predicted.

>>> +
>>>        if (pool->mp_ops) {
>>>                err = pool->mp_ops->init(pool);
>>>                if (err) {
>>> @@ -1020,3 +1033,77 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
>>>        }
>>>    }
>>>    EXPORT_SYMBOL(page_pool_update_nid);
>>> +
>>> +void __page_pool_iov_free(struct page_pool_iov *ppiov)
>>> +{
>>> +     if (WARN_ON(ppiov->pp->mp_ops != &dmabuf_devmem_ops))
>>> +             return;
>>> +
>>> +     netdev_free_dmabuf(ppiov);
>>> +}
>>> +EXPORT_SYMBOL_GPL(__page_pool_iov_free);
>>
>> I didn't look too deep but I don't think I immediately follow
>> the pp refcounting. It increments pages_state_hold_cnt on
>> allocation, but IIUC doesn't mark skbs for recycle? Then, they all
>> will be put down via page_pool_iov_put_many() bypassing
>> page_pool_return_page() and friends. That will call
>> netdev_free_dmabuf(), which doesn't bump pages_state_release_cnt.
>>
>> At least I couldn't make it work with io_uring, and for my purposes,
>> I forced all puts to go through page_pool_return_page(), which calls
>> the ->release_page callback. The callback will put the reference and
>> ask its page pool to account release_cnt. It also gets rid of
>> __page_pool_iov_free(), as we'd need to add a hook there for
>> customization otherwise.
>>
>> I didn't care about overhead because the hot path for me is getting
>> buffers from a ring, which is somewhat analogous to sock_devmem_dontneed(),
>> but done on pp allocations under napi, and it's done separately.
>>
>> Completely untested with TCP devmem:
>>
>> https://github.com/isilence/linux/commit/14bd56605183dc80b540999e8058c79ac92ae2d8
>>
> 
> This was a mistake in the last RFC, which should be fixed in v1. In
> the RFC I was not marking the skbs as skb_mark_for_recycle(), so the
> unreffing path wasn't as expected.
> 
> In this iteration, that should be completely fixed. I suspect since I
> just posted this you're actually referring to the issue tested on the
> last RFC? Correct me if wrong.

Right, it was with RFCv3

> In this iteration, the reffing story:
> 
> - memory provider allocs ppiov and returns it to the page pool with
> ppiov->refcount == 1.
> - The page_pool gives the page to the driver. The driver may
> obtain/release references with page_pool_page_[get|put]_many(), but
> the driver is likely not doing that unless it's doing its own page
> recycling.
> - The net stack obtains references via skb_frag_ref() ->
> page_pool_page_get_many()
> - The net stack drops references via skb_frag_unref() ->
> napi_pp_put_page() -> page_pool_return_page() and friends.
> 
> Thus, the issue where the unref path was skipping
> page_pool_return_page() and friends should be resolved in this
> iteration, let me know if you think otherwise, but I think this was an
> issue limited to the last RFC.

Then page_pool_iov_put_many() should and supposedly would never be
called by non devmap code because all puts must circle back into
->release_page. Why adding it to into page_pool_page_put_many()?

@@ -731,6 +731,29 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
+	if (page_is_page_pool_iov(page)) {
...
+		page_pool_page_put_many(page, 1);
+		return NULL;
+	}

Well, I'm looking at this new branch from Patch 10, it can put
the buffer, but what if we race at it's actually the final put?
Looks like nobody is going to to bump up pages_state_release_cnt

If you remove the branch, let it fall into ->release and rely
on refcounting there, then the callback could also fix up
release_cnt or ask pp to do it, like in the patch I linked above
Mina Almasry Dec. 11, 2023, 2:30 a.m. UTC | #5
On Sat, Dec 9, 2023 at 7:05 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 12/8/23 23:25, Mina Almasry wrote:
> > On Fri, Dec 8, 2023 at 2:56 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
> >>
> >> On 12/8/23 00:52, Mina Almasry wrote:
> > ...
> >>> +     if (pool->p.queue)
> >>> +             binding = READ_ONCE(pool->p.queue->binding);
> >>> +
> >>> +     if (binding) {
> >>> +             pool->mp_ops = &dmabuf_devmem_ops;
> >>> +             pool->mp_priv = binding;
> >>> +     }
> >>
> >> Hmm, I don't understand why would we replace a nice transparent
> >> api with page pool relying on a queue having devmem specific
> >> pointer? It seemed more flexible and cleaner in the last RFC.
> >>
> >
> > Jakub requested this change and may chime in, but I suspect it's to
> > further abstract the devmem changes from driver. In this iteration,
> > the driver grabs the netdev_rx_queue and passes it to the page_pool,
> > and any future configurations between the net stack and page_pool can
> > be passed this way with the driver unbothered.
>
> Ok, that makes sense, but even if passed via an rx queue I'd
> at least hope it keeping abstract provider parameters, e.g.
> ops, but not hard coded with devmem specific code.
>
> It might even be better done with a helper like
> create_page_pool_from_queue(), unless there is some deeper
> interaction b/w pp and rx queues is predicted.
>

Off hand I don't see the need for a new create_page_pool_from_queue().
page_pool_create() already takes in a param arg that lets us pass in
the queue as well as any other params.

> >>> +
> >>>        if (pool->mp_ops) {
> >>>                err = pool->mp_ops->init(pool);
> >>>                if (err) {
> >>> @@ -1020,3 +1033,77 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
> >>>        }
> >>>    }
> >>>    EXPORT_SYMBOL(page_pool_update_nid);
> >>> +
> >>> +void __page_pool_iov_free(struct page_pool_iov *ppiov)
> >>> +{
> >>> +     if (WARN_ON(ppiov->pp->mp_ops != &dmabuf_devmem_ops))
> >>> +             return;
> >>> +
> >>> +     netdev_free_dmabuf(ppiov);
> >>> +}
> >>> +EXPORT_SYMBOL_GPL(__page_pool_iov_free);
> >>
> >> I didn't look too deep but I don't think I immediately follow
> >> the pp refcounting. It increments pages_state_hold_cnt on
> >> allocation, but IIUC doesn't mark skbs for recycle? Then, they all
> >> will be put down via page_pool_iov_put_many() bypassing
> >> page_pool_return_page() and friends. That will call
> >> netdev_free_dmabuf(), which doesn't bump pages_state_release_cnt.
> >>
> >> At least I couldn't make it work with io_uring, and for my purposes,
> >> I forced all puts to go through page_pool_return_page(), which calls
> >> the ->release_page callback. The callback will put the reference and
> >> ask its page pool to account release_cnt. It also gets rid of
> >> __page_pool_iov_free(), as we'd need to add a hook there for
> >> customization otherwise.
> >>
> >> I didn't care about overhead because the hot path for me is getting
> >> buffers from a ring, which is somewhat analogous to sock_devmem_dontneed(),
> >> but done on pp allocations under napi, and it's done separately.
> >>
> >> Completely untested with TCP devmem:
> >>
> >> https://github.com/isilence/linux/commit/14bd56605183dc80b540999e8058c79ac92ae2d8
> >>
> >
> > This was a mistake in the last RFC, which should be fixed in v1. In
> > the RFC I was not marking the skbs as skb_mark_for_recycle(), so the
> > unreffing path wasn't as expected.
> >
> > In this iteration, that should be completely fixed. I suspect since I
> > just posted this you're actually referring to the issue tested on the
> > last RFC? Correct me if wrong.
>
> Right, it was with RFCv3
>
> > In this iteration, the reffing story:
> >
> > - memory provider allocs ppiov and returns it to the page pool with
> > ppiov->refcount == 1.
> > - The page_pool gives the page to the driver. The driver may
> > obtain/release references with page_pool_page_[get|put]_many(), but
> > the driver is likely not doing that unless it's doing its own page
> > recycling.
> > - The net stack obtains references via skb_frag_ref() ->
> > page_pool_page_get_many()
> > - The net stack drops references via skb_frag_unref() ->
> > napi_pp_put_page() -> page_pool_return_page() and friends.
> >
> > Thus, the issue where the unref path was skipping
> > page_pool_return_page() and friends should be resolved in this
> > iteration, let me know if you think otherwise, but I think this was an
> > issue limited to the last RFC.
>
> Then page_pool_iov_put_many() should and supposedly would never be
> called by non devmap code because all puts must circle back into
> ->release_page. Why adding it to into page_pool_page_put_many()?
>
> @@ -731,6 +731,29 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
> +       if (page_is_page_pool_iov(page)) {
> ...
> +               page_pool_page_put_many(page, 1);
> +               return NULL;
> +       }
>
> Well, I'm looking at this new branch from Patch 10, it can put
> the buffer, but what if we race at it's actually the final put?
> Looks like nobody is going to to bump up pages_state_release_cnt
>

Good catch, I think indeed the release_cnt would be incorrect in this
case. I think the race is benign in the sense that the ppiov will be
freed correctly and available for allocation when the page_pool next
needs it; the issue is with the stats AFAICT.

> If you remove the branch, let it fall into ->release and rely
> on refcounting there, then the callback could also fix up
> release_cnt or ask pp to do it, like in the patch I linked above
>

Sadly I don't think this is possible due to the reasons I mention in
the commit message of that patch. Prematurely releasing ppiov and not
having them be candidates for recycling shows me a 4-5x degradation in
performance.

What I could do here is detect that the refcount was dropped to 0 and
fix up the stats in that case.
Pavel Begunkov Dec. 11, 2023, 8:35 p.m. UTC | #6
On 12/11/23 02:30, Mina Almasry wrote:
> On Sat, Dec 9, 2023 at 7:05 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> On 12/8/23 23:25, Mina Almasry wrote:
>>> On Fri, Dec 8, 2023 at 2:56 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>>>
>>>> On 12/8/23 00:52, Mina Almasry wrote:
>>> ...
>>>>> +     if (pool->p.queue)
>>>>> +             binding = READ_ONCE(pool->p.queue->binding);
>>>>> +
>>>>> +     if (binding) {
>>>>> +             pool->mp_ops = &dmabuf_devmem_ops;
>>>>> +             pool->mp_priv = binding;
>>>>> +     }
>>>>
>>>> Hmm, I don't understand why would we replace a nice transparent
>>>> api with page pool relying on a queue having devmem specific
>>>> pointer? It seemed more flexible and cleaner in the last RFC.
>>>>
>>>
>>> Jakub requested this change and may chime in, but I suspect it's to
>>> further abstract the devmem changes from driver. In this iteration,
>>> the driver grabs the netdev_rx_queue and passes it to the page_pool,
>>> and any future configurations between the net stack and page_pool can
>>> be passed this way with the driver unbothered.
>>
>> Ok, that makes sense, but even if passed via an rx queue I'd
>> at least hope it keeping abstract provider parameters, e.g.
>> ops, but not hard coded with devmem specific code.
>>
>> It might even be better done with a helper like
>> create_page_pool_from_queue(), unless there is some deeper
>> interaction b/w pp and rx queues is predicted.
>>
> 
> Off hand I don't see the need for a new create_page_pool_from_queue().
> page_pool_create() already takes in a param arg that lets us pass in
> the queue as well as any other params.
> 
>>>>> +
>>>>>         if (pool->mp_ops) {
>>>>>                 err = pool->mp_ops->init(pool);
>>>>>                 if (err) {
>>>>> @@ -1020,3 +1033,77 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
>>>>>         }
>>>>>     }
>>>>>     EXPORT_SYMBOL(page_pool_update_nid);
>>>>> +
>>>>> +void __page_pool_iov_free(struct page_pool_iov *ppiov)
>>>>> +{
>>>>> +     if (WARN_ON(ppiov->pp->mp_ops != &dmabuf_devmem_ops))
>>>>> +             return;
>>>>> +
>>>>> +     netdev_free_dmabuf(ppiov);
>>>>> +}
>>>>> +EXPORT_SYMBOL_GPL(__page_pool_iov_free);
>>>>
>>>> I didn't look too deep but I don't think I immediately follow
>>>> the pp refcounting. It increments pages_state_hold_cnt on
>>>> allocation, but IIUC doesn't mark skbs for recycle? Then, they all
>>>> will be put down via page_pool_iov_put_many() bypassing
>>>> page_pool_return_page() and friends. That will call
>>>> netdev_free_dmabuf(), which doesn't bump pages_state_release_cnt.
>>>>
>>>> At least I couldn't make it work with io_uring, and for my purposes,
>>>> I forced all puts to go through page_pool_return_page(), which calls
>>>> the ->release_page callback. The callback will put the reference and
>>>> ask its page pool to account release_cnt. It also gets rid of
>>>> __page_pool_iov_free(), as we'd need to add a hook there for
>>>> customization otherwise.
>>>>
>>>> I didn't care about overhead because the hot path for me is getting
>>>> buffers from a ring, which is somewhat analogous to sock_devmem_dontneed(),
>>>> but done on pp allocations under napi, and it's done separately.
>>>>
>>>> Completely untested with TCP devmem:
>>>>
>>>> https://github.com/isilence/linux/commit/14bd56605183dc80b540999e8058c79ac92ae2d8
>>>>
>>>
>>> This was a mistake in the last RFC, which should be fixed in v1. In
>>> the RFC I was not marking the skbs as skb_mark_for_recycle(), so the
>>> unreffing path wasn't as expected.
>>>
>>> In this iteration, that should be completely fixed. I suspect since I
>>> just posted this you're actually referring to the issue tested on the
>>> last RFC? Correct me if wrong.
>>
>> Right, it was with RFCv3
>>
>>> In this iteration, the reffing story:
>>>
>>> - memory provider allocs ppiov and returns it to the page pool with
>>> ppiov->refcount == 1.
>>> - The page_pool gives the page to the driver. The driver may
>>> obtain/release references with page_pool_page_[get|put]_many(), but
>>> the driver is likely not doing that unless it's doing its own page
>>> recycling.
>>> - The net stack obtains references via skb_frag_ref() ->
>>> page_pool_page_get_many()
>>> - The net stack drops references via skb_frag_unref() ->
>>> napi_pp_put_page() -> page_pool_return_page() and friends.
>>>
>>> Thus, the issue where the unref path was skipping
>>> page_pool_return_page() and friends should be resolved in this
>>> iteration, let me know if you think otherwise, but I think this was an
>>> issue limited to the last RFC.
>>
>> Then page_pool_iov_put_many() should and supposedly would never be
>> called by non devmap code because all puts must circle back into
>> ->release_page. Why adding it to into page_pool_page_put_many()?
>>
>> @@ -731,6 +731,29 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
>> +       if (page_is_page_pool_iov(page)) {
>> ...
>> +               page_pool_page_put_many(page, 1);
>> +               return NULL;
>> +       }
>>
>> Well, I'm looking at this new branch from Patch 10, it can put
>> the buffer, but what if we race at it's actually the final put?
>> Looks like nobody is going to to bump up pages_state_release_cnt
>>
> 
> Good catch, I think indeed the release_cnt would be incorrect in this
> case. I think the race is benign in the sense that the ppiov will be
> freed correctly and available for allocation when the page_pool next
> needs it; the issue is with the stats AFAICT.

hold_cnt + release_cnt serves is used for refcounting. In this case
it'll leak the pool when you try to destroy it.


>> If you remove the branch, let it fall into ->release and rely
>> on refcounting there, then the callback could also fix up
>> release_cnt or ask pp to do it, like in the patch I linked above
>>
> 
> Sadly I don't think this is possible due to the reasons I mention in
> the commit message of that patch. Prematurely releasing ppiov and not
> having them be candidates for recycling shows me a 4-5x degradation in
> performance.

I don't think I follow. The concept is to only recycle a buffer (i.e.
make it available for allocation) when its refs drop to zero, which is
IMHO the only way it can work, and IIUC what this patchset is doing.

That's also I suggest to do, but through a slightly different path.
Let's say at some moment there are 2 refs (e.g. 1 for an skb and
1 for userspace/xarray).

Say it first puts the skb:

napi_pp_put_page()
   -> page_pool_return_page()
     -> mp_ops->release_page()
        -> need_to_free = put_buf()
           // not last ref, need_to_free==false,
           // don't recycle, don't increase release_cnt

Then you put the last ref:

page_pool_iov_put_many()
   -> page_pool_return_page()
     -> mp_ops->release_page()
        -> need_to_free = put_buf()
           // last ref, need_to_free==true,
           // recycle and release_cnt++

And that last put can even be recycled right into the
pp / ptr_ring, in which case it doesn't need to touch
release_cnt. Does it make sense? I don't see where
4-5x degradation would come from


> What I could do here is detect that the refcount was dropped to 0 and
> fix up the stats in that case.
Jason Gunthorpe Dec. 12, 2023, 12:25 p.m. UTC | #7
On Thu, Dec 07, 2023 at 04:52:39PM -0800, Mina Almasry wrote:

> +static inline struct page_pool_iov *page_to_page_pool_iov(struct page *page)
> +{
> +	if (page_is_page_pool_iov(page))
> +		return (struct page_pool_iov *)((unsigned long)page & ~PP_IOV);
> +
> +	DEBUG_NET_WARN_ON_ONCE(true);
> +	return NULL;
> +}

We already asked not to do this, please do not allocate weird things
can call them 'struct page' when they are not. It undermines the
maintainability of the mm to have things mis-typed like
this. Introduce a new type for your thing so the compiler can check it
properly.

Jason
Christoph Hellwig Dec. 12, 2023, 1:07 p.m. UTC | #8
On Tue, Dec 12, 2023 at 08:25:35AM -0400, Jason Gunthorpe wrote:
> > +static inline struct page_pool_iov *page_to_page_pool_iov(struct page *page)
> > +{
> > +	if (page_is_page_pool_iov(page))
> > +		return (struct page_pool_iov *)((unsigned long)page & ~PP_IOV);
> > +
> > +	DEBUG_NET_WARN_ON_ONCE(true);
> > +	return NULL;
> > +}
> 
> We already asked not to do this, please do not allocate weird things
> can call them 'struct page' when they are not. It undermines the
> maintainability of the mm to have things mis-typed like
> this. Introduce a new type for your thing so the compiler can check it
> properly.

Yes.  Or even better avoid this mess entirely..
Mina Almasry Dec. 12, 2023, 2:26 p.m. UTC | #9
On Tue, Dec 12, 2023 at 4:25 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Thu, Dec 07, 2023 at 04:52:39PM -0800, Mina Almasry wrote:
>
> > +static inline struct page_pool_iov *page_to_page_pool_iov(struct page *page)
> > +{
> > +     if (page_is_page_pool_iov(page))
> > +             return (struct page_pool_iov *)((unsigned long)page & ~PP_IOV);
> > +
> > +     DEBUG_NET_WARN_ON_ONCE(true);
> > +     return NULL;
> > +}
>
> We already asked not to do this, please do not allocate weird things
> can call them 'struct page' when they are not. It undermines the
> maintainability of the mm to have things mis-typed like
> this. Introduce a new type for your thing so the compiler can check it
> properly.
>

There is a new type introduced, it's the page_pool_iov. We set the LSB
on page_pool_iov* and cast it to page* only to avoid the churn of
renaming page* to page_pool_iov* in the page_pool and all the net
drivers using it. Is that not a reasonable compromise in your opinion?
Since the LSB is set on the resulting page pointers, they are not
actually usuable as pages, and are never passed to mm APIs per your
requirement.
Jason Gunthorpe Dec. 12, 2023, 2:39 p.m. UTC | #10
On Tue, Dec 12, 2023 at 06:26:51AM -0800, Mina Almasry wrote:
> On Tue, Dec 12, 2023 at 4:25 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Thu, Dec 07, 2023 at 04:52:39PM -0800, Mina Almasry wrote:
> >
> > > +static inline struct page_pool_iov *page_to_page_pool_iov(struct page *page)
> > > +{
> > > +     if (page_is_page_pool_iov(page))
> > > +             return (struct page_pool_iov *)((unsigned long)page & ~PP_IOV);
> > > +
> > > +     DEBUG_NET_WARN_ON_ONCE(true);
> > > +     return NULL;
> > > +}
> >
> > We already asked not to do this, please do not allocate weird things
> > can call them 'struct page' when they are not. It undermines the
> > maintainability of the mm to have things mis-typed like
> > this. Introduce a new type for your thing so the compiler can check it
> > properly.
> >
> 
> There is a new type introduced, it's the page_pool_iov. We set the LSB
> on page_pool_iov* and cast it to page* only to avoid the churn of
> renaming page* to page_pool_iov* in the page_pool and all the net
> drivers using it. Is that not a reasonable compromise in your opinion?
> Since the LSB is set on the resulting page pointers, they are not
> actually usuable as pages, and are never passed to mm APIs per your
> requirement.

There were two asks, the one you did was to never pass this non-struct
page memory to the mm, which is great.

The other was to not mistype things, and don't type something as
struct page when it is, in fact, not.

I fear what you've done is make it so only one driver calls these
special functions and left the other drivers passing the struct page
directly to the mm and sort of obfuscating why it is OK based on this
netdev knowledge of not enabling/using the static branch in the other
cases.

Perhaps you can simply avoid this by arranging for this driver to also
exclusively use some special type to indicate the dual nature of the
pointer and leave the other drivers as using the struct page version.

Jason
Mina Almasry Dec. 12, 2023, 2:58 p.m. UTC | #11
On Tue, Dec 12, 2023 at 6:39 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Tue, Dec 12, 2023 at 06:26:51AM -0800, Mina Almasry wrote:
> > On Tue, Dec 12, 2023 at 4:25 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > >
> > > On Thu, Dec 07, 2023 at 04:52:39PM -0800, Mina Almasry wrote:
> > >
> > > > +static inline struct page_pool_iov *page_to_page_pool_iov(struct page *page)
> > > > +{
> > > > +     if (page_is_page_pool_iov(page))
> > > > +             return (struct page_pool_iov *)((unsigned long)page & ~PP_IOV);
> > > > +
> > > > +     DEBUG_NET_WARN_ON_ONCE(true);
> > > > +     return NULL;
> > > > +}
> > >
> > > We already asked not to do this, please do not allocate weird things
> > > can call them 'struct page' when they are not. It undermines the
> > > maintainability of the mm to have things mis-typed like
> > > this. Introduce a new type for your thing so the compiler can check it
> > > properly.
> > >
> >
> > There is a new type introduced, it's the page_pool_iov. We set the LSB
> > on page_pool_iov* and cast it to page* only to avoid the churn of
> > renaming page* to page_pool_iov* in the page_pool and all the net
> > drivers using it. Is that not a reasonable compromise in your opinion?
> > Since the LSB is set on the resulting page pointers, they are not
> > actually usuable as pages, and are never passed to mm APIs per your
> > requirement.
>
> There were two asks, the one you did was to never pass this non-struct
> page memory to the mm, which is great.
>
> The other was to not mistype things, and don't type something as
> struct page when it is, in fact, not.
>
> I fear what you've done is make it so only one driver calls these
> special functions and left the other drivers passing the struct page
> directly to the mm and sort of obfuscating why it is OK based on this
> netdev knowledge of not enabling/using the static branch in the other
> cases.
>

Jason, we set the LSB on page_pool_iov pointers before casting it to
struct page pointers. The resulting pointers are not useable as page
pointers at all.

In order to use the resulting pointers, the driver _must_ use the
special functions that first clear the LSB. It is impossible for the
driver to 'accidentally' use the resulting page pointers with the LSB
set - the kernel would just crash trying to dereference such a
pointer.

The way it works currently is that drivers that support devmem TCP
will declare that support to the net stack, and use the special
functions that clear the LSB and cast the struct back to
page_pool_iov. The drivers that don't support devmem TCP will not
declare support and will get pages allocated from the mm stack from
the page_pool and use them as pages normally.

> Perhaps you can simply avoid this by arranging for this driver to also
> exclusively use some special type to indicate the dual nature of the
> pointer and leave the other drivers as using the struct page version.
>

This is certainly possible, but it requires us to rename all the page
pointers in the page_pool to the new type, and requires the driver
adding devmem TCP support to rename all the page* pointer instances to
the new type. It's possible but it introduces lots of code churn. Is
the LSB + cast not a reasonable compromise here? I feel like the trick
of setting the least significant bit on a pointer to indicate it's
something else has a fair amount of precedent in the kernel.

--
Thanks,
Mina
Jason Gunthorpe Dec. 12, 2023, 3:08 p.m. UTC | #12
On Tue, Dec 12, 2023 at 06:58:17AM -0800, Mina Almasry wrote:

> Jason, we set the LSB on page_pool_iov pointers before casting it to
> struct page pointers. The resulting pointers are not useable as page
> pointers at all.

I understand that, the second ask is about maintainability of the mm
by using correct types.

> > Perhaps you can simply avoid this by arranging for this driver to also
> > exclusively use some special type to indicate the dual nature of the
> > pointer and leave the other drivers as using the struct page version.
> 
> This is certainly possible, but it requires us to rename all the page
> pointers in the page_pool to the new type, and requires the driver
> adding devmem TCP support to rename all the page* pointer instances to
> the new type. It's possible but it introduces lots of code churn. Is
> the LSB + cast not a reasonable compromise here? I feel like the trick
> of setting the least significant bit on a pointer to indicate it's
> something else has a fair amount of precedent in the kernel.

Linus himself has complained about exactly this before, and written a cleanup:

https://lore.kernel.org/linux-mm/20221108194139.57604-1-torvalds@linux-foundation.org/

If you mangle a pointer *so it is no longer a pointer* then give it a
proper opaque type so the compiler can check everything statically and
require that the necessary converters are called in all cases.

You call it churn, I call it future maintainability. :(

No objection to using the LSB, just properly type a LSB mangled
pointer so everyone knows what is going on and don't call it MM's
struct page *.

I would say this is important here because it is a large driver facing
API surface.

Jason
Mina Almasry Dec. 13, 2023, 1:09 a.m. UTC | #13
On Tue, Dec 12, 2023 at 7:08 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Tue, Dec 12, 2023 at 06:58:17AM -0800, Mina Almasry wrote:
>
> > Jason, we set the LSB on page_pool_iov pointers before casting it to
> > struct page pointers. The resulting pointers are not useable as page
> > pointers at all.
>
> I understand that, the second ask is about maintainability of the mm
> by using correct types.
>
> > > Perhaps you can simply avoid this by arranging for this driver to also
> > > exclusively use some special type to indicate the dual nature of the
> > > pointer and leave the other drivers as using the struct page version.
> >
> > This is certainly possible, but it requires us to rename all the page
> > pointers in the page_pool to the new type, and requires the driver
> > adding devmem TCP support to rename all the page* pointer instances to
> > the new type. It's possible but it introduces lots of code churn. Is
> > the LSB + cast not a reasonable compromise here? I feel like the trick
> > of setting the least significant bit on a pointer to indicate it's
> > something else has a fair amount of precedent in the kernel.
>
> Linus himself has complained about exactly this before, and written a cleanup:
>
> https://lore.kernel.org/linux-mm/20221108194139.57604-1-torvalds@linux-foundation.org/
>
> If you mangle a pointer *so it is no longer a pointer* then give it a
> proper opaque type so the compiler can check everything statically and
> require that the necessary converters are called in all cases.
>
> You call it churn, I call it future maintainability. :(
>
> No objection to using the LSB, just properly type a LSB mangled
> pointer so everyone knows what is going on and don't call it MM's
> struct page *.
>
> I would say this is important here because it is a large driver facing
> API surface.
>

OK, I imagine this is not that hard to implement - it's really whether
the change is acceptable to reviewers.

I figure I can start by implementing a no-op abstraction to page*:

typedef struct page netmem_t

and replace the page* in the following places with netmem_t*:

1. page_pool API (not internals)
2. drivers using the page_pool.
3. skb_frag_t.

I think that change needs to be a separate series by itself. Then the
devmem patches would on top of that change netmem_t such that it can
be a union between struct page and page_pool_iov and add the special
handling of page_pool_iov. Does this sound reasonable?


--
Thanks,
Mina
David Ahern Dec. 13, 2023, 2:19 a.m. UTC | #14
On 12/12/23 6:09 PM, Mina Almasry wrote:
> OK, I imagine this is not that hard to implement - it's really whether
> the change is acceptable to reviewers.
> 
> I figure I can start by implementing a no-op abstraction to page*:
> 
> typedef struct page netmem_t
> 
> and replace the page* in the following places with netmem_t*:
> 
> 1. page_pool API (not internals)
> 2. drivers using the page_pool.
> 3. skb_frag_t.
> 

accessors to skb_frag_t field are now consolidated to
include/linux/skbuff.h (the one IB driver was fixed in Sept by
4ececeb83986), so changing skb_frag_t from bio_vec to something like:

typedef struct skb_frag {
	void *addr;
	unsigned int length;
	unsigned int offset;
};

is trivial. From there, addr can default to `struct page *`. If LSB is
set, strip it and return `struct page_pool_iov *` or `struct buffer_pool *`
Yinjun Zhang Dec. 13, 2023, 7:49 a.m. UTC | #15
On Thu,  7 Dec 2023 16:52:39 -0800, Mina Almasry wrote:
<...>
> +static int mp_dmabuf_devmem_init(struct page_pool *pool)
> +{
> +	struct netdev_dmabuf_binding *binding = pool->mp_priv;
> +
> +	if (!binding)
> +		return -EINVAL;
> +
> +	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
> +		return -EOPNOTSUPP;
> +
> +	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
> +		return -EOPNOTSUPP;
> +
> +	netdev_dmabuf_binding_get(binding);
> +	return 0;
> +}
> +
> +static struct page *mp_dmabuf_devmem_alloc_pages(struct page_pool *pool,
> +						 gfp_t gfp)
> +{
> +	struct netdev_dmabuf_binding *binding = pool->mp_priv;
> +	struct page_pool_iov *ppiov;
> +
> +	ppiov = netdev_alloc_dmabuf(binding);

Since it only supports one-page allocation, we'd better add a check in
`ops->init()` that `pool->p.order` must be 0.

> +	if (!ppiov)
> +		return NULL;
> +
> +	ppiov->pp = pool;
> +	pool->pages_state_hold_cnt++;
> +	trace_page_pool_state_hold(pool, (struct page *)ppiov,
> +				   pool->pages_state_hold_cnt);
> +	return (struct page *)((unsigned long)ppiov | PP_IOV);
> +}
<...>
Mina Almasry Dec. 14, 2023, 8:03 p.m. UTC | #16
On Mon, Dec 11, 2023 at 12:37 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
...
> >> If you remove the branch, let it fall into ->release and rely
> >> on refcounting there, then the callback could also fix up
> >> release_cnt or ask pp to do it, like in the patch I linked above
> >>
> >
> > Sadly I don't think this is possible due to the reasons I mention in
> > the commit message of that patch. Prematurely releasing ppiov and not
> > having them be candidates for recycling shows me a 4-5x degradation in
> > performance.
>
> I don't think I follow. The concept is to only recycle a buffer (i.e.
> make it available for allocation) when its refs drop to zero, which is
> IMHO the only way it can work, and IIUC what this patchset is doing.
>
> That's also I suggest to do, but through a slightly different path.
> Let's say at some moment there are 2 refs (e.g. 1 for an skb and
> 1 for userspace/xarray).
>
> Say it first puts the skb:
>
> napi_pp_put_page()
>    -> page_pool_return_page()
>      -> mp_ops->release_page()
>         -> need_to_free = put_buf()
>            // not last ref, need_to_free==false,
>            // don't recycle, don't increase release_cnt
>
> Then you put the last ref:
>
> page_pool_iov_put_many()
>    -> page_pool_return_page()
>      -> mp_ops->release_page()
>         -> need_to_free = put_buf()
>            // last ref, need_to_free==true,
>            // recycle and release_cnt++
>
> And that last put can even be recycled right into the
> pp / ptr_ring, in which case it doesn't need to touch
> release_cnt. Does it make sense? I don't see where
> 4-5x degradation would come from
>
>

Sorry for the late reply, I have been working on this locally.

What you're saying makes sense, and I'm no longer sure why I was
seeing a perf degradation without '[net-next v1 10/16] page_pool:
don't release iov on elevanted refcount'. However, even though what
you're saying is technically correct, AFAIU it's actually semantically
wrong. When a page is released by the page_pool, we should call
page_pool_clear_pp_info() and completely disconnect the page from the
pool. If we call release_page() on a page and then the page pool sees
it again in page_pool_return_page(), I think that is considered a bug.
In fact I think what you're proposing is as a result of a bug because
we don't call a page_pool_clear_pp_info() equivalent on releasing
ppiov.

However, I'm reasonably confident I figured out the right thing to do
here. The page_pool uses page->pp_frag_count for its refcounting.
pp_frag_count is a misnomer, it's being renamed to pp_ref_count in
Liang's series[1]). In this series I used a get_page/put_page
equivalent for refcounting. Once I transitioned to using
pp_[frag|ref]_count for refcounting inside the page_pool, the issue
went away, and I no longer need the patch 'page_pool: don't release
iov on elevanted refcount'.

There is an additional upside, since pages and ppiovs are both being
refcounted using pp_[frag|ref]_count, we get some unified handling for
ppiov and we reduce the checks around ppiov. This should be fixed
properly in the next series.

I still need to do some work (~1 week) before I upload the next
version as there is a new requirement from MM that we transition to a
new type and not re-use page*, but I uploaded my changes github with
the refcounting issues resolved in case they're useful to you. Sorry
for the churn:

https://github.com/mina/linux/commits/tcpdevmem-v1.5/

[1] https://patchwork.kernel.org/project/netdevbpf/list/?series=809049&state=*
Pavel Begunkov Dec. 19, 2023, 11:55 p.m. UTC | #17
On 12/14/23 20:03, Mina Almasry wrote:
> On Mon, Dec 11, 2023 at 12:37 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
> ...
>>>> If you remove the branch, let it fall into ->release and rely
>>>> on refcounting there, then the callback could also fix up
>>>> release_cnt or ask pp to do it, like in the patch I linked above
>>>>
>>>
>>> Sadly I don't think this is possible due to the reasons I mention in
>>> the commit message of that patch. Prematurely releasing ppiov and not
>>> having them be candidates for recycling shows me a 4-5x degradation in
>>> performance.
>>
>> I don't think I follow. The concept is to only recycle a buffer (i.e.
>> make it available for allocation) when its refs drop to zero, which is
>> IMHO the only way it can work, and IIUC what this patchset is doing.
>>
>> That's also I suggest to do, but through a slightly different path.
>> Let's say at some moment there are 2 refs (e.g. 1 for an skb and
>> 1 for userspace/xarray).
>>
>> Say it first puts the skb:
>>
>> napi_pp_put_page()
>>     -> page_pool_return_page()
>>       -> mp_ops->release_page()
>>          -> need_to_free = put_buf()
>>             // not last ref, need_to_free==false,
>>             // don't recycle, don't increase release_cnt
>>
>> Then you put the last ref:
>>
>> page_pool_iov_put_many()
>>     -> page_pool_return_page()
>>       -> mp_ops->release_page()
>>          -> need_to_free = put_buf()
>>             // last ref, need_to_free==true,
>>             // recycle and release_cnt++
>>
>> And that last put can even be recycled right into the
>> pp / ptr_ring, in which case it doesn't need to touch
>> release_cnt. Does it make sense? I don't see where
>> 4-5x degradation would come from
>>
>>
> 
> Sorry for the late reply, I have been working on this locally.
> 
> What you're saying makes sense, and I'm no longer sure why I was
> seeing a perf degradation without '[net-next v1 10/16] page_pool:
> don't release iov on elevanted refcount'. However, even though what
> you're saying is technically correct, AFAIU it's actually semantically
> wrong. When a page is released by the page_pool, we should call
> page_pool_clear_pp_info() and completely disconnect the page from the
> pool. If we call release_page() on a page and then the page pool sees
> it again in page_pool_return_page(), I think that is considered a bug.

You're adding a new feature the semantics of which is already
different from what is in there, you can extend it any way as long
as it makes sense and agreed on. IMHO, it does. But well, if
there is a better solution I'm all for it.

> In fact I think what you're proposing is as a result of a bug because
> we don't call a page_pool_clear_pp_info() equivalent on releasing
> ppiov.

I don't get it, what bug? page_pool_clear_pp_info() is not called
for ppiov because it doesn't make sense to call it for ppiov,
there is no reason to clear ppiov->pp, nor there is any pp_magic.


> However, I'm reasonably confident I figured out the right thing to do
> here. The page_pool uses page->pp_frag_count for its refcounting.
> pp_frag_count is a misnomer, it's being renamed to pp_ref_count in
> Liang's series[1]). In this series I used a get_page/put_page
> equivalent for refcounting. Once I transitioned to using
> pp_[frag|ref]_count for refcounting inside the page_pool, the issue
> went away, and I no longer need the patch 'page_pool: don't release
> iov on elevanted refcount'.

Lovely, I'll take a look later! (also assuming it's in v5)


> There is an additional upside, since pages and ppiovs are both being
> refcounted using pp_[frag|ref]_count, we get some unified handling for
> ppiov and we reduce the checks around ppiov. This should be fixed
> properly in the next series.
> 
> I still need to do some work (~1 week) before I upload the next
> version as there is a new requirement from MM that we transition to a
> new type and not re-use page*, but I uploaded my changes github with
> the refcounting issues resolved in case they're useful to you. Sorry
> for the churn:
> 
> https://github.com/mina/linux/commits/tcpdevmem-v1.5/
> 
> [1] https://patchwork.kernel.org/project/netdevbpf/list/?series=809049&state=*
>
diff mbox series

Patch

diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 8bfc2d43efd4..00197f14aa87 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -53,6 +53,8 @@ 
 #define _NET_PAGE_POOL_HELPERS_H
 
 #include <net/page_pool/types.h>
+#include <net/net_debug.h>
+#include <net/devmem.h>
 
 #ifdef CONFIG_PAGE_POOL_STATS
 /* Deprecated driver-facing API, use netlink instead */
@@ -92,6 +94,11 @@  static inline unsigned int page_pool_iov_idx(const struct page_pool_iov *ppiov)
 	return ppiov - page_pool_iov_owner(ppiov)->ppiovs;
 }
 
+static inline u32 page_pool_iov_binding_id(const struct page_pool_iov *ppiov)
+{
+	return page_pool_iov_owner(ppiov)->binding->id;
+}
+
 static inline dma_addr_t
 page_pool_iov_dma_addr(const struct page_pool_iov *ppiov)
 {
@@ -107,6 +114,46 @@  page_pool_iov_binding(const struct page_pool_iov *ppiov)
 	return page_pool_iov_owner(ppiov)->binding;
 }
 
+static inline int page_pool_iov_refcount(const struct page_pool_iov *ppiov)
+{
+	return refcount_read(&ppiov->refcount);
+}
+
+static inline void page_pool_iov_get_many(struct page_pool_iov *ppiov,
+					  unsigned int count)
+{
+	refcount_add(count, &ppiov->refcount);
+}
+
+void __page_pool_iov_free(struct page_pool_iov *ppiov);
+
+static inline void page_pool_iov_put_many(struct page_pool_iov *ppiov,
+					  unsigned int count)
+{
+	if (!refcount_sub_and_test(count, &ppiov->refcount))
+		return;
+
+	__page_pool_iov_free(ppiov);
+}
+
+/* page pool mm helpers */
+
+DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);
+static inline bool page_is_page_pool_iov(const struct page *page)
+{
+	return static_branch_unlikely(&page_pool_mem_providers) &&
+	       (unsigned long)page & PP_IOV;
+}
+
+static inline struct page_pool_iov *page_to_page_pool_iov(struct page *page)
+{
+	if (page_is_page_pool_iov(page))
+		return (struct page_pool_iov *)((unsigned long)page & ~PP_IOV);
+
+	DEBUG_NET_WARN_ON_ONCE(true);
+	return NULL;
+}
+
 /**
  * page_pool_dev_alloc_pages() - allocate a page.
  * @pool:	pool from which to allocate
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 44faee7a7b02..136930a238de 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -134,8 +134,15 @@  struct memory_provider_ops {
 	bool (*release_page)(struct page_pool *pool, struct page *page);
 };
 
+extern const struct memory_provider_ops dmabuf_devmem_ops;
+
 /* page_pool_iov support */
 
+/*  We overload the LSB of the struct page pointer to indicate whether it's
+ *  a page or page_pool_iov.
+ */
+#define PP_IOV 0x01UL
+
 /* Owner of the dma-buf chunks inserted into the gen pool. Each scatterlist
  * entry from the dmabuf is inserted into the genpool as a chunk, and needs
  * this owner struct to keep track of some metadata necessary to create
@@ -159,6 +166,8 @@  struct page_pool_iov {
 	struct dmabuf_genpool_chunk_owner *owner;
 
 	refcount_t refcount;
+
+	struct page_pool *pp;
 };
 
 struct page_pool {
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index f5c84d2a4510..423c88564a00 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -12,6 +12,7 @@ 
 
 #include <net/page_pool/helpers.h>
 #include <net/xdp.h>
+#include <net/netdev_rx_queue.h>
 
 #include <linux/dma-direction.h>
 #include <linux/dma-mapping.h>
@@ -20,12 +21,15 @@ 
 #include <linux/poison.h>
 #include <linux/ethtool.h>
 #include <linux/netdevice.h>
+#include <linux/genalloc.h>
+#include <net/devmem.h>
 
 #include <trace/events/page_pool.h>
 
 #include "page_pool_priv.h"
 
-static DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
+DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
+EXPORT_SYMBOL(page_pool_mem_providers);
 
 #define DEFER_TIME (msecs_to_jiffies(1000))
 #define DEFER_WARN_INTERVAL (60 * HZ)
@@ -175,6 +179,7 @@  static void page_pool_producer_unlock(struct page_pool *pool,
 static int page_pool_init(struct page_pool *pool,
 			  const struct page_pool_params *params)
 {
+	struct netdev_dmabuf_binding *binding = NULL;
 	unsigned int ring_qsize = 1024; /* Default */
 	int err;
 
@@ -237,6 +242,14 @@  static int page_pool_init(struct page_pool *pool,
 	/* Driver calling page_pool_create() also call page_pool_destroy() */
 	refcount_set(&pool->user_cnt, 1);
 
+	if (pool->p.queue)
+		binding = READ_ONCE(pool->p.queue->binding);
+
+	if (binding) {
+		pool->mp_ops = &dmabuf_devmem_ops;
+		pool->mp_priv = binding;
+	}
+
 	if (pool->mp_ops) {
 		err = pool->mp_ops->init(pool);
 		if (err) {
@@ -1020,3 +1033,77 @@  void page_pool_update_nid(struct page_pool *pool, int new_nid)
 	}
 }
 EXPORT_SYMBOL(page_pool_update_nid);
+
+void __page_pool_iov_free(struct page_pool_iov *ppiov)
+{
+	if (WARN_ON(ppiov->pp->mp_ops != &dmabuf_devmem_ops))
+		return;
+
+	netdev_free_dmabuf(ppiov);
+}
+EXPORT_SYMBOL_GPL(__page_pool_iov_free);
+
+/*** "Dmabuf devmem memory provider" ***/
+
+static int mp_dmabuf_devmem_init(struct page_pool *pool)
+{
+	struct netdev_dmabuf_binding *binding = pool->mp_priv;
+
+	if (!binding)
+		return -EINVAL;
+
+	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+		return -EOPNOTSUPP;
+
+	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+		return -EOPNOTSUPP;
+
+	netdev_dmabuf_binding_get(binding);
+	return 0;
+}
+
+static struct page *mp_dmabuf_devmem_alloc_pages(struct page_pool *pool,
+						 gfp_t gfp)
+{
+	struct netdev_dmabuf_binding *binding = pool->mp_priv;
+	struct page_pool_iov *ppiov;
+
+	ppiov = netdev_alloc_dmabuf(binding);
+	if (!ppiov)
+		return NULL;
+
+	ppiov->pp = pool;
+	pool->pages_state_hold_cnt++;
+	trace_page_pool_state_hold(pool, (struct page *)ppiov,
+				   pool->pages_state_hold_cnt);
+	return (struct page *)((unsigned long)ppiov | PP_IOV);
+}
+
+static void mp_dmabuf_devmem_destroy(struct page_pool *pool)
+{
+	struct netdev_dmabuf_binding *binding = pool->mp_priv;
+
+	netdev_dmabuf_binding_put(binding);
+}
+
+static bool mp_dmabuf_devmem_release_page(struct page_pool *pool,
+					  struct page *page)
+{
+	struct page_pool_iov *ppiov;
+
+	if (WARN_ON_ONCE(!page_is_page_pool_iov(page)))
+		return false;
+
+	ppiov = page_to_page_pool_iov(page);
+	page_pool_iov_put_many(ppiov, 1);
+	/* We don't want the page pool put_page()ing our page_pool_iovs. */
+	return false;
+}
+
+const struct memory_provider_ops dmabuf_devmem_ops = {
+	.init			= mp_dmabuf_devmem_init,
+	.destroy		= mp_dmabuf_devmem_destroy,
+	.alloc_pages		= mp_dmabuf_devmem_alloc_pages,
+	.release_page		= mp_dmabuf_devmem_release_page,
+};
+EXPORT_SYMBOL(dmabuf_devmem_ops);