diff mbox series

[net-next,v4,6/7] page_pool: check for DMA sync shortcut earlier

Message ID 20240423135832.2271696-7-aleksander.lobakin@intel.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series dma: skip calling no-op sync ops when possible | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/apply fail Patch does not apply to net-next-1

Commit Message

Alexander Lobakin April 23, 2024, 1:58 p.m. UTC
We can save a couple more function calls in the Page Pool code if we
check for dma_need_sync() earlier, just when we test pp->p.dma_sync.
Move both these checks into an inline wrapper and call the PP wrapper
over the generic DMA sync function only when both are true.
You can't cache the result of dma_need_sync() in &page_pool, as it may
change anytime if an SWIOTLB buffer is allocated or mapped.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 net/core/page_pool.c | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

Comments

Alexander Lobakin April 24, 2024, 8:52 a.m. UTC | #1
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Tue, 23 Apr 2024 15:58:31 +0200

> We can save a couple more function calls in the Page Pool code if we
> check for dma_need_sync() earlier, just when we test pp->p.dma_sync.
> Move both these checks into an inline wrapper and call the PP wrapper
> over the generic DMA sync function only when both are true.
> You can't cache the result of dma_need_sync() in &page_pool, as it may
> change anytime if an SWIOTLB buffer is allocated or mapped.
> 
> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
> ---
>  net/core/page_pool.c | 31 +++++++++++++++++--------------
>  1 file changed, 17 insertions(+), 14 deletions(-)
> 
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 6cf26a68fa91..87319c6365e0 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -398,16 +398,24 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
>  	return page;
>  }
>  
> -static void page_pool_dma_sync_for_device(const struct page_pool *pool,
> -					  const struct page *page,
> -					  unsigned int dma_sync_size)
> +static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
> +					    const struct page *page,
> +					    u32 dma_sync_size)
>  {
>  	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
>  
>  	dma_sync_size = min(dma_sync_size, pool->p.max_len);
> -	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
> -					 pool->p.offset, dma_sync_size,
> -					 pool->p.dma_dir);
> +	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
> +				     dma_sync_size, pool->p.dma_dir);

Breh, this breaks builds on !DMA_NEED_SYNC systems, as this function
isn't defined there, and my CI didn't catch it in time... I'll ifdef
this in the next spin after some reviews for this one.

> +}
> +
> +static __always_inline void
> +page_pool_dma_sync_for_device(const struct page_pool *pool,
> +			      const struct page *page,
> +			      u32 dma_sync_size)
> +{
> +	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
> +		__page_pool_dma_sync_for_device(pool, page, dma_sync_size);
>  }
>  
>  static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
> @@ -429,8 +437,7 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
>  	if (page_pool_set_dma_addr(page, dma))
>  		goto unmap_failed;
>  
> -	if (pool->dma_sync)
> -		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
> +	page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
>  
>  	return true;
>  
> @@ -699,9 +706,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
>  	if (likely(__page_pool_page_can_be_recycled(page))) {
>  		/* Read barrier done in page_ref_count / READ_ONCE */
>  
> -		if (pool->dma_sync)
> -			page_pool_dma_sync_for_device(pool, page,
> -						      dma_sync_size);
> +		page_pool_dma_sync_for_device(pool, page, dma_sync_size);
>  
>  		if (allow_direct && page_pool_recycle_in_cache(page, pool))
>  			return NULL;
> @@ -840,9 +845,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
>  		return NULL;
>  
>  	if (__page_pool_page_can_be_recycled(page)) {
> -		if (pool->dma_sync)
> -			page_pool_dma_sync_for_device(pool, page, -1);
> -
> +		page_pool_dma_sync_for_device(pool, page, -1);
>  		return page;
>  	}

Thanks,
Olek
Christoph Hellwig April 26, 2024, 5:02 a.m. UTC | #2
On Wed, Apr 24, 2024 at 10:52:49AM +0200, Alexander Lobakin wrote:
> Breh, this breaks builds on !DMA_NEED_SYNC systems, as this function
> isn't defined there, and my CI didn't catch it in time... I'll ifdef
> this in the next spin after some reviews for this one.

I looked over the dma-mapping parts of the series and it looks fine
to me.  So please resend as the fixed up version as soon as you fine
time for it.
Christoph Hellwig May 2, 2024, 5:58 a.m. UTC | #3
Hi Alexander,

do you have time to resend the series?  I'd love to merge it for
6.10 if I can get the updated version ASAP.
Robin Murphy May 2, 2024, 3:49 p.m. UTC | #4
On 24/04/2024 9:52 am, Alexander Lobakin wrote:
> From: Alexander Lobakin <aleksander.lobakin@intel.com>
> Date: Tue, 23 Apr 2024 15:58:31 +0200
> 
>> We can save a couple more function calls in the Page Pool code if we
>> check for dma_need_sync() earlier, just when we test pp->p.dma_sync.
>> Move both these checks into an inline wrapper and call the PP wrapper
>> over the generic DMA sync function only when both are true.
>> You can't cache the result of dma_need_sync() in &page_pool, as it may
>> change anytime if an SWIOTLB buffer is allocated or mapped.
>>
>> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
>> ---
>>   net/core/page_pool.c | 31 +++++++++++++++++--------------
>>   1 file changed, 17 insertions(+), 14 deletions(-)
>>
>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>> index 6cf26a68fa91..87319c6365e0 100644
>> --- a/net/core/page_pool.c
>> +++ b/net/core/page_pool.c
>> @@ -398,16 +398,24 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
>>   	return page;
>>   }
>>   
>> -static void page_pool_dma_sync_for_device(const struct page_pool *pool,
>> -					  const struct page *page,
>> -					  unsigned int dma_sync_size)
>> +static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
>> +					    const struct page *page,
>> +					    u32 dma_sync_size)
>>   {
>>   	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
>>   
>>   	dma_sync_size = min(dma_sync_size, pool->p.max_len);
>> -	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
>> -					 pool->p.offset, dma_sync_size,
>> -					 pool->p.dma_dir);
>> +	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
>> +				     dma_sync_size, pool->p.dma_dir);
> 
> Breh, this breaks builds on !DMA_NEED_SYNC systems, as this function
> isn't defined there, and my CI didn't catch it in time... I'll ifdef
> this in the next spin after some reviews for this one.

Hmm, the way things have ended up, do we even need this change? (Other
than factoring out the pool->dma_sync check, which is clearly nice)

Since __page_pool_dma_sync_for_device() is never called directly, the
flow we always get is:

page_pool_dma_sync_for_device()
     dma_dev_need_sync()
         __page_pool_dma_sync_for_device()
             ... // a handful of trivial arithmetic
             __dma_sync_single_for_device()

i.e. still effectively the same order of
"if (dma_dev_need_sync()) __dma_sync()" invocations as if we'd just used
the standard dma_sync(), so referencing the unwrapped internals only
spreads it out a bit more for no real benefit. As an experiment I tried
the diff below on top, effectively undoing this problematic part, and it
doesn't seem to make any appreciable difference in a before-and-after
comparison of the object code, at least for my arm64 build.

Thanks,
Robin.

----->8-----
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 27f3b6db800e..b8ab7d4ca229 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -398,24 +398,20 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
  	return page;
  }
  
-static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
-					    const struct page *page,
-					    u32 dma_sync_size)
-{
-	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
-
-	dma_sync_size = min(dma_sync_size, pool->p.max_len);
-	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
-				     dma_sync_size, pool->p.dma_dir);
-}
-
  static __always_inline void
  page_pool_dma_sync_for_device(const struct page_pool *pool,
  			      const struct page *page,
  			      u32 dma_sync_size)
  {
-	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
-		__page_pool_dma_sync_for_device(pool, page, dma_sync_size);
+	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
+
+	if (!pool->dma_sync)
+		return;
+
+	dma_sync_size = min(dma_sync_size, pool->p.max_len);
+	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
+					 pool->p.offset, dma_sync_size,
+					 pool->p.dma_dir);
  }
  
  static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
Alexander Lobakin May 6, 2024, 9:38 a.m. UTC | #5
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 May 2024 07:58:55 +0200

> Hi Alexander,
> 
> do you have time to resend the series?  I'd love to merge it for
> 6.10 if I can get the updated version ASAP.

Hi!

Sorry, I was on a small vacation.
Do I still have a chance to get this into 6.10?

Thanks,
Olek
Alexander Lobakin May 6, 2024, 9:40 a.m. UTC | #6
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 2 May 2024 16:49:48 +0100

> On 24/04/2024 9:52 am, Alexander Lobakin wrote:
>> From: Alexander Lobakin <aleksander.lobakin@intel.com>
>> Date: Tue, 23 Apr 2024 15:58:31 +0200
>>
>>> We can save a couple more function calls in the Page Pool code if we
>>> check for dma_need_sync() earlier, just when we test pp->p.dma_sync.
>>> Move both these checks into an inline wrapper and call the PP wrapper
>>> over the generic DMA sync function only when both are true.
>>> You can't cache the result of dma_need_sync() in &page_pool, as it may
>>> change anytime if an SWIOTLB buffer is allocated or mapped.
>>>
>>> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
>>> ---
>>>   net/core/page_pool.c | 31 +++++++++++++++++--------------
>>>   1 file changed, 17 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>>> index 6cf26a68fa91..87319c6365e0 100644
>>> --- a/net/core/page_pool.c
>>> +++ b/net/core/page_pool.c
>>> @@ -398,16 +398,24 @@ static struct page
>>> *__page_pool_get_cached(struct page_pool *pool)
>>>       return page;
>>>   }
>>>   -static void page_pool_dma_sync_for_device(const struct page_pool
>>> *pool,
>>> -                      const struct page *page,
>>> -                      unsigned int dma_sync_size)
>>> +static void __page_pool_dma_sync_for_device(const struct page_pool
>>> *pool,
>>> +                        const struct page *page,
>>> +                        u32 dma_sync_size)
>>>   {
>>>       dma_addr_t dma_addr = page_pool_get_dma_addr(page);
>>>         dma_sync_size = min(dma_sync_size, pool->p.max_len);
>>> -    dma_sync_single_range_for_device(pool->p.dev, dma_addr,
>>> -                     pool->p.offset, dma_sync_size,
>>> -                     pool->p.dma_dir);
>>> +    __dma_sync_single_for_device(pool->p.dev, dma_addr +
>>> pool->p.offset,
>>> +                     dma_sync_size, pool->p.dma_dir);
>>
>> Breh, this breaks builds on !DMA_NEED_SYNC systems, as this function
>> isn't defined there, and my CI didn't catch it in time... I'll ifdef
>> this in the next spin after some reviews for this one.
> 
> Hmm, the way things have ended up, do we even need this change? (Other
> than factoring out the pool->dma_sync check, which is clearly nice)
> 
> Since __page_pool_dma_sync_for_device() is never called directly, the
> flow we always get is:
> 
> page_pool_dma_sync_for_device()
>     dma_dev_need_sync()
>         __page_pool_dma_sync_for_device()
>             ... // a handful of trivial arithmetic
>             __dma_sync_single_for_device()
> 
> i.e. still effectively the same order of
> "if (dma_dev_need_sync()) __dma_sync()" invocations as if we'd just used
> the standard dma_sync(), so referencing the unwrapped internals only
> spreads it out a bit more for no real benefit. As an experiment I tried
> the diff below on top, effectively undoing this problematic part, and it
> doesn't seem to make any appreciable difference in a before-and-after
> comparison of the object code, at least for my arm64 build.
> 
> Thanks,
> Robin.
> 
> ----->8-----
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 27f3b6db800e..b8ab7d4ca229 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -398,24 +398,20 @@ static struct page *__page_pool_get_cached(struct
> page_pool *pool)
>      return page;
>  }
>  
> -static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
> -                        const struct page *page,
> -                        u32 dma_sync_size)
> -{
> -    dma_addr_t dma_addr = page_pool_get_dma_addr(page);
> -
> -    dma_sync_size = min(dma_sync_size, pool->p.max_len);
> -    __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
> -                     dma_sync_size, pool->p.dma_dir);
> -}
> -
>  static __always_inline void

So this would unconditionally inline the whole sync code into the call
sites, while I wanted to give a chance for the compilers to uninline
__page_pool_dma_sync_for_device().

>  page_pool_dma_sync_for_device(const struct page_pool *pool,
>                    const struct page *page,
>                    u32 dma_sync_size)
>  {
> -    if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
> -        __page_pool_dma_sync_for_device(pool, page, dma_sync_size);
> +    dma_addr_t dma_addr = page_pool_get_dma_addr(page);
> +
> +    if (!pool->dma_sync)
> +        return;
> +
> +    dma_sync_size = min(dma_sync_size, pool->p.max_len);
> +    dma_sync_single_range_for_device(pool->p.dev, dma_addr,
> +                     pool->p.offset, dma_sync_size,
> +                     pool->p.dma_dir);
>  }
>  
>  static bool page_pool_dma_map(struct page_pool *pool, struct page *page)

Thanks,
Olek
Christoph Hellwig May 6, 2024, 9:43 a.m. UTC | #7
On Mon, May 06, 2024 at 11:38:17AM +0200, Alexander Lobakin wrote:
> From: Christoph Hellwig <hch@lst.de>
> Date: Thu, 2 May 2024 07:58:55 +0200
> 
> > Hi Alexander,
> > 
> > do you have time to resend the series?  I'd love to merge it for
> > 6.10 if I can get the updated version ASAP.
> 
> Hi!
> 
> Sorry, I was on a small vacation.
> Do I still have a chance to get this into 6.10?

If I can get it merged by tomorrow, yes.
diff mbox series

Patch

diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 6cf26a68fa91..87319c6365e0 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -398,16 +398,24 @@  static struct page *__page_pool_get_cached(struct page_pool *pool)
 	return page;
 }
 
-static void page_pool_dma_sync_for_device(const struct page_pool *pool,
-					  const struct page *page,
-					  unsigned int dma_sync_size)
+static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
+					    const struct page *page,
+					    u32 dma_sync_size)
 {
 	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
 
 	dma_sync_size = min(dma_sync_size, pool->p.max_len);
-	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
-					 pool->p.offset, dma_sync_size,
-					 pool->p.dma_dir);
+	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
+				     dma_sync_size, pool->p.dma_dir);
+}
+
+static __always_inline void
+page_pool_dma_sync_for_device(const struct page_pool *pool,
+			      const struct page *page,
+			      u32 dma_sync_size)
+{
+	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
+		__page_pool_dma_sync_for_device(pool, page, dma_sync_size);
 }
 
 static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
@@ -429,8 +437,7 @@  static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 	if (page_pool_set_dma_addr(page, dma))
 		goto unmap_failed;
 
-	if (pool->dma_sync)
-		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
+	page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
 
 	return true;
 
@@ -699,9 +706,7 @@  __page_pool_put_page(struct page_pool *pool, struct page *page,
 	if (likely(__page_pool_page_can_be_recycled(page))) {
 		/* Read barrier done in page_ref_count / READ_ONCE */
 
-		if (pool->dma_sync)
-			page_pool_dma_sync_for_device(pool, page,
-						      dma_sync_size);
+		page_pool_dma_sync_for_device(pool, page, dma_sync_size);
 
 		if (allow_direct && page_pool_recycle_in_cache(page, pool))
 			return NULL;
@@ -840,9 +845,7 @@  static struct page *page_pool_drain_frag(struct page_pool *pool,
 		return NULL;
 
 	if (__page_pool_page_can_be_recycled(page)) {
-		if (pool->dma_sync)
-			page_pool_dma_sync_for_device(pool, page, -1);
-
+		page_pool_dma_sync_for_device(pool, page, -1);
 		return page;
 	}