diff mbox series

[net-next,v3,1/4] net: stmmac: Switch to zero-copy in non-XDP RX path

Message ID bd7aabf4d9b6696885922ed4bef8fc95142d3004.1736910454.git.0x1207@gmail.com (mailing list archive)
State New, archived
Headers show
Series net: stmmac: RX performance improvement | expand

Commit Message

Furong Xu Jan. 15, 2025, 3:27 a.m. UTC
Avoid memcpy in non-XDP RX path by marking all allocated SKBs to
be recycled in the upper network stack.

This patch brings ~11.5% driver performance improvement in a TCP RX
throughput test with iPerf tool on a single isolated Cortex-A65 CPU
core, from 2.18 Gbits/sec increased to 2.43 Gbits/sec.

Signed-off-by: Furong Xu <0x1207@gmail.com>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac.h  |  1 +
 .../net/ethernet/stmicro/stmmac/stmmac_main.c | 26 ++++++++++++-------
 2 files changed, 18 insertions(+), 9 deletions(-)

Comments

Larysa Zaremba Jan. 15, 2025, 4:58 p.m. UTC | #1
On Wed, Jan 15, 2025 at 11:27:02AM +0800, Furong Xu wrote:
> Avoid memcpy in non-XDP RX path by marking all allocated SKBs to
> be recycled in the upper network stack.
> 
> This patch brings ~11.5% driver performance improvement in a TCP RX
> throughput test with iPerf tool on a single isolated Cortex-A65 CPU
> core, from 2.18 Gbits/sec increased to 2.43 Gbits/sec.
> 
> Signed-off-by: Furong Xu <0x1207@gmail.com>
> Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>

Reviewed-by: Larysa Zaremba <larysa.zaremba@intel.com>

> ---
>  drivers/net/ethernet/stmicro/stmmac/stmmac.h  |  1 +
>  .../net/ethernet/stmicro/stmmac/stmmac_main.c | 26 ++++++++++++-------
>  2 files changed, 18 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> index e8dbce20129c..f05cae103d83 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> @@ -126,6 +126,7 @@ struct stmmac_rx_queue {
>  	unsigned int cur_rx;
>  	unsigned int dirty_rx;
>  	unsigned int buf_alloc_num;
> +	unsigned int napi_skb_frag_size;
>  	dma_addr_t dma_rx_phy;
>  	u32 rx_tail_addr;
>  	unsigned int state_saved;
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index acd6994c1764..1d98a5e8c98c 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -1341,7 +1341,7 @@ static unsigned int stmmac_rx_offset(struct stmmac_priv *priv)
>  	if (stmmac_xdp_is_enabled(priv))
>  		return XDP_PACKET_HEADROOM;
>  
> -	return 0;
> +	return NET_SKB_PAD;
>  }
>  
>  static int stmmac_set_bfsize(int mtu, int bufsize)
> @@ -2040,17 +2040,21 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv,
>  	struct stmmac_channel *ch = &priv->channel[queue];
>  	bool xdp_prog = stmmac_xdp_is_enabled(priv);
>  	struct page_pool_params pp_params = { 0 };
> -	unsigned int num_pages;
> +	unsigned int dma_buf_sz_pad, num_pages;
>  	unsigned int napi_id;
>  	int ret;
>  
> +	dma_buf_sz_pad = stmmac_rx_offset(priv) + dma_conf->dma_buf_sz +
> +			 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> +	num_pages = DIV_ROUND_UP(dma_buf_sz_pad, PAGE_SIZE);
> +
>  	rx_q->queue_index = queue;
>  	rx_q->priv_data = priv;
> +	rx_q->napi_skb_frag_size = num_pages * PAGE_SIZE;
>  
>  	pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
>  	pp_params.pool_size = dma_conf->dma_rx_size;
> -	num_pages = DIV_ROUND_UP(dma_conf->dma_buf_sz, PAGE_SIZE);
> -	pp_params.order = ilog2(num_pages);
> +	pp_params.order = order_base_2(num_pages);
>  	pp_params.nid = dev_to_node(priv->device);
>  	pp_params.dev = priv->device;
>  	pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
> @@ -5582,22 +5586,26 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
>  		}
>  
>  		if (!skb) {
> +			unsigned int head_pad_len;
> +
>  			/* XDP program may expand or reduce tail */
>  			buf1_len = ctx.xdp.data_end - ctx.xdp.data;
>  
> -			skb = napi_alloc_skb(&ch->rx_napi, buf1_len);
> +			skb = napi_build_skb(page_address(buf->page),
> +					     rx_q->napi_skb_frag_size);
>  			if (!skb) {
> +				page_pool_recycle_direct(rx_q->page_pool,
> +							 buf->page);
>  				rx_dropped++;
>  				count++;
>  				goto drain_data;
>  			}
>  
>  			/* XDP program may adjust header */
> -			skb_copy_to_linear_data(skb, ctx.xdp.data, buf1_len);
> +			head_pad_len = ctx.xdp.data - ctx.xdp.data_hard_start;
> +			skb_reserve(skb, head_pad_len);
>  			skb_put(skb, buf1_len);
> -
> -			/* Data payload copied into SKB, page ready for recycle */
> -			page_pool_recycle_direct(rx_q->page_pool, buf->page);
> +			skb_mark_for_recycle(skb);
>  			buf->page = NULL;
>  		} else if (buf1_len) {
>  			dma_sync_single_for_cpu(priv->device, buf->addr,
> -- 
> 2.34.1
> 
>
Yanteng Si Jan. 16, 2025, 2:05 a.m. UTC | #2
在 2025/1/15 11:27, Furong Xu 写道:
> Avoid memcpy in non-XDP RX path by marking all allocated SKBs to
> be recycled in the upper network stack.
> 
> This patch brings ~11.5% driver performance improvement in a TCP RX
> throughput test with iPerf tool on a single isolated Cortex-A65 CPU
> core, from 2.18 Gbits/sec increased to 2.43 Gbits/sec.
> 
> Signed-off-by: Furong Xu <0x1207@gmail.com>
> Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Yanteng Si <si.yanteng@linux.dev>

Thanks,
Yanteng
> ---
>   drivers/net/ethernet/stmicro/stmmac/stmmac.h  |  1 +
>   .../net/ethernet/stmicro/stmmac/stmmac_main.c | 26 ++++++++++++-------
>   2 files changed, 18 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> index e8dbce20129c..f05cae103d83 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> @@ -126,6 +126,7 @@ struct stmmac_rx_queue {
>   	unsigned int cur_rx;
>   	unsigned int dirty_rx;
>   	unsigned int buf_alloc_num;
> +	unsigned int napi_skb_frag_size;
>   	dma_addr_t dma_rx_phy;
>   	u32 rx_tail_addr;
>   	unsigned int state_saved;
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index acd6994c1764..1d98a5e8c98c 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -1341,7 +1341,7 @@ static unsigned int stmmac_rx_offset(struct stmmac_priv *priv)
>   	if (stmmac_xdp_is_enabled(priv))
>   		return XDP_PACKET_HEADROOM;
>   
> -	return 0;
> +	return NET_SKB_PAD;
>   }
>   
>   static int stmmac_set_bfsize(int mtu, int bufsize)
> @@ -2040,17 +2040,21 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv,
>   	struct stmmac_channel *ch = &priv->channel[queue];
>   	bool xdp_prog = stmmac_xdp_is_enabled(priv);
>   	struct page_pool_params pp_params = { 0 };
> -	unsigned int num_pages;
> +	unsigned int dma_buf_sz_pad, num_pages;
>   	unsigned int napi_id;
>   	int ret;
>   
> +	dma_buf_sz_pad = stmmac_rx_offset(priv) + dma_conf->dma_buf_sz +
> +			 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> +	num_pages = DIV_ROUND_UP(dma_buf_sz_pad, PAGE_SIZE);
> +
>   	rx_q->queue_index = queue;
>   	rx_q->priv_data = priv;
> +	rx_q->napi_skb_frag_size = num_pages * PAGE_SIZE;
>   
>   	pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
>   	pp_params.pool_size = dma_conf->dma_rx_size;
> -	num_pages = DIV_ROUND_UP(dma_conf->dma_buf_sz, PAGE_SIZE);
> -	pp_params.order = ilog2(num_pages);
> +	pp_params.order = order_base_2(num_pages);
>   	pp_params.nid = dev_to_node(priv->device);
>   	pp_params.dev = priv->device;
>   	pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
> @@ -5582,22 +5586,26 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
>   		}
>   
>   		if (!skb) {
> +			unsigned int head_pad_len;
> +
>   			/* XDP program may expand or reduce tail */
>   			buf1_len = ctx.xdp.data_end - ctx.xdp.data;
>   
> -			skb = napi_alloc_skb(&ch->rx_napi, buf1_len);
> +			skb = napi_build_skb(page_address(buf->page),
> +					     rx_q->napi_skb_frag_size);
>   			if (!skb) {
> +				page_pool_recycle_direct(rx_q->page_pool,
> +							 buf->page);
>   				rx_dropped++;
>   				count++;
>   				goto drain_data;
>   			}
>   
>   			/* XDP program may adjust header */
> -			skb_copy_to_linear_data(skb, ctx.xdp.data, buf1_len);
> +			head_pad_len = ctx.xdp.data - ctx.xdp.data_hard_start;
> +			skb_reserve(skb, head_pad_len);
>   			skb_put(skb, buf1_len);
> -
> -			/* Data payload copied into SKB, page ready for recycle */
> -			page_pool_recycle_direct(rx_q->page_pool, buf->page);
> +			skb_mark_for_recycle(skb);
>   			buf->page = NULL;
>   		} else if (buf1_len) {
>   			dma_sync_single_for_cpu(priv->device, buf->addr,
Jon Hunter Jan. 23, 2025, 2:06 p.m. UTC | #3
Hi Furong,

On 15/01/2025 03:27, Furong Xu wrote:
> Avoid memcpy in non-XDP RX path by marking all allocated SKBs to
> be recycled in the upper network stack.
> 
> This patch brings ~11.5% driver performance improvement in a TCP RX
> throughput test with iPerf tool on a single isolated Cortex-A65 CPU
> core, from 2.18 Gbits/sec increased to 2.43 Gbits/sec.
> 
> Signed-off-by: Furong Xu <0x1207@gmail.com>
> Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
> ---
>   drivers/net/ethernet/stmicro/stmmac/stmmac.h  |  1 +
>   .../net/ethernet/stmicro/stmmac/stmmac_main.c | 26 ++++++++++++-------
>   2 files changed, 18 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> index e8dbce20129c..f05cae103d83 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> @@ -126,6 +126,7 @@ struct stmmac_rx_queue {
>   	unsigned int cur_rx;
>   	unsigned int dirty_rx;
>   	unsigned int buf_alloc_num;
> +	unsigned int napi_skb_frag_size;
>   	dma_addr_t dma_rx_phy;
>   	u32 rx_tail_addr;
>   	unsigned int state_saved;
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index acd6994c1764..1d98a5e8c98c 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -1341,7 +1341,7 @@ static unsigned int stmmac_rx_offset(struct stmmac_priv *priv)
>   	if (stmmac_xdp_is_enabled(priv))
>   		return XDP_PACKET_HEADROOM;
>   
> -	return 0;
> +	return NET_SKB_PAD;
>   }
>   
>   static int stmmac_set_bfsize(int mtu, int bufsize)
> @@ -2040,17 +2040,21 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv,
>   	struct stmmac_channel *ch = &priv->channel[queue];
>   	bool xdp_prog = stmmac_xdp_is_enabled(priv);
>   	struct page_pool_params pp_params = { 0 };
> -	unsigned int num_pages;
> +	unsigned int dma_buf_sz_pad, num_pages;
>   	unsigned int napi_id;
>   	int ret;
>   
> +	dma_buf_sz_pad = stmmac_rx_offset(priv) + dma_conf->dma_buf_sz +
> +			 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> +	num_pages = DIV_ROUND_UP(dma_buf_sz_pad, PAGE_SIZE);
> +
>   	rx_q->queue_index = queue;
>   	rx_q->priv_data = priv;
> +	rx_q->napi_skb_frag_size = num_pages * PAGE_SIZE;
>   
>   	pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
>   	pp_params.pool_size = dma_conf->dma_rx_size;
> -	num_pages = DIV_ROUND_UP(dma_conf->dma_buf_sz, PAGE_SIZE);
> -	pp_params.order = ilog2(num_pages);
> +	pp_params.order = order_base_2(num_pages);
>   	pp_params.nid = dev_to_node(priv->device);
>   	pp_params.dev = priv->device;
>   	pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
> @@ -5582,22 +5586,26 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
>   		}
>   
>   		if (!skb) {
> +			unsigned int head_pad_len;
> +
>   			/* XDP program may expand or reduce tail */
>   			buf1_len = ctx.xdp.data_end - ctx.xdp.data;
>   
> -			skb = napi_alloc_skb(&ch->rx_napi, buf1_len);
> +			skb = napi_build_skb(page_address(buf->page),
> +					     rx_q->napi_skb_frag_size);
>   			if (!skb) {
> +				page_pool_recycle_direct(rx_q->page_pool,
> +							 buf->page);
>   				rx_dropped++;
>   				count++;
>   				goto drain_data;
>   			}
>   
>   			/* XDP program may adjust header */
> -			skb_copy_to_linear_data(skb, ctx.xdp.data, buf1_len);
> +			head_pad_len = ctx.xdp.data - ctx.xdp.data_hard_start;
> +			skb_reserve(skb, head_pad_len);
>   			skb_put(skb, buf1_len);
> -
> -			/* Data payload copied into SKB, page ready for recycle */
> -			page_pool_recycle_direct(rx_q->page_pool, buf->page);
> +			skb_mark_for_recycle(skb);
>   			buf->page = NULL;
>   		} else if (buf1_len) {
>   			dma_sync_single_for_cpu(priv->device, buf->addr,


We have noticed a boot regression on -next when booting with NFS. Bisect 
is pointing to this commit and reverting this on top of -next does fix 
the problem.

I only see this on Tegra234 which uses the 
drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c driver. Tegra194 which 
uses the drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c driver 
appears to be fine.

I tried booting without NFS and the network interface seems to come up 
fine. It only seems to be a problem when booting with NFS. However, I 
guess I have also not stressed the network interface when booting 
without NFS. Have you tried booting with NFS?

Thanks
Jon
Furong Xu Jan. 23, 2025, 4:35 p.m. UTC | #4
Hi Jon,

On Thu, 23 Jan 2025 14:06:42 +0000, Jon Hunter wrote: 
> We have noticed a boot regression on -next when booting with NFS.
> Bisect is pointing to this commit and reverting this on top of -next
> does fix the problem.
> 
> I only see this on Tegra234 which uses the 
> drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c driver. Tegra194
> which uses the
> drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c driver
> appears to be fine.

What is the MTU of Tegra234 and NFS server? Are they both 1500?

Could you please try attached patch to confirm if this regression is
fixed?

If the attached patch fixes this regression, and so it seems to be a
cache coherence issue specific to Tegra234, since this patch avoid
memcpy and the page buffers may be modified by upper network stack of
course, then cache lines of page buffers may become dirty. But by
reverting this patch, cache lines of page buffers never become dirty,
this is the core difference.
Brad Griffis Jan. 23, 2025, 7:53 p.m. UTC | #5
On 1/23/25 08:35, Furong Xu wrote:
> What is the MTU of Tegra234 and NFS server? Are they both 1500?

I see the same issue.  Yes, both are 1500.

> Could you please try attached patch to confirm if this regression is
> fixed?

Patch fixes the issue.

> If the attached patch fixes this regression, and so it seems to be a
> cache coherence issue specific to Tegra234, since this patch avoid
> memcpy and the page buffers may be modified by upper network stack of
> course, then cache lines of page buffers may become dirty. But by
> reverting this patch, cache lines of page buffers never become dirty,
> this is the core difference.

Thanks for these insights. I don't have specific experience in this 
driver, but I see we have dma-coherent turned on for this driver in our 
downstream device tree files (i.e. dtbs that coincide with our 
out-of-tree implementation of this driver).  I went back to the original 
code and verified that the issue was there. I did a new test where I 
added dma-coherent to this ethernet node in the dtb and retested. It worked!

Just to clarify, the patch that you had us try was not intended as an 
actual fix, correct? It was only for diagnostic purposes, i.e. to see if 
there is some kind of cache coherence issue, which seems to be the case? 
  So perhaps the only fix needed is to add dma-coherent to our device tree?
Andrew Lunn Jan. 23, 2025, 9:48 p.m. UTC | #6
> Just to clarify, the patch that you had us try was not intended as an actual
> fix, correct? It was only for diagnostic purposes, i.e. to see if there is
> some kind of cache coherence issue, which seems to be the case?  So perhaps
> the only fix needed is to add dma-coherent to our device tree?

That sounds quite error prone. How many other DT blobs are missing the
property? If the memory should be coherent, i would expect the driver
to allocate coherent memory. Or the driver needs to handle
non-coherent memory and add the necessary flush/invalidates etc.

	Andrew
Furong Xu Jan. 24, 2025, 1:53 a.m. UTC | #7
On Thu, 23 Jan 2025 11:53:21 -0800, Brad Griffis <bgriffis@nvidia.com> wrote:

> On 1/23/25 08:35, Furong Xu wrote:
> > What is the MTU of Tegra234 and NFS server? Are they both 1500?  
> 
> I see the same issue.  Yes, both are 1500.
> 
> > Could you please try attached patch to confirm if this regression is
> > fixed?  
> 
> Patch fixes the issue.
> 
> > If the attached patch fixes this regression, and so it seems to be a
> > cache coherence issue specific to Tegra234, since this patch avoid
> > memcpy and the page buffers may be modified by upper network stack of
> > course, then cache lines of page buffers may become dirty. But by
> > reverting this patch, cache lines of page buffers never become dirty,
> > this is the core difference.  
> 
> Thanks for these insights. I don't have specific experience in this 
> driver, but I see we have dma-coherent turned on for this driver in our 
> downstream device tree files (i.e. dtbs that coincide with our 
> out-of-tree implementation of this driver).  I went back to the original 
> code and verified that the issue was there. I did a new test where I 
> added dma-coherent to this ethernet node in the dtb and retested. It worked!
> 
> Just to clarify, the patch that you had us try was not intended as an 
> actual fix, correct? It was only for diagnostic purposes, i.e. to see if 
> there is some kind of cache coherence issue, which seems to be the case? 

It is not an actual fix, it is only for diagnostic purposes.

>   So perhaps the only fix needed is to add dma-coherent to our device tree?

Yes, add dma-coherent to ethernet node is the correct fix.
Furong Xu Jan. 24, 2025, 2:42 a.m. UTC | #8
On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch> wrote:

> > Just to clarify, the patch that you had us try was not intended as an actual
> > fix, correct? It was only for diagnostic purposes, i.e. to see if there is
> > some kind of cache coherence issue, which seems to be the case?  So perhaps
> > the only fix needed is to add dma-coherent to our device tree?  
> 
> That sounds quite error prone. How many other DT blobs are missing the
> property? If the memory should be coherent, i would expect the driver
> to allocate coherent memory. Or the driver needs to handle
> non-coherent memory and add the necessary flush/invalidates etc.

stmmac driver does the necessary cache flush/invalidates to maintain cache lines
explicitly.

See dma_sync_single_for_cpu():
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/include/linux/dma-mapping.h#n297

dma_dev_need_sync() is supposed to return false for Tegra234, since the ethernet
controller on Tegra234 is dma coherent by SoC design as Brad said their
downstream device tree has dma-coherent turned on by default, and after add
dma-coherent to mainline ethernet node, stmmac driver works fine.
But dma-coherent property is missing in mainline Tegra234 ethernet device tree
node, dma_dev_need_sync() returns true and this is not the expected behavior.

The dma-coherent property in device tree node is SoC specific, so only the
vendors know if their stmmac ethernet controller is dma coherent and
whether their device tree are missing the critical dma-coherent property.
Thierry Reding Jan. 24, 2025, 1:15 p.m. UTC | #9
On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote:
> On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch> wrote:
> 
> > > Just to clarify, the patch that you had us try was not intended as an actual
> > > fix, correct? It was only for diagnostic purposes, i.e. to see if there is
> > > some kind of cache coherence issue, which seems to be the case?  So perhaps
> > > the only fix needed is to add dma-coherent to our device tree?  
> > 
> > That sounds quite error prone. How many other DT blobs are missing the
> > property? If the memory should be coherent, i would expect the driver
> > to allocate coherent memory. Or the driver needs to handle
> > non-coherent memory and add the necessary flush/invalidates etc.
> 
> stmmac driver does the necessary cache flush/invalidates to maintain cache lines
> explicitly.
> 
> See dma_sync_single_for_cpu():
> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/include/linux/dma-mapping.h#n297
> 
> dma_dev_need_sync() is supposed to return false for Tegra234, since the ethernet
> controller on Tegra234 is dma coherent by SoC design as Brad said their
> downstream device tree has dma-coherent turned on by default, and after add
> dma-coherent to mainline ethernet node, stmmac driver works fine.
> But dma-coherent property is missing in mainline Tegra234 ethernet device tree
> node, dma_dev_need_sync() returns true and this is not the expected behavior.

My understanding is that the Ethernet controller itself is not DMA
coherent. Instead, it is by accessing memory through the IOMMU that the
accesses become coherent. It's technically possible for the IOMMU to be
disabled via command-line, or simply compile out the driver for it, and
the devices are supposed to keep working with it (though it's not a
recommended configuration), so exclusively relying on the presence of
an IOMMU node or even a dma-coherent property is bound to fail in these
corner cases. We don't currently have a good way of describing this in
device tree, but a discussion with the DT maintainers is probably
warranted.

> The dma-coherent property in device tree node is SoC specific, so only the
> vendors know if their stmmac ethernet controller is dma coherent and
> whether their device tree are missing the critical dma-coherent property.

What I fail to understand is how dma-coherent can make a difference in
this case. If it's not present, then the driver is supposed to maintain
caches explicitly. But it seems like explicit cache maintenance actually
causes things to break. So do we need to assume that DMA coherent
devices in generally won't work if the driver manages caches explicitly?

I always figured dma-coherent was more of an optimization, but this
seems to indicate it isn't.

Thierry
Andrew Lunn Jan. 24, 2025, 3:14 p.m. UTC | #10
> It is not an actual fix, it is only for diagnostic purposes.
> 
> >   So perhaps the only fix needed is to add dma-coherent to our device tree?
> 
> Yes, add dma-coherent to ethernet node is the correct fix.

What do you think about the comment i made?

     Andrew
Ido Schimmel Jan. 25, 2025, 10:20 a.m. UTC | #11
On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote:
> On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch> wrote:
> 
> > > Just to clarify, the patch that you had us try was not intended as an actual
> > > fix, correct? It was only for diagnostic purposes, i.e. to see if there is
> > > some kind of cache coherence issue, which seems to be the case?  So perhaps
> > > the only fix needed is to add dma-coherent to our device tree?  
> > 
> > That sounds quite error prone. How many other DT blobs are missing the
> > property? If the memory should be coherent, i would expect the driver
> > to allocate coherent memory. Or the driver needs to handle
> > non-coherent memory and add the necessary flush/invalidates etc.
> 
> stmmac driver does the necessary cache flush/invalidates to maintain cache lines
> explicitly.

Given the problem happens when the kernel performs syncing, is it
possible that there is a problem with how the syncing is performed?

I am not familiar with this driver, but it seems to allocate multiple
buffers per packet when split header is enabled and these buffers are
allocated from the same page pool (see stmmac_init_rx_buffers()).
Despite that, the driver is creating the page pool with a non-zero
offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the
headroom, which is only present in the head buffer.

I asked Thierry to test the following patch [1] and initial testing
seems OK. He also confirmed that "SPH feature enabled" shows up in the
kernel log.

BTW, the commit that added split header support (67afd6d1cfdf0) says
that it "reduces CPU usage because without the feature all the entire
packet is memcpy'ed, while that with the feature only the header is".
This is no longer correct after your patch, so is there still value in
the split header feature? With two large buffers being allocated from
the same page pool for each packet (headers + data), the end result is
an inflated skb->truesize, no?

[1]
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index edbf8994455d..42170ce2927e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2091,8 +2091,8 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv,
 	pp_params.nid = dev_to_node(priv->device);
 	pp_params.dev = priv->device;
 	pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
-	pp_params.offset = stmmac_rx_offset(priv);
-	pp_params.max_len = dma_conf->dma_buf_sz;
+	pp_params.offset = 0;
+	pp_params.max_len = num_pages * PAGE_SIZE;
 
 	rx_q->page_pool = page_pool_create(&pp_params);
 	if (IS_ERR(rx_q->page_pool)) {
Furong Xu Jan. 25, 2025, 2:43 p.m. UTC | #12
Hi Ido

On Sat, 25 Jan 2025 12:20:38 +0200, Ido Schimmel wrote:

> On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote:
> > On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch>
> > wrote: 
> > > > Just to clarify, the patch that you had us try was not intended
> > > > as an actual fix, correct? It was only for diagnostic purposes,
> > > > i.e. to see if there is some kind of cache coherence issue,
> > > > which seems to be the case?  So perhaps the only fix needed is
> > > > to add dma-coherent to our device tree?    
> > > 
> > > That sounds quite error prone. How many other DT blobs are
> > > missing the property? If the memory should be coherent, i would
> > > expect the driver to allocate coherent memory. Or the driver
> > > needs to handle non-coherent memory and add the necessary
> > > flush/invalidates etc.  
> > 
> > stmmac driver does the necessary cache flush/invalidates to
> > maintain cache lines explicitly.  
> 
> Given the problem happens when the kernel performs syncing, is it
> possible that there is a problem with how the syncing is performed?
> 
> I am not familiar with this driver, but it seems to allocate multiple
> buffers per packet when split header is enabled and these buffers are
> allocated from the same page pool (see stmmac_init_rx_buffers()).
> Despite that, the driver is creating the page pool with a non-zero
> offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the
> headroom, which is only present in the head buffer.
> 
> I asked Thierry to test the following patch [1] and initial testing
> seems OK. He also confirmed that "SPH feature enabled" shows up in the
> kernel log.
> BTW, the commit that added split header support (67afd6d1cfdf0) says
> that it "reduces CPU usage because without the feature all the entire
> packet is memcpy'ed, while that with the feature only the header is".
> This is no longer correct after your patch, so is there still value in
> the split header feature? With two large buffers being allocated from

Thanks for these great insights!

Yes, when "SPH feature enabled", it is not correct after my patch,
pp_params.offset should be updated to match the offset of split payload.

But I would like to let pp_params.max_len remains to
dma_conf->dma_buf_sz since the sizes of both header and payload are
limited to dma_conf->dma_buf_sz by DMA engine, no more than
dma_conf->dma_buf_sz bytes will be written into a page buffer.
So my patch would be like [2]:

BTW, the split header feature will be very useful on some certain
cases, stmmac driver should support this feature always.

[2]
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index edbf8994455d..def0d893efbb 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2091,7 +2091,7 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv,
        pp_params.nid = dev_to_node(priv->device);
        pp_params.dev = priv->device;
        pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
-       pp_params.offset = stmmac_rx_offset(priv);
+       pp_params.offset = priv->sph ? 0 : stmmac_rx_offset(priv);
        pp_params.max_len = dma_conf->dma_buf_sz;

        rx_q->page_pool = page_pool_create(&pp_params);
Furong Xu Jan. 25, 2025, 3:03 p.m. UTC | #13
Hi Thierry

On Sat, 25 Jan 2025 12:20:38 +0200, Ido Schimmel wrote:

> On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote:
> > On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch>
> > wrote: 
> > > > Just to clarify, the patch that you had us try was not intended
> > > > as an actual fix, correct? It was only for diagnostic purposes,
> > > > i.e. to see if there is some kind of cache coherence issue,
> > > > which seems to be the case?  So perhaps the only fix needed is
> > > > to add dma-coherent to our device tree?    
> > > 
> > > That sounds quite error prone. How many other DT blobs are
> > > missing the property? If the memory should be coherent, i would
> > > expect the driver to allocate coherent memory. Or the driver
> > > needs to handle non-coherent memory and add the necessary
> > > flush/invalidates etc.  
> > 
> > stmmac driver does the necessary cache flush/invalidates to
> > maintain cache lines explicitly.  
> 
> Given the problem happens when the kernel performs syncing, is it
> possible that there is a problem with how the syncing is performed?
> 
> I am not familiar with this driver, but it seems to allocate multiple
> buffers per packet when split header is enabled and these buffers are
> allocated from the same page pool (see stmmac_init_rx_buffers()).
> Despite that, the driver is creating the page pool with a non-zero
> offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the
> headroom, which is only present in the head buffer.
> 
> I asked Thierry to test the following patch [1] and initial testing
> seems OK. He also confirmed that "SPH feature enabled" shows up in the
> kernel log.

It is recommended to disable the "SPH feature" by default unless some
certain cases depend on it. Like Ido said, two large buffers being
allocated from the same page pool for each packet, this is a huge waste
of memory, and brings performance drops for most of general cases.

Our downstream driver and two mainline drivers disable SPH by default:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c#n357
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c#n471
Andrew Lunn Jan. 25, 2025, 7:08 p.m. UTC | #14
> It is recommended to disable the "SPH feature" by default unless some
> certain cases depend on it. Like Ido said, two large buffers being
> allocated from the same page pool for each packet, this is a huge waste
> of memory, and brings performance drops for most of general cases.

I don't know this driver, but it looks like SPH is required for
NETIF_F_GRO? Can you add this flag to hw_features, but not
wanted_features and leave SPH disabled until ethtool is used to enable
GRO?

Are there other use cases where SPH is needed?

	Andrew
Furong Xu Jan. 26, 2025, 2:39 a.m. UTC | #15
On Sat, 25 Jan 2025 20:08:12 +0100, Andrew Lunn <andrew@lunn.ch> wrote:

> > It is recommended to disable the "SPH feature" by default unless
> > some certain cases depend on it. Like Ido said, two large buffers
> > being allocated from the same page pool for each packet, this is a
> > huge waste of memory, and brings performance drops for most of
> > general cases.  
> 
> I don't know this driver, but it looks like SPH is required for
> NETIF_F_GRO? Can you add this flag to hw_features, but not
> wanted_features and leave SPH disabled until ethtool is used to enable
> GRO?

SPH has its own ethtool command, stmmac driver does not implement yet.
see:
https://patchwork.kernel.org/project/netdevbpf/cover/20250114142852.3364986-1-ap420073@gmail.com/

> Are there other use cases where SPH is needed?

https://patchwork.kernel.org/project/netdevbpf/cover/20240910171458.219195-1-almasrymina@google.com/
https://patchwork.kernel.org/project/netdevbpf/cover/20250116231704.2402455-1-dw@davidwei.uk/

The stmmac driver does not support both of them, but it will someday :)
Ido Schimmel Jan. 26, 2025, 8:41 a.m. UTC | #16
Hi,

On Sat, Jan 25, 2025 at 10:43:42PM +0800, Furong Xu wrote:
> Hi Ido
> 
> On Sat, 25 Jan 2025 12:20:38 +0200, Ido Schimmel wrote:
> 
> > On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote:
> > > On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch>
> > > wrote: 
> > > > > Just to clarify, the patch that you had us try was not intended
> > > > > as an actual fix, correct? It was only for diagnostic purposes,
> > > > > i.e. to see if there is some kind of cache coherence issue,
> > > > > which seems to be the case?  So perhaps the only fix needed is
> > > > > to add dma-coherent to our device tree?    
> > > > 
> > > > That sounds quite error prone. How many other DT blobs are
> > > > missing the property? If the memory should be coherent, i would
> > > > expect the driver to allocate coherent memory. Or the driver
> > > > needs to handle non-coherent memory and add the necessary
> > > > flush/invalidates etc.  
> > > 
> > > stmmac driver does the necessary cache flush/invalidates to
> > > maintain cache lines explicitly.  
> > 
> > Given the problem happens when the kernel performs syncing, is it
> > possible that there is a problem with how the syncing is performed?
> > 
> > I am not familiar with this driver, but it seems to allocate multiple
> > buffers per packet when split header is enabled and these buffers are
> > allocated from the same page pool (see stmmac_init_rx_buffers()).
> > Despite that, the driver is creating the page pool with a non-zero
> > offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the
> > headroom, which is only present in the head buffer.
> > 
> > I asked Thierry to test the following patch [1] and initial testing
> > seems OK. He also confirmed that "SPH feature enabled" shows up in the
> > kernel log.
> > BTW, the commit that added split header support (67afd6d1cfdf0) says
> > that it "reduces CPU usage because without the feature all the entire
> > packet is memcpy'ed, while that with the feature only the header is".
> > This is no longer correct after your patch, so is there still value in
> > the split header feature? With two large buffers being allocated from
> 
> Thanks for these great insights!
> 
> Yes, when "SPH feature enabled", it is not correct after my patch,
> pp_params.offset should be updated to match the offset of split payload.
> 
> But I would like to let pp_params.max_len remains to
> dma_conf->dma_buf_sz since the sizes of both header and payload are
> limited to dma_conf->dma_buf_sz by DMA engine, no more than
> dma_conf->dma_buf_sz bytes will be written into a page buffer.
> So my patch would be like [2]:
> 
> BTW, the split header feature will be very useful on some certain
> cases, stmmac driver should support this feature always.
> 
> [2]
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index edbf8994455d..def0d893efbb 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -2091,7 +2091,7 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv,
>         pp_params.nid = dev_to_node(priv->device);
>         pp_params.dev = priv->device;
>         pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
> -       pp_params.offset = stmmac_rx_offset(priv);
> +       pp_params.offset = priv->sph ? 0 : stmmac_rx_offset(priv);

SPH is the only scenario in which the driver uses multiple buffers per
packet?

>         pp_params.max_len = dma_conf->dma_buf_sz;

Are you sure this is correct? Page pool documentation says that "For
pages recycled on the XDP xmit and skb paths the page pool will use the
max_len member of struct page_pool_params to decide how much of the page
needs to be synced (starting at offset)" [1].

While "no more than dma_conf->dma_buf_sz bytes will be written into a
page buffer", for the head buffer they will be written starting at a
non-zero offset unlike buffers used for the data, no?

[1] https://docs.kernel.org/networking/page_pool.html#dma-sync
Furong Xu Jan. 26, 2025, 10:37 a.m. UTC | #17
On Sun, 26 Jan 2025 10:41:23 +0200, Ido Schimmel wrote:
 
> SPH is the only scenario in which the driver uses multiple buffers per
> packet?

Yes.

Jumbo mode may use multiple buffers per packet too, but they are
high order pages, just like a single page in a page pool when using
a standard MTU.

> >         pp_params.max_len = dma_conf->dma_buf_sz;  
> 
> Are you sure this is correct? Page pool documentation says that "For
> pages recycled on the XDP xmit and skb paths the page pool will use
> the max_len member of struct page_pool_params to decide how much of
> the page needs to be synced (starting at offset)" [1].

Page pool must sync an area of the buffer because both DMA and CPU may
touch this area, other areas are CPU exclusive, so no sync for them
seems better.

> While "no more than dma_conf->dma_buf_sz bytes will be written into a
> page buffer", for the head buffer they will be written starting at a
> non-zero offset unlike buffers used for the data, no?

Correct, they have different offsets.

The "SPH feature" splits header into buf->page (non-zero offset) and
splits payload into buf->sec_page (zero offset).

For buf->page, pp_params.max_len should be the size of L3/L4 header,
and with a offset of NET_SKB_PAD.

For buf->sec_page, pp_params.max_len should be dma_conf->dma_buf_sz,
and with a offset of 0.

This is always true:
sizeof(L3/L4 header) + NET_SKB_PAD < dma_conf->dma_buf_sz + 0

pp_params.max_len = dma_conf->dma_buf_sz;
make things simpler :)
Ido Schimmel Jan. 26, 2025, 11:35 a.m. UTC | #18
On Sun, Jan 26, 2025 at 06:37:14PM +0800, Furong Xu wrote:
> The "SPH feature" splits header into buf->page (non-zero offset) and
> splits payload into buf->sec_page (zero offset).
> 
> For buf->page, pp_params.max_len should be the size of L3/L4 header,
> and with a offset of NET_SKB_PAD.
> 
> For buf->sec_page, pp_params.max_len should be dma_conf->dma_buf_sz,
> and with a offset of 0.
> 
> This is always true:
> sizeof(L3/L4 header) + NET_SKB_PAD < dma_conf->dma_buf_sz + 0

Thanks, understood, but are there situations where the device is unable
to split a packet? For example, a large L2 packet. I am trying to
understand if there are situations where the device will write more than
"dma_conf->dma_buf_sz - NET_SKB_PAD" to the head buffer.
Furong Xu Jan. 26, 2025, 12:56 p.m. UTC | #19
On Sun, 26 Jan 2025 13:35:45 +0200, Ido Schimmel wrote:

> On Sun, Jan 26, 2025 at 06:37:14PM +0800, Furong Xu wrote:
> > The "SPH feature" splits header into buf->page (non-zero offset) and
> > splits payload into buf->sec_page (zero offset).
> > 
> > For buf->page, pp_params.max_len should be the size of L3/L4 header,
> > and with a offset of NET_SKB_PAD.
> > 
> > For buf->sec_page, pp_params.max_len should be dma_conf->dma_buf_sz,
> > and with a offset of 0.
> > 
> > This is always true:
> > sizeof(L3/L4 header) + NET_SKB_PAD < dma_conf->dma_buf_sz + 0  
> 
> Thanks, understood, but are there situations where the device is
> unable to split a packet? For example, a large L2 packet. I am trying
> to understand if there are situations where the device will write
> more than "dma_conf->dma_buf_sz - NET_SKB_PAD" to the head buffer.

Nice catch!
When receiving a large L2/non-IP packet, more than "dma_conf->dma_buf_sz
- NET_SKB_PAD" will be written.

So we should:
pp_params.max_len = dma_conf->dma_buf_sz + stmmac_rx_offset(priv);

Thanks a lot, Ido
Have a nice weekend :)
Thierry Reding Jan. 27, 2025, 1:28 p.m. UTC | #20
On Sat, Jan 25, 2025 at 11:03:47PM +0800, Furong Xu wrote:
> Hi Thierry
> 
> On Sat, 25 Jan 2025 12:20:38 +0200, Ido Schimmel wrote:
> 
> > On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote:
> > > On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch>
> > > wrote: 
> > > > > Just to clarify, the patch that you had us try was not intended
> > > > > as an actual fix, correct? It was only for diagnostic purposes,
> > > > > i.e. to see if there is some kind of cache coherence issue,
> > > > > which seems to be the case?  So perhaps the only fix needed is
> > > > > to add dma-coherent to our device tree?    
> > > > 
> > > > That sounds quite error prone. How many other DT blobs are
> > > > missing the property? If the memory should be coherent, i would
> > > > expect the driver to allocate coherent memory. Or the driver
> > > > needs to handle non-coherent memory and add the necessary
> > > > flush/invalidates etc.  
> > > 
> > > stmmac driver does the necessary cache flush/invalidates to
> > > maintain cache lines explicitly.  
> > 
> > Given the problem happens when the kernel performs syncing, is it
> > possible that there is a problem with how the syncing is performed?
> > 
> > I am not familiar with this driver, but it seems to allocate multiple
> > buffers per packet when split header is enabled and these buffers are
> > allocated from the same page pool (see stmmac_init_rx_buffers()).
> > Despite that, the driver is creating the page pool with a non-zero
> > offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the
> > headroom, which is only present in the head buffer.
> > 
> > I asked Thierry to test the following patch [1] and initial testing
> > seems OK. He also confirmed that "SPH feature enabled" shows up in the
> > kernel log.
> 
> It is recommended to disable the "SPH feature" by default unless some
> certain cases depend on it. Like Ido said, two large buffers being
> allocated from the same page pool for each packet, this is a huge waste
> of memory, and brings performance drops for most of general cases.
> 
> Our downstream driver and two mainline drivers disable SPH by default:
> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c#n357
> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c#n471

Okay, that's something we can look into changing. What would be an
example of a use-case depending on SPH? Also, isn't this something
that should be a policy that users can configure?

Irrespective of that we should fix the problems we are seeing with
SPH enabled.

Thierry
Lucas Stach Jan. 28, 2025, 8:04 p.m. UTC | #21
Am Freitag, dem 24.01.2025 um 14:15 +0100 schrieb Thierry Reding:
> 
[...]
> > The dma-coherent property in device tree node is SoC specific, so only the
> > vendors know if their stmmac ethernet controller is dma coherent and
> > whether their device tree are missing the critical dma-coherent property.
> 
> What I fail to understand is how dma-coherent can make a difference in
> this case. If it's not present, then the driver is supposed to maintain
> caches explicitly. But it seems like explicit cache maintenance actually
> causes things to break. So do we need to assume that DMA coherent
> devices in generally won't work if the driver manages caches explicitly?
> 
> I always figured dma-coherent was more of an optimization, but this
> seems to indicate it isn't.

There is a real failure scenario when the device is actually dma-
coherent, but the DT claims it isn't: If a location from e.g. a receive
buffer is already allocated in the cache, the write from the device
will update the cache line and not go out to memory. If you then do the
usual non-coherent cache maintenance, i.e. invalidate the RX buffer
area after the device indicated the reception of the buffer, you will
destroy the updated data in the cache and read stale data from memory.

Regards,
Lucas
Jon Hunter Jan. 29, 2025, 2:51 p.m. UTC | #22
Hi Furong,

On 27/01/2025 13:28, Thierry Reding wrote:
> On Sat, Jan 25, 2025 at 11:03:47PM +0800, Furong Xu wrote:
>> Hi Thierry
>>
>> On Sat, 25 Jan 2025 12:20:38 +0200, Ido Schimmel wrote:
>>
>>> On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote:
>>>> On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch>
>>>> wrote:
>>>>>> Just to clarify, the patch that you had us try was not intended
>>>>>> as an actual fix, correct? It was only for diagnostic purposes,
>>>>>> i.e. to see if there is some kind of cache coherence issue,
>>>>>> which seems to be the case?  So perhaps the only fix needed is
>>>>>> to add dma-coherent to our device tree?
>>>>>
>>>>> That sounds quite error prone. How many other DT blobs are
>>>>> missing the property? If the memory should be coherent, i would
>>>>> expect the driver to allocate coherent memory. Or the driver
>>>>> needs to handle non-coherent memory and add the necessary
>>>>> flush/invalidates etc.
>>>>
>>>> stmmac driver does the necessary cache flush/invalidates to
>>>> maintain cache lines explicitly.
>>>
>>> Given the problem happens when the kernel performs syncing, is it
>>> possible that there is a problem with how the syncing is performed?
>>>
>>> I am not familiar with this driver, but it seems to allocate multiple
>>> buffers per packet when split header is enabled and these buffers are
>>> allocated from the same page pool (see stmmac_init_rx_buffers()).
>>> Despite that, the driver is creating the page pool with a non-zero
>>> offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the
>>> headroom, which is only present in the head buffer.
>>>
>>> I asked Thierry to test the following patch [1] and initial testing
>>> seems OK. He also confirmed that "SPH feature enabled" shows up in the
>>> kernel log.
>>
>> It is recommended to disable the "SPH feature" by default unless some
>> certain cases depend on it. Like Ido said, two large buffers being
>> allocated from the same page pool for each packet, this is a huge waste
>> of memory, and brings performance drops for most of general cases.
>>
>> Our downstream driver and two mainline drivers disable SPH by default:
>> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c#n357
>> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c#n471
> 
> Okay, that's something we can look into changing. What would be an
> example of a use-case depending on SPH? Also, isn't this something
> that should be a policy that users can configure?
> 
> Irrespective of that we should fix the problems we are seeing with
> SPH enabled.


Any update on this?

Thanks
Jon
Furong Xu Feb. 7, 2025, 9:07 a.m. UTC | #23
Hi Jon,

On Wed, 29 Jan 2025 14:51:35 +0000, Jon Hunter <jonathanh@nvidia.com> wrote:
> Hi Furong,
> 
> On 27/01/2025 13:28, Thierry Reding wrote:
> > On Sat, Jan 25, 2025 at 11:03:47PM +0800, Furong Xu wrote:  
> >> Hi Thierry
> >>
> >> On Sat, 25 Jan 2025 12:20:38 +0200, Ido Schimmel wrote:
> >>  
> >>> On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote:  
> >>>> On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch>
> >>>> wrote:  
> >>>>>> Just to clarify, the patch that you had us try was not intended
> >>>>>> as an actual fix, correct? It was only for diagnostic purposes,
> >>>>>> i.e. to see if there is some kind of cache coherence issue,
> >>>>>> which seems to be the case?  So perhaps the only fix needed is
> >>>>>> to add dma-coherent to our device tree?  
> >>>>>
> >>>>> That sounds quite error prone. How many other DT blobs are
> >>>>> missing the property? If the memory should be coherent, i would
> >>>>> expect the driver to allocate coherent memory. Or the driver
> >>>>> needs to handle non-coherent memory and add the necessary
> >>>>> flush/invalidates etc.  
> >>>>
> >>>> stmmac driver does the necessary cache flush/invalidates to
> >>>> maintain cache lines explicitly.  
> >>>
> >>> Given the problem happens when the kernel performs syncing, is it
> >>> possible that there is a problem with how the syncing is performed?
> >>>
> >>> I am not familiar with this driver, but it seems to allocate multiple
> >>> buffers per packet when split header is enabled and these buffers are
> >>> allocated from the same page pool (see stmmac_init_rx_buffers()).
> >>> Despite that, the driver is creating the page pool with a non-zero
> >>> offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the
> >>> headroom, which is only present in the head buffer.
> >>>
> >>> I asked Thierry to test the following patch [1] and initial testing
> >>> seems OK. He also confirmed that "SPH feature enabled" shows up in the
> >>> kernel log.  
> >>
> >> It is recommended to disable the "SPH feature" by default unless some
> >> certain cases depend on it. Like Ido said, two large buffers being
> >> allocated from the same page pool for each packet, this is a huge waste
> >> of memory, and brings performance drops for most of general cases.
> >>
> >> Our downstream driver and two mainline drivers disable SPH by default:
> >> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c#n357
> >> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c#n471  
> > 
> > Okay, that's something we can look into changing. What would be an
> > example of a use-case depending on SPH? Also, isn't this something
> > that should be a policy that users can configure?
> > 
> > Irrespective of that we should fix the problems we are seeing with
> > SPH enabled.  
> 
> 
> Any update on this?

Sorry for my late response, I was on Chinese New Year holiday.

The fix is sent, and it will be so nice to have your Tested-by: tag there:
https://lore.kernel.org/all/20250207085639.13580-1-0x1207@gmail.com/
Jon Hunter Feb. 7, 2025, 1:42 p.m. UTC | #24
On 07/02/2025 09:07, Furong Xu wrote:
> Hi Jon,
> 
> On Wed, 29 Jan 2025 14:51:35 +0000, Jon Hunter <jonathanh@nvidia.com> wrote:
>> Hi Furong,
>>
>> On 27/01/2025 13:28, Thierry Reding wrote:
>>> On Sat, Jan 25, 2025 at 11:03:47PM +0800, Furong Xu wrote:
>>>> Hi Thierry
>>>>
>>>> On Sat, 25 Jan 2025 12:20:38 +0200, Ido Schimmel wrote:
>>>>   
>>>>> On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote:
>>>>>> On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch>
>>>>>> wrote:
>>>>>>>> Just to clarify, the patch that you had us try was not intended
>>>>>>>> as an actual fix, correct? It was only for diagnostic purposes,
>>>>>>>> i.e. to see if there is some kind of cache coherence issue,
>>>>>>>> which seems to be the case?  So perhaps the only fix needed is
>>>>>>>> to add dma-coherent to our device tree?
>>>>>>>
>>>>>>> That sounds quite error prone. How many other DT blobs are
>>>>>>> missing the property? If the memory should be coherent, i would
>>>>>>> expect the driver to allocate coherent memory. Or the driver
>>>>>>> needs to handle non-coherent memory and add the necessary
>>>>>>> flush/invalidates etc.
>>>>>>
>>>>>> stmmac driver does the necessary cache flush/invalidates to
>>>>>> maintain cache lines explicitly.
>>>>>
>>>>> Given the problem happens when the kernel performs syncing, is it
>>>>> possible that there is a problem with how the syncing is performed?
>>>>>
>>>>> I am not familiar with this driver, but it seems to allocate multiple
>>>>> buffers per packet when split header is enabled and these buffers are
>>>>> allocated from the same page pool (see stmmac_init_rx_buffers()).
>>>>> Despite that, the driver is creating the page pool with a non-zero
>>>>> offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the
>>>>> headroom, which is only present in the head buffer.
>>>>>
>>>>> I asked Thierry to test the following patch [1] and initial testing
>>>>> seems OK. He also confirmed that "SPH feature enabled" shows up in the
>>>>> kernel log.
>>>>
>>>> It is recommended to disable the "SPH feature" by default unless some
>>>> certain cases depend on it. Like Ido said, two large buffers being
>>>> allocated from the same page pool for each packet, this is a huge waste
>>>> of memory, and brings performance drops for most of general cases.
>>>>
>>>> Our downstream driver and two mainline drivers disable SPH by default:
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c#n357
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c#n471
>>>
>>> Okay, that's something we can look into changing. What would be an
>>> example of a use-case depending on SPH? Also, isn't this something
>>> that should be a policy that users can configure?
>>>
>>> Irrespective of that we should fix the problems we are seeing with
>>> SPH enabled.
>>
>>
>> Any update on this?
> 
> Sorry for my late response, I was on Chinese New Year holiday.

No problem! Happy new year!

> The fix is sent, and it will be so nice to have your Tested-by: tag there:
> https://lore.kernel.org/all/20250207085639.13580-1-0x1207@gmail.com/

Thanks, I have tested and responded. All looking good!

Jon
diff mbox series

Patch

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index e8dbce20129c..f05cae103d83 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -126,6 +126,7 @@  struct stmmac_rx_queue {
 	unsigned int cur_rx;
 	unsigned int dirty_rx;
 	unsigned int buf_alloc_num;
+	unsigned int napi_skb_frag_size;
 	dma_addr_t dma_rx_phy;
 	u32 rx_tail_addr;
 	unsigned int state_saved;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index acd6994c1764..1d98a5e8c98c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1341,7 +1341,7 @@  static unsigned int stmmac_rx_offset(struct stmmac_priv *priv)
 	if (stmmac_xdp_is_enabled(priv))
 		return XDP_PACKET_HEADROOM;
 
-	return 0;
+	return NET_SKB_PAD;
 }
 
 static int stmmac_set_bfsize(int mtu, int bufsize)
@@ -2040,17 +2040,21 @@  static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv,
 	struct stmmac_channel *ch = &priv->channel[queue];
 	bool xdp_prog = stmmac_xdp_is_enabled(priv);
 	struct page_pool_params pp_params = { 0 };
-	unsigned int num_pages;
+	unsigned int dma_buf_sz_pad, num_pages;
 	unsigned int napi_id;
 	int ret;
 
+	dma_buf_sz_pad = stmmac_rx_offset(priv) + dma_conf->dma_buf_sz +
+			 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	num_pages = DIV_ROUND_UP(dma_buf_sz_pad, PAGE_SIZE);
+
 	rx_q->queue_index = queue;
 	rx_q->priv_data = priv;
+	rx_q->napi_skb_frag_size = num_pages * PAGE_SIZE;
 
 	pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
 	pp_params.pool_size = dma_conf->dma_rx_size;
-	num_pages = DIV_ROUND_UP(dma_conf->dma_buf_sz, PAGE_SIZE);
-	pp_params.order = ilog2(num_pages);
+	pp_params.order = order_base_2(num_pages);
 	pp_params.nid = dev_to_node(priv->device);
 	pp_params.dev = priv->device;
 	pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
@@ -5582,22 +5586,26 @@  static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 		}
 
 		if (!skb) {
+			unsigned int head_pad_len;
+
 			/* XDP program may expand or reduce tail */
 			buf1_len = ctx.xdp.data_end - ctx.xdp.data;
 
-			skb = napi_alloc_skb(&ch->rx_napi, buf1_len);
+			skb = napi_build_skb(page_address(buf->page),
+					     rx_q->napi_skb_frag_size);
 			if (!skb) {
+				page_pool_recycle_direct(rx_q->page_pool,
+							 buf->page);
 				rx_dropped++;
 				count++;
 				goto drain_data;
 			}
 
 			/* XDP program may adjust header */
-			skb_copy_to_linear_data(skb, ctx.xdp.data, buf1_len);
+			head_pad_len = ctx.xdp.data - ctx.xdp.data_hard_start;
+			skb_reserve(skb, head_pad_len);
 			skb_put(skb, buf1_len);
-
-			/* Data payload copied into SKB, page ready for recycle */
-			page_pool_recycle_direct(rx_q->page_pool, buf->page);
+			skb_mark_for_recycle(skb);
 			buf->page = NULL;
 		} else if (buf1_len) {
 			dma_sync_single_for_cpu(priv->device, buf->addr,