Message ID | bd7aabf4d9b6696885922ed4bef8fc95142d3004.1736910454.git.0x1207@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | net: stmmac: RX performance improvement | expand |
On Wed, Jan 15, 2025 at 11:27:02AM +0800, Furong Xu wrote: > Avoid memcpy in non-XDP RX path by marking all allocated SKBs to > be recycled in the upper network stack. > > This patch brings ~11.5% driver performance improvement in a TCP RX > throughput test with iPerf tool on a single isolated Cortex-A65 CPU > core, from 2.18 Gbits/sec increased to 2.43 Gbits/sec. > > Signed-off-by: Furong Xu <0x1207@gmail.com> > Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com> Reviewed-by: Larysa Zaremba <larysa.zaremba@intel.com> > --- > drivers/net/ethernet/stmicro/stmmac/stmmac.h | 1 + > .../net/ethernet/stmicro/stmmac/stmmac_main.c | 26 ++++++++++++------- > 2 files changed, 18 insertions(+), 9 deletions(-) > > diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h > index e8dbce20129c..f05cae103d83 100644 > --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h > +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h > @@ -126,6 +126,7 @@ struct stmmac_rx_queue { > unsigned int cur_rx; > unsigned int dirty_rx; > unsigned int buf_alloc_num; > + unsigned int napi_skb_frag_size; > dma_addr_t dma_rx_phy; > u32 rx_tail_addr; > unsigned int state_saved; > diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c > index acd6994c1764..1d98a5e8c98c 100644 > --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c > +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c > @@ -1341,7 +1341,7 @@ static unsigned int stmmac_rx_offset(struct stmmac_priv *priv) > if (stmmac_xdp_is_enabled(priv)) > return XDP_PACKET_HEADROOM; > > - return 0; > + return NET_SKB_PAD; > } > > static int stmmac_set_bfsize(int mtu, int bufsize) > @@ -2040,17 +2040,21 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv, > struct stmmac_channel *ch = &priv->channel[queue]; > bool xdp_prog = stmmac_xdp_is_enabled(priv); > struct page_pool_params pp_params = { 0 }; > - unsigned int num_pages; > + unsigned int dma_buf_sz_pad, num_pages; > unsigned int napi_id; > int ret; > > + dma_buf_sz_pad = stmmac_rx_offset(priv) + dma_conf->dma_buf_sz + > + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > + num_pages = DIV_ROUND_UP(dma_buf_sz_pad, PAGE_SIZE); > + > rx_q->queue_index = queue; > rx_q->priv_data = priv; > + rx_q->napi_skb_frag_size = num_pages * PAGE_SIZE; > > pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > pp_params.pool_size = dma_conf->dma_rx_size; > - num_pages = DIV_ROUND_UP(dma_conf->dma_buf_sz, PAGE_SIZE); > - pp_params.order = ilog2(num_pages); > + pp_params.order = order_base_2(num_pages); > pp_params.nid = dev_to_node(priv->device); > pp_params.dev = priv->device; > pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; > @@ -5582,22 +5586,26 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) > } > > if (!skb) { > + unsigned int head_pad_len; > + > /* XDP program may expand or reduce tail */ > buf1_len = ctx.xdp.data_end - ctx.xdp.data; > > - skb = napi_alloc_skb(&ch->rx_napi, buf1_len); > + skb = napi_build_skb(page_address(buf->page), > + rx_q->napi_skb_frag_size); > if (!skb) { > + page_pool_recycle_direct(rx_q->page_pool, > + buf->page); > rx_dropped++; > count++; > goto drain_data; > } > > /* XDP program may adjust header */ > - skb_copy_to_linear_data(skb, ctx.xdp.data, buf1_len); > + head_pad_len = ctx.xdp.data - ctx.xdp.data_hard_start; > + skb_reserve(skb, head_pad_len); > skb_put(skb, buf1_len); > - > - /* Data payload copied into SKB, page ready for recycle */ > - page_pool_recycle_direct(rx_q->page_pool, buf->page); > + skb_mark_for_recycle(skb); > buf->page = NULL; > } else if (buf1_len) { > dma_sync_single_for_cpu(priv->device, buf->addr, > -- > 2.34.1 > >
在 2025/1/15 11:27, Furong Xu 写道: > Avoid memcpy in non-XDP RX path by marking all allocated SKBs to > be recycled in the upper network stack. > > This patch brings ~11.5% driver performance improvement in a TCP RX > throughput test with iPerf tool on a single isolated Cortex-A65 CPU > core, from 2.18 Gbits/sec increased to 2.43 Gbits/sec. > > Signed-off-by: Furong Xu <0x1207@gmail.com> > Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com> Reviewed-by: Yanteng Si <si.yanteng@linux.dev> Thanks, Yanteng > --- > drivers/net/ethernet/stmicro/stmmac/stmmac.h | 1 + > .../net/ethernet/stmicro/stmmac/stmmac_main.c | 26 ++++++++++++------- > 2 files changed, 18 insertions(+), 9 deletions(-) > > diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h > index e8dbce20129c..f05cae103d83 100644 > --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h > +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h > @@ -126,6 +126,7 @@ struct stmmac_rx_queue { > unsigned int cur_rx; > unsigned int dirty_rx; > unsigned int buf_alloc_num; > + unsigned int napi_skb_frag_size; > dma_addr_t dma_rx_phy; > u32 rx_tail_addr; > unsigned int state_saved; > diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c > index acd6994c1764..1d98a5e8c98c 100644 > --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c > +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c > @@ -1341,7 +1341,7 @@ static unsigned int stmmac_rx_offset(struct stmmac_priv *priv) > if (stmmac_xdp_is_enabled(priv)) > return XDP_PACKET_HEADROOM; > > - return 0; > + return NET_SKB_PAD; > } > > static int stmmac_set_bfsize(int mtu, int bufsize) > @@ -2040,17 +2040,21 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv, > struct stmmac_channel *ch = &priv->channel[queue]; > bool xdp_prog = stmmac_xdp_is_enabled(priv); > struct page_pool_params pp_params = { 0 }; > - unsigned int num_pages; > + unsigned int dma_buf_sz_pad, num_pages; > unsigned int napi_id; > int ret; > > + dma_buf_sz_pad = stmmac_rx_offset(priv) + dma_conf->dma_buf_sz + > + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > + num_pages = DIV_ROUND_UP(dma_buf_sz_pad, PAGE_SIZE); > + > rx_q->queue_index = queue; > rx_q->priv_data = priv; > + rx_q->napi_skb_frag_size = num_pages * PAGE_SIZE; > > pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > pp_params.pool_size = dma_conf->dma_rx_size; > - num_pages = DIV_ROUND_UP(dma_conf->dma_buf_sz, PAGE_SIZE); > - pp_params.order = ilog2(num_pages); > + pp_params.order = order_base_2(num_pages); > pp_params.nid = dev_to_node(priv->device); > pp_params.dev = priv->device; > pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; > @@ -5582,22 +5586,26 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) > } > > if (!skb) { > + unsigned int head_pad_len; > + > /* XDP program may expand or reduce tail */ > buf1_len = ctx.xdp.data_end - ctx.xdp.data; > > - skb = napi_alloc_skb(&ch->rx_napi, buf1_len); > + skb = napi_build_skb(page_address(buf->page), > + rx_q->napi_skb_frag_size); > if (!skb) { > + page_pool_recycle_direct(rx_q->page_pool, > + buf->page); > rx_dropped++; > count++; > goto drain_data; > } > > /* XDP program may adjust header */ > - skb_copy_to_linear_data(skb, ctx.xdp.data, buf1_len); > + head_pad_len = ctx.xdp.data - ctx.xdp.data_hard_start; > + skb_reserve(skb, head_pad_len); > skb_put(skb, buf1_len); > - > - /* Data payload copied into SKB, page ready for recycle */ > - page_pool_recycle_direct(rx_q->page_pool, buf->page); > + skb_mark_for_recycle(skb); > buf->page = NULL; > } else if (buf1_len) { > dma_sync_single_for_cpu(priv->device, buf->addr,
Hi Furong, On 15/01/2025 03:27, Furong Xu wrote: > Avoid memcpy in non-XDP RX path by marking all allocated SKBs to > be recycled in the upper network stack. > > This patch brings ~11.5% driver performance improvement in a TCP RX > throughput test with iPerf tool on a single isolated Cortex-A65 CPU > core, from 2.18 Gbits/sec increased to 2.43 Gbits/sec. > > Signed-off-by: Furong Xu <0x1207@gmail.com> > Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com> > --- > drivers/net/ethernet/stmicro/stmmac/stmmac.h | 1 + > .../net/ethernet/stmicro/stmmac/stmmac_main.c | 26 ++++++++++++------- > 2 files changed, 18 insertions(+), 9 deletions(-) > > diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h > index e8dbce20129c..f05cae103d83 100644 > --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h > +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h > @@ -126,6 +126,7 @@ struct stmmac_rx_queue { > unsigned int cur_rx; > unsigned int dirty_rx; > unsigned int buf_alloc_num; > + unsigned int napi_skb_frag_size; > dma_addr_t dma_rx_phy; > u32 rx_tail_addr; > unsigned int state_saved; > diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c > index acd6994c1764..1d98a5e8c98c 100644 > --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c > +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c > @@ -1341,7 +1341,7 @@ static unsigned int stmmac_rx_offset(struct stmmac_priv *priv) > if (stmmac_xdp_is_enabled(priv)) > return XDP_PACKET_HEADROOM; > > - return 0; > + return NET_SKB_PAD; > } > > static int stmmac_set_bfsize(int mtu, int bufsize) > @@ -2040,17 +2040,21 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv, > struct stmmac_channel *ch = &priv->channel[queue]; > bool xdp_prog = stmmac_xdp_is_enabled(priv); > struct page_pool_params pp_params = { 0 }; > - unsigned int num_pages; > + unsigned int dma_buf_sz_pad, num_pages; > unsigned int napi_id; > int ret; > > + dma_buf_sz_pad = stmmac_rx_offset(priv) + dma_conf->dma_buf_sz + > + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > + num_pages = DIV_ROUND_UP(dma_buf_sz_pad, PAGE_SIZE); > + > rx_q->queue_index = queue; > rx_q->priv_data = priv; > + rx_q->napi_skb_frag_size = num_pages * PAGE_SIZE; > > pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > pp_params.pool_size = dma_conf->dma_rx_size; > - num_pages = DIV_ROUND_UP(dma_conf->dma_buf_sz, PAGE_SIZE); > - pp_params.order = ilog2(num_pages); > + pp_params.order = order_base_2(num_pages); > pp_params.nid = dev_to_node(priv->device); > pp_params.dev = priv->device; > pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; > @@ -5582,22 +5586,26 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) > } > > if (!skb) { > + unsigned int head_pad_len; > + > /* XDP program may expand or reduce tail */ > buf1_len = ctx.xdp.data_end - ctx.xdp.data; > > - skb = napi_alloc_skb(&ch->rx_napi, buf1_len); > + skb = napi_build_skb(page_address(buf->page), > + rx_q->napi_skb_frag_size); > if (!skb) { > + page_pool_recycle_direct(rx_q->page_pool, > + buf->page); > rx_dropped++; > count++; > goto drain_data; > } > > /* XDP program may adjust header */ > - skb_copy_to_linear_data(skb, ctx.xdp.data, buf1_len); > + head_pad_len = ctx.xdp.data - ctx.xdp.data_hard_start; > + skb_reserve(skb, head_pad_len); > skb_put(skb, buf1_len); > - > - /* Data payload copied into SKB, page ready for recycle */ > - page_pool_recycle_direct(rx_q->page_pool, buf->page); > + skb_mark_for_recycle(skb); > buf->page = NULL; > } else if (buf1_len) { > dma_sync_single_for_cpu(priv->device, buf->addr, We have noticed a boot regression on -next when booting with NFS. Bisect is pointing to this commit and reverting this on top of -next does fix the problem. I only see this on Tegra234 which uses the drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c driver. Tegra194 which uses the drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c driver appears to be fine. I tried booting without NFS and the network interface seems to come up fine. It only seems to be a problem when booting with NFS. However, I guess I have also not stressed the network interface when booting without NFS. Have you tried booting with NFS? Thanks Jon
Hi Jon, On Thu, 23 Jan 2025 14:06:42 +0000, Jon Hunter wrote: > We have noticed a boot regression on -next when booting with NFS. > Bisect is pointing to this commit and reverting this on top of -next > does fix the problem. > > I only see this on Tegra234 which uses the > drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c driver. Tegra194 > which uses the > drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c driver > appears to be fine. What is the MTU of Tegra234 and NFS server? Are they both 1500? Could you please try attached patch to confirm if this regression is fixed? If the attached patch fixes this regression, and so it seems to be a cache coherence issue specific to Tegra234, since this patch avoid memcpy and the page buffers may be modified by upper network stack of course, then cache lines of page buffers may become dirty. But by reverting this patch, cache lines of page buffers never become dirty, this is the core difference.
On 1/23/25 08:35, Furong Xu wrote: > What is the MTU of Tegra234 and NFS server? Are they both 1500? I see the same issue. Yes, both are 1500. > Could you please try attached patch to confirm if this regression is > fixed? Patch fixes the issue. > If the attached patch fixes this regression, and so it seems to be a > cache coherence issue specific to Tegra234, since this patch avoid > memcpy and the page buffers may be modified by upper network stack of > course, then cache lines of page buffers may become dirty. But by > reverting this patch, cache lines of page buffers never become dirty, > this is the core difference. Thanks for these insights. I don't have specific experience in this driver, but I see we have dma-coherent turned on for this driver in our downstream device tree files (i.e. dtbs that coincide with our out-of-tree implementation of this driver). I went back to the original code and verified that the issue was there. I did a new test where I added dma-coherent to this ethernet node in the dtb and retested. It worked! Just to clarify, the patch that you had us try was not intended as an actual fix, correct? It was only for diagnostic purposes, i.e. to see if there is some kind of cache coherence issue, which seems to be the case? So perhaps the only fix needed is to add dma-coherent to our device tree?
> Just to clarify, the patch that you had us try was not intended as an actual > fix, correct? It was only for diagnostic purposes, i.e. to see if there is > some kind of cache coherence issue, which seems to be the case? So perhaps > the only fix needed is to add dma-coherent to our device tree? That sounds quite error prone. How many other DT blobs are missing the property? If the memory should be coherent, i would expect the driver to allocate coherent memory. Or the driver needs to handle non-coherent memory and add the necessary flush/invalidates etc. Andrew
On Thu, 23 Jan 2025 11:53:21 -0800, Brad Griffis <bgriffis@nvidia.com> wrote: > On 1/23/25 08:35, Furong Xu wrote: > > What is the MTU of Tegra234 and NFS server? Are they both 1500? > > I see the same issue. Yes, both are 1500. > > > Could you please try attached patch to confirm if this regression is > > fixed? > > Patch fixes the issue. > > > If the attached patch fixes this regression, and so it seems to be a > > cache coherence issue specific to Tegra234, since this patch avoid > > memcpy and the page buffers may be modified by upper network stack of > > course, then cache lines of page buffers may become dirty. But by > > reverting this patch, cache lines of page buffers never become dirty, > > this is the core difference. > > Thanks for these insights. I don't have specific experience in this > driver, but I see we have dma-coherent turned on for this driver in our > downstream device tree files (i.e. dtbs that coincide with our > out-of-tree implementation of this driver). I went back to the original > code and verified that the issue was there. I did a new test where I > added dma-coherent to this ethernet node in the dtb and retested. It worked! > > Just to clarify, the patch that you had us try was not intended as an > actual fix, correct? It was only for diagnostic purposes, i.e. to see if > there is some kind of cache coherence issue, which seems to be the case? It is not an actual fix, it is only for diagnostic purposes. > So perhaps the only fix needed is to add dma-coherent to our device tree? Yes, add dma-coherent to ethernet node is the correct fix.
On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch> wrote: > > Just to clarify, the patch that you had us try was not intended as an actual > > fix, correct? It was only for diagnostic purposes, i.e. to see if there is > > some kind of cache coherence issue, which seems to be the case? So perhaps > > the only fix needed is to add dma-coherent to our device tree? > > That sounds quite error prone. How many other DT blobs are missing the > property? If the memory should be coherent, i would expect the driver > to allocate coherent memory. Or the driver needs to handle > non-coherent memory and add the necessary flush/invalidates etc. stmmac driver does the necessary cache flush/invalidates to maintain cache lines explicitly. See dma_sync_single_for_cpu(): https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/include/linux/dma-mapping.h#n297 dma_dev_need_sync() is supposed to return false for Tegra234, since the ethernet controller on Tegra234 is dma coherent by SoC design as Brad said their downstream device tree has dma-coherent turned on by default, and after add dma-coherent to mainline ethernet node, stmmac driver works fine. But dma-coherent property is missing in mainline Tegra234 ethernet device tree node, dma_dev_need_sync() returns true and this is not the expected behavior. The dma-coherent property in device tree node is SoC specific, so only the vendors know if their stmmac ethernet controller is dma coherent and whether their device tree are missing the critical dma-coherent property.
On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote: > On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch> wrote: > > > > Just to clarify, the patch that you had us try was not intended as an actual > > > fix, correct? It was only for diagnostic purposes, i.e. to see if there is > > > some kind of cache coherence issue, which seems to be the case? So perhaps > > > the only fix needed is to add dma-coherent to our device tree? > > > > That sounds quite error prone. How many other DT blobs are missing the > > property? If the memory should be coherent, i would expect the driver > > to allocate coherent memory. Or the driver needs to handle > > non-coherent memory and add the necessary flush/invalidates etc. > > stmmac driver does the necessary cache flush/invalidates to maintain cache lines > explicitly. > > See dma_sync_single_for_cpu(): > https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/include/linux/dma-mapping.h#n297 > > dma_dev_need_sync() is supposed to return false for Tegra234, since the ethernet > controller on Tegra234 is dma coherent by SoC design as Brad said their > downstream device tree has dma-coherent turned on by default, and after add > dma-coherent to mainline ethernet node, stmmac driver works fine. > But dma-coherent property is missing in mainline Tegra234 ethernet device tree > node, dma_dev_need_sync() returns true and this is not the expected behavior. My understanding is that the Ethernet controller itself is not DMA coherent. Instead, it is by accessing memory through the IOMMU that the accesses become coherent. It's technically possible for the IOMMU to be disabled via command-line, or simply compile out the driver for it, and the devices are supposed to keep working with it (though it's not a recommended configuration), so exclusively relying on the presence of an IOMMU node or even a dma-coherent property is bound to fail in these corner cases. We don't currently have a good way of describing this in device tree, but a discussion with the DT maintainers is probably warranted. > The dma-coherent property in device tree node is SoC specific, so only the > vendors know if their stmmac ethernet controller is dma coherent and > whether their device tree are missing the critical dma-coherent property. What I fail to understand is how dma-coherent can make a difference in this case. If it's not present, then the driver is supposed to maintain caches explicitly. But it seems like explicit cache maintenance actually causes things to break. So do we need to assume that DMA coherent devices in generally won't work if the driver manages caches explicitly? I always figured dma-coherent was more of an optimization, but this seems to indicate it isn't. Thierry
> It is not an actual fix, it is only for diagnostic purposes. > > > So perhaps the only fix needed is to add dma-coherent to our device tree? > > Yes, add dma-coherent to ethernet node is the correct fix. What do you think about the comment i made? Andrew
On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote: > On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch> wrote: > > > > Just to clarify, the patch that you had us try was not intended as an actual > > > fix, correct? It was only for diagnostic purposes, i.e. to see if there is > > > some kind of cache coherence issue, which seems to be the case? So perhaps > > > the only fix needed is to add dma-coherent to our device tree? > > > > That sounds quite error prone. How many other DT blobs are missing the > > property? If the memory should be coherent, i would expect the driver > > to allocate coherent memory. Or the driver needs to handle > > non-coherent memory and add the necessary flush/invalidates etc. > > stmmac driver does the necessary cache flush/invalidates to maintain cache lines > explicitly. Given the problem happens when the kernel performs syncing, is it possible that there is a problem with how the syncing is performed? I am not familiar with this driver, but it seems to allocate multiple buffers per packet when split header is enabled and these buffers are allocated from the same page pool (see stmmac_init_rx_buffers()). Despite that, the driver is creating the page pool with a non-zero offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the headroom, which is only present in the head buffer. I asked Thierry to test the following patch [1] and initial testing seems OK. He also confirmed that "SPH feature enabled" shows up in the kernel log. BTW, the commit that added split header support (67afd6d1cfdf0) says that it "reduces CPU usage because without the feature all the entire packet is memcpy'ed, while that with the feature only the header is". This is no longer correct after your patch, so is there still value in the split header feature? With two large buffers being allocated from the same page pool for each packet (headers + data), the end result is an inflated skb->truesize, no? [1] diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index edbf8994455d..42170ce2927e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2091,8 +2091,8 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv, pp_params.nid = dev_to_node(priv->device); pp_params.dev = priv->device; pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; - pp_params.offset = stmmac_rx_offset(priv); - pp_params.max_len = dma_conf->dma_buf_sz; + pp_params.offset = 0; + pp_params.max_len = num_pages * PAGE_SIZE; rx_q->page_pool = page_pool_create(&pp_params); if (IS_ERR(rx_q->page_pool)) {
Hi Ido On Sat, 25 Jan 2025 12:20:38 +0200, Ido Schimmel wrote: > On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote: > > On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch> > > wrote: > > > > Just to clarify, the patch that you had us try was not intended > > > > as an actual fix, correct? It was only for diagnostic purposes, > > > > i.e. to see if there is some kind of cache coherence issue, > > > > which seems to be the case? So perhaps the only fix needed is > > > > to add dma-coherent to our device tree? > > > > > > That sounds quite error prone. How many other DT blobs are > > > missing the property? If the memory should be coherent, i would > > > expect the driver to allocate coherent memory. Or the driver > > > needs to handle non-coherent memory and add the necessary > > > flush/invalidates etc. > > > > stmmac driver does the necessary cache flush/invalidates to > > maintain cache lines explicitly. > > Given the problem happens when the kernel performs syncing, is it > possible that there is a problem with how the syncing is performed? > > I am not familiar with this driver, but it seems to allocate multiple > buffers per packet when split header is enabled and these buffers are > allocated from the same page pool (see stmmac_init_rx_buffers()). > Despite that, the driver is creating the page pool with a non-zero > offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the > headroom, which is only present in the head buffer. > > I asked Thierry to test the following patch [1] and initial testing > seems OK. He also confirmed that "SPH feature enabled" shows up in the > kernel log. > BTW, the commit that added split header support (67afd6d1cfdf0) says > that it "reduces CPU usage because without the feature all the entire > packet is memcpy'ed, while that with the feature only the header is". > This is no longer correct after your patch, so is there still value in > the split header feature? With two large buffers being allocated from Thanks for these great insights! Yes, when "SPH feature enabled", it is not correct after my patch, pp_params.offset should be updated to match the offset of split payload. But I would like to let pp_params.max_len remains to dma_conf->dma_buf_sz since the sizes of both header and payload are limited to dma_conf->dma_buf_sz by DMA engine, no more than dma_conf->dma_buf_sz bytes will be written into a page buffer. So my patch would be like [2]: BTW, the split header feature will be very useful on some certain cases, stmmac driver should support this feature always. [2] diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index edbf8994455d..def0d893efbb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2091,7 +2091,7 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv, pp_params.nid = dev_to_node(priv->device); pp_params.dev = priv->device; pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; - pp_params.offset = stmmac_rx_offset(priv); + pp_params.offset = priv->sph ? 0 : stmmac_rx_offset(priv); pp_params.max_len = dma_conf->dma_buf_sz; rx_q->page_pool = page_pool_create(&pp_params);
Hi Thierry On Sat, 25 Jan 2025 12:20:38 +0200, Ido Schimmel wrote: > On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote: > > On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch> > > wrote: > > > > Just to clarify, the patch that you had us try was not intended > > > > as an actual fix, correct? It was only for diagnostic purposes, > > > > i.e. to see if there is some kind of cache coherence issue, > > > > which seems to be the case? So perhaps the only fix needed is > > > > to add dma-coherent to our device tree? > > > > > > That sounds quite error prone. How many other DT blobs are > > > missing the property? If the memory should be coherent, i would > > > expect the driver to allocate coherent memory. Or the driver > > > needs to handle non-coherent memory and add the necessary > > > flush/invalidates etc. > > > > stmmac driver does the necessary cache flush/invalidates to > > maintain cache lines explicitly. > > Given the problem happens when the kernel performs syncing, is it > possible that there is a problem with how the syncing is performed? > > I am not familiar with this driver, but it seems to allocate multiple > buffers per packet when split header is enabled and these buffers are > allocated from the same page pool (see stmmac_init_rx_buffers()). > Despite that, the driver is creating the page pool with a non-zero > offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the > headroom, which is only present in the head buffer. > > I asked Thierry to test the following patch [1] and initial testing > seems OK. He also confirmed that "SPH feature enabled" shows up in the > kernel log. It is recommended to disable the "SPH feature" by default unless some certain cases depend on it. Like Ido said, two large buffers being allocated from the same page pool for each packet, this is a huge waste of memory, and brings performance drops for most of general cases. Our downstream driver and two mainline drivers disable SPH by default: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c#n357 https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c#n471
> It is recommended to disable the "SPH feature" by default unless some > certain cases depend on it. Like Ido said, two large buffers being > allocated from the same page pool for each packet, this is a huge waste > of memory, and brings performance drops for most of general cases. I don't know this driver, but it looks like SPH is required for NETIF_F_GRO? Can you add this flag to hw_features, but not wanted_features and leave SPH disabled until ethtool is used to enable GRO? Are there other use cases where SPH is needed? Andrew
On Sat, 25 Jan 2025 20:08:12 +0100, Andrew Lunn <andrew@lunn.ch> wrote: > > It is recommended to disable the "SPH feature" by default unless > > some certain cases depend on it. Like Ido said, two large buffers > > being allocated from the same page pool for each packet, this is a > > huge waste of memory, and brings performance drops for most of > > general cases. > > I don't know this driver, but it looks like SPH is required for > NETIF_F_GRO? Can you add this flag to hw_features, but not > wanted_features and leave SPH disabled until ethtool is used to enable > GRO? SPH has its own ethtool command, stmmac driver does not implement yet. see: https://patchwork.kernel.org/project/netdevbpf/cover/20250114142852.3364986-1-ap420073@gmail.com/ > Are there other use cases where SPH is needed? https://patchwork.kernel.org/project/netdevbpf/cover/20240910171458.219195-1-almasrymina@google.com/ https://patchwork.kernel.org/project/netdevbpf/cover/20250116231704.2402455-1-dw@davidwei.uk/ The stmmac driver does not support both of them, but it will someday :)
Hi, On Sat, Jan 25, 2025 at 10:43:42PM +0800, Furong Xu wrote: > Hi Ido > > On Sat, 25 Jan 2025 12:20:38 +0200, Ido Schimmel wrote: > > > On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote: > > > On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch> > > > wrote: > > > > > Just to clarify, the patch that you had us try was not intended > > > > > as an actual fix, correct? It was only for diagnostic purposes, > > > > > i.e. to see if there is some kind of cache coherence issue, > > > > > which seems to be the case? So perhaps the only fix needed is > > > > > to add dma-coherent to our device tree? > > > > > > > > That sounds quite error prone. How many other DT blobs are > > > > missing the property? If the memory should be coherent, i would > > > > expect the driver to allocate coherent memory. Or the driver > > > > needs to handle non-coherent memory and add the necessary > > > > flush/invalidates etc. > > > > > > stmmac driver does the necessary cache flush/invalidates to > > > maintain cache lines explicitly. > > > > Given the problem happens when the kernel performs syncing, is it > > possible that there is a problem with how the syncing is performed? > > > > I am not familiar with this driver, but it seems to allocate multiple > > buffers per packet when split header is enabled and these buffers are > > allocated from the same page pool (see stmmac_init_rx_buffers()). > > Despite that, the driver is creating the page pool with a non-zero > > offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the > > headroom, which is only present in the head buffer. > > > > I asked Thierry to test the following patch [1] and initial testing > > seems OK. He also confirmed that "SPH feature enabled" shows up in the > > kernel log. > > BTW, the commit that added split header support (67afd6d1cfdf0) says > > that it "reduces CPU usage because without the feature all the entire > > packet is memcpy'ed, while that with the feature only the header is". > > This is no longer correct after your patch, so is there still value in > > the split header feature? With two large buffers being allocated from > > Thanks for these great insights! > > Yes, when "SPH feature enabled", it is not correct after my patch, > pp_params.offset should be updated to match the offset of split payload. > > But I would like to let pp_params.max_len remains to > dma_conf->dma_buf_sz since the sizes of both header and payload are > limited to dma_conf->dma_buf_sz by DMA engine, no more than > dma_conf->dma_buf_sz bytes will be written into a page buffer. > So my patch would be like [2]: > > BTW, the split header feature will be very useful on some certain > cases, stmmac driver should support this feature always. > > [2] > diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c > index edbf8994455d..def0d893efbb 100644 > --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c > +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c > @@ -2091,7 +2091,7 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv, > pp_params.nid = dev_to_node(priv->device); > pp_params.dev = priv->device; > pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; > - pp_params.offset = stmmac_rx_offset(priv); > + pp_params.offset = priv->sph ? 0 : stmmac_rx_offset(priv); SPH is the only scenario in which the driver uses multiple buffers per packet? > pp_params.max_len = dma_conf->dma_buf_sz; Are you sure this is correct? Page pool documentation says that "For pages recycled on the XDP xmit and skb paths the page pool will use the max_len member of struct page_pool_params to decide how much of the page needs to be synced (starting at offset)" [1]. While "no more than dma_conf->dma_buf_sz bytes will be written into a page buffer", for the head buffer they will be written starting at a non-zero offset unlike buffers used for the data, no? [1] https://docs.kernel.org/networking/page_pool.html#dma-sync
On Sun, 26 Jan 2025 10:41:23 +0200, Ido Schimmel wrote: > SPH is the only scenario in which the driver uses multiple buffers per > packet? Yes. Jumbo mode may use multiple buffers per packet too, but they are high order pages, just like a single page in a page pool when using a standard MTU. > > pp_params.max_len = dma_conf->dma_buf_sz; > > Are you sure this is correct? Page pool documentation says that "For > pages recycled on the XDP xmit and skb paths the page pool will use > the max_len member of struct page_pool_params to decide how much of > the page needs to be synced (starting at offset)" [1]. Page pool must sync an area of the buffer because both DMA and CPU may touch this area, other areas are CPU exclusive, so no sync for them seems better. > While "no more than dma_conf->dma_buf_sz bytes will be written into a > page buffer", for the head buffer they will be written starting at a > non-zero offset unlike buffers used for the data, no? Correct, they have different offsets. The "SPH feature" splits header into buf->page (non-zero offset) and splits payload into buf->sec_page (zero offset). For buf->page, pp_params.max_len should be the size of L3/L4 header, and with a offset of NET_SKB_PAD. For buf->sec_page, pp_params.max_len should be dma_conf->dma_buf_sz, and with a offset of 0. This is always true: sizeof(L3/L4 header) + NET_SKB_PAD < dma_conf->dma_buf_sz + 0 pp_params.max_len = dma_conf->dma_buf_sz; make things simpler :)
On Sun, Jan 26, 2025 at 06:37:14PM +0800, Furong Xu wrote: > The "SPH feature" splits header into buf->page (non-zero offset) and > splits payload into buf->sec_page (zero offset). > > For buf->page, pp_params.max_len should be the size of L3/L4 header, > and with a offset of NET_SKB_PAD. > > For buf->sec_page, pp_params.max_len should be dma_conf->dma_buf_sz, > and with a offset of 0. > > This is always true: > sizeof(L3/L4 header) + NET_SKB_PAD < dma_conf->dma_buf_sz + 0 Thanks, understood, but are there situations where the device is unable to split a packet? For example, a large L2 packet. I am trying to understand if there are situations where the device will write more than "dma_conf->dma_buf_sz - NET_SKB_PAD" to the head buffer.
On Sun, 26 Jan 2025 13:35:45 +0200, Ido Schimmel wrote: > On Sun, Jan 26, 2025 at 06:37:14PM +0800, Furong Xu wrote: > > The "SPH feature" splits header into buf->page (non-zero offset) and > > splits payload into buf->sec_page (zero offset). > > > > For buf->page, pp_params.max_len should be the size of L3/L4 header, > > and with a offset of NET_SKB_PAD. > > > > For buf->sec_page, pp_params.max_len should be dma_conf->dma_buf_sz, > > and with a offset of 0. > > > > This is always true: > > sizeof(L3/L4 header) + NET_SKB_PAD < dma_conf->dma_buf_sz + 0 > > Thanks, understood, but are there situations where the device is > unable to split a packet? For example, a large L2 packet. I am trying > to understand if there are situations where the device will write > more than "dma_conf->dma_buf_sz - NET_SKB_PAD" to the head buffer. Nice catch! When receiving a large L2/non-IP packet, more than "dma_conf->dma_buf_sz - NET_SKB_PAD" will be written. So we should: pp_params.max_len = dma_conf->dma_buf_sz + stmmac_rx_offset(priv); Thanks a lot, Ido Have a nice weekend :)
On Sat, Jan 25, 2025 at 11:03:47PM +0800, Furong Xu wrote: > Hi Thierry > > On Sat, 25 Jan 2025 12:20:38 +0200, Ido Schimmel wrote: > > > On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote: > > > On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch> > > > wrote: > > > > > Just to clarify, the patch that you had us try was not intended > > > > > as an actual fix, correct? It was only for diagnostic purposes, > > > > > i.e. to see if there is some kind of cache coherence issue, > > > > > which seems to be the case? So perhaps the only fix needed is > > > > > to add dma-coherent to our device tree? > > > > > > > > That sounds quite error prone. How many other DT blobs are > > > > missing the property? If the memory should be coherent, i would > > > > expect the driver to allocate coherent memory. Or the driver > > > > needs to handle non-coherent memory and add the necessary > > > > flush/invalidates etc. > > > > > > stmmac driver does the necessary cache flush/invalidates to > > > maintain cache lines explicitly. > > > > Given the problem happens when the kernel performs syncing, is it > > possible that there is a problem with how the syncing is performed? > > > > I am not familiar with this driver, but it seems to allocate multiple > > buffers per packet when split header is enabled and these buffers are > > allocated from the same page pool (see stmmac_init_rx_buffers()). > > Despite that, the driver is creating the page pool with a non-zero > > offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the > > headroom, which is only present in the head buffer. > > > > I asked Thierry to test the following patch [1] and initial testing > > seems OK. He also confirmed that "SPH feature enabled" shows up in the > > kernel log. > > It is recommended to disable the "SPH feature" by default unless some > certain cases depend on it. Like Ido said, two large buffers being > allocated from the same page pool for each packet, this is a huge waste > of memory, and brings performance drops for most of general cases. > > Our downstream driver and two mainline drivers disable SPH by default: > https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c#n357 > https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c#n471 Okay, that's something we can look into changing. What would be an example of a use-case depending on SPH? Also, isn't this something that should be a policy that users can configure? Irrespective of that we should fix the problems we are seeing with SPH enabled. Thierry
Am Freitag, dem 24.01.2025 um 14:15 +0100 schrieb Thierry Reding: > [...] > > The dma-coherent property in device tree node is SoC specific, so only the > > vendors know if their stmmac ethernet controller is dma coherent and > > whether their device tree are missing the critical dma-coherent property. > > What I fail to understand is how dma-coherent can make a difference in > this case. If it's not present, then the driver is supposed to maintain > caches explicitly. But it seems like explicit cache maintenance actually > causes things to break. So do we need to assume that DMA coherent > devices in generally won't work if the driver manages caches explicitly? > > I always figured dma-coherent was more of an optimization, but this > seems to indicate it isn't. There is a real failure scenario when the device is actually dma- coherent, but the DT claims it isn't: If a location from e.g. a receive buffer is already allocated in the cache, the write from the device will update the cache line and not go out to memory. If you then do the usual non-coherent cache maintenance, i.e. invalidate the RX buffer area after the device indicated the reception of the buffer, you will destroy the updated data in the cache and read stale data from memory. Regards, Lucas
Hi Furong, On 27/01/2025 13:28, Thierry Reding wrote: > On Sat, Jan 25, 2025 at 11:03:47PM +0800, Furong Xu wrote: >> Hi Thierry >> >> On Sat, 25 Jan 2025 12:20:38 +0200, Ido Schimmel wrote: >> >>> On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote: >>>> On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch> >>>> wrote: >>>>>> Just to clarify, the patch that you had us try was not intended >>>>>> as an actual fix, correct? It was only for diagnostic purposes, >>>>>> i.e. to see if there is some kind of cache coherence issue, >>>>>> which seems to be the case? So perhaps the only fix needed is >>>>>> to add dma-coherent to our device tree? >>>>> >>>>> That sounds quite error prone. How many other DT blobs are >>>>> missing the property? If the memory should be coherent, i would >>>>> expect the driver to allocate coherent memory. Or the driver >>>>> needs to handle non-coherent memory and add the necessary >>>>> flush/invalidates etc. >>>> >>>> stmmac driver does the necessary cache flush/invalidates to >>>> maintain cache lines explicitly. >>> >>> Given the problem happens when the kernel performs syncing, is it >>> possible that there is a problem with how the syncing is performed? >>> >>> I am not familiar with this driver, but it seems to allocate multiple >>> buffers per packet when split header is enabled and these buffers are >>> allocated from the same page pool (see stmmac_init_rx_buffers()). >>> Despite that, the driver is creating the page pool with a non-zero >>> offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the >>> headroom, which is only present in the head buffer. >>> >>> I asked Thierry to test the following patch [1] and initial testing >>> seems OK. He also confirmed that "SPH feature enabled" shows up in the >>> kernel log. >> >> It is recommended to disable the "SPH feature" by default unless some >> certain cases depend on it. Like Ido said, two large buffers being >> allocated from the same page pool for each packet, this is a huge waste >> of memory, and brings performance drops for most of general cases. >> >> Our downstream driver and two mainline drivers disable SPH by default: >> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c#n357 >> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c#n471 > > Okay, that's something we can look into changing. What would be an > example of a use-case depending on SPH? Also, isn't this something > that should be a policy that users can configure? > > Irrespective of that we should fix the problems we are seeing with > SPH enabled. Any update on this? Thanks Jon
Hi Jon, On Wed, 29 Jan 2025 14:51:35 +0000, Jon Hunter <jonathanh@nvidia.com> wrote: > Hi Furong, > > On 27/01/2025 13:28, Thierry Reding wrote: > > On Sat, Jan 25, 2025 at 11:03:47PM +0800, Furong Xu wrote: > >> Hi Thierry > >> > >> On Sat, 25 Jan 2025 12:20:38 +0200, Ido Schimmel wrote: > >> > >>> On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote: > >>>> On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch> > >>>> wrote: > >>>>>> Just to clarify, the patch that you had us try was not intended > >>>>>> as an actual fix, correct? It was only for diagnostic purposes, > >>>>>> i.e. to see if there is some kind of cache coherence issue, > >>>>>> which seems to be the case? So perhaps the only fix needed is > >>>>>> to add dma-coherent to our device tree? > >>>>> > >>>>> That sounds quite error prone. How many other DT blobs are > >>>>> missing the property? If the memory should be coherent, i would > >>>>> expect the driver to allocate coherent memory. Or the driver > >>>>> needs to handle non-coherent memory and add the necessary > >>>>> flush/invalidates etc. > >>>> > >>>> stmmac driver does the necessary cache flush/invalidates to > >>>> maintain cache lines explicitly. > >>> > >>> Given the problem happens when the kernel performs syncing, is it > >>> possible that there is a problem with how the syncing is performed? > >>> > >>> I am not familiar with this driver, but it seems to allocate multiple > >>> buffers per packet when split header is enabled and these buffers are > >>> allocated from the same page pool (see stmmac_init_rx_buffers()). > >>> Despite that, the driver is creating the page pool with a non-zero > >>> offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the > >>> headroom, which is only present in the head buffer. > >>> > >>> I asked Thierry to test the following patch [1] and initial testing > >>> seems OK. He also confirmed that "SPH feature enabled" shows up in the > >>> kernel log. > >> > >> It is recommended to disable the "SPH feature" by default unless some > >> certain cases depend on it. Like Ido said, two large buffers being > >> allocated from the same page pool for each packet, this is a huge waste > >> of memory, and brings performance drops for most of general cases. > >> > >> Our downstream driver and two mainline drivers disable SPH by default: > >> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c#n357 > >> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c#n471 > > > > Okay, that's something we can look into changing. What would be an > > example of a use-case depending on SPH? Also, isn't this something > > that should be a policy that users can configure? > > > > Irrespective of that we should fix the problems we are seeing with > > SPH enabled. > > > Any update on this? Sorry for my late response, I was on Chinese New Year holiday. The fix is sent, and it will be so nice to have your Tested-by: tag there: https://lore.kernel.org/all/20250207085639.13580-1-0x1207@gmail.com/
On 07/02/2025 09:07, Furong Xu wrote: > Hi Jon, > > On Wed, 29 Jan 2025 14:51:35 +0000, Jon Hunter <jonathanh@nvidia.com> wrote: >> Hi Furong, >> >> On 27/01/2025 13:28, Thierry Reding wrote: >>> On Sat, Jan 25, 2025 at 11:03:47PM +0800, Furong Xu wrote: >>>> Hi Thierry >>>> >>>> On Sat, 25 Jan 2025 12:20:38 +0200, Ido Schimmel wrote: >>>> >>>>> On Fri, Jan 24, 2025 at 10:42:56AM +0800, Furong Xu wrote: >>>>>> On Thu, 23 Jan 2025 22:48:42 +0100, Andrew Lunn <andrew@lunn.ch> >>>>>> wrote: >>>>>>>> Just to clarify, the patch that you had us try was not intended >>>>>>>> as an actual fix, correct? It was only for diagnostic purposes, >>>>>>>> i.e. to see if there is some kind of cache coherence issue, >>>>>>>> which seems to be the case? So perhaps the only fix needed is >>>>>>>> to add dma-coherent to our device tree? >>>>>>> >>>>>>> That sounds quite error prone. How many other DT blobs are >>>>>>> missing the property? If the memory should be coherent, i would >>>>>>> expect the driver to allocate coherent memory. Or the driver >>>>>>> needs to handle non-coherent memory and add the necessary >>>>>>> flush/invalidates etc. >>>>>> >>>>>> stmmac driver does the necessary cache flush/invalidates to >>>>>> maintain cache lines explicitly. >>>>> >>>>> Given the problem happens when the kernel performs syncing, is it >>>>> possible that there is a problem with how the syncing is performed? >>>>> >>>>> I am not familiar with this driver, but it seems to allocate multiple >>>>> buffers per packet when split header is enabled and these buffers are >>>>> allocated from the same page pool (see stmmac_init_rx_buffers()). >>>>> Despite that, the driver is creating the page pool with a non-zero >>>>> offset (see __alloc_dma_rx_desc_resources()) to avoid syncing the >>>>> headroom, which is only present in the head buffer. >>>>> >>>>> I asked Thierry to test the following patch [1] and initial testing >>>>> seems OK. He also confirmed that "SPH feature enabled" shows up in the >>>>> kernel log. >>>> >>>> It is recommended to disable the "SPH feature" by default unless some >>>> certain cases depend on it. Like Ido said, two large buffers being >>>> allocated from the same page pool for each packet, this is a huge waste >>>> of memory, and brings performance drops for most of general cases. >>>> >>>> Our downstream driver and two mainline drivers disable SPH by default: >>>> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c#n357 >>>> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/tree/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c#n471 >>> >>> Okay, that's something we can look into changing. What would be an >>> example of a use-case depending on SPH? Also, isn't this something >>> that should be a policy that users can configure? >>> >>> Irrespective of that we should fix the problems we are seeing with >>> SPH enabled. >> >> >> Any update on this? > > Sorry for my late response, I was on Chinese New Year holiday. No problem! Happy new year! > The fix is sent, and it will be so nice to have your Tested-by: tag there: > https://lore.kernel.org/all/20250207085639.13580-1-0x1207@gmail.com/ Thanks, I have tested and responded. All looking good! Jon
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index e8dbce20129c..f05cae103d83 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -126,6 +126,7 @@ struct stmmac_rx_queue { unsigned int cur_rx; unsigned int dirty_rx; unsigned int buf_alloc_num; + unsigned int napi_skb_frag_size; dma_addr_t dma_rx_phy; u32 rx_tail_addr; unsigned int state_saved; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index acd6994c1764..1d98a5e8c98c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1341,7 +1341,7 @@ static unsigned int stmmac_rx_offset(struct stmmac_priv *priv) if (stmmac_xdp_is_enabled(priv)) return XDP_PACKET_HEADROOM; - return 0; + return NET_SKB_PAD; } static int stmmac_set_bfsize(int mtu, int bufsize) @@ -2040,17 +2040,21 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv, struct stmmac_channel *ch = &priv->channel[queue]; bool xdp_prog = stmmac_xdp_is_enabled(priv); struct page_pool_params pp_params = { 0 }; - unsigned int num_pages; + unsigned int dma_buf_sz_pad, num_pages; unsigned int napi_id; int ret; + dma_buf_sz_pad = stmmac_rx_offset(priv) + dma_conf->dma_buf_sz + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + num_pages = DIV_ROUND_UP(dma_buf_sz_pad, PAGE_SIZE); + rx_q->queue_index = queue; rx_q->priv_data = priv; + rx_q->napi_skb_frag_size = num_pages * PAGE_SIZE; pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; pp_params.pool_size = dma_conf->dma_rx_size; - num_pages = DIV_ROUND_UP(dma_conf->dma_buf_sz, PAGE_SIZE); - pp_params.order = ilog2(num_pages); + pp_params.order = order_base_2(num_pages); pp_params.nid = dev_to_node(priv->device); pp_params.dev = priv->device; pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; @@ -5582,22 +5586,26 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) } if (!skb) { + unsigned int head_pad_len; + /* XDP program may expand or reduce tail */ buf1_len = ctx.xdp.data_end - ctx.xdp.data; - skb = napi_alloc_skb(&ch->rx_napi, buf1_len); + skb = napi_build_skb(page_address(buf->page), + rx_q->napi_skb_frag_size); if (!skb) { + page_pool_recycle_direct(rx_q->page_pool, + buf->page); rx_dropped++; count++; goto drain_data; } /* XDP program may adjust header */ - skb_copy_to_linear_data(skb, ctx.xdp.data, buf1_len); + head_pad_len = ctx.xdp.data - ctx.xdp.data_hard_start; + skb_reserve(skb, head_pad_len); skb_put(skb, buf1_len); - - /* Data payload copied into SKB, page ready for recycle */ - page_pool_recycle_direct(rx_q->page_pool, buf->page); + skb_mark_for_recycle(skb); buf->page = NULL; } else if (buf1_len) { dma_sync_single_for_cpu(priv->device, buf->addr,