diff mbox series

net: broadcom: bcm4908_enet: use build_skb()

Message ID 20221025132245.22871-1-zajec5@gmail.com (mailing list archive)
State Accepted
Commit 3a1cc23a75abcd9cea585eb84846507363d58397
Delegated to: Netdev Maintainers
Headers show
Series net: broadcom: bcm4908_enet: use build_skb() | expand

Checks

Context Check Description
netdev/tree_selection success Guessed tree name to be net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix warning Target tree name not specified in the subject
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch warning WARNING: line length of 89 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 95 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Rafał Miłecki Oct. 25, 2022, 1:22 p.m. UTC
From: Rafał Miłecki <rafal@milecki.pl>

RX code can be more efficient with the build_skb(). Allocating actual
SKB around eth packet buffer - right before passing it up - results in
a better cache usage.

Without RPS (echo 0 > rps_cpus) BCM4908 NAT masq performance "jumps"
between two speeds: ~900 Mbps and 940 Mbps (it's a 4 CPUs SoC). This
change bumps the lower speed from 905 Mb/s to 918 Mb/s (tested using
single stream iperf 2.0.5 traffic).

There are more optimizations to consider. One obvious to try is GRO
however as BCM4908 doesn't do hw csum is may actually lower performance.
Sometimes. Some early testing:

┌─────────────────────────────────┬─────────────────────┬────────────────────┐
│                                 │ netif_receive_skb() │ napi_gro_receive() │
├─────────────────────────────────┼─────────────────────┼────────────────────┤
│ netdev_alloc_skb()              │            905 Mb/s │           892 Mb/s │
│ napi_alloc_frag() + build_skb() │            918 Mb/s │           917 Mb/s │
└─────────────────────────────────┴─────────────────────┴────────────────────┘

Another ideas:
1. napi_build_skb()
2. skb_copy_from_linear_data() for small packets

Those need proper testing first though. That can be done later.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
---
 drivers/net/ethernet/broadcom/bcm4908_enet.c | 53 +++++++++++++-------
 1 file changed, 36 insertions(+), 17 deletions(-)

Comments

patchwork-bot+netdevbpf@kernel.org Oct. 27, 2022, 11:20 a.m. UTC | #1
Hello:

This patch was applied to netdev/net-next.git (master)
by Paolo Abeni <pabeni@redhat.com>:

On Tue, 25 Oct 2022 15:22:45 +0200 you wrote:
> From: Rafał Miłecki <rafal@milecki.pl>
> 
> RX code can be more efficient with the build_skb(). Allocating actual
> SKB around eth packet buffer - right before passing it up - results in
> a better cache usage.
> 
> Without RPS (echo 0 > rps_cpus) BCM4908 NAT masq performance "jumps"
> between two speeds: ~900 Mbps and 940 Mbps (it's a 4 CPUs SoC). This
> change bumps the lower speed from 905 Mb/s to 918 Mb/s (tested using
> single stream iperf 2.0.5 traffic).
> 
> [...]

Here is the summary with links:
  - net: broadcom: bcm4908_enet: use build_skb()
    https://git.kernel.org/netdev/net-next/c/3a1cc23a75ab

You are awesome, thank you!
diff mbox series

Patch

diff --git a/drivers/net/ethernet/broadcom/bcm4908_enet.c b/drivers/net/ethernet/broadcom/bcm4908_enet.c
index 93ccf549e2ed..ca8c86ee44c0 100644
--- a/drivers/net/ethernet/broadcom/bcm4908_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm4908_enet.c
@@ -36,13 +36,24 @@ 
 #define ENET_MAX_ETH_OVERHEAD			(ETH_HLEN + BRCM_MAX_TAG_LEN + VLAN_HLEN + \
 						 ETH_FCS_LEN + 4) /* 32 */
 
+#define ENET_RX_SKB_BUF_SIZE			(NET_SKB_PAD + NET_IP_ALIGN + \
+						 ETH_HLEN + BRCM_MAX_TAG_LEN + VLAN_HLEN + \
+						 ENET_MTU_MAX + ETH_FCS_LEN + 4)
+#define ENET_RX_SKB_BUF_ALLOC_SIZE		(SKB_DATA_ALIGN(ENET_RX_SKB_BUF_SIZE) + \
+						 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+#define ENET_RX_BUF_DMA_OFFSET			(NET_SKB_PAD + NET_IP_ALIGN)
+#define ENET_RX_BUF_DMA_SIZE			(ENET_RX_SKB_BUF_SIZE - ENET_RX_BUF_DMA_OFFSET)
+
 struct bcm4908_enet_dma_ring_bd {
 	__le32 ctl;
 	__le32 addr;
 } __packed;
 
 struct bcm4908_enet_dma_ring_slot {
-	struct sk_buff *skb;
+	union {
+		void *buf;			/* RX */
+		struct sk_buff *skb;		/* TX */
+	};
 	unsigned int len;
 	dma_addr_t dma_addr;
 };
@@ -260,22 +271,21 @@  static int bcm4908_enet_dma_alloc_rx_buf(struct bcm4908_enet *enet, unsigned int
 	u32 tmp;
 	int err;
 
-	slot->len = ENET_MTU_MAX + ENET_MAX_ETH_OVERHEAD;
-
-	slot->skb = netdev_alloc_skb(enet->netdev, slot->len);
-	if (!slot->skb)
+	slot->buf = napi_alloc_frag(ENET_RX_SKB_BUF_ALLOC_SIZE);
+	if (!slot->buf)
 		return -ENOMEM;
 
-	slot->dma_addr = dma_map_single(dev, slot->skb->data, slot->len, DMA_FROM_DEVICE);
+	slot->dma_addr = dma_map_single(dev, slot->buf + ENET_RX_BUF_DMA_OFFSET,
+					ENET_RX_BUF_DMA_SIZE, DMA_FROM_DEVICE);
 	err = dma_mapping_error(dev, slot->dma_addr);
 	if (err) {
 		dev_err(dev, "Failed to map DMA buffer: %d\n", err);
-		kfree_skb(slot->skb);
-		slot->skb = NULL;
+		skb_free_frag(slot->buf);
+		slot->buf = NULL;
 		return err;
 	}
 
-	tmp = slot->len << DMA_CTL_LEN_DESC_BUFLENGTH_SHIFT;
+	tmp = ENET_RX_BUF_DMA_SIZE << DMA_CTL_LEN_DESC_BUFLENGTH_SHIFT;
 	tmp |= DMA_CTL_STATUS_OWN;
 	if (idx == enet->rx_ring.length - 1)
 		tmp |= DMA_CTL_STATUS_WRAP;
@@ -315,11 +325,11 @@  static void bcm4908_enet_dma_uninit(struct bcm4908_enet *enet)
 
 	for (i = rx_ring->length - 1; i >= 0; i--) {
 		slot = &rx_ring->slots[i];
-		if (!slot->skb)
+		if (!slot->buf)
 			continue;
 		dma_unmap_single(dev, slot->dma_addr, slot->len, DMA_FROM_DEVICE);
-		kfree_skb(slot->skb);
-		slot->skb = NULL;
+		skb_free_frag(slot->buf);
+		slot->buf = NULL;
 	}
 }
 
@@ -577,6 +587,7 @@  static int bcm4908_enet_poll_rx(struct napi_struct *napi, int weight)
 	while (handled < weight) {
 		struct bcm4908_enet_dma_ring_bd *buf_desc;
 		struct bcm4908_enet_dma_ring_slot slot;
+		struct sk_buff *skb;
 		u32 ctl;
 		int len;
 		int err;
@@ -600,16 +611,24 @@  static int bcm4908_enet_poll_rx(struct napi_struct *napi, int weight)
 
 		if (len < ETH_ZLEN ||
 		    (ctl & (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) != (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) {
-			kfree_skb(slot.skb);
+			skb_free_frag(slot.buf);
 			enet->netdev->stats.rx_dropped++;
 			break;
 		}
 
-		dma_unmap_single(dev, slot.dma_addr, slot.len, DMA_FROM_DEVICE);
+		dma_unmap_single(dev, slot.dma_addr, ENET_RX_BUF_DMA_SIZE, DMA_FROM_DEVICE);
+
+		skb = build_skb(slot.buf, ENET_RX_SKB_BUF_ALLOC_SIZE);
+		if (unlikely(!skb)) {
+			skb_free_frag(slot.buf);
+			enet->netdev->stats.rx_dropped++;
+			break;
+		}
+		skb_reserve(skb, ENET_RX_BUF_DMA_OFFSET);
+		skb_put(skb, len - ETH_FCS_LEN);
+		skb->protocol = eth_type_trans(skb, enet->netdev);
 
-		skb_put(slot.skb, len - ETH_FCS_LEN);
-		slot.skb->protocol = eth_type_trans(slot.skb, enet->netdev);
-		netif_receive_skb(slot.skb);
+		netif_receive_skb(skb);
 
 		enet->netdev->stats.rx_packets++;
 		enet->netdev->stats.rx_bytes += len;