@@ -205,6 +205,7 @@ Adaptive coalescing can be switched on/off through `ethtool(8)`'s
More information about Adaptive Interrupt Moderation (DIM) can be found in
Documentation/networking/net_dim.rst
+.. _`RX copybreak`:
RX copybreak
============
The rx_copybreak is initialized by default to ENA_DEFAULT_RX_COPYBREAK
@@ -315,3 +316,34 @@ Rx
- The new SKB is updated with the necessary information (protocol,
checksum hw verify result, etc), and then passed to the network
stack, using the NAPI interface function :code:`napi_gro_receive()`.
+
+Dynamic RX Buffers (DRB)
+------------------------
+
+Each RX descriptor in the RX ring is a single memory page (which is either 4KB
+or 16KB long depending on system's configurations).
+To reduce the memory allocations required when dealing with a high rate of small
+packets, the driver tries to reuse the remaining RX descriptor's space if more
+than 2KB of this page remain unused.
+
+A simple example of this mechanism is the following sequence of events:
+
+::
+
+ 1. Driver allocates page-sized RX buffer and passes it to hardware
+ +----------------------+
+ |4KB RX Buffer |
+ +----------------------+
+
+ 2. A 300Bytes packet is received on this buffer
+
+ 3. The driver increases the ref count on this page and returns it back to
+ HW as an RX buffer of size 4KB - 300Bytes = 3796 Bytes
+ +----+--------------------+
+ |****|3796 Bytes RX Buffer|
+ +----+--------------------+
+
+This mechanism isn't used when an XDP program is loaded, or when the
+RX packet is less than rx_copybreak bytes (in which case the packet is
+copied out of the RX buffer into the linear part of a new skb allocated
+for it and the RX buffer remains the same size, see `RX copybreak`_).
@@ -869,7 +869,9 @@ struct ena_admin_host_info {
* 2 : interrupt_moderation
* 3 : rx_buf_mirroring
* 4 : rss_configurable_function_key
- * 31:5 : reserved
+ * 5 : reserved
+ * 6 : rx_page_reuse
+ * 31:7 : reserved
*/
u32 driver_supported_features;
};
@@ -1184,6 +1186,8 @@ struct ena_admin_ena_mmio_req_read_less_resp {
#define ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_MASK BIT(3)
#define ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_SHIFT 4
#define ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK BIT(4)
+#define ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_SHIFT 6
+#define ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_MASK BIT(6)
/* aenq_common_desc */
#define ENA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK BIT(0)
@@ -1023,7 +1023,7 @@ static int ena_alloc_rx_buffer(struct ena_ring *rx_ring,
int tailroom;
/* restore page offset value in case it has been changed by device */
- rx_info->page_offset = headroom;
+ rx_info->buf_offset = headroom;
/* if previous allocated page is not used */
if (unlikely(rx_info->page))
@@ -1040,6 +1040,8 @@ static int ena_alloc_rx_buffer(struct ena_ring *rx_ring,
tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
rx_info->page = page;
+ rx_info->dma_addr = dma;
+ rx_info->page_offset = 0;
ena_buf = &rx_info->ena_buf;
ena_buf->paddr = dma + headroom;
ena_buf->len = ENA_PAGE_SIZE - headroom - tailroom;
@@ -1047,14 +1049,12 @@ static int ena_alloc_rx_buffer(struct ena_ring *rx_ring,
return 0;
}
-static void ena_unmap_rx_buff(struct ena_ring *rx_ring,
- struct ena_rx_buffer *rx_info)
+static void ena_unmap_rx_buff_attrs(struct ena_ring *rx_ring,
+ struct ena_rx_buffer *rx_info,
+ unsigned long attrs)
{
- struct ena_com_buf *ena_buf = &rx_info->ena_buf;
-
- dma_unmap_page(rx_ring->dev, ena_buf->paddr - rx_ring->rx_headroom,
- ENA_PAGE_SIZE,
- DMA_BIDIRECTIONAL);
+ dma_unmap_page_attrs(rx_ring->dev, rx_info->dma_addr, ENA_PAGE_SIZE,
+ DMA_BIDIRECTIONAL, attrs);
}
static void ena_free_rx_page(struct ena_ring *rx_ring,
@@ -1068,7 +1068,7 @@ static void ena_free_rx_page(struct ena_ring *rx_ring,
return;
}
- ena_unmap_rx_buff(rx_ring, rx_info);
+ ena_unmap_rx_buff_attrs(rx_ring, rx_info, 0);
__free_page(page);
rx_info->page = NULL;
@@ -1406,14 +1406,14 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
return tx_pkts;
}
-static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag)
+static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag, u16 len)
{
struct sk_buff *skb;
if (!first_frag)
- skb = napi_alloc_skb(rx_ring->napi, rx_ring->rx_copybreak);
+ skb = napi_alloc_skb(rx_ring->napi, len);
else
- skb = napi_build_skb(first_frag, ENA_PAGE_SIZE);
+ skb = napi_build_skb(first_frag, len);
if (unlikely(!skb)) {
ena_increase_stat(&rx_ring->rx_stats.skb_alloc_fail, 1,
@@ -1422,24 +1422,47 @@ static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, void *first_frag)
netif_dbg(rx_ring->adapter, rx_err, rx_ring->netdev,
"Failed to allocate skb. first_frag %s\n",
first_frag ? "provided" : "not provided");
- return NULL;
}
return skb;
}
+static bool ena_try_rx_buf_page_reuse(struct ena_rx_buffer *rx_info, u16 buf_len,
+ u16 len, int pkt_offset)
+{
+ struct ena_com_buf *ena_buf = &rx_info->ena_buf;
+
+ /* More than ENA_MIN_RX_BUF_SIZE left in the reused buffer
+ * for data + headroom + tailroom.
+ */
+ if (SKB_DATA_ALIGN(len + pkt_offset) + ENA_MIN_RX_BUF_SIZE <= ena_buf->len) {
+ page_ref_inc(rx_info->page);
+ rx_info->page_offset += buf_len;
+ ena_buf->paddr += buf_len;
+ ena_buf->len -= buf_len;
+ return true;
+ }
+
+ return false;
+}
+
static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
struct ena_com_rx_buf_info *ena_bufs,
u32 descs,
u16 *next_to_clean)
{
+ int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ bool is_xdp_loaded = ena_xdp_present_ring(rx_ring);
struct ena_rx_buffer *rx_info;
struct ena_adapter *adapter;
+ int page_offset, pkt_offset;
+ dma_addr_t pre_reuse_paddr;
u16 len, req_id, buf = 0;
+ bool reuse_rx_buf_page;
struct sk_buff *skb;
- void *page_addr;
- u32 page_offset;
- void *data_addr;
+ void *buf_addr;
+ int buf_offset;
+ u16 buf_len;
len = ena_bufs[buf].len;
req_id = ena_bufs[buf].req_id;
@@ -1459,34 +1482,30 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
"rx_info %p page %p\n",
rx_info, rx_info->page);
- /* save virt address of first buffer */
- page_addr = page_address(rx_info->page);
+ buf_offset = rx_info->buf_offset;
+ pkt_offset = buf_offset - rx_ring->rx_headroom;
page_offset = rx_info->page_offset;
- data_addr = page_addr + page_offset;
-
- prefetch(data_addr);
+ buf_addr = page_address(rx_info->page) + page_offset;
if (len <= rx_ring->rx_copybreak) {
- skb = ena_alloc_skb(rx_ring, NULL);
+ skb = ena_alloc_skb(rx_ring, NULL, len);
if (unlikely(!skb))
return NULL;
- netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
- "RX allocated small packet. len %d. data_len %d\n",
- skb->len, skb->data_len);
-
/* sync this buffer for CPU use */
dma_sync_single_for_cpu(rx_ring->dev,
- dma_unmap_addr(&rx_info->ena_buf, paddr),
+ dma_unmap_addr(&rx_info->ena_buf, paddr) + pkt_offset,
len,
DMA_FROM_DEVICE);
- skb_copy_to_linear_data(skb, data_addr, len);
+ skb_copy_to_linear_data(skb, buf_addr + buf_offset, len);
dma_sync_single_for_device(rx_ring->dev,
- dma_unmap_addr(&rx_info->ena_buf, paddr),
+ dma_unmap_addr(&rx_info->ena_buf, paddr) + pkt_offset,
len,
DMA_FROM_DEVICE);
skb_put(skb, len);
+ netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
+ "RX allocated small packet. len %d.\n", skb->len);
skb->protocol = eth_type_trans(skb, rx_ring->netdev);
rx_ring->free_ids[*next_to_clean] = req_id;
*next_to_clean = ENA_RX_RING_IDX_ADD(*next_to_clean, descs,
@@ -1494,14 +1513,28 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
return skb;
}
- ena_unmap_rx_buff(rx_ring, rx_info);
+ buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom);
+
+ pre_reuse_paddr = dma_unmap_addr(&rx_info->ena_buf, paddr);
+
+ /* If XDP isn't loaded try to reuse part of the RX buffer */
+ reuse_rx_buf_page = !is_xdp_loaded &&
+ ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
- skb = ena_alloc_skb(rx_ring, page_addr);
+ dma_sync_single_for_cpu(rx_ring->dev,
+ pre_reuse_paddr + pkt_offset,
+ len,
+ DMA_FROM_DEVICE);
+
+ if (!reuse_rx_buf_page)
+ ena_unmap_rx_buff_attrs(rx_ring, rx_info, DMA_ATTR_SKIP_CPU_SYNC);
+
+ skb = ena_alloc_skb(rx_ring, buf_addr, buf_len);
if (unlikely(!skb))
return NULL;
/* Populate skb's linear part */
- skb_reserve(skb, page_offset);
+ skb_reserve(skb, buf_offset);
skb_put(skb, len);
skb->protocol = eth_type_trans(skb, rx_ring->netdev);
@@ -1510,7 +1543,8 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
"RX skb updated. len %d. data_len %d\n",
skb->len, skb->data_len);
- rx_info->page = NULL;
+ if (!reuse_rx_buf_page)
+ rx_info->page = NULL;
rx_ring->free_ids[*next_to_clean] = req_id;
*next_to_clean =
@@ -1525,10 +1559,28 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
rx_info = &rx_ring->rx_buffer_info[req_id];
- ena_unmap_rx_buff(rx_ring, rx_info);
+ /* rx_info->buf_offset includes rx_ring->rx_headroom */
+ buf_offset = rx_info->buf_offset;
+ pkt_offset = buf_offset - rx_ring->rx_headroom;
+ buf_len = SKB_DATA_ALIGN(len + buf_offset + tailroom);
+ page_offset = rx_info->page_offset;
+
+ pre_reuse_paddr = dma_unmap_addr(&rx_info->ena_buf, paddr);
+
+ reuse_rx_buf_page = !is_xdp_loaded &&
+ ena_try_rx_buf_page_reuse(rx_info, buf_len, len, pkt_offset);
+
+ dma_sync_single_for_cpu(rx_ring->dev,
+ pre_reuse_paddr + pkt_offset,
+ len,
+ DMA_FROM_DEVICE);
+
+ if (!reuse_rx_buf_page)
+ ena_unmap_rx_buff_attrs(rx_ring, rx_info,
+ DMA_ATTR_SKIP_CPU_SYNC);
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
- rx_info->page_offset, len, ENA_PAGE_SIZE);
+ page_offset + buf_offset, len, buf_len);
} while (1);
@@ -1626,7 +1678,7 @@ static int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp)
rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
xdp_prepare_buff(xdp, page_address(rx_info->page),
- rx_info->page_offset,
+ rx_info->buf_offset,
rx_ring->ena_bufs[0].len, false);
/* If for some reason we received a bigger packet than
* we expect, then we simply drop it
@@ -1638,7 +1690,7 @@ static int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp)
/* The xdp program might expand the headers */
if (ret == ENA_XDP_PASS) {
- rx_info->page_offset = xdp->data - xdp->data_hard_start;
+ rx_info->buf_offset = xdp->data - xdp->data_hard_start;
rx_ring->ena_bufs[0].len = xdp->data_end - xdp->data;
}
@@ -1693,7 +1745,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
/* First descriptor might have an offset set by the device */
rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
- rx_info->page_offset += ena_rx_ctx.pkt_offset;
+ rx_info->buf_offset += ena_rx_ctx.pkt_offset;
netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
"rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n",
@@ -1723,8 +1775,9 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
* from RX side.
*/
if (xdp_verdict & ENA_XDP_FORWARDED) {
- ena_unmap_rx_buff(rx_ring,
- &rx_ring->rx_buffer_info[req_id]);
+ ena_unmap_rx_buff_attrs(rx_ring,
+ &rx_ring->rx_buffer_info[req_id],
+ 0);
rx_ring->rx_buffer_info[req_id].page = NULL;
}
}
@@ -3233,7 +3286,8 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pd
ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK |
ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK |
ENA_ADMIN_HOST_INFO_RX_BUF_MIRRORING_MASK |
- ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK;
+ ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK |
+ ENA_ADMIN_HOST_INFO_RX_PAGE_REUSE_MASK;
rc = ena_com_set_host_attributes(ena_dev);
if (rc) {
@@ -51,6 +51,8 @@
#define ENA_DEFAULT_RING_SIZE (1024)
#define ENA_MIN_RING_SIZE (256)
+#define ENA_MIN_RX_BUF_SIZE (2048)
+
#define ENA_MIN_NUM_IO_QUEUES (1)
#define ENA_TX_WAKEUP_THRESH (MAX_SKB_FRAGS + 2)
@@ -175,7 +177,9 @@ struct ena_tx_buffer {
struct ena_rx_buffer {
struct sk_buff *skb;
struct page *page;
+ dma_addr_t dma_addr;
u32 page_offset;
+ u32 buf_offset;
struct ena_com_buf ena_buf;
} ____cacheline_aligned;