diff mbox series

[net-next,v2,6/8] net/funeth: add the data path

Message ID 20211231090833.98977-7-dmichail@fungible.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series new Fungible Ethernet driver | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 10 maintainers not CCed: andrii@kernel.org hawk@kernel.org kpsingh@kernel.org daniel@iogearbox.net john.fastabend@gmail.com kafai@fb.com songliubraving@fb.com bpf@vger.kernel.org ast@kernel.org yhs@fb.com
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Lines should not end with a '(' CHECK: Macro argument 'counter' may be better as '(counter)' to avoid precedence issues WARNING: From:/Signed-off-by: email address mismatch: 'From: Dimitris Michailidis <d.michailidis@fungible.com>' != 'Signed-off-by: Dimitris Michailidis <dmichail@fungible.com>' WARNING: Use of volatile is usually wrong: see Documentation/process/volatile-considered-harmful.rst WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 81 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline fail Was 0 now: 11

Commit Message

Dimitris Michailidis Dec. 31, 2021, 9:08 a.m. UTC
Add the driver's data path. Tx handles skbs, XDP, and kTLS, Rx has skbs
and XDP. Also included are Rx and Tx queue creation/tear-down and
tracing.

Signed-off-by: Dimitris Michailidis <dmichail@fungible.com>
---
 .../net/ethernet/fungible/funeth/funeth_rx.c  | 725 ++++++++++++++++++
 .../ethernet/fungible/funeth/funeth_trace.h   | 117 +++
 .../net/ethernet/fungible/funeth/funeth_tx.c  | 701 +++++++++++++++++
 .../ethernet/fungible/funeth/funeth_txrx.h    | 242 ++++++
 4 files changed, 1785 insertions(+)
 create mode 100644 drivers/net/ethernet/fungible/funeth/funeth_rx.c
 create mode 100644 drivers/net/ethernet/fungible/funeth/funeth_trace.h
 create mode 100644 drivers/net/ethernet/fungible/funeth/funeth_tx.c
 create mode 100644 drivers/net/ethernet/fungible/funeth/funeth_txrx.h

Comments

Andrew Lunn Jan. 1, 2022, 5:57 p.m. UTC | #1
> +++ b/drivers/net/ethernet/fungible/funeth/funeth_rx.c

> +/* See if a page is running low on refs we are holding and if so take more. */
> +static inline void refresh_refs(struct funeth_rxbuf *buf)
> +{
> +	if (unlikely(buf->pg_refs < MIN_PAGE_REFS)) {
> +		buf->pg_refs += EXTRA_PAGE_REFS;
> +		page_ref_add(buf->page, EXTRA_PAGE_REFS);
> +	}
> +}

No inline functions in C files please. Let the compiler decide. Please
check for this in the whole driver.

> +/* A CQE contains a fixed completion structure along with optional metadata and
> + * even packet data. Given the start address of a CQE return the start of the
> + * contained fixed structure, which lies at the end.
> + */
> +static inline const void *cqe_to_info(const void *cqe)
> +{
> +	return cqe + FUNETH_CQE_INFO_OFFSET;
> +}
> +
> +/* The inverse of cqe_to_info(). */
> +static inline const void *info_to_cqe(const void *cqe_info)
> +{
> +	return cqe_info - FUNETH_CQE_INFO_OFFSET;
> +}

This looks pretty brittle. Can you define a struct cqe, so avoid all
this arithmetic on void * pointers?

> +static unsigned int write_pkt_desc(struct sk_buff *skb, struct funeth_txq *q,
> +				   unsigned int tls_len)
> +{
> +	unsigned int idx = q->prod_cnt & q->mask;
> +	struct fun_eth_tx_req *req = fun_tx_desc_addr(q, idx);

You need to move the assignment into the body to keep with reverse
Christmas tree.

> +	if (txq_to_end(q, gle) == 0) {
> +		gle = (struct fun_dataop_gl *)q->desc;
> +		for ( ; i < ngle; i++, gle++)
> +			fun_dataop_gl_init(gle, 0, 0, lens[i], addrs[i]);
> +	}
> +
> +#ifdef CONFIG_TLS_DEVICE

If possible, use if (IS_ENABLED(TLS_DEVICE)), not #ifdef. It will give
you better compile testing, if it works.

> +	if (unlikely(tls_len)) {
> +		struct fun_eth_tls *tls = (struct fun_eth_tls *)gle;
> +		struct fun_ktls_tx_ctx *tls_ctx;
> +
> +		req->len8 += FUNETH_TLS_SZ / 8;
> +		req->flags = cpu_to_be16(FUN_ETH_TX_TLS);
> +
> +		tls_ctx = tls_driver_ctx(skb->sk, TLS_OFFLOAD_CTX_DIR_TX);
> +		tls->tlsid = tls_ctx->tlsid;
> +		tls_ctx->next_seq += tls_len;
> +
> +		u64_stats_update_begin(&q->syncp);
> +		q->stats.tx_tls_bytes += tls_len;
> +		q->stats.tx_tls_pkts += 1 + extra_pkts;
> +		u64_stats_update_end(&q->syncp);
> +	}
> +#endif

> +/* Create a Tx queue, allocating all the host and device resources needed. */
> +struct funeth_txq *funeth_txq_create(struct net_device *dev, unsigned int qidx,
> +				     unsigned int ndesc, struct fun_irq *irq)
> +{
> +	struct funeth_priv *fp = netdev_priv(dev);
> +	unsigned int ethid = fp->ethid_start + qidx;
> +	int numa_node, err = -ENOMEM;
> +	struct funeth_txq *q;
> +	const char *qtype;

...

> +	netif_info(fp, ifup, dev,
> +		   "%s queue %u, depth %u, HW qid %u, IRQ idx %u, node %d\n",
> +		   qtype, qidx, ndesc, q->hw_qid, q->irq_idx, numa_node);

Probably should be _dbg(). We try not to spam the kernel log.

> +struct funeth_txq_stats {  /* per Tx queue SW counters */
> +	u64 tx_pkts;       /* # of Tx packets */
> +	u64 tx_bytes;      /* total bytes of Tx packets */
> +	u64 tx_cso;        /* # of packets with checksum offload */
> +	u64 tx_tso;        /* # of TSO super-packets */
> +	u64 tx_more;       /* # of DBs elided due to xmit_more */
> +	u64 tx_nstops;     /* # of times the queue has stopped */
> +	u64 tx_nrestarts;  /* # of times the queue has restarted */
> +	u64 tx_map_err;    /* # of packets dropped due to DMA mapping errors */
> +	u64 tx_len_err;    /* # of packets dropped due to unsupported length */
> +	u64 tx_xdp_full;   /* # of XDP packets that could not be enqueued */
> +#ifdef CONFIG_TLS_DEVICE
> +	u64 tx_tls_pkts;   /* # of Tx TLS packets offloaded to HW */
> +	u64 tx_tls_bytes;  /* Tx bytes of HW-handled TLS payload */
> +	u64 tx_tls_fallback; /* attempted Tx TLS offloads punted to SW */
> +	u64 tx_tls_drops;  /* attempted Tx TLS offloads dropped */
> +#endif

You might want to think about this, and kAPI stability. I assume it
implies a different number of statistics are returned by ethtool -S,
depending on how the driver is built. That could be surprising for
user space. It might be better to always have the statistics, and just
return 0 when TLS is not available.

       Andrew
Dimitris Michailidis Jan. 3, 2022, 8:45 p.m. UTC | #2
On Sat, Jan 1, 2022 at 9:57 AM Andrew Lunn <andrew@lunn.ch> wrote:
>
> > +++ b/drivers/net/ethernet/fungible/funeth/funeth_rx.c
>
> > +/* See if a page is running low on refs we are holding and if so take more. */
> > +static inline void refresh_refs(struct funeth_rxbuf *buf)
> > +{
> > +     if (unlikely(buf->pg_refs < MIN_PAGE_REFS)) {
> > +             buf->pg_refs += EXTRA_PAGE_REFS;
> > +             page_ref_add(buf->page, EXTRA_PAGE_REFS);
> > +     }
> > +}
>
> No inline functions in C files please. Let the compiler decide. Please
> check for this in the whole driver.

Removed.

>
> > +/* A CQE contains a fixed completion structure along with optional metadata and
> > + * even packet data. Given the start address of a CQE return the start of the
> > + * contained fixed structure, which lies at the end.
> > + */
> > +static inline const void *cqe_to_info(const void *cqe)
> > +{
> > +     return cqe + FUNETH_CQE_INFO_OFFSET;
> > +}
> > +
> > +/* The inverse of cqe_to_info(). */
> > +static inline const void *info_to_cqe(const void *cqe_info)
> > +{
> > +     return cqe_info - FUNETH_CQE_INFO_OFFSET;
> > +}
>
> This looks pretty brittle. Can you define a struct cqe, so avoid all
> this arithmetic on void * pointers?

The top and bottom parts of a CQE are defined by FW (fun_eth_cqe and
fun_cqe_info) and there's a variable number of unused bytes in the middle
determined by the driver's choice of descriptor size. The driver would need
to define a struct with FUNETH_CQE_INFO_OFFSET - sizeof(fun_cqe_info)
space in the middle. Certainly doable but from a quick prototyping it
looks uglier to me.

This is for these queues that have a compile-time descriptor size
and known top part. The funcore library module does similar
info<->cqe conversions in a somewhat different form but
because it's a library it doesn't know the desc size at compile time
and wouldn't be able to define the structure. The two modules would
deviate in how they handle this while today their difference is one uses
a compile-time offset and the other a run-time one.


> > +static unsigned int write_pkt_desc(struct sk_buff *skb, struct funeth_txq *q,
> > +                                unsigned int tls_len)
> > +{
> > +     unsigned int idx = q->prod_cnt & q->mask;
> > +     struct fun_eth_tx_req *req = fun_tx_desc_addr(q, idx);
>
> You need to move the assignment into the body to keep with reverse
> Christmas tree.

Done

> > +     if (txq_to_end(q, gle) == 0) {
> > +             gle = (struct fun_dataop_gl *)q->desc;
> > +             for ( ; i < ngle; i++, gle++)
> > +                     fun_dataop_gl_init(gle, 0, 0, lens[i], addrs[i]);
> > +     }
> > +
> > +#ifdef CONFIG_TLS_DEVICE
>
> If possible, use if (IS_ENABLED(TLS_DEVICE)), not #ifdef. It will give
> you better compile testing, if it works.

Done

> > +     if (unlikely(tls_len)) {
> > +             struct fun_eth_tls *tls = (struct fun_eth_tls *)gle;
> > +             struct fun_ktls_tx_ctx *tls_ctx;
> > +
> > +             req->len8 += FUNETH_TLS_SZ / 8;
> > +             req->flags = cpu_to_be16(FUN_ETH_TX_TLS);
> > +
> > +             tls_ctx = tls_driver_ctx(skb->sk, TLS_OFFLOAD_CTX_DIR_TX);
> > +             tls->tlsid = tls_ctx->tlsid;
> > +             tls_ctx->next_seq += tls_len;
> > +
> > +             u64_stats_update_begin(&q->syncp);
> > +             q->stats.tx_tls_bytes += tls_len;
> > +             q->stats.tx_tls_pkts += 1 + extra_pkts;
> > +             u64_stats_update_end(&q->syncp);
> > +     }
> > +#endif
>
> > +/* Create a Tx queue, allocating all the host and device resources needed. */
> > +struct funeth_txq *funeth_txq_create(struct net_device *dev, unsigned int qidx,
> > +                                  unsigned int ndesc, struct fun_irq *irq)
> > +{
> > +     struct funeth_priv *fp = netdev_priv(dev);
> > +     unsigned int ethid = fp->ethid_start + qidx;
> > +     int numa_node, err = -ENOMEM;
> > +     struct funeth_txq *q;
> > +     const char *qtype;
>
> ...
>
> > +     netif_info(fp, ifup, dev,
> > +                "%s queue %u, depth %u, HW qid %u, IRQ idx %u, node %d\n",
> > +                qtype, qidx, ndesc, q->hw_qid, q->irq_idx, numa_node);
>
> Probably should be _dbg(). We try not to spam the kernel log.

One needs to set msglvl in ethtool to get this. msglvl defaults to 0 so it
doesn't log if not requested.

> > +struct funeth_txq_stats {  /* per Tx queue SW counters */
> > +     u64 tx_pkts;       /* # of Tx packets */
> > +     u64 tx_bytes;      /* total bytes of Tx packets */
> > +     u64 tx_cso;        /* # of packets with checksum offload */
> > +     u64 tx_tso;        /* # of TSO super-packets */
> > +     u64 tx_more;       /* # of DBs elided due to xmit_more */
> > +     u64 tx_nstops;     /* # of times the queue has stopped */
> > +     u64 tx_nrestarts;  /* # of times the queue has restarted */
> > +     u64 tx_map_err;    /* # of packets dropped due to DMA mapping errors */
> > +     u64 tx_len_err;    /* # of packets dropped due to unsupported length */
> > +     u64 tx_xdp_full;   /* # of XDP packets that could not be enqueued */
> > +#ifdef CONFIG_TLS_DEVICE
> > +     u64 tx_tls_pkts;   /* # of Tx TLS packets offloaded to HW */
> > +     u64 tx_tls_bytes;  /* Tx bytes of HW-handled TLS payload */
> > +     u64 tx_tls_fallback; /* attempted Tx TLS offloads punted to SW */
> > +     u64 tx_tls_drops;  /* attempted Tx TLS offloads dropped */
> > +#endif
>
> You might want to think about this, and kAPI stability. I assume it
> implies a different number of statistics are returned by ethtool -S,
> depending on how the driver is built. That could be surprising for
> user space. It might be better to always have the statistics, and just
> return 0 when TLS is not available.

Yes, ethtool -S has them based on config. Perfectly happy to always
have them. Will change it.

>
>        Andrew
diff mbox series

Patch

diff --git a/drivers/net/ethernet/fungible/funeth/funeth_rx.c b/drivers/net/ethernet/fungible/funeth/funeth_rx.c
new file mode 100644
index 000000000000..56dbc0406e1c
--- /dev/null
+++ b/drivers/net/ethernet/fungible/funeth/funeth_rx.c
@@ -0,0 +1,725 @@ 
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
+
+#include <linux/bpf_trace.h>
+#include <linux/dma-mapping.h>
+#include <linux/etherdevice.h>
+#include <linux/filter.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <linux/skbuff.h>
+#include "funeth_txrx.h"
+#include "funeth.h"
+#include "fun_queue.h"
+
+#define CREATE_TRACE_POINTS
+#include "funeth_trace.h"
+
+/* Given the device's max supported MTU and pages of at least 4KB a packet can
+ * be scattered into at most 4 buffers.
+ */
+#define RX_MAX_FRAGS 4
+
+/* Per packet headroom in non-XDP mode. Present only for 1-frag packets. */
+#define FUN_RX_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN)
+
+/* We try to reuse pages for our buffers. To avoid frequent page ref writes we
+ * take EXTRA_PAGE_REFS references at once and then hand them out one per packet
+ * occupying the buffer.
+ */
+#define EXTRA_PAGE_REFS 1000000
+#define MIN_PAGE_REFS 1000
+
+enum {
+	FUN_XDP_FLUSH_REDIR = 1,
+	FUN_XDP_FLUSH_TX = 2,
+};
+
+/* See if a page is running low on refs we are holding and if so take more. */
+static inline void refresh_refs(struct funeth_rxbuf *buf)
+{
+	if (unlikely(buf->pg_refs < MIN_PAGE_REFS)) {
+		buf->pg_refs += EXTRA_PAGE_REFS;
+		page_ref_add(buf->page, EXTRA_PAGE_REFS);
+	}
+}
+
+/* Offer a buffer to the Rx buffer cache. The cache will hold the buffer if its
+ * page is worth retaining and there's room for it. Otherwise the page is
+ * unmapped and our references released.
+ */
+static void cache_offer(struct funeth_rxq *q, const struct funeth_rxbuf *buf)
+{
+	struct funeth_rx_cache *c = &q->cache;
+
+	if (c->prod_cnt - c->cons_cnt <= c->mask && buf->node == numa_mem_id()) {
+		c->bufs[c->prod_cnt & c->mask] = *buf;
+		c->prod_cnt++;
+	} else {
+		dma_unmap_page_attrs(q->dma_dev, buf->dma_addr, PAGE_SIZE,
+				     DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
+		__page_frag_cache_drain(buf->page, buf->pg_refs);
+	}
+}
+
+/* Get a page from the Rx buffer cache. We only consider the next available
+ * page and return it if we own all its references.
+ */
+static bool cache_get(struct funeth_rxq *q, struct funeth_rxbuf *rb)
+{
+	struct funeth_rx_cache *c = &q->cache;
+	struct funeth_rxbuf *buf;
+
+	if (c->prod_cnt == c->cons_cnt)
+		return false;             /* empty cache */
+
+	buf = &c->bufs[c->cons_cnt & c->mask];
+	if (page_ref_count(buf->page) == buf->pg_refs) {
+		dma_sync_single_for_device(q->dma_dev, buf->dma_addr,
+					   PAGE_SIZE, DMA_FROM_DEVICE);
+		*rb = *buf;
+		buf->page = NULL;
+		refresh_refs(rb);
+		c->cons_cnt++;
+		return true;
+	}
+
+	/* Page can't be reused. If the cache is full drop this page. */
+	if (c->prod_cnt - c->cons_cnt > c->mask) {
+		dma_unmap_page_attrs(q->dma_dev, buf->dma_addr, PAGE_SIZE,
+				     DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
+		__page_frag_cache_drain(buf->page, buf->pg_refs);
+		buf->page = NULL;
+		c->cons_cnt++;
+	}
+	return false;
+}
+
+/* Allocate and DMA-map a page for receive. */
+static int funeth_alloc_page(struct funeth_rxq *q, struct funeth_rxbuf *rb,
+			     int node, gfp_t gfp)
+{
+	struct page *p;
+
+	if (cache_get(q, rb))
+		return 0;
+
+	p = __alloc_pages_node(node, gfp | __GFP_NOWARN, 0);
+	if (unlikely(!p))
+		return -ENOMEM;
+
+	rb->dma_addr = dma_map_page(q->dma_dev, p, 0, PAGE_SIZE,
+				    DMA_FROM_DEVICE);
+	if (unlikely(dma_mapping_error(q->dma_dev, rb->dma_addr))) {
+		FUN_QSTAT_INC(q, rx_map_err);
+		__free_page(p);
+		return -ENOMEM;
+	}
+
+	FUN_QSTAT_INC(q, rx_page_alloc);
+
+	rb->page = p;
+	rb->pg_refs = 1;
+	refresh_refs(rb);
+	rb->node = page_is_pfmemalloc(p) ? -1 : page_to_nid(p);
+	return 0;
+}
+
+static void funeth_free_page(struct funeth_rxq *q, struct funeth_rxbuf *rb)
+{
+	if (rb->page) {
+		dma_unmap_page(q->dma_dev, rb->dma_addr, PAGE_SIZE,
+			       DMA_FROM_DEVICE);
+		__page_frag_cache_drain(rb->page, rb->pg_refs);
+		rb->page = NULL;
+	}
+}
+
+/* Run the XDP program assigned to an Rx queue.
+ * Return %NULL if the buffer is consumed, or the virtual address of the packet
+ * to turn into an skb.
+ */
+static void *fun_run_xdp(struct funeth_rxq *q, skb_frag_t *frags, void *buf_va,
+			 int ref_ok, struct funeth_txq *xdp_q)
+{
+	struct bpf_prog *xdp_prog;
+	struct xdp_buff xdp;
+	u32 act;
+
+	/* VA includes the headroom, frag size includes headroom + tailroom */
+	xdp_init_buff(&xdp, ALIGN(skb_frag_size(frags), FUN_EPRQ_PKT_ALIGN),
+		      &q->xdp_rxq);
+	xdp_prepare_buff(&xdp, buf_va, FUN_XDP_HEADROOM, skb_frag_size(frags) -
+			 (FUN_RX_TAILROOM + FUN_XDP_HEADROOM), false);
+
+	xdp_prog = READ_ONCE(q->xdp_prog);
+	act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+	switch (act) {
+	case XDP_PASS:
+		/* remove headroom, which may not be FUN_XDP_HEADROOM now */
+		skb_frag_size_set(frags, xdp.data_end - xdp.data);
+		skb_frag_off_add(frags, xdp.data - xdp.data_hard_start);
+		goto pass;
+	case XDP_TX:
+		if (unlikely(!ref_ok))
+			goto pass;
+		if (!fun_xdp_tx(xdp_q, xdp.data, xdp.data_end - xdp.data))
+			goto xdp_error;
+		FUN_QSTAT_INC(q, xdp_tx);
+		q->xdp_flush |= FUN_XDP_FLUSH_TX;
+		break;
+	case XDP_REDIRECT:
+		if (unlikely(!ref_ok))
+			goto pass;
+		if (unlikely(xdp_do_redirect(q->netdev, &xdp, xdp_prog)))
+			goto xdp_error;
+		FUN_QSTAT_INC(q, xdp_redir);
+		q->xdp_flush |= FUN_XDP_FLUSH_REDIR;
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		fallthrough;
+	case XDP_ABORTED:
+		trace_xdp_exception(q->netdev, xdp_prog, act);
+xdp_error:
+		q->cur_buf->pg_refs++; /* return frags' page reference */
+		FUN_QSTAT_INC(q, xdp_err);
+		break;
+	case XDP_DROP:
+		q->cur_buf->pg_refs++;
+		FUN_QSTAT_INC(q, xdp_drops);
+		break;
+	}
+	return NULL;
+
+pass:
+	return xdp.data;
+}
+
+/* A CQE contains a fixed completion structure along with optional metadata and
+ * even packet data. Given the start address of a CQE return the start of the
+ * contained fixed structure, which lies at the end.
+ */
+static inline const void *cqe_to_info(const void *cqe)
+{
+	return cqe + FUNETH_CQE_INFO_OFFSET;
+}
+
+/* The inverse of cqe_to_info(). */
+static inline const void *info_to_cqe(const void *cqe_info)
+{
+	return cqe_info - FUNETH_CQE_INFO_OFFSET;
+}
+
+/* Return the type of hash provided by the device based on the L3 and L4
+ * protocols it parsed for the packet.
+ */
+static enum pkt_hash_types cqe_to_pkt_hash_type(u16 pkt_parse)
+{
+	static const enum pkt_hash_types htype_map[] = {
+		PKT_HASH_TYPE_NONE, PKT_HASH_TYPE_L3,
+		PKT_HASH_TYPE_NONE, PKT_HASH_TYPE_L4,
+		PKT_HASH_TYPE_NONE, PKT_HASH_TYPE_L3,
+		PKT_HASH_TYPE_NONE, PKT_HASH_TYPE_L3
+	};
+	u16 key;
+
+	/* Build the key from the TCP/UDP and IP/IPv6 bits */
+	key = ((pkt_parse >> FUN_ETH_RX_CV_OL4_PROT_S) & 6) |
+	      ((pkt_parse >> (FUN_ETH_RX_CV_OL3_PROT_S + 1)) & 1);
+
+	return htype_map[key];
+}
+
+/* Each received packet can be scattered across several Rx buffers or can
+ * share a buffer with previously received packets depending on the buffer
+ * and packet sizes and the room available in the most recently used buffer.
+ *
+ * The rules are:
+ * - If the buffer at the head of an RQ has not been used it gets (part of) the
+ *   next incoming packet.
+ * - Otherwise, if the packet fully fits in the buffer's remaining space the
+ *   packet is written there.
+ * - Otherwise, the packet goes into the next Rx buffer.
+ *
+ * This function returns the Rx buffer for a packet or fragment thereof of the
+ * given length. If it isn't @buf it either recycles or frees that buffer
+ * before advancing the queue to the next buffer.
+ *
+ * If called repeatedly with the remaining length of a packet it will walk
+ * through all the buffers containing the packet.
+ */
+static struct funeth_rxbuf *
+get_buf(struct funeth_rxq *q, struct funeth_rxbuf *buf, unsigned int len)
+{
+	if (q->buf_offset + len <= PAGE_SIZE || !q->buf_offset)
+		return buf;            /* @buf holds (part of) the packet */
+
+	/* The packet occupies part of the next buffer. Move there after
+	 * replenishing the current buffer slot either with the spare page or
+	 * by reusing the slot's existing page. Note that if a spare page isn't
+	 * available and the current packet occupies @buf it is a multi-frag
+	 * packet that will be dropped leaving @buf available for reuse.
+	 */
+	if ((page_ref_count(buf->page) == buf->pg_refs &&
+	     buf->node == numa_mem_id()) || !q->spare_buf.page) {
+		dma_sync_single_for_device(q->dma_dev, buf->dma_addr,
+					   PAGE_SIZE, DMA_FROM_DEVICE);
+		refresh_refs(buf);
+	} else {
+		cache_offer(q, buf);
+		*buf = q->spare_buf;
+		q->spare_buf.page = NULL;
+		q->rqes[q->rq_cons & q->rq_mask] =
+			FUN_EPRQ_RQBUF_INIT(buf->dma_addr);
+	}
+	q->buf_offset = 0;
+	q->rq_cons++;
+	return &q->bufs[q->rq_cons & q->rq_mask];
+}
+
+/* Gather the page fragments making up the first Rx packet on @q. Its total
+ * length @tot_len includes optional head- and tail-rooms.
+ *
+ * Return 0 if the device retains ownership of at least some of the pages.
+ * In this case the caller may only copy the packet.
+ *
+ * A non-zero return value gives the caller permission to use references to the
+ * pages, e.g., attach them to skbs. Additionally, if the value is <0 at least
+ * one of the pages is PF_MEMALLOC.
+ *
+ * Regardless of outcome the caller is granted a reference to each of the pages.
+ */
+static int fun_gather_pkt(struct funeth_rxq *q, unsigned int tot_len,
+			  skb_frag_t *frags)
+{
+	struct funeth_rxbuf *buf = q->cur_buf;
+	unsigned int frag_len;
+	int ref_ok = 1;
+
+	for (;;) {
+		buf = get_buf(q, buf, tot_len);
+
+		/* We always keep the RQ full of buffers so before we can give
+		 * one of our pages to the stack we require that we can obtain
+		 * a replacement page. If we can't the packet will either be
+		 * copied or dropped so we can retain ownership of the page and
+		 * reuse it.
+		 */
+		if (!q->spare_buf.page &&
+		    funeth_alloc_page(q, &q->spare_buf, numa_mem_id(),
+				      GFP_ATOMIC | __GFP_MEMALLOC))
+			ref_ok = 0;
+
+		frag_len = min_t(unsigned int, tot_len,
+				 PAGE_SIZE - q->buf_offset);
+		dma_sync_single_for_cpu(q->dma_dev,
+					buf->dma_addr + q->buf_offset,
+					frag_len, DMA_FROM_DEVICE);
+		buf->pg_refs--;
+		if (ref_ok)
+			ref_ok |= buf->node;
+
+		__skb_frag_set_page(frags, buf->page);
+		skb_frag_off_set(frags, q->buf_offset);
+		skb_frag_size_set(frags++, frag_len);
+
+		tot_len -= frag_len;
+		if (!tot_len)
+			break;
+
+		q->buf_offset = PAGE_SIZE;
+	}
+	q->buf_offset = ALIGN(q->buf_offset + frag_len, FUN_EPRQ_PKT_ALIGN);
+	q->cur_buf = buf;
+	return ref_ok;
+}
+
+static inline bool rx_hwtstamp_enabled(const struct net_device *dev)
+{
+	const struct funeth_priv *d = netdev_priv(dev);
+
+	return d->hwtstamp_cfg.rx_filter == HWTSTAMP_FILTER_ALL;
+}
+
+/* Advance the CQ pointers and phase tag to the next CQE. */
+static void advance_cq(struct funeth_rxq *q)
+{
+	if (unlikely(q->cq_head == q->cq_mask)) {
+		q->cq_head = 0;
+		q->phase ^= 1;
+		q->next_cqe_info = cqe_to_info(q->cqes);
+	} else {
+		q->cq_head++;
+		q->next_cqe_info += FUNETH_CQE_SIZE;
+	}
+	prefetch(q->next_cqe_info);
+}
+
+/* Process the packet represented by the head CQE of @q. Gather the packet's
+ * fragments, run it through the optional XDP program, and if needed construct
+ * an skb and pass it to the stack.
+ */
+static void fun_handle_cqe_pkt(struct funeth_rxq *q, struct funeth_txq *xdp_q)
+{
+	const struct fun_eth_cqe *rxreq = info_to_cqe(q->next_cqe_info);
+	unsigned int i, tot_len, pkt_len = be32_to_cpu(rxreq->pkt_len);
+	struct net_device *ndev = q->netdev;
+	skb_frag_t frags[RX_MAX_FRAGS];
+	struct skb_shared_info *si;
+	gro_result_t gro_res;
+	struct sk_buff *skb;
+	int ref_ok;
+	void *va;
+	u16 cv;
+
+	u64_stats_update_begin(&q->syncp);
+	q->stats.rx_pkts++;
+	q->stats.rx_bytes += pkt_len;
+	u64_stats_update_end(&q->syncp);
+
+	advance_cq(q);
+
+	/* account for head- and tail-room, present only for 1-buffer packets */
+	tot_len = pkt_len + q->extraroom;
+	if (unlikely(tot_len > PAGE_SIZE))
+		tot_len = pkt_len;
+
+	ref_ok = fun_gather_pkt(q, tot_len, frags);
+	va = skb_frag_address(frags);
+	if (xdp_q) {
+		va = fun_run_xdp(q, frags, va, ref_ok, xdp_q);
+		if (!va)
+			return;
+	}
+	if (unlikely(!ref_ok))
+		goto no_mem;
+
+	if (likely(rxreq->headroom == cpu_to_be16(FUN_RX_HEADROOM))) {
+		prefetch(va + FUN_RX_HEADROOM);
+		skb = build_skb(va, ALIGN(tot_len, FUN_EPRQ_PKT_ALIGN));
+		if (unlikely(!skb))
+			goto no_mem;
+
+		skb_reserve(skb, FUN_RX_HEADROOM);
+		__skb_put(skb, pkt_len);
+		skb->protocol = eth_type_trans(skb, ndev);
+	} else {
+		prefetch(va);
+		skb = napi_get_frags(q->napi);
+		if (unlikely(!skb))
+			goto no_mem;
+
+		if (ref_ok < 0)
+			skb->pfmemalloc = 1;
+
+		si = skb_shinfo(skb);
+		si->nr_frags = rxreq->nsgl;
+		for (i = 0; i < si->nr_frags; i++)
+			si->frags[i] = frags[i];
+
+		skb->len = pkt_len;
+		skb->data_len = pkt_len;
+		skb->truesize += round_up(pkt_len, FUN_EPRQ_PKT_ALIGN);
+	}
+
+	skb_record_rx_queue(skb, q->qidx);
+	cv = be16_to_cpu(rxreq->pkt_cv);
+	if (likely((q->netdev->features & NETIF_F_RXHASH) && rxreq->hash))
+		skb_set_hash(skb, be32_to_cpu(rxreq->hash),
+			     cqe_to_pkt_hash_type(cv));
+	if (likely((q->netdev->features & NETIF_F_RXCSUM) && rxreq->csum)) {
+		FUN_QSTAT_INC(q, rx_cso);
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		skb->csum_level = be16_to_cpu(rxreq->csum) - 1;
+	}
+	if (unlikely(rx_hwtstamp_enabled(q->netdev)))
+		skb_hwtstamps(skb)->hwtstamp = be64_to_cpu(rxreq->timestamp);
+
+	trace_funeth_rx(q, rxreq->nsgl, pkt_len, skb->hash, cv);
+
+	gro_res = skb->data_len ? napi_gro_frags(q->napi) :
+				  napi_gro_receive(q->napi, skb);
+	if (gro_res == GRO_MERGED || gro_res == GRO_MERGED_FREE)
+		FUN_QSTAT_INC(q, gro_merged);
+	else if (gro_res == GRO_HELD)
+		FUN_QSTAT_INC(q, gro_pkts);
+	return;
+
+no_mem:
+	FUN_QSTAT_INC(q, rx_mem_drops);
+
+	/* Release the references we've been granted for the frag pages.
+	 * We return the ref of the last frag and free the rest.
+	 */
+	q->cur_buf->pg_refs++;
+	for (i = 0; i < rxreq->nsgl - 1; i++)
+		__free_page(skb_frag_page(frags + i));
+}
+
+/* Return 0 if the phase tag of the CQE at the CQ's head matches expectations
+ * indicating the CQE is new.
+ */
+static inline u16 cqe_phase_mismatch(const struct fun_cqe_info *ci, u16 phase)
+{
+	u16 sf_p = be16_to_cpu(ci->sf_p);
+
+	return (sf_p & 1) ^ phase;
+}
+
+/* Walk through a CQ identifying and processing fresh CQEs up to the given
+ * budget. Return the remaining budget.
+ */
+static int fun_process_cqes(struct funeth_rxq *q, int budget)
+{
+	struct funeth_txq *xdp_q = NULL;
+
+	if (READ_ONCE(q->xdp_prog)) {
+		struct funeth_priv *fp = netdev_priv(q->netdev);
+
+		xdp_q = fp->xdpqs[smp_processor_id()];
+	}
+
+	while (budget && !cqe_phase_mismatch(q->next_cqe_info, q->phase)) {
+		/* access other descriptor fields after the phase check */
+		dma_rmb();
+
+		fun_handle_cqe_pkt(q, xdp_q);
+		budget--;
+	}
+
+	if (unlikely(q->xdp_flush)) {
+		if (q->xdp_flush & FUN_XDP_FLUSH_TX)
+			fun_txq_wr_db(xdp_q);
+		if (q->xdp_flush & FUN_XDP_FLUSH_REDIR)
+			xdp_do_flush();
+		q->xdp_flush = 0;
+	}
+
+	return budget;
+}
+
+/* NAPI handler for Rx queues. Calls the CQE processing loop and writes RQ/CQ
+ * doorbells as needed.
+ */
+int fun_rxq_napi_poll(struct napi_struct *napi, int budget)
+{
+	struct fun_irq *irq = container_of(napi, struct fun_irq, napi);
+	struct funeth_rxq *q = irq->rxq;
+	int work_done = budget - fun_process_cqes(q, budget);
+	u32 cq_db_val = q->cq_head;
+
+	if (unlikely(work_done >= budget))
+		FUN_QSTAT_INC(q, rx_budget);
+	else if (napi_complete_done(napi, work_done))
+		cq_db_val |= q->irq_db_val;
+
+	/* check whether to post new Rx buffers */
+	if (q->rq_cons - q->rq_cons_db >= q->rq_db_thres) {
+		u64_stats_update_begin(&q->syncp);
+		q->stats.rx_bufs += q->rq_cons - q->rq_cons_db;
+		u64_stats_update_end(&q->syncp);
+		q->rq_cons_db = q->rq_cons;
+		writel((q->rq_cons - 1) & q->rq_mask, q->rq_db);
+	}
+
+	writel(cq_db_val, q->cq_db);
+	return work_done;
+}
+
+/* Free the Rx buffers of an Rx queue. */
+static void fun_rxq_free_bufs(struct funeth_rxq *q)
+{
+	struct funeth_rxbuf *b = q->bufs;
+	unsigned int i;
+
+	for (i = 0; i <= q->rq_mask; i++, b++)
+		funeth_free_page(q, b);
+
+	funeth_free_page(q, &q->spare_buf);
+	q->cur_buf = NULL;
+}
+
+/* Initially provision an Rx queue with Rx buffers. */
+static int fun_rxq_alloc_bufs(struct funeth_rxq *q, int node)
+{
+	struct funeth_rxbuf *b = q->bufs;
+	unsigned int i;
+
+	for (i = 0; i <= q->rq_mask; i++, b++) {
+		if (funeth_alloc_page(q, b, node, GFP_KERNEL)) {
+			fun_rxq_free_bufs(q);
+			return -ENOMEM;
+		}
+		q->rqes[i] = FUN_EPRQ_RQBUF_INIT(b->dma_addr);
+	}
+	q->cur_buf = q->bufs;
+	return 0;
+}
+
+/* Initialize a used-buffer cache of the given depth. */
+static int fun_rxq_init_cache(struct funeth_rx_cache *c, unsigned int depth,
+			      int node)
+{
+	c->mask = depth - 1;
+	c->bufs = kvzalloc_node(depth * sizeof(*c->bufs), GFP_KERNEL, node);
+	return c->bufs ? 0 : -ENOMEM;
+}
+
+/* Deallocate an Rx queue's used-buffer cache and its contents. */
+static void fun_rxq_free_cache(struct funeth_rxq *q)
+{
+	struct funeth_rxbuf *b = q->cache.bufs;
+	unsigned int i;
+
+	for (i = 0; i <= q->cache.mask; i++, b++)
+		funeth_free_page(q, b);
+
+	kvfree(q->cache.bufs);
+	q->cache.bufs = NULL;
+}
+
+/* Create an Rx queue, allocating all the host and device resources needed. */
+struct funeth_rxq *funeth_rxq_create(struct net_device *dev, unsigned int qidx,
+				     unsigned int ncqe, unsigned int nrqe,
+				     struct fun_irq *irq)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	unsigned int irq_idx = irq->irq_idx;
+	unsigned int headroom, tailroom;
+	struct funeth_rxq *q;
+	int err = -ENOMEM;
+	int numa_node;
+
+	numa_node = fun_irq_node(irq);
+	q = kzalloc_node(sizeof(*q), GFP_KERNEL, numa_node);
+	if (!q)
+		goto err;
+
+	q->netdev = dev;
+	q->napi = &irq->napi;
+	q->dma_dev = &fp->pdev->dev;
+	q->cq_mask = ncqe - 1;
+	q->rq_mask = nrqe - 1;
+	q->qidx = qidx;
+
+	q->rqes = fun_alloc_ring_mem(q->dma_dev, nrqe, sizeof(*q->rqes),
+				     sizeof(*q->bufs), false, numa_node,
+				     &q->rq_dma_addr, (void **)&q->bufs, NULL);
+	if (!q->rqes)
+		goto free_q;
+
+	q->cqes = fun_alloc_ring_mem(q->dma_dev, ncqe, FUNETH_CQE_SIZE, 0,
+				     false, numa_node, &q->cq_dma_addr, NULL,
+				     NULL);
+	if (!q->cqes)
+		goto free_rqes;
+
+	err = fun_rxq_init_cache(&q->cache, nrqe, numa_node);
+	if (err)
+		goto free_cqes;
+
+	q->xdp_prog = fp->xdp_prog;
+	headroom = fp->xdp_prog ? FUN_XDP_HEADROOM : FUN_RX_HEADROOM;
+	tailroom = FUN_RX_TAILROOM;
+	q->extraroom = headroom + tailroom;
+
+	err = fun_sq_create(fp->fdev, FUN_ADMIN_RES_CREATE_FLAG_ALLOCATOR |
+			    FUN_ADMIN_EPSQ_CREATE_FLAG_RQ, 0,
+			    FUN_HCI_ID_INVALID, 0, nrqe, q->rq_dma_addr, 0, 0,
+			    0, 0, fp->fdev->kern_end_qid, PAGE_SHIFT,
+			    &q->hw_sqid, &q->rq_db);
+	if (err)
+		goto free_cache;
+
+	err = fun_cq_create(fp->fdev, FUN_ADMIN_RES_CREATE_FLAG_ALLOCATOR |
+			    FUN_ADMIN_EPCQ_CREATE_FLAG_RQ, 0,
+			    q->hw_sqid, ilog2(FUNETH_CQE_SIZE), ncqe,
+			    q->cq_dma_addr, headroom, tailroom, 0, 0, irq_idx,
+			    0, fp->fdev->kern_end_qid, &q->hw_cqid, &q->cq_db);
+	if (err)
+		goto free_rq;
+
+	err = fun_rxq_alloc_bufs(q, numa_node);
+	if (err)
+		goto free_cq;
+
+	q->phase = 1;
+	q->irq_idx = irq_idx;
+	q->rq_db_thres = nrqe / 4;
+	q->next_cqe_info = cqe_to_info(q->cqes);
+	q->stats.rx_bufs = q->rq_mask;
+	u64_stats_init(&q->syncp);
+
+	err = xdp_rxq_info_reg(&q->xdp_rxq, dev, qidx, irq->napi.napi_id);
+	if (err)
+		goto free_bufs;
+
+	err = xdp_rxq_info_reg_mem_model(&q->xdp_rxq, MEM_TYPE_PAGE_SHARED,
+					 NULL);
+	if (err)
+		goto xdp_unreg;
+
+	irq->rxq = q;
+	writel(q->rq_mask, q->rq_db);
+
+	netif_info(fp, ifup, dev,
+		   "Rx queue %u, depth %u/%u, HW qid %u/%u, IRQ idx %u, node %d, headroom %u\n",
+		   qidx, ncqe, nrqe, q->hw_cqid, q->hw_sqid, irq_idx,
+		   numa_node, headroom);
+	return q;
+
+xdp_unreg:
+	xdp_rxq_info_unreg(&q->xdp_rxq);
+free_bufs:
+	fun_rxq_free_bufs(q);
+free_cq:
+	fun_destroy_cq(fp->fdev, q->hw_cqid);
+free_rq:
+	fun_destroy_sq(fp->fdev, q->hw_sqid);
+free_cache:
+	fun_rxq_free_cache(q);
+free_cqes:
+	dma_free_coherent(q->dma_dev, ncqe * FUNETH_CQE_SIZE, q->cqes,
+			  q->cq_dma_addr);
+free_rqes:
+	fun_free_ring_mem(q->dma_dev, nrqe, sizeof(*q->rqes), false, q->rqes,
+			  q->rq_dma_addr, q->bufs);
+free_q:
+	kfree(q);
+err:
+	netdev_err(dev, "Failed to allocate Rx queue %u, error %d\n", qidx,
+		   err);
+	return ERR_PTR(err);
+}
+
+/* Destroy and free an Rx queue. */
+int funeth_rxq_free(struct funeth_rxq *q)
+{
+	struct funeth_priv *fp = netdev_priv(q->netdev);
+	int err, err2;
+
+	err = fun_destroy_sq(fp->fdev, q->hw_sqid);
+	err2 = fun_destroy_cq(fp->fdev, q->hw_cqid);
+	if (!err)
+		err = err2;
+
+	fun_rxq_free_cache(q);
+	fun_rxq_free_bufs(q);
+	xdp_rxq_info_unreg(&q->xdp_rxq);
+	fun_free_ring_mem(q->dma_dev, q->rq_mask + 1, sizeof(*q->rqes), false,
+			  q->rqes, q->rq_dma_addr, q->bufs);
+	dma_free_coherent(q->dma_dev, (q->cq_mask + 1) * FUNETH_CQE_SIZE,
+			  q->cqes, q->cq_dma_addr);
+
+	/* Before freeing the queue transfer key counters to the device. */
+	fp->rx_packets += q->stats.rx_pkts;
+	fp->rx_bytes   += q->stats.rx_bytes;
+	fp->rx_dropped += q->stats.rx_map_err + q->stats.rx_mem_drops;
+
+	netif_info(fp, ifdown, q->netdev,
+		   "Freeing Rx queue %u (id %u/%u), IRQ %u, err %d\n",
+		   q->qidx, q->hw_cqid, q->hw_sqid, q->irq_idx, err);
+	kfree(q);
+	return err;
+}
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_trace.h b/drivers/net/ethernet/fungible/funeth/funeth_trace.h
new file mode 100644
index 000000000000..9e58dfec19d5
--- /dev/null
+++ b/drivers/net/ethernet/fungible/funeth/funeth_trace.h
@@ -0,0 +1,117 @@ 
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM funeth
+
+#if !defined(_TRACE_FUNETH_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FUNETH_H
+
+#include <linux/tracepoint.h>
+
+#include "funeth_txrx.h"
+
+TRACE_EVENT(funeth_tx,
+
+	TP_PROTO(const struct funeth_txq *txq,
+		 u32 len,
+		 u32 sqe_idx,
+		 u32 ngle),
+
+	TP_ARGS(txq, len, sqe_idx, ngle),
+
+	TP_STRUCT__entry(
+		__field(u32, qidx)
+		__field(u32, len)
+		__field(u32, sqe_idx)
+		__field(u32, ngle)
+		__string(devname, txq->netdev->name)
+	),
+
+	TP_fast_assign(
+		__entry->qidx = txq->qidx;
+		__entry->len = len;
+		__entry->sqe_idx = sqe_idx;
+		__entry->ngle = ngle;
+		__assign_str(devname, txq->netdev->name);
+	),
+
+	TP_printk("%s: Txq %u, SQE idx %u, len %u, num GLEs %u",
+		  __get_str(devname), __entry->qidx, __entry->sqe_idx,
+		  __entry->len, __entry->ngle)
+);
+
+TRACE_EVENT(funeth_tx_free,
+
+	TP_PROTO(const struct funeth_txq *txq,
+		 u32 sqe_idx,
+		 u32 num_sqes,
+		 u32 hw_head),
+
+	TP_ARGS(txq, sqe_idx, num_sqes, hw_head),
+
+	TP_STRUCT__entry(
+		__field(u32, qidx)
+		__field(u32, sqe_idx)
+		__field(u32, num_sqes)
+		__field(u32, hw_head)
+		__string(devname, txq->netdev->name)
+	),
+
+	TP_fast_assign(
+		__entry->qidx = txq->qidx;
+		__entry->sqe_idx = sqe_idx;
+		__entry->num_sqes = num_sqes;
+		__entry->hw_head = hw_head;
+		__assign_str(devname, txq->netdev->name);
+	),
+
+	TP_printk("%s: Txq %u, SQE idx %u, SQEs %u, HW head %u",
+		  __get_str(devname), __entry->qidx, __entry->sqe_idx,
+		  __entry->num_sqes, __entry->hw_head)
+);
+
+TRACE_EVENT(funeth_rx,
+
+	TP_PROTO(const struct funeth_rxq *rxq,
+		 u32 num_rqes,
+		 u32 pkt_len,
+		 u32 hash,
+		 u32 cls_vec),
+
+	TP_ARGS(rxq, num_rqes, pkt_len, hash, cls_vec),
+
+	TP_STRUCT__entry(
+		__field(u32, qidx)
+		__field(u32, cq_head)
+		__field(u32, num_rqes)
+		__field(u32, len)
+		__field(u32, hash)
+		__field(u32, cls_vec)
+		__string(devname, rxq->netdev->name)
+	),
+
+	TP_fast_assign(
+		__entry->qidx = rxq->qidx;
+		__entry->cq_head = rxq->cq_head;
+		__entry->num_rqes = num_rqes;
+		__entry->len = pkt_len;
+		__entry->hash = hash;
+		__entry->cls_vec = cls_vec;
+		__assign_str(devname, rxq->netdev->name);
+	),
+
+	TP_printk("%s: Rxq %u, CQ head %u, RQEs %u, len %u, hash %u, CV %#x",
+		  __get_str(devname), __entry->qidx, __entry->cq_head,
+		  __entry->num_rqes, __entry->len, __entry->hash,
+		  __entry->cls_vec)
+);
+
+#endif /* _TRACE_FUNETH_H */
+
+/* Below must be outside protection. */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE funeth_trace
+
+#include <trace/define_trace.h>
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_tx.c b/drivers/net/ethernet/fungible/funeth/funeth_tx.c
new file mode 100644
index 000000000000..f3f4ebf1220a
--- /dev/null
+++ b/drivers/net/ethernet/fungible/funeth/funeth_tx.c
@@ -0,0 +1,701 @@ 
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
+
+#include <linux/dma-mapping.h>
+#include <linux/ip.h>
+#include <linux/pci.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <uapi/linux/udp.h>
+#include "funeth.h"
+#include "funeth_txrx.h"
+#include "funeth_trace.h"
+#include "fun_queue.h"
+
+/* minimum packet size supported by HW is 33B */
+#define FUN_TX_MIN_LEN 33
+
+#define FUN_XDP_CLEAN_THRES 32
+#define FUN_XDP_CLEAN_BATCH 16
+
+/* DMA-map a packet and return the (length, DMA_address) pairs for its
+ * segments. If a mapping error occurs -ENOMEM is returned.
+ */
+static int map_skb(const struct sk_buff *skb, struct device *dev,
+		   dma_addr_t *addr, unsigned int *len)
+{
+	const struct skb_shared_info *si;
+	const skb_frag_t *fp, *end;
+
+	*len = skb_headlen(skb);
+	*addr = dma_map_single(dev, skb->data, *len, DMA_TO_DEVICE);
+	if (dma_mapping_error(dev, *addr))
+		return -ENOMEM;
+
+	si = skb_shinfo(skb);
+	end = &si->frags[si->nr_frags];
+
+	for (fp = si->frags; fp < end; fp++) {
+		*++len = skb_frag_size(fp);
+		*++addr = skb_frag_dma_map(dev, fp, 0, *len, DMA_TO_DEVICE);
+		if (dma_mapping_error(dev, *addr))
+			goto unwind;
+	}
+	return 0;
+
+unwind:
+	while (fp-- > si->frags)
+		dma_unmap_page(dev, *--addr, skb_frag_size(fp), DMA_TO_DEVICE);
+
+	dma_unmap_single(dev, addr[-1], skb_headlen(skb), DMA_TO_DEVICE);
+	return -ENOMEM;
+}
+
+/* Return the address just past the end of a Tx queue's descriptor ring.
+ * It exploits the fact that the HW writeback area is just after the end
+ * of the descriptor ring.
+ */
+static inline void *txq_end(const struct funeth_txq *q)
+{
+	return (void *)q->hw_wb;
+}
+
+/* Return the amount of space within a Tx ring from the given address to the
+ * end.
+ */
+static inline unsigned int txq_to_end(const struct funeth_txq *q, void *p)
+{
+	return txq_end(q) - p;
+}
+
+/* Return the number of Tx descriptors occupied by a Tx request. */
+static inline unsigned int tx_req_ndesc(const struct fun_eth_tx_req *req)
+{
+	return DIV_ROUND_UP(req->len8, FUNETH_SQE_SIZE / 8);
+}
+
+static inline __be16 tcp_hdr_doff_flags(const struct tcphdr *th)
+{
+	return *(__be16 *)&tcp_flag_word(th);
+}
+
+#ifdef CONFIG_TLS_DEVICE
+#include "funeth_ktls.h"
+
+static struct sk_buff *fun_tls_tx(struct sk_buff *skb, struct funeth_txq *q,
+				  unsigned int *tls_len)
+{
+	const struct fun_ktls_tx_ctx *tls_ctx;
+	u32 datalen, seq;
+
+	datalen = skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb));
+	if (!datalen)
+		return skb;
+
+	if (likely(!tls_offload_tx_resync_pending(skb->sk))) {
+		seq = ntohl(tcp_hdr(skb)->seq);
+		tls_ctx = tls_driver_ctx(skb->sk, TLS_OFFLOAD_CTX_DIR_TX);
+
+		if (likely(tls_ctx->next_seq == seq)) {
+			*tls_len = datalen;
+			return skb;
+		}
+		if (seq - tls_ctx->next_seq < U32_MAX / 4) {
+			tls_offload_tx_resync_request(skb->sk, seq,
+						      tls_ctx->next_seq);
+		}
+	}
+
+	FUN_QSTAT_INC(q, tx_tls_fallback);
+	skb = tls_encrypt_skb(skb);
+	if (!skb)
+		FUN_QSTAT_INC(q, tx_tls_drops);
+
+	return skb;
+}
+#endif
+
+/* Write as many descriptors as needed for the supplied skb starting at the
+ * current producer location. The caller has made certain enough descriptors
+ * are available.
+ *
+ * Returns the number of descriptors written, 0 on error.
+ */
+static unsigned int write_pkt_desc(struct sk_buff *skb, struct funeth_txq *q,
+				   unsigned int tls_len)
+{
+	unsigned int idx = q->prod_cnt & q->mask;
+	struct fun_eth_tx_req *req = fun_tx_desc_addr(q, idx);
+	unsigned int extra_bytes = 0, extra_pkts = 0;
+	const struct skb_shared_info *shinfo;
+	unsigned int lens[MAX_SKB_FRAGS + 1];
+	dma_addr_t addrs[MAX_SKB_FRAGS + 1];
+	struct fun_dataop_gl *gle;
+	const struct tcphdr *th;
+	unsigned int ngle, i;
+	u16 flags;
+
+	if (unlikely(skb->len < FUN_TX_MIN_LEN)) {
+		FUN_QSTAT_INC(q, tx_len_err);
+		return 0;
+	}
+
+	if (unlikely(map_skb(skb, q->dma_dev, addrs, lens))) {
+		FUN_QSTAT_INC(q, tx_map_err);
+		return 0;
+	}
+
+	req->op = FUN_ETH_OP_TX;
+	req->len8 = 0;
+	req->flags = 0;
+	req->suboff8 = offsetof(struct fun_eth_tx_req, dataop);
+	req->repr_idn = 0;
+	req->encap_proto = 0;
+
+	shinfo = skb_shinfo(skb);
+	if (likely(shinfo->gso_size)) {
+		if (skb->encapsulation) {
+			flags = FUN_ETH_OUTER_EN | FUN_ETH_INNER_LSO |
+				FUN_ETH_UPDATE_INNER_L4_CKSUM |
+				FUN_ETH_UPDATE_OUTER_L3_LEN;
+			if (shinfo->gso_type & (SKB_GSO_UDP_TUNNEL |
+						SKB_GSO_UDP_TUNNEL_CSUM)) {
+				flags |= FUN_ETH_UPDATE_OUTER_L4_LEN |
+					 FUN_ETH_OUTER_UDP;
+				if (shinfo->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)
+					flags |= FUN_ETH_UPDATE_OUTER_L4_CKSUM;
+			}
+			if (ip_hdr(skb)->version == 4)
+				flags |= FUN_ETH_UPDATE_OUTER_L3_CKSUM;
+			else
+				flags |= FUN_ETH_OUTER_IPV6;
+
+			if (skb->inner_network_header) {
+				if (inner_ip_hdr(skb)->version == 4)
+					flags |= FUN_ETH_UPDATE_INNER_L3_CKSUM |
+						 FUN_ETH_UPDATE_INNER_L3_LEN;
+				else
+					flags |= FUN_ETH_INNER_IPV6 |
+						 FUN_ETH_UPDATE_INNER_L3_LEN;
+			}
+			th = inner_tcp_hdr(skb);
+			fun_eth_offload_init(&req->offload, flags,
+					     shinfo->gso_size,
+					     tcp_hdr_doff_flags(th), 0,
+					     skb_inner_network_offset(skb),
+					     skb_inner_transport_offset(skb),
+					     skb_network_offset(skb),
+					     skb_transport_offset(skb));
+		} else {
+			/* HW considers one set of headers as inner */
+			flags = FUN_ETH_INNER_LSO |
+				FUN_ETH_UPDATE_INNER_L4_CKSUM |
+				FUN_ETH_UPDATE_INNER_L3_LEN;
+			if (shinfo->gso_type & SKB_GSO_TCPV6)
+				flags |= FUN_ETH_INNER_IPV6;
+			else
+				flags |= FUN_ETH_UPDATE_INNER_L3_CKSUM;
+			th = tcp_hdr(skb);
+			fun_eth_offload_init(&req->offload, flags,
+					     shinfo->gso_size,
+					     tcp_hdr_doff_flags(th), 0,
+					     skb_network_offset(skb),
+					     skb_transport_offset(skb), 0, 0);
+		}
+
+		u64_stats_update_begin(&q->syncp);
+		q->stats.tx_tso++;
+		q->stats.tx_cso += shinfo->gso_segs;
+		u64_stats_update_end(&q->syncp);
+
+		extra_pkts = shinfo->gso_segs - 1;
+		extra_bytes = (be16_to_cpu(req->offload.inner_l4_off) +
+			       __tcp_hdrlen(th)) * extra_pkts;
+	} else if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
+		flags = FUN_ETH_UPDATE_INNER_L4_CKSUM;
+		if (skb->csum_offset == offsetof(struct udphdr, check))
+			flags |= FUN_ETH_INNER_UDP;
+		fun_eth_offload_init(&req->offload, flags, 0, 0, 0, 0,
+				     skb_checksum_start_offset(skb), 0, 0);
+		FUN_QSTAT_INC(q, tx_cso);
+	} else {
+		fun_eth_offload_init(&req->offload, 0, 0, 0, 0, 0, 0, 0, 0);
+	}
+
+	ngle = shinfo->nr_frags + 1;
+	req->len8 = (sizeof(*req) + ngle * sizeof(*gle)) / 8;
+	req->dataop = FUN_DATAOP_HDR_INIT(ngle, 0, ngle, 0, skb->len);
+
+	for (i = 0, gle = (struct fun_dataop_gl *)req->dataop.imm;
+	     i < ngle && txq_to_end(q, gle); i++, gle++)
+		fun_dataop_gl_init(gle, 0, 0, lens[i], addrs[i]);
+
+	if (txq_to_end(q, gle) == 0) {
+		gle = (struct fun_dataop_gl *)q->desc;
+		for ( ; i < ngle; i++, gle++)
+			fun_dataop_gl_init(gle, 0, 0, lens[i], addrs[i]);
+	}
+
+#ifdef CONFIG_TLS_DEVICE
+	if (unlikely(tls_len)) {
+		struct fun_eth_tls *tls = (struct fun_eth_tls *)gle;
+		struct fun_ktls_tx_ctx *tls_ctx;
+
+		req->len8 += FUNETH_TLS_SZ / 8;
+		req->flags = cpu_to_be16(FUN_ETH_TX_TLS);
+
+		tls_ctx = tls_driver_ctx(skb->sk, TLS_OFFLOAD_CTX_DIR_TX);
+		tls->tlsid = tls_ctx->tlsid;
+		tls_ctx->next_seq += tls_len;
+
+		u64_stats_update_begin(&q->syncp);
+		q->stats.tx_tls_bytes += tls_len;
+		q->stats.tx_tls_pkts += 1 + extra_pkts;
+		u64_stats_update_end(&q->syncp);
+	}
+#endif
+
+	u64_stats_update_begin(&q->syncp);
+	q->stats.tx_bytes += skb->len + extra_bytes;
+	q->stats.tx_pkts += 1 + extra_pkts;
+	u64_stats_update_end(&q->syncp);
+
+	q->info[idx].skb = skb;
+
+	trace_funeth_tx(q, skb->len, idx, req->dataop.ngather);
+	return tx_req_ndesc(req);
+}
+
+/* Return the number of available descriptors of a Tx queue.
+ * HW assumes head==tail means the ring is empty so we need to keep one
+ * descriptor unused.
+ */
+static inline unsigned int fun_txq_avail(const struct funeth_txq *q)
+{
+	return q->mask - q->prod_cnt + q->cons_cnt;
+}
+
+/* Stop a queue if it can't handle another worst-case packet. */
+static void fun_tx_check_stop(struct funeth_txq *q)
+{
+	if (likely(fun_txq_avail(q) >= FUNETH_MAX_PKT_DESC))
+		return;
+
+	netif_tx_stop_queue(q->ndq);
+
+	/* NAPI reclaim is freeing packets in parallel with us and we may race.
+	 * We have stopped the queue but check again after synchronizing with
+	 * reclaim.
+	 */
+	smp_mb();
+	if (likely(fun_txq_avail(q) < FUNETH_MAX_PKT_DESC))
+		FUN_QSTAT_INC(q, tx_nstops);
+	else
+		netif_tx_start_queue(q->ndq);
+}
+
+/* Return true if a queue has enough space to restart. Current condition is
+ * that the queue must be >= 1/4 empty.
+ */
+static bool fun_txq_may_restart(struct funeth_txq *q)
+{
+	return fun_txq_avail(q) >= q->mask / 4;
+}
+
+netdev_tx_t fun_start_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct funeth_priv *fp = netdev_priv(netdev);
+	unsigned int qid = skb_get_queue_mapping(skb);
+	struct funeth_txq *q = fp->txqs[qid];
+	unsigned int tls_len = 0;
+	unsigned int ndesc;
+
+#ifdef CONFIG_TLS_DEVICE
+	if (skb->sk && tls_is_sk_tx_device_offloaded(skb->sk)) {
+		skb = fun_tls_tx(skb, q, &tls_len);
+		if (unlikely(!skb))
+			goto dropped;
+	}
+#endif
+
+	ndesc = write_pkt_desc(skb, q, tls_len);
+	if (unlikely(!ndesc)) {
+		dev_kfree_skb_any(skb);
+		goto dropped;
+	}
+
+	q->prod_cnt += ndesc;
+	fun_tx_check_stop(q);
+
+	skb_tx_timestamp(skb);
+
+	if (__netdev_tx_sent_queue(q->ndq, skb->len, netdev_xmit_more()))
+		fun_txq_wr_db(q);
+	else
+		FUN_QSTAT_INC(q, tx_more);
+
+	return NETDEV_TX_OK;
+
+dropped:
+	/* A dropped packet may be the last one in a xmit_more train,
+	 * ring the doorbell just in case.
+	 */
+	if (!netdev_xmit_more())
+		fun_txq_wr_db(q);
+	return NETDEV_TX_OK;
+}
+
+/* Return a Tx queue's HW head index written back to host memory. */
+static inline u16 txq_hw_head(const struct funeth_txq *q)
+{
+	return (u16)be64_to_cpu(*q->hw_wb);
+}
+
+/* Unmap the Tx packet starting at the given descriptor index and
+ * return the number of Tx descriptors it occupied.
+ */
+static unsigned int unmap_skb(const struct funeth_txq *q, unsigned int idx)
+{
+	const struct fun_eth_tx_req *req = fun_tx_desc_addr(q, idx);
+	unsigned int ngle = req->dataop.ngather;
+	struct fun_dataop_gl *gle;
+
+	if (ngle) {
+		gle = (struct fun_dataop_gl *)req->dataop.imm;
+		dma_unmap_single(q->dma_dev, be64_to_cpu(gle->sgl_data),
+				 be32_to_cpu(gle->sgl_len), DMA_TO_DEVICE);
+
+		for (gle++; --ngle && txq_to_end(q, gle); gle++)
+			dma_unmap_page(q->dma_dev, be64_to_cpu(gle->sgl_data),
+				       be32_to_cpu(gle->sgl_len),
+				       DMA_TO_DEVICE);
+
+		for (gle = (struct fun_dataop_gl *)q->desc; ngle; ngle--, gle++)
+			dma_unmap_page(q->dma_dev, be64_to_cpu(gle->sgl_data),
+				       be32_to_cpu(gle->sgl_len),
+				       DMA_TO_DEVICE);
+	}
+
+	return tx_req_ndesc(req);
+}
+
+/* Reclaim completed Tx descriptors and free their packets. Restart a stopped
+ * queue if we freed enough descriptors.
+ *
+ * Return true if we exhausted the budget while there is more work to be done.
+ */
+static bool fun_txq_reclaim(struct funeth_txq *q, int budget)
+{
+	unsigned int npkts = 0, nbytes = 0, ndesc = 0;
+	unsigned int head, limit, reclaim_idx;
+
+	/* budget may be 0, e.g., netpoll */
+	limit = budget ? budget : UINT_MAX;
+
+	for (head = txq_hw_head(q), reclaim_idx = q->cons_cnt & q->mask;
+	     head != reclaim_idx && npkts < limit; head = txq_hw_head(q)) {
+		/* The HW head is continually updated, ensure we don't read
+		 * descriptor state before the head tells us to reclaim it.
+		 * On the enqueue side the doorbell is an implicit write
+		 * barrier.
+		 */
+		rmb();
+
+		do {
+			unsigned int pkt_desc = unmap_skb(q, reclaim_idx);
+			struct sk_buff *skb = q->info[reclaim_idx].skb;
+
+			trace_funeth_tx_free(q, reclaim_idx, pkt_desc, head);
+
+			nbytes += skb->len;
+			napi_consume_skb(skb, budget);
+			ndesc += pkt_desc;
+			reclaim_idx = (reclaim_idx + pkt_desc) & q->mask;
+			npkts++;
+		} while (reclaim_idx != head && npkts < limit);
+	}
+
+	q->cons_cnt += ndesc;
+	netdev_tx_completed_queue(q->ndq, npkts, nbytes);
+	smp_mb(); /* pairs with the one in fun_tx_check_stop() */
+
+	if (unlikely(netif_tx_queue_stopped(q->ndq) &&
+		     fun_txq_may_restart(q))) {
+		netif_tx_wake_queue(q->ndq);
+		FUN_QSTAT_INC(q, tx_nrestarts);
+	}
+
+	return reclaim_idx != head;
+}
+
+/* The NAPI handler for Tx queues. */
+int fun_txq_napi_poll(struct napi_struct *napi, int budget)
+{
+	struct fun_irq *irq = container_of(napi, struct fun_irq, napi);
+	struct funeth_txq *q = irq->txq;
+	unsigned int db_val;
+
+	if (fun_txq_reclaim(q, budget))
+		return budget;               /* exhausted budget */
+
+	napi_complete(napi);                 /* exhausted pending work */
+	db_val = READ_ONCE(q->irq_db_val) | (q->cons_cnt & q->mask);
+	writel(db_val, q->db);
+	return 0;
+}
+
+static void fun_xdp_unmap(const struct funeth_txq *q, unsigned int idx)
+{
+	const struct fun_eth_tx_req *req = fun_tx_desc_addr(q, idx);
+	const struct fun_dataop_gl *gle;
+
+	gle = (const struct fun_dataop_gl *)req->dataop.imm;
+	dma_unmap_single(q->dma_dev, be64_to_cpu(gle->sgl_data),
+			 be32_to_cpu(gle->sgl_len), DMA_TO_DEVICE);
+}
+
+/* Reclaim up to @budget completed Tx descriptors from a TX XDP queue. */
+static unsigned int fun_xdpq_clean(struct funeth_txq *q, unsigned int budget)
+{
+	unsigned int npkts = 0, head, reclaim_idx;
+
+	for (head = txq_hw_head(q), reclaim_idx = q->cons_cnt & q->mask;
+	     head != reclaim_idx && npkts < budget; head = txq_hw_head(q)) {
+		/* The HW head is continually updated, ensure we don't read
+		 * descriptor state before the head tells us to reclaim it.
+		 * On the enqueue side the doorbell is an implicit write
+		 * barrier.
+		 */
+		rmb();
+
+		do {
+			fun_xdp_unmap(q, reclaim_idx);
+			page_frag_free(q->info[reclaim_idx].vaddr);
+
+			trace_funeth_tx_free(q, reclaim_idx, 1, head);
+
+			reclaim_idx = (reclaim_idx + 1) & q->mask;
+			npkts++;
+		} while (reclaim_idx != head && npkts < budget);
+	}
+
+	q->cons_cnt += npkts;
+	return npkts;
+}
+
+bool fun_xdp_tx(struct funeth_txq *q, void *data, unsigned int len)
+{
+	struct fun_eth_tx_req *req;
+	struct fun_dataop_gl *gle;
+	unsigned int idx;
+	dma_addr_t dma;
+
+	if (unlikely(len < FUN_TX_MIN_LEN)) {
+		FUN_QSTAT_INC(q, tx_len_err);
+		return false;
+	}
+
+	if (fun_txq_avail(q) < FUN_XDP_CLEAN_THRES)
+		fun_xdpq_clean(q, FUN_XDP_CLEAN_BATCH);
+
+	if (!unlikely(fun_txq_avail(q))) {
+		FUN_QSTAT_INC(q, tx_xdp_full);
+		return false;
+	}
+
+	dma = dma_map_single(q->dma_dev, data, len, DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(q->dma_dev, dma))) {
+		FUN_QSTAT_INC(q, tx_map_err);
+		return false;
+	}
+
+	idx = q->prod_cnt & q->mask;
+	req = fun_tx_desc_addr(q, idx);
+	req->op = FUN_ETH_OP_TX;
+	req->len8 = (sizeof(*req) + sizeof(*gle)) / 8;
+	req->flags = 0;
+	req->suboff8 = offsetof(struct fun_eth_tx_req, dataop);
+	req->repr_idn = 0;
+	req->encap_proto = 0;
+	fun_eth_offload_init(&req->offload, 0, 0, 0, 0, 0, 0, 0, 0);
+	req->dataop = FUN_DATAOP_HDR_INIT(1, 0, 1, 0, len);
+
+	gle = (struct fun_dataop_gl *)req->dataop.imm;
+	fun_dataop_gl_init(gle, 0, 0, len, dma);
+
+	q->info[idx].vaddr = data;
+
+	u64_stats_update_begin(&q->syncp);
+	q->stats.tx_bytes += len;
+	q->stats.tx_pkts++;
+	u64_stats_update_end(&q->syncp);
+
+	trace_funeth_tx(q, len, idx, 1);
+	q->prod_cnt++;
+
+	return true;
+}
+
+int fun_xdp_xmit_frames(struct net_device *dev, int n,
+			struct xdp_frame **frames, u32 flags)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	struct funeth_txq *q;
+	int i, q_idx;
+
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+		return -EINVAL;
+
+	if (unlikely(!rcu_access_pointer(fp->rxqs)))
+		return -ENETDOWN;
+
+	q_idx = smp_processor_id();
+	if (unlikely(q_idx >= fp->num_xdpqs))
+		return -ENXIO;
+
+	for (q = fp->xdpqs[q_idx], i = 0; i < n; i++) {
+		const struct xdp_frame *xdpf = frames[i];
+
+		if (!fun_xdp_tx(q, xdpf->data, xdpf->len))
+			break;
+	}
+
+	if (unlikely(flags & XDP_XMIT_FLUSH))
+		fun_txq_wr_db(q);
+	return i;
+}
+
+/* Purge a Tx queue of any queued packets. Should be called once HW access
+ * to the packets has been revoked, e.g., after the queue has been disabled.
+ */
+static void fun_txq_purge(struct funeth_txq *q)
+{
+	while (q->cons_cnt != q->prod_cnt) {
+		unsigned int idx = q->cons_cnt & q->mask;
+
+		q->cons_cnt += unmap_skb(q, idx);
+		dev_kfree_skb_any(q->info[idx].skb);
+	}
+	netdev_tx_reset_queue(q->ndq);
+}
+
+static void fun_xdpq_purge(struct funeth_txq *q)
+{
+	while (q->cons_cnt != q->prod_cnt) {
+		unsigned int idx = q->cons_cnt & q->mask;
+
+		fun_xdp_unmap(q, idx);
+		page_frag_free(q->info[idx].vaddr);
+		q->cons_cnt++;
+	}
+}
+
+/* Create a Tx queue, allocating all the host and device resources needed. */
+struct funeth_txq *funeth_txq_create(struct net_device *dev, unsigned int qidx,
+				     unsigned int ndesc, struct fun_irq *irq)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	unsigned int ethid = fp->ethid_start + qidx;
+	int numa_node, err = -ENOMEM;
+	struct funeth_txq *q;
+	const char *qtype;
+
+	if (irq) {
+		/* regular Tx queue */
+		qtype = "Tx";
+		numa_node = fun_irq_node(irq);
+	} else {
+		/* XDP Tx queue */
+		qtype = "XDP";
+		numa_node = cpu_to_node(qidx);
+		ethid += dev->real_num_tx_queues;
+	}
+
+	q = kzalloc_node(sizeof(*q), GFP_KERNEL, numa_node);
+	if (!q)
+		goto err;
+
+	q->irq_db_val = FUN_IRQ_SQ_DB(fp->tx_coal_usec, fp->tx_coal_count);
+	q->netdev = dev;
+	q->dma_dev = &fp->pdev->dev;
+	q->mask = ndesc - 1;
+	q->qidx = qidx;
+	q->irq_idx = irq ? irq->irq_idx : 0;
+	q->ethid = ethid;
+
+	q->desc = fun_alloc_ring_mem(q->dma_dev, ndesc, FUNETH_SQE_SIZE,
+				     sizeof(*q->info), true, numa_node,
+				     &q->dma_addr, (void **)&q->info,
+				     &q->hw_wb);
+	if (!q->desc)
+		goto free_q;
+
+	err = fun_sq_create(fp->fdev,
+			    FUN_ADMIN_EPSQ_CREATE_FLAG_HEAD_WB_ADDRESS |
+			    FUN_ADMIN_RES_CREATE_FLAG_ALLOCATOR, 0,
+			    FUN_HCI_ID_INVALID, ilog2(FUNETH_SQE_SIZE), ndesc,
+			    q->dma_addr, fp->tx_coal_count,
+			    fp->tx_coal_usec, q->irq_idx, 0,
+			    fp->fdev->kern_end_qid, 0, &q->hw_qid, &q->db);
+	if (err)
+		goto free_desc;
+
+	err = fun_create_and_bind_tx(fp, ethid, q->hw_qid);
+	if (err)
+		goto free_devq;
+
+	u64_stats_init(&q->syncp);
+	if (irq) {
+		q->ndq = netdev_get_tx_queue(dev, qidx);
+		irq->txq = q;
+		writel(q->irq_db_val, q->db);
+	}
+
+	netif_info(fp, ifup, dev,
+		   "%s queue %u, depth %u, HW qid %u, IRQ idx %u, node %d\n",
+		   qtype, qidx, ndesc, q->hw_qid, q->irq_idx, numa_node);
+	return q;
+
+free_devq:
+	fun_destroy_sq(fp->fdev, q->hw_qid);
+free_desc:
+	fun_free_ring_mem(q->dma_dev, ndesc, FUNETH_SQE_SIZE, true, q->desc,
+			  q->dma_addr, q->info);
+free_q:
+	kfree(q);
+err:
+	netdev_err(dev, "Failed to allocate %s queue %u, error %d\n", qtype,
+		   qidx, err);
+	return ERR_PTR(err);
+}
+
+/* Destroy and free a Tx queue.
+ * The queue must be already disconnected from the stack.
+ */
+int funeth_txq_free(struct funeth_txq *q)
+{
+	struct funeth_priv *fp = netdev_priv(q->netdev);
+	int err;
+
+	err = fun_destroy_sq(fp->fdev, q->hw_qid);
+	if (!err)
+		err = fun_res_destroy(fp->fdev, FUN_ADMIN_OP_ETH, 0, q->ethid);
+
+	if (q->ndq)
+		fun_txq_purge(q);
+	else
+		fun_xdpq_purge(q);
+
+	fun_free_ring_mem(q->dma_dev, q->mask + 1, FUNETH_SQE_SIZE, true,
+			  q->desc, q->dma_addr, q->info);
+
+	fp->tx_packets += q->stats.tx_pkts;
+	fp->tx_bytes   += q->stats.tx_bytes;
+	fp->tx_dropped += q->stats.tx_map_err + q->stats.tx_len_err;
+
+	netif_info(fp, ifdown, q->netdev,
+		   "Freeing %s queue %u (id %u), IRQ %u, ethid %u, err %d\n",
+		   q->ndq ? "Tx" : "XDP", q->qidx, q->hw_qid, q->irq_idx,
+		   q->ethid, err);
+	kfree(q);
+	return err;
+}
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
new file mode 100644
index 000000000000..804c81d028de
--- /dev/null
+++ b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
@@ -0,0 +1,242 @@ 
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+
+#ifndef _FUNETH_TXRX_H
+#define _FUNETH_TXRX_H
+
+#include <linux/netdevice.h>
+#include <linux/u64_stats_sync.h>
+
+/* Tx descriptor size */
+#define FUNETH_SQE_SIZE 64U
+
+/* Size of device headers per Tx packet */
+#define FUNETH_FUNOS_HDR_SZ (sizeof(struct fun_eth_tx_req))
+
+/* Number of gather list entries per Tx descriptor */
+#define FUNETH_GLE_PER_DESC (FUNETH_SQE_SIZE / sizeof(struct fun_dataop_gl))
+
+/* Max gather list size in bytes for an sk_buff. */
+#define FUNETH_MAX_GL_SZ ((MAX_SKB_FRAGS + 1) * sizeof(struct fun_dataop_gl))
+
+#ifdef CONFIG_TLS_DEVICE
+# define FUNETH_TLS_SZ sizeof(struct fun_eth_tls)
+#else
+# define FUNETH_TLS_SZ 0
+#endif
+
+/* Max number of Tx descriptors for an sk_buff using a gather list. */
+#define FUNETH_MAX_GL_DESC \
+	DIV_ROUND_UP((FUNETH_FUNOS_HDR_SZ + FUNETH_MAX_GL_SZ + FUNETH_TLS_SZ), \
+		     FUNETH_SQE_SIZE)
+
+/* Max number of Tx descriptors for any packet. */
+#define FUNETH_MAX_PKT_DESC FUNETH_MAX_GL_DESC
+
+/* Rx CQ descriptor size. */
+#define FUNETH_CQE_SIZE 64U
+
+/* Offset of cqe_info within a CQE. */
+#define FUNETH_CQE_INFO_OFFSET (FUNETH_CQE_SIZE - sizeof(struct fun_cqe_info))
+
+/* Construct the IRQ portion of a CQ doorbell. The resulting value arms the
+ * interrupt with the supplied time delay and packet count moderation settings.
+ */
+#define FUN_IRQ_CQ_DB(usec, pkts) \
+	(FUN_DB_IRQ_ARM_F | ((usec) << FUN_DB_INTCOAL_USEC_S) | \
+	 ((pkts) << FUN_DB_INTCOAL_ENTRIES_S))
+
+/* As above for SQ doorbells. */
+#define FUN_IRQ_SQ_DB(usec, pkts) \
+	(FUN_DB_IRQ_ARM_F | \
+	 ((usec) << FUN_DB_INTCOAL_USEC_S) | \
+	 ((pkts) << FUN_DB_INTCOAL_ENTRIES_S))
+
+/* Per packet tailroom. Present only for 1-frag packets. */
+#define FUN_RX_TAILROOM SKB_DATA_ALIGN(sizeof(struct skb_shared_info))
+
+/* Per packet headroom for XDP. Preferred over XDP_PACKET_HEADROOM to
+ * accommodate two packets per buffer for 4K pages and 1500B MTUs.
+ */
+#define FUN_XDP_HEADROOM 192
+
+struct funeth_txq_stats {  /* per Tx queue SW counters */
+	u64 tx_pkts;       /* # of Tx packets */
+	u64 tx_bytes;      /* total bytes of Tx packets */
+	u64 tx_cso;        /* # of packets with checksum offload */
+	u64 tx_tso;        /* # of TSO super-packets */
+	u64 tx_more;       /* # of DBs elided due to xmit_more */
+	u64 tx_nstops;     /* # of times the queue has stopped */
+	u64 tx_nrestarts;  /* # of times the queue has restarted */
+	u64 tx_map_err;    /* # of packets dropped due to DMA mapping errors */
+	u64 tx_len_err;    /* # of packets dropped due to unsupported length */
+	u64 tx_xdp_full;   /* # of XDP packets that could not be enqueued */
+#ifdef CONFIG_TLS_DEVICE
+	u64 tx_tls_pkts;   /* # of Tx TLS packets offloaded to HW */
+	u64 tx_tls_bytes;  /* Tx bytes of HW-handled TLS payload */
+	u64 tx_tls_fallback; /* attempted Tx TLS offloads punted to SW */
+	u64 tx_tls_drops;  /* attempted Tx TLS offloads dropped */
+#endif
+};
+
+struct funeth_tx_info {      /* per Tx descriptor state */
+	union {
+		struct sk_buff *skb; /* associated packet */
+		void *vaddr;         /* start address for XDP */
+	};
+};
+
+struct funeth_txq {
+	/* RO cacheline of frequently accessed data */
+	u32 mask;               /* queue depth - 1 */
+	u32 hw_qid;             /* device ID of the queue */
+	void *desc;             /* base address of descriptor ring */
+	struct funeth_tx_info *info;
+	struct device *dma_dev; /* device for DMA mappings */
+	volatile __be64 *hw_wb; /* HW write-back location */
+	u32 __iomem *db;        /* SQ doorbell register address */
+	struct netdev_queue *ndq;
+	dma_addr_t dma_addr;    /* DMA address of descriptor ring */
+	/* producer R/W cacheline */
+	u16 qidx;               /* queue index within net_device */
+	u16 irq_idx;            /* IRQ index asserted by queue */
+	u32 prod_cnt;           /* producer counter */
+	struct funeth_txq_stats stats;
+	/* shared R/W cacheline, primarily accessed by consumer */
+	u32 irq_db_val;         /* value written to IRQ doorbell */
+	u32 cons_cnt;           /* consumer (cleanup) counter */
+	struct net_device *netdev;
+	u16 ethid;
+	bool hang_reported;
+	struct u64_stats_sync syncp;
+};
+
+struct funeth_rxq_stats {  /* per Rx queue SW counters */
+	u64 rx_pkts;       /* # of received packets, including SW drops */
+	u64 rx_bytes;      /* total size of received packets */
+	u64 rx_cso;        /* # of packets with checksum offload */
+	u64 rx_bufs;       /* total # of Rx buffers provided to device */
+	u64 gro_pkts;      /* # of GRO superpackets */
+	u64 gro_merged;    /* # of pkts merged into existing GRO superpackets */
+	u64 rx_page_alloc; /* # of page allocations for Rx buffers */
+	u64 rx_budget;     /* NAPI iterations that exhausted their budget */
+	u64 rx_mem_drops;  /* # of packets dropped due to memory shortage */
+	u64 rx_map_err;    /* # of page DMA mapping errors */
+	u64 xdp_drops;     /* XDP_DROPped packets */
+	u64 xdp_tx;        /* successful XDP transmits */
+	u64 xdp_redir;     /* successful XDP redirects */
+	u64 xdp_err;       /* packets dropped due to XDP errors */
+};
+
+struct funeth_rxbuf {          /* per Rx buffer state */
+	struct page *page;     /* associated page */
+	dma_addr_t dma_addr;   /* DMA address of page start */
+	int pg_refs;           /* page refs held by driver */
+	int node;              /* page node, or -1 if it is PF_MEMALLOC */
+};
+
+struct funeth_rx_cache {       /* cache of DMA-mapped previously used buffers */
+	struct funeth_rxbuf *bufs; /* base of Rx buffer state ring */
+	unsigned int prod_cnt;     /* producer counter */
+	unsigned int cons_cnt;     /* consumer counter */
+	unsigned int mask;         /* depth - 1 */
+};
+
+/* An Rx queue consists of a CQ and an SQ used to provide Rx buffers. */
+struct funeth_rxq {
+	struct net_device *netdev;
+	struct napi_struct *napi;
+	struct device *dma_dev;    /* device for DMA mappings */
+	void *cqes;                /* base of CQ descriptor ring */
+	const void *next_cqe_info; /* fun_cqe_info of next CQE */
+	u32 __iomem *cq_db;        /* CQ doorbell register address */
+	unsigned int cq_head;      /* CQ head index */
+	unsigned int cq_mask;      /* CQ depth - 1 */
+	u16 phase;                 /* CQ phase tag */
+	u16 qidx;                  /* queue index within net_device */
+	unsigned int irq_db_val;   /* IRQ info for CQ doorbell */
+	struct fun_eprq_rqbuf *rqes; /* base of RQ descriptor ring */
+	struct funeth_rxbuf *bufs; /* base of Rx buffer state ring */
+	struct funeth_rxbuf *cur_buf; /* currently active buffer */
+	u32 __iomem *rq_db;        /* RQ doorbell register address */
+	unsigned int rq_cons;      /* RQ consumer counter */
+	unsigned int rq_mask;      /* RQ depth - 1 */
+	unsigned int buf_offset;   /* offset of next pkt in head buffer */
+	u8 xdp_flush;              /* XDP flush types needed at NAPI end */
+	unsigned int rq_cons_db;   /* value of rq_cons at last RQ db */
+	unsigned int rq_db_thres;  /* # of new buffers needed to write RQ db */
+	struct funeth_rxbuf spare_buf; /* spare for next buffer replacement */
+	struct funeth_rx_cache cache; /* used buffer cache */
+	struct bpf_prog *xdp_prog; /* optional XDP BPF program */
+	struct funeth_rxq_stats stats;
+	dma_addr_t cq_dma_addr;    /* DMA address of CQE ring */
+	dma_addr_t rq_dma_addr;    /* DMA address of RQE ring */
+	unsigned int extraroom;    /* per packet headroom + tailroom */
+	u16 irq_idx;               /* IRQ index for CQ interrupt */
+	u16 irq_cnt;               /* IRQ counter for DIM */
+	u32 hw_cqid;               /* device ID of the queue's CQ */
+	u32 hw_sqid;               /* device ID of the queue's SQ */
+	struct u64_stats_sync syncp;
+	struct xdp_rxq_info xdp_rxq;
+};
+
+#define FUN_QSTAT_INC(q, counter) \
+	do { \
+		u64_stats_update_begin(&(q)->syncp); \
+		(q)->stats.counter++; \
+		u64_stats_update_end(&(q)->syncp); \
+	} while (0)
+
+#define FUN_QSTAT_READ(q, seq, stats_copy) \
+	do { \
+		seq = u64_stats_fetch_begin(&(q)->syncp); \
+		stats_copy = (q)->stats; \
+	} while (u64_stats_fetch_retry(&(q)->syncp, (seq)))
+
+#define FUN_INT_NAME_LEN (IFNAMSIZ + 16)
+
+struct fun_irq {
+	struct napi_struct napi;
+	struct funeth_txq *txq;
+	struct funeth_rxq *rxq;
+	u16 irq_idx;              /* index of MSI-X interrupt */
+	int irq;                  /* Linux IRQ vector */
+	cpumask_t affinity_mask;  /* IRQ affinity */
+	struct irq_affinity_notify aff_notify;
+	char name[FUN_INT_NAME_LEN];
+} ____cacheline_internodealigned_in_smp;
+
+/* Return the start address of the idx-th Tx descriptor. */
+static inline void *fun_tx_desc_addr(const struct funeth_txq *q,
+				     unsigned int idx)
+{
+	return q->desc + idx * FUNETH_SQE_SIZE;
+}
+
+static inline void fun_txq_wr_db(const struct funeth_txq *q)
+{
+	unsigned int tail = q->prod_cnt & q->mask;
+
+	writel(tail, q->db);
+}
+
+static inline int fun_irq_node(const struct fun_irq *p)
+{
+	return local_memory_node(cpu_to_node(cpumask_first(&p->affinity_mask)));
+}
+
+int fun_rxq_napi_poll(struct napi_struct *napi, int budget);
+int fun_txq_napi_poll(struct napi_struct *napi, int budget);
+netdev_tx_t fun_start_xmit(struct sk_buff *skb, struct net_device *netdev);
+bool fun_xdp_tx(struct funeth_txq *q, void *data, unsigned int len);
+int fun_xdp_xmit_frames(struct net_device *dev, int n,
+			struct xdp_frame **frames, u32 flags);
+
+struct funeth_txq *funeth_txq_create(struct net_device *dev, unsigned int qidx,
+				     unsigned int ndesc, struct fun_irq *irq);
+int funeth_txq_free(struct funeth_txq *q);
+struct funeth_rxq *funeth_rxq_create(struct net_device *dev, unsigned int qidx,
+				     unsigned int ncqe, unsigned int nrqe,
+				     struct fun_irq *irq);
+int funeth_rxq_free(struct funeth_rxq *q);
+
+#endif /* _FUNETH_TXRX_H */