diff mbox

[1/2] add mt7601u driver

Message ID 20150505024949.0a672dd1@north (mailing list archive)
State Not Applicable
Delegated to: Kalle Valo
Headers show

Commit Message

Jakub Kici?ski May 5, 2015, 12:49 a.m. UTC
On Mon, 04 May 2015 12:15:11 +0200, Johannes Berg wrote:
> On Mon, 2015-05-04 at 12:04 +0200, Jakub Kici?ski wrote:
> 
> > > Don't know how your buffers are set up, but if the DMA engine consumes
> > > pages you could consider using paged RX instead of the memcpy().
> > 
> > DMA engine can concatenate multiple frames into a single USB bulk
> > transfer to a large continuous buffer.  There is no way to request 
> > any alignment of the frames within that large buffer so I think paged 
> > RX is not an option.
> 
> You could probably still do it because the networking stack only
> requires the *headers* to be aligned. But if they headers aren't in the
> skb->data (skb->head) then they'll be pulled into that by the network
> stack, where they'll be aligned.

I *think* I got it.  Would you care to take a look?  I basically
allocate a compound page with dev_alloc_pages() and run get_page() for
every fragment.

------>8-----------------------------------


--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Johannes Berg May 5, 2015, 6:45 a.m. UTC | #1
On Tue, 2015-05-05 at 02:49 +0200, Jakub Kici?ski wrote:
> On Mon, 04 May 2015 12:15:11 +0200, Johannes Berg wrote:
> > On Mon, 2015-05-04 at 12:04 +0200, Jakub Kici?ski wrote:
> > 
> > > > Don't know how your buffers are set up, but if the DMA engine consumes
> > > > pages you could consider using paged RX instead of the memcpy().
> > > 
> > > DMA engine can concatenate multiple frames into a single USB bulk
> > > transfer to a large continuous buffer.  There is no way to request 
> > > any alignment of the frames within that large buffer so I think paged 
> > > RX is not an option.
> > 
> > You could probably still do it because the networking stack only
> > requires the *headers* to be aligned. But if they headers aren't in the
> > skb->data (skb->head) then they'll be pulled into that by the network
> > stack, where they'll be aligned.
> 
> I *think* I got it.  Would you care to take a look?  I basically
> allocate a compound page with dev_alloc_pages() and run get_page() for
> every fragment.

I think it looks fine - except the compound part. I'm pretty sure you
need to do references always with the pointer to the first page (and
consequently don't need to split on the page boundary, and use the same
page pointer just with a bigger offset)

> +static void mt7601u_rx_add_frags(struct mt7601u_dev *dev, struct sk_buff *skb,
> +				 void *data, u32 true_len, u32 truesize,
> +				 struct page *p)
> +{
> +	unsigned long addr = (unsigned long) data;
> +	u32 overpage = 0;
> +	u32 p_off = (data - page_address(p)) >> PAGE_SHIFT;

i.e. you shouldn't need p_off

> +	if ((addr & ~PAGE_MASK) + true_len > PAGE_SIZE)
> +		overpage = (addr & ~PAGE_MASK) + true_len - PAGE_SIZE;

nor overpage

> +	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, p + p_off,
> +			addr & ~PAGE_MASK, true_len - overpage, truesize);
> +	get_page(p + p_off);

Here just use

	skb_fill_page_desc(skb, i, p, data - page_address(p),
			 true_len, truesize);
	get_page(p);

I believe.

> +	/* Do we really need to split the buffer if it crosses a page boundary?
> +	 * Does networking code care?
> +	 */
> +	if (overpage) {
> +		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, p + p_off + 1,
> +				0, overpage, 0);
> +		get_page(p + p_off + 1);
> +	}
> +}

and this goes away

> +static void mt7601u_rx_process_seg_paged(struct mt7601u_dev *dev, u8 *data,
> +					 u32 seg_len, struct page *p)

[...]

> +	skb = __netdev_alloc_skb(NULL, 256, GFP_ATOMIC);
> +	if (!skb)
> +		return;

We have a comment on this code (from Eric) saying

        /* Dont use dev_alloc_skb(), we'll have enough headroom once   
         * ieee80211_hdr pulled.

which probably applies here as well. We also just use 128 instead of 256
and haven't seen a need for more.

We also pull the entire frame only if it fits into those 128 bytes, and
we preload the 802.11 header + 8 bytes for snap and possibly crypto
headroom so we can do it all at once instead of later in multiple
places. You can check out iwl_mvm_pass_packet_to_mac80211() to see the
details.

johannes

--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/dma.c b/dma.c
index e865cf8384a5..a92993a64aeb 100644
--- a/dma.c
+++ b/dma.c
@@ -16,6 +16,9 @@ 
 #include "usb.h"
 #include "trace.h"
 
+static int mt7601u_submit_rx_buf(struct mt7601u_dev *dev,
+				 struct mt7601u_dma_buf_rx *e, gfp_t gfp);
+
 static unsigned int ieee80211_get_hdrlen_from_buf(const u8 *data, unsigned len)
 {
 	const struct ieee80211_hdr *hdr = (const struct ieee80211_hdr *)data;
@@ -60,7 +63,7 @@  mt7601u_rx_process_seg(struct mt7601u_dev *dev, u8 *data, u32 seg_len)
 {
 	struct sk_buff *skb;
 	struct mt7601u_rxwi *rxwi;
-	u32 fce_info;
+	u32 true_len, fce_info;
 
 	/* DMA_INFO field at the beginning of the segment contains only some of
 	 * the information, we need to read the FCE descriptor from the end.
@@ -86,11 +89,81 @@  mt7601u_rx_process_seg(struct mt7601u_dev *dev, u8 *data, u32 seg_len)
 	if (!skb)
 		return;
 
-	if (mt76_mac_process_rx(dev, skb, rxwi)) {
-		dev_kfree_skb(skb);
-		return;
+	true_len = mt76_mac_process_rx(dev, skb, skb->data, rxwi);
+	skb_trim(skb, true_len);
+
+	ieee80211_rx_ni(dev->hw, skb);
+}
+
+static void mt7601u_rx_add_frags(struct mt7601u_dev *dev, struct sk_buff *skb,
+				 void *data, u32 true_len, u32 truesize,
+				 struct page *p)
+{
+	unsigned long addr = (unsigned long) data;
+	u32 overpage = 0;
+	u32 p_off = (data - page_address(p)) >> PAGE_SHIFT;
+
+	if ((addr & ~PAGE_MASK) + true_len > PAGE_SIZE)
+		overpage = (addr & ~PAGE_MASK) + true_len - PAGE_SIZE;
+
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, p + p_off,
+			addr & ~PAGE_MASK, true_len - overpage, truesize);
+	get_page(p + p_off);
+
+	/* Do we really need to split the buffer if it crosses a page boundary?
+	 * Does networking code care?
+	 */
+	if (overpage) {
+		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, p + p_off + 1,
+				0, overpage, 0);
+		get_page(p + p_off + 1);
+	}
+}
+
+static void mt7601u_rx_process_seg_paged(struct mt7601u_dev *dev, u8 *data,
+					 u32 seg_len, struct page *p)
+{
+	struct sk_buff *skb;
+	struct mt7601u_rxwi *rxwi;
+	u32 true_len, fce_info, truesize = seg_len;
+
+	/* DMA_INFO field at the beginning of the segment contains only some of
+	 * the information, we need to read the FCE descriptor from the end.
+	 */
+	fce_info = get_unaligned_le32(data + seg_len - MT_FCE_INFO_LEN);
+	seg_len -= MT_FCE_INFO_LEN;
+
+	data += MT_DMA_HDR_LEN;
+	seg_len -= MT_DMA_HDR_LEN;
+
+	rxwi = (struct mt7601u_rxwi *) data;
+	data += sizeof(struct mt7601u_rxwi);
+	seg_len -= sizeof(struct mt7601u_rxwi);
+
+	if (unlikely(rxwi->zero[0] || rxwi->zero[1] || rxwi->zero[2]))
+		dev_err_once(dev->dev, "Error: RXWI zero fields are set\n");
+	if (unlikely(MT76_GET(MT_RXD_INFO_TYPE, fce_info)))
+		dev_err_once(dev->dev, "Error: RX path seen a non-pkt urb\n");
+
+	trace_mt_rx(dev, rxwi, fce_info);
+
+	if (rxwi->rxinfo & cpu_to_le32(MT_RXINFO_L2PAD)) {
+		int hdr_len = ieee80211_get_hdrlen_from_buf(data, seg_len);
+
+		memmove(data + 2, data, hdr_len);
+		seg_len -= 2;
+		data += 2;
 	}
 
+	skb = __netdev_alloc_skb(NULL, 256, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	true_len = mt76_mac_process_rx(dev, skb, data, rxwi);
+	WARN_ON(true_len > seg_len || true_len < seg_len - 3);
+
+	mt7601u_rx_add_frags(dev, skb, data, true_len, truesize, p);
+
 	ieee80211_rx_ni(dev->hw, skb);
 }
 
@@ -110,17 +183,31 @@  static u16 mt7601u_rx_next_seg_len(u8 *data, u32 data_len)
 }
 
 static void
-mt7601u_rx_process_entry(struct mt7601u_dev *dev, struct mt7601u_dma_buf *e)
+mt7601u_rx_process_entry(struct mt7601u_dev *dev, struct mt7601u_dma_buf_rx *e)
 {
 	u32 seg_len, data_len = e->urb->actual_length;
-	u8 *data = e->buf;
+	u8 *data = page_address(e->p);
+	struct page *new_p = NULL;
+	bool paged = true;
 	int cnt = 0;
 
 	if (!test_bit(MT7601U_STATE_INITIALIZED, &dev->state))
 		return;
 
+	/* Copy if there is very little data in the buffer. */
+	if (data_len < 512) {
+		paged = false;
+	} else {
+		new_p = dev_alloc_pages(MT_RX_ORDER);
+		if (!new_p)
+			paged = false;
+	}
+
 	while ((seg_len = mt7601u_rx_next_seg_len(data, data_len))) {
-		mt7601u_rx_process_seg(dev, data, seg_len);
+		if (paged)
+			mt7601u_rx_process_seg_paged(dev, data, seg_len, e->p);
+		else
+			mt7601u_rx_process_seg(dev, data, seg_len);
 
 		data_len -= seg_len;
 		data += seg_len;
@@ -128,14 +215,21 @@  mt7601u_rx_process_entry(struct mt7601u_dev *dev, struct mt7601u_dma_buf *e)
 	}
 
 	if (cnt > 1)
-		trace_mt_rx_dma_aggr(dev, cnt);
+		trace_mt_rx_dma_aggr(dev, cnt, paged);
+
+	if (paged) {
+		/* we have one extra ref from the allcator */
+		put_page(e->p);
+
+		e->p = new_p;
+	}
 }
 
-static struct mt7601u_dma_buf *
+static struct mt7601u_dma_buf_rx *
 mt7601u_rx_get_pending_entry(struct mt7601u_dev *dev)
 {
 	struct mt7601u_rx_queue *q = &dev->rx_q;
-	struct mt7601u_dma_buf *buf = NULL;
+	struct mt7601u_dma_buf_rx *buf = NULL;
 	unsigned long flags;
 
 	spin_lock_irqsave(&dev->rx_lock, flags);
@@ -175,15 +269,14 @@  out:
 static void mt7601u_rx_tasklet(unsigned long data)
 {
 	struct mt7601u_dev *dev = (struct mt7601u_dev *) data;
-	struct mt7601u_dma_buf *e;
+	struct mt7601u_dma_buf_rx *e;
 
 	while ((e = mt7601u_rx_get_pending_entry(dev))) {
 		if (e->urb->status)
 			continue;
 
 		mt7601u_rx_process_entry(dev, e);
-		mt7601u_usb_submit_buf(dev, USB_DIR_IN, MT_EP_IN_PKT_RX, e,
-				       GFP_ATOMIC, mt7601u_complete_rx, dev);
+		mt7601u_submit_rx_buf(dev, e, GFP_ATOMIC);
 	}
 }
 
@@ -325,14 +418,33 @@  static void mt7601u_kill_rx(struct mt7601u_dev *dev)
 	spin_unlock_irqrestore(&dev->rx_lock, flags);
 }
 
+static int mt7601u_submit_rx_buf(struct mt7601u_dev *dev,
+				 struct mt7601u_dma_buf_rx *e, gfp_t gfp)
+{
+	struct usb_device *usb_dev = mt7601u_to_usb_dev(dev);
+	u8 *buf = page_address(e->p);
+	unsigned pipe;
+	int ret;
+
+	pipe = usb_rcvbulkpipe(usb_dev, dev->in_eps[MT_EP_IN_PKT_RX]);
+
+	usb_fill_bulk_urb(e->urb, usb_dev, pipe, buf, MT_RX_URB_SIZE,
+			  mt7601u_complete_rx, dev);
+
+	trace_mt_submit_urb(dev, e->urb);
+	ret = usb_submit_urb(e->urb, gfp);
+	if (ret)
+		dev_err(dev->dev, "Error: submit RX URB failed:%d\n", ret);
+
+	return ret;
+}
+
 static int mt7601u_submit_rx(struct mt7601u_dev *dev)
 {
 	int i, ret;
 
 	for (i = 0; i < dev->rx_q.entries; i++) {
-		ret = mt7601u_usb_submit_buf(dev, USB_DIR_IN, MT_EP_IN_PKT_RX,
-					     &dev->rx_q.e[i], GFP_KERNEL,
-					     mt7601u_complete_rx, dev);
+		ret = mt7601u_submit_rx_buf(dev, &dev->rx_q.e[i], GFP_KERNEL);
 		if (ret)
 			return ret;
 	}
@@ -344,8 +456,10 @@  static void mt7601u_free_rx(struct mt7601u_dev *dev)
 {
 	int i;
 
-	for (i = 0; i < dev->rx_q.entries; i++)
-		mt7601u_usb_free_buf(dev, &dev->rx_q.e[i]);
+	for (i = 0; i < dev->rx_q.entries; i++) {
+		__free_pages(dev->rx_q.e[i].p, MT_RX_ORDER);
+		usb_free_urb(dev->rx_q.e[i].urb);
+	}
 }
 
 static int mt7601u_alloc_rx(struct mt7601u_dev *dev)
@@ -356,9 +470,13 @@  static int mt7601u_alloc_rx(struct mt7601u_dev *dev)
 	dev->rx_q.dev = dev;
 	dev->rx_q.entries = N_RX_ENTRIES;
 
-	for (i = 0; i < N_RX_ENTRIES; i++)
-		if (mt7601u_usb_alloc_buf(dev, MT_RX_URB_SIZE, &dev->rx_q.e[i]))
+	for (i = 0; i < N_RX_ENTRIES; i++) {
+		dev->rx_q.e[i].urb = usb_alloc_urb(0, GFP_KERNEL);
+		dev->rx_q.e[i].p = dev_alloc_pages(MT_RX_ORDER);
+
+		if (!dev->rx_q.e[i].urb || !dev->rx_q.e[i].p)
 			return -ENOMEM;
+	}
 
 	return 0;
 }
diff --git a/mac.c b/mac.c
index 539751362b41..8802366f7181 100644
--- a/mac.c
+++ b/mac.c
@@ -441,30 +441,28 @@  mt7601u_rx_monitor_beacon(struct mt7601u_dev *dev, struct mt7601u_rxwi *rxwi,
 }
 
 static int
-mt7601u_rx_is_our_beacon(struct mt7601u_dev *dev, struct sk_buff *skb)
+mt7601u_rx_is_our_beacon(struct mt7601u_dev *dev, u8 *data)
 {
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)data;
 
 	return ieee80211_is_beacon(hdr->frame_control) &&
 		ether_addr_equal(hdr->addr2, dev->ap_bssid);
 }
 
-int mt76_mac_process_rx(struct mt7601u_dev *dev, struct sk_buff *skb, void *rxi)
+u32 mt76_mac_process_rx(struct mt7601u_dev *dev, struct sk_buff *skb,
+			u8 *data, void *rxi)
 {
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
 	struct mt7601u_rxwi *rxwi = rxi;
 	u32 ctl = le32_to_cpu(rxwi->ctl);
 	u16 rate = le16_to_cpu(rxwi->rate);
-	int len, rssi;
+	int rssi;
 
 	if (rxwi->rxinfo & cpu_to_le32(MT_RXINFO_DECRYPT)) {
 		status->flag |= RX_FLAG_DECRYPTED;
 		status->flag |= RX_FLAG_IV_STRIPPED | RX_FLAG_MMIC_STRIPPED;
 	}
 
-	len = MT76_GET(MT_RXWI_CTL_MPDU_LEN, ctl);
-	skb_trim(skb, len);
-
 	status->chains = BIT(0);
 	rssi = mt7601u_phy_get_rssi(dev, rxwi, rate);
 	status->chain_signal[0] = status->signal = rssi;
@@ -474,13 +472,13 @@  int mt76_mac_process_rx(struct mt7601u_dev *dev, struct sk_buff *skb, void *rxi)
 	mt76_mac_process_rate(status, rate);
 
 	spin_lock_bh(&dev->con_mon_lock);
-	if (mt7601u_rx_is_our_beacon(dev, skb))
+	if (mt7601u_rx_is_our_beacon(dev, data))
 		mt7601u_rx_monitor_beacon(dev, rxwi, rate, rssi);
 	else if (rxwi->rxinfo & cpu_to_le32(MT_RXINFO_U2M))
 		dev->avg_rssi = (dev->avg_rssi * 15) / 16 + (rssi << 8);
 	spin_unlock_bh(&dev->con_mon_lock);
 
-	return 0;
+	return MT76_GET(MT_RXWI_CTL_MPDU_LEN, ctl);
 }
 
 static enum mt76_cipher_type
diff --git a/mac.h b/mac.h
index 90cf147f37e7..2c22d63c63a2 100644
--- a/mac.h
+++ b/mac.h
@@ -160,8 +160,8 @@  struct mt76_txwi {
 #define MT_TXWI_CTL_CHAN_CHECK_PKT	BIT(4)
 #define MT_TXWI_CTL_PIFS_REV		BIT(6)
 
-int
-mt76_mac_process_rx(struct mt7601u_dev *dev, struct sk_buff *skb, void *rxwi);
+u32 mt76_mac_process_rx(struct mt7601u_dev *dev, struct sk_buff *skb,
+			u8 *data, void *rxi);
 int mt76_mac_wcid_set_key(struct mt7601u_dev *dev, u8 idx,
 			  struct ieee80211_key_conf *key);
 void mt76_mac_wcid_set_rate(struct mt7601u_dev *dev, struct mt76_wcid *wcid,
diff --git a/mt7601u.h b/mt7601u.h
index f9957350a187..d6f83e03cb58 100644
--- a/mt7601u.h
+++ b/mt7601u.h
@@ -37,6 +37,7 @@ 
 #define MT_USB_AGGR_SIZE_LIMIT		21 /* * 1024B */
 #define MT_USB_AGGR_TIMEOUT		0x80 /* * 33ns */
 #define MT_RX_URB_SIZE			(24 * 1024)
+#define MT_RX_ORDER			3
 
 struct mt7601u_dma_buf {
 	struct urb *urb;
@@ -73,7 +74,10 @@  struct mac_stats {
 struct mt7601u_rx_queue {
 	struct mt7601u_dev *dev;
 
-	struct mt7601u_dma_buf e[N_RX_ENTRIES];
+	struct mt7601u_dma_buf_rx {
+		struct urb *urb;
+		struct page *p;
+	} e[N_RX_ENTRIES];
 
 	unsigned int start;
 	unsigned int end;
diff --git a/trace.h b/trace.h
index db86272401ff..289897300ef0 100644
--- a/trace.h
+++ b/trace.h
@@ -352,17 +352,20 @@  TRACE_EVENT(mt_tx_status,
 );
 
 TRACE_EVENT(mt_rx_dma_aggr,
-	TP_PROTO(struct mt7601u_dev *dev, int cnt),
-	TP_ARGS(dev, cnt),
+	TP_PROTO(struct mt7601u_dev *dev, int cnt, bool paged),
+	TP_ARGS(dev, cnt, paged),
 	TP_STRUCT__entry(
 		DEV_ENTRY
 		__field(u8, cnt)
+		__field(bool, paged)
 	),
 	TP_fast_assign(
 		DEV_ASSIGN;
 		__entry->cnt = cnt;
+		__entry->paged = paged;
 	),
-	TP_printk(DEV_PR_FMT "%d", DEV_PR_ARG, __entry->cnt)
+	TP_printk(DEV_PR_FMT "cnt:%d paged:%d",
+		  DEV_PR_ARG, __entry->cnt, __entry->paged)
 );
 
 DEFINE_EVENT(dev_simple_evt, set_key,