diff mbox

[RFC,v3] mac80211: add A-MSDU tx support

Message ID 1454851282-31464-1-git-send-email-nbd@openwrt.org (mailing list archive)
State RFC
Delegated to: Johannes Berg
Headers show

Commit Message

Felix Fietkau Feb. 7, 2016, 1:21 p.m. UTC
Requires software tx queueing support. frag_list support (for zero-copy)
is optional.

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
---
 include/net/mac80211.h     |  17 +++++
 net/mac80211/agg-tx.c      |   5 ++
 net/mac80211/debugfs.c     |   2 +
 net/mac80211/ieee80211_i.h |   1 +
 net/mac80211/tx.c          | 168 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 193 insertions(+)

Comments

Emmanuel Grumbach Feb. 7, 2016, 8:16 p.m. UTC | #1
On Sun, Feb 7, 2016 at 3:21 PM, Felix Fietkau <nbd@openwrt.org> wrote:
> Requires software tx queueing support. frag_list support (for zero-copy)
> is optional.
>
> Signed-off-by: Felix Fietkau <nbd@openwrt.org>
> ---
>  include/net/mac80211.h     |  17 +++++
>  net/mac80211/agg-tx.c      |   5 ++
>  net/mac80211/debugfs.c     |   2 +
>  net/mac80211/ieee80211_i.h |   1 +
>  net/mac80211/tx.c          | 168 +++++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 193 insertions(+)
>
> diff --git a/include/net/mac80211.h b/include/net/mac80211.h
> index 5714774..31dca81 100644
> --- a/include/net/mac80211.h
> +++ b/include/net/mac80211.h
> @@ -709,6 +709,7 @@ enum mac80211_tx_info_flags {
>   * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll
>   *     frame (PS-Poll or uAPSD).
>   * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
> + * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
>   *
>   * These flags are used in tx_info->control.flags.
>   */
> @@ -716,6 +717,7 @@ enum mac80211_tx_control_flags {
>         IEEE80211_TX_CTRL_PORT_CTRL_PROTO       = BIT(0),
>         IEEE80211_TX_CTRL_PS_RESPONSE           = BIT(1),
>         IEEE80211_TX_CTRL_RATE_INJECT           = BIT(2),
> +       IEEE80211_TX_CTRL_AMSDU                 = BIT(3),
>  };
>
>  /*
> @@ -1964,6 +1966,12 @@ struct ieee80211_txq {
>   *     order and does not need to manage its own reorder buffer or BA session
>   *     timeout.
>   *
> + * @IEEE80211_HW_TX_AMSDU: Hardware (or driver) supports software aggregated
> + *     A-MSDU frames. Requires software tx queueing support.
> + *
> + * @IEEE80211_HW_TX_FRAG_LIST: Hardware (or driver) supports sending frag_list
> + *     skbs, needed for zero-copy software A-MSDU.
> + *
>   * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
>   */
>  enum ieee80211_hw_flags {
> @@ -2001,6 +2009,8 @@ enum ieee80211_hw_flags {
>         IEEE80211_HW_BEACON_TX_STATUS,
>         IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR,
>         IEEE80211_HW_SUPPORTS_REORDERING_BUFFER,
> +       IEEE80211_HW_TX_AMSDU,
> +       IEEE80211_HW_TX_FRAG_LIST,
>
>         /* keep last, obviously */
>         NUM_IEEE80211_HW_FLAGS
> @@ -2073,6 +2083,11 @@ enum ieee80211_hw_flags {
>   *     size is smaller (an example is LinkSys WRT120N with FW v1.0.07
>   *     build 002 Jun 18 2012).
>   *
> + * @max_tx_amsdu_subframes: maximum number of subframes used in software
> + *     A-MSDU aggregation
> + *
> + * @max_tx_fragments: maximum fragments per (A-)MSDU.
> + *
>   * @offchannel_tx_hw_queue: HW queue ID to use for offchannel TX
>   *     (if %IEEE80211_HW_QUEUE_CONTROL is set)
>   *
> @@ -2127,6 +2142,8 @@ struct ieee80211_hw {
>         u8 max_rate_tries;
>         u8 max_rx_aggregation_subframes;
>         u8 max_tx_aggregation_subframes;
> +       u8 max_tx_amsdu_subframes;
> +       u8 max_tx_fragments;

I have to see I still don't understand why drivers would want to limit
the number of subframes. Limiting the number of frags should be
enough, don't you think?

>         u8 offchannel_tx_hw_queue;
>         u8 radiotap_mcs_details;
>         u16 radiotap_vht_details;
> diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
> index 4932e9f..42fa810 100644
> --- a/net/mac80211/agg-tx.c
> +++ b/net/mac80211/agg-taxx.c
> @@ -935,6 +935,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
>                                   size_t len)
>  {
>         struct tid_ampdu_tx *tid_tx;
> +       struct ieee80211_txq *txq;
>         u16 capab, tid;
>         u8 buf_size;
>         bool amsdu;
> @@ -945,6 +946,10 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
>         buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
>         buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes);
>
> +       txq = sta->sta.txq[tid];
> +       if (!amsdu && txq)
> +               set_bit(IEEE80211_TXQ_NO_AMSDU, &to_txq_info(txq)->flags);
> +
>         mutex_lock(&sta->ampdu_mlme.mtx);
>
>         tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
> diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
> index e433d0c..847779d 100644
> --- a/net/mac80211/debugfs.c
> +++ b/net/mac80211/debugfs.c
> @@ -127,6 +127,8 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = {
>         FLAG(BEACON_TX_STATUS),
>         FLAG(NEEDS_UNIQUE_STA_ADDR),
>         FLAG(SUPPORTS_REORDERING_BUFFER),
> +       FLAG(TX_AMSDU),
> +       FLAG(TX_FRAG_LIST),
>
>         /* keep last for the build bug below */
>         (void *)0x1
> diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
> index a49c103..e68d8db 100644
> --- a/net/mac80211/ieee80211_i.h
> +++ b/net/mac80211/ieee80211_i.h
> @@ -799,6 +799,7 @@ struct mac80211_qos_map {
>  enum txq_info_flags {
>         IEEE80211_TXQ_STOP,
>         IEEE80211_TXQ_AMPDU,
> +       IEEE80211_TXQ_NO_AMSDU,
>  };
>
>  struct txq_info {
> diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
> index a5aa275..f37f729 100644
> --- a/net/mac80211/tx.c
> +++ b/net/mac80211/tx.c
> @@ -1324,6 +1324,10 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
>  out:
>         spin_unlock_bh(&txqi->queue.lock);
>
> +       if (skb && skb_has_frag_list(skb) &&
> +           !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
> +               skb_linearize(skb);
> +
>         return skb;
>  }
>  EXPORT_SYMBOL(ieee80211_tx_dequeue);
> @@ -2763,6 +2767,166 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta)
>                 kfree_rcu(fast_tx, rcu_head);
>  }
>
> +static int ieee80211_amsdu_pad(struct sk_buff *skb, int subframe_len)
> +{
> +       int amsdu_len = subframe_len + sizeof(struct ethhdr);
> +       int padding = (4 - amsdu_len) & 3;
> +
> +       if (padding)
> +               memset(skb_put(skb, padding), 0, padding);
> +
> +       return padding;
> +}
> +
> +static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
> +                                        struct ieee80211_fast_tx *fast_tx,
> +                                        struct sk_buff *skb)
> +{
> +       struct ieee80211_local *local = sdata->local;
> +       struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
> +       struct ieee80211_hdr *hdr;
> +       struct ethhdr amsdu_hdr;
> +       int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header);
> +       int subframe_len = skb->len - hdr_len;
> +       void *data;
> +       u8 *qc;
> +
> +       if (info->control.flags & IEEE80211_TX_CTRL_AMSDU)
> +               return true;
> +
> +       if (skb_headroom(skb) < sizeof(amsdu_hdr) || skb_tailroom(skb) < 3) {
> +               I802_DEBUG_INC(local->tx_expand_skb_head);
> +
> +               if (pskb_expand_head(skb, sizeof(amsdu_hdr), 3, GFP_ATOMIC)) {
> +                       wiphy_debug(local->hw.wiphy,
> +                                   "failed to reallocate TX buffer\n");
> +                       return false;
> +               }
> +       }
> +
> +       subframe_len += ieee80211_amsdu_pad(skb, subframe_len);
> +
> +       amsdu_hdr.h_proto = cpu_to_be16(subframe_len);
> +       memcpy(amsdu_hdr.h_source, skb->data + fast_tx->sa_offs, ETH_ALEN);
> +       memcpy(amsdu_hdr.h_dest, skb->data + fast_tx->da_offs, ETH_ALEN);
> +
> +       data = skb_push(skb, sizeof(amsdu_hdr));
> +       memmove(data, data + sizeof(amsdu_hdr), hdr_len);
> +       memcpy(data + hdr_len, &amsdu_hdr, sizeof(amsdu_hdr));
> +
> +       hdr = data;
> +       qc = ieee80211_get_qos_ctl(hdr);
> +       *qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
> +
> +       info->control.flags |= IEEE80211_TX_CTRL_AMSDU;
> +
> +       return true;
> +}
> +
> +static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
> +                                     struct sta_info *sta,
> +                                     struct ieee80211_fast_tx *fast_tx,
> +                                     struct sk_buff *skb)
> +{
> +       struct ieee80211_local *local = sdata->local;
> +       u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
> +       struct ieee80211_txq *txq = sta->sta.txq[tid];
> +       struct txq_info *txqi;
> +       struct sk_buff **frag_tail, *head;
> +       int subframe_len = skb->len - ETH_ALEN;
> +       u8 max_subframes = 0xff;
> +       int max_frags = local->hw.max_tx_fragments;
> +       int max_amsdu_len;
> +       __be16 len;
> +       void *data;
> +       bool ret = false;
> +       int n = 1, nfrags;
> +
> +       if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
> +               return false;
> +
> +       if (!txq)
> +               return false;
> +
> +       txqi = to_txq_info(txq);
> +       if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
> +               return false;
> +
> +       if (sta->sta.max_amsdu_subframes)
> +               max_subframes = min(max_subframes,
> +                                   sta->sta.max_amsdu_subframes);
> +
> +       if (local->hw.max_tx_amsdu_subframes)
> +               max_subframes = min(max_subframes,
> +                                   local->hw.max_tx_amsdu_subframes);
> +
> +       spin_lock_bh(&txqi->queue.lock);
> +
> +       head = skb_peek_tail(&txqi->queue);
> +       if (!head)
> +               goto out;
> +
> +       if (skb->len + head->len > max_amsdu_len)
> +               goto out;
> +
> +       /*
> +        * HT A-MPDU limits maximum MPDU size to 4095 bytes. Since aggregation
> +        * sessions are started/stopped without txq flush, use the limit here
> +        * to avoid having to de-aggregate later.
> +        */
> +       if (skb->len + head->len > 4095 &&
> +           !sta->sta.vht_cap.vht_supported)
> +               goto out;
> +
> +       if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
> +               goto out;
> +
> +       nfrags = 1 + skb_shinfo(skb)->nr_frags;
> +       nfrags += 1 + skb_shinfo(head)->nr_frags;
> +       frag_tail = &skb_shinfo(head)->frag_list;
> +       while (*frag_tail) {
> +               nfrags += 1 + skb_shinfo(*frag_tail)->nr_frags;
> +           frag_tail = &(*frag_tail)->next;
> +               n++;
> +       }
> +
> +       if (max_subframes && n > max_subframes)

max_subframes can't be 0. You set it to 0xff at the beginning.

> +               goto out;
> +
> +       if (max_frags && nfrags > max_frags)
> +               goto out;
> +
> +       if (skb_headroom(skb) < 8 || skb_tailroom(skb) < 3) {
> +               I802_DEBUG_INC(local->tx_expand_skb_head);
> +
> +               if (pskb_expand_head(skb, 8, 3, GFP_ATOMIC)) {
> +                       wiphy_debug(local->hw.wiphy,
> +                                   "failed to reallocate TX buffer\n");
> +                       goto out;
> +               }
> +       }
> +
> +       subframe_len += ieee80211_amsdu_pad(skb, subframe_len);
> +
> +       ret = true;
> +       data = skb_push(skb, ETH_ALEN + 2);
> +       memmove(data, data + ETH_ALEN + 2, 2 * ETH_ALEN);
> +
> +       data += 2 * ETH_ALEN;
> +       len = cpu_to_be16(subframe_len);
> +       memcpy(data, &len, 2);
> +       memcpy(data + 2, rfc1042_header, ETH_ALEN);
> +
> +       head->len += skb->len;
> +       head->data_len += skb->len;
> +       *frag_tail = skb;
> +
> +out:
> +       spin_unlock_bh(&txqi->queue.lock);
> +
> +       return ret;
> +}
> +
>  static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
>                                 struct net_device *dev, struct sta_info *sta,
>                                 struct ieee80211_fast_tx *fast_tx,
> @@ -2817,6 +2981,10 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
>
>         ieee80211_tx_stats(dev, skb->len + extra_head);
>
> +       if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
> +           ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
> +               return true;
> +
>         /* will not be crypto-handled beyond what we do here, so use false
>          * as the may-encrypt argument for the resize to not account for
>          * more room than we already have in 'extra_head'
> --
> 2.2.2
>
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Felix Fietkau Feb. 7, 2016, 9:03 p.m. UTC | #2
On 2016-02-07 21:16, Emmanuel Grumbach wrote:
> On Sun, Feb 7, 2016 at 3:21 PM, Felix Fietkau <nbd@openwrt.org> wrote:
>> Requires software tx queueing support. frag_list support (for zero-copy)
>> is optional.
>>
>> Signed-off-by: Felix Fietkau <nbd@openwrt.org>
>> ---
>>  include/net/mac80211.h     |  17 +++++
>>  net/mac80211/agg-tx.c      |   5 ++
>>  net/mac80211/debugfs.c     |   2 +
>>  net/mac80211/ieee80211_i.h |   1 +
>>  net/mac80211/tx.c          | 168 +++++++++++++++++++++++++++++++++++++++++++++
>>  5 files changed, 193 insertions(+)
>>
>> diff --git a/include/net/mac80211.h b/include/net/mac80211.h
>> index 5714774..31dca81 100644
>> --- a/include/net/mac80211.h
>> +++ b/include/net/mac80211.h
>> @@ -2127,6 +2142,8 @@ struct ieee80211_hw {
>>         u8 max_rate_tries;
>>         u8 max_rx_aggregation_subframes;
>>         u8 max_tx_aggregation_subframes;
>> +       u8 max_tx_amsdu_subframes;
>> +       u8 max_tx_fragments;
> 
> I have to see I still don't understand why drivers would want to limit
> the number of subframes. Limiting the number of frags should be
> enough, don't you think?
Makes sense, I'll remove it.

>>         u8 offchannel_tx_hw_queue;
>>         u8 radiotap_mcs_details;
>>         u16 radiotap_vht_details;

>> diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
>> index a5aa275..f37f729 100644
>> --- a/net/mac80211/tx.c
>> +++ b/net/mac80211/tx.c
>> @@ -1324,6 +1324,10 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
>>  out:
>>         spin_unlock_bh(&txqi->queue.lock);
>>
>> +       if (skb && skb_has_frag_list(skb) &&
>> +           !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
>> +               skb_linearize(skb);
>> +
>>         return skb;
>>  }
>>  EXPORT_SYMBOL(ieee80211_tx_dequeue);
>> @@ -2763,6 +2767,166 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta)
>>                 kfree_rcu(fast_tx, rcu_head);
>>  }
>>
>> +static int ieee80211_amsdu_pad(struct sk_buff *skb, int subframe_len)
>> +{
>> +       int amsdu_len = subframe_len + sizeof(struct ethhdr);
>> +       int padding = (4 - amsdu_len) & 3;
>> +
>> +       if (padding)
>> +               memset(skb_put(skb, padding), 0, padding);
>> +
>> +       return padding;
>> +}
>> +
>> +static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
>> +                                        struct ieee80211_fast_tx *fast_tx,
>> +                                        struct sk_buff *skb)
>> +{
>> +       struct ieee80211_local *local = sdata->local;
>> +       struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
>> +       struct ieee80211_hdr *hdr;
>> +       struct ethhdr amsdu_hdr;
>> +       int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header);
>> +       int subframe_len = skb->len - hdr_len;
>> +       void *data;
>> +       u8 *qc;
>> +
>> +       if (info->control.flags & IEEE80211_TX_CTRL_AMSDU)
>> +               return true;
>> +
>> +       if (skb_headroom(skb) < sizeof(amsdu_hdr) || skb_tailroom(skb) < 3) {
>> +               I802_DEBUG_INC(local->tx_expand_skb_head);
>> +
>> +               if (pskb_expand_head(skb, sizeof(amsdu_hdr), 3, GFP_ATOMIC)) {
>> +                       wiphy_debug(local->hw.wiphy,
>> +                                   "failed to reallocate TX buffer\n");
>> +                       return false;
>> +               }
>> +       }
>> +
>> +       subframe_len += ieee80211_amsdu_pad(skb, subframe_len);
>> +
>> +       amsdu_hdr.h_proto = cpu_to_be16(subframe_len);
>> +       memcpy(amsdu_hdr.h_source, skb->data + fast_tx->sa_offs, ETH_ALEN);
>> +       memcpy(amsdu_hdr.h_dest, skb->data + fast_tx->da_offs, ETH_ALEN);
>> +
>> +       data = skb_push(skb, sizeof(amsdu_hdr));
>> +       memmove(data, data + sizeof(amsdu_hdr), hdr_len);
>> +       memcpy(data + hdr_len, &amsdu_hdr, sizeof(amsdu_hdr));
>> +
>> +       hdr = data;
>> +       qc = ieee80211_get_qos_ctl(hdr);
>> +       *qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
>> +
>> +       info->control.flags |= IEEE80211_TX_CTRL_AMSDU;
>> +
>> +       return true;
>> +}
>> +
>> +static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
>> +                                     struct sta_info *sta,
>> +                                     struct ieee80211_fast_tx *fast_tx,
>> +                                     struct sk_buff *skb)
>> +{
>> +       struct ieee80211_local *local = sdata->local;
>> +       u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
>> +       struct ieee80211_txq *txq = sta->sta.txq[tid];
>> +       struct txq_info *txqi;
>> +       struct sk_buff **frag_tail, *head;
>> +       int subframe_len = skb->len - ETH_ALEN;
>> +       u8 max_subframes = 0xff;
>> +       int max_frags = local->hw.max_tx_fragments;
>> +       int max_amsdu_len;
>> +       __be16 len;
>> +       void *data;
>> +       bool ret = false;
>> +       int n = 1, nfrags;
>> +
>> +       if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
>> +               return false;
>> +
>> +       if (!txq)
>> +               return false;
>> +
>> +       txqi = to_txq_info(txq);
>> +       if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
>> +               return false;
>> +
>> +       if (sta->sta.max_amsdu_subframes)
>> +               max_subframes = min(max_subframes,
>> +                                   sta->sta.max_amsdu_subframes);
>> +
>> +       if (local->hw.max_tx_amsdu_subframes)
>> +               max_subframes = min(max_subframes,
>> +                                   local->hw.max_tx_amsdu_subframes);
>> +
>> +       spin_lock_bh(&txqi->queue.lock);
>> +
>> +       head = skb_peek_tail(&txqi->queue);
>> +       if (!head)
>> +               goto out;
>> +
>> +       if (skb->len + head->len > max_amsdu_len)
>> +               goto out;
>> +
>> +       /*
>> +        * HT A-MPDU limits maximum MPDU size to 4095 bytes. Since aggregation
>> +        * sessions are started/stopped without txq flush, use the limit here
>> +        * to avoid having to de-aggregate later.
>> +        */
>> +       if (skb->len + head->len > 4095 &&
>> +           !sta->sta.vht_cap.vht_supported)
>> +               goto out;
>> +
>> +       if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
>> +               goto out;
>> +
>> +       nfrags = 1 + skb_shinfo(skb)->nr_frags;
>> +       nfrags += 1 + skb_shinfo(head)->nr_frags;
>> +       frag_tail = &skb_shinfo(head)->frag_list;
>> +       while (*frag_tail) {
>> +               nfrags += 1 + skb_shinfo(*frag_tail)->nr_frags;
>> +           frag_tail = &(*frag_tail)->next;
>> +               n++;
>> +       }
>> +
>> +       if (max_subframes && n > max_subframes)
> 
> max_subframes can't be 0. You set it to 0xff at the beginning.
Right, thanks.

- Felix
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5714774..31dca81 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -709,6 +709,7 @@  enum mac80211_tx_info_flags {
  * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
+ * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -716,6 +717,7 @@  enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PORT_CTRL_PROTO	= BIT(0),
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
+	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
 };
 
 /*
@@ -1964,6 +1966,12 @@  struct ieee80211_txq {
  *	order and does not need to manage its own reorder buffer or BA session
  *	timeout.
  *
+ * @IEEE80211_HW_TX_AMSDU: Hardware (or driver) supports software aggregated
+ *	A-MSDU frames. Requires software tx queueing support.
+ *
+ * @IEEE80211_HW_TX_FRAG_LIST: Hardware (or driver) supports sending frag_list
+ *	skbs, needed for zero-copy software A-MSDU.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2001,6 +2009,8 @@  enum ieee80211_hw_flags {
 	IEEE80211_HW_BEACON_TX_STATUS,
 	IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR,
 	IEEE80211_HW_SUPPORTS_REORDERING_BUFFER,
+	IEEE80211_HW_TX_AMSDU,
+	IEEE80211_HW_TX_FRAG_LIST,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
@@ -2073,6 +2083,11 @@  enum ieee80211_hw_flags {
  *	size is smaller (an example is LinkSys WRT120N with FW v1.0.07
  *	build 002 Jun 18 2012).
  *
+ * @max_tx_amsdu_subframes: maximum number of subframes used in software
+ *	A-MSDU aggregation
+ *
+ * @max_tx_fragments: maximum fragments per (A-)MSDU.
+ *
  * @offchannel_tx_hw_queue: HW queue ID to use for offchannel TX
  *	(if %IEEE80211_HW_QUEUE_CONTROL is set)
  *
@@ -2127,6 +2142,8 @@  struct ieee80211_hw {
 	u8 max_rate_tries;
 	u8 max_rx_aggregation_subframes;
 	u8 max_tx_aggregation_subframes;
+	u8 max_tx_amsdu_subframes;
+	u8 max_tx_fragments;
 	u8 offchannel_tx_hw_queue;
 	u8 radiotap_mcs_details;
 	u16 radiotap_vht_details;
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 4932e9f..42fa810 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -935,6 +935,7 @@  void ieee80211_process_addba_resp(struct ieee80211_local *local,
 				  size_t len)
 {
 	struct tid_ampdu_tx *tid_tx;
+	struct ieee80211_txq *txq;
 	u16 capab, tid;
 	u8 buf_size;
 	bool amsdu;
@@ -945,6 +946,10 @@  void ieee80211_process_addba_resp(struct ieee80211_local *local,
 	buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
 	buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes);
 
+	txq = sta->sta.txq[tid];
+	if (!amsdu && txq)
+		set_bit(IEEE80211_TXQ_NO_AMSDU, &to_txq_info(txq)->flags);
+
 	mutex_lock(&sta->ampdu_mlme.mtx);
 
 	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index e433d0c..847779d 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -127,6 +127,8 @@  static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = {
 	FLAG(BEACON_TX_STATUS),
 	FLAG(NEEDS_UNIQUE_STA_ADDR),
 	FLAG(SUPPORTS_REORDERING_BUFFER),
+	FLAG(TX_AMSDU),
+	FLAG(TX_FRAG_LIST),
 
 	/* keep last for the build bug below */
 	(void *)0x1
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index a49c103..e68d8db 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -799,6 +799,7 @@  struct mac80211_qos_map {
 enum txq_info_flags {
 	IEEE80211_TXQ_STOP,
 	IEEE80211_TXQ_AMPDU,
+	IEEE80211_TXQ_NO_AMSDU,
 };
 
 struct txq_info {
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index a5aa275..f37f729 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1324,6 +1324,10 @@  struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 out:
 	spin_unlock_bh(&txqi->queue.lock);
 
+	if (skb && skb_has_frag_list(skb) &&
+	    !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
+		skb_linearize(skb);
+
 	return skb;
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
@@ -2763,6 +2767,166 @@  void ieee80211_clear_fast_xmit(struct sta_info *sta)
 		kfree_rcu(fast_tx, rcu_head);
 }
 
+static int ieee80211_amsdu_pad(struct sk_buff *skb, int subframe_len)
+{
+	int amsdu_len = subframe_len + sizeof(struct ethhdr);
+	int padding = (4 - amsdu_len) & 3;
+
+	if (padding)
+		memset(skb_put(skb, padding), 0, padding);
+
+	return padding;
+}
+
+static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
+					 struct ieee80211_fast_tx *fast_tx,
+					 struct sk_buff *skb)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr;
+	struct ethhdr amsdu_hdr;
+	int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header);
+	int subframe_len = skb->len - hdr_len;
+	void *data;
+	u8 *qc;
+
+	if (info->control.flags & IEEE80211_TX_CTRL_AMSDU)
+		return true;
+
+	if (skb_headroom(skb) < sizeof(amsdu_hdr) || skb_tailroom(skb) < 3) {
+		I802_DEBUG_INC(local->tx_expand_skb_head);
+
+		if (pskb_expand_head(skb, sizeof(amsdu_hdr), 3, GFP_ATOMIC)) {
+			wiphy_debug(local->hw.wiphy,
+				    "failed to reallocate TX buffer\n");
+			return false;
+		}
+	}
+
+	subframe_len += ieee80211_amsdu_pad(skb, subframe_len);
+
+	amsdu_hdr.h_proto = cpu_to_be16(subframe_len);
+	memcpy(amsdu_hdr.h_source, skb->data + fast_tx->sa_offs, ETH_ALEN);
+	memcpy(amsdu_hdr.h_dest, skb->data + fast_tx->da_offs, ETH_ALEN);
+
+	data = skb_push(skb, sizeof(amsdu_hdr));
+	memmove(data, data + sizeof(amsdu_hdr), hdr_len);
+	memcpy(data + hdr_len, &amsdu_hdr, sizeof(amsdu_hdr));
+
+	hdr = data;
+	qc = ieee80211_get_qos_ctl(hdr);
+	*qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
+
+	info->control.flags |= IEEE80211_TX_CTRL_AMSDU;
+
+	return true;
+}
+
+static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
+				      struct sta_info *sta,
+				      struct ieee80211_fast_tx *fast_tx,
+				      struct sk_buff *skb)
+{
+	struct ieee80211_local *local = sdata->local;
+	u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+	struct ieee80211_txq *txq = sta->sta.txq[tid];
+	struct txq_info *txqi;
+	struct sk_buff **frag_tail, *head;
+	int subframe_len = skb->len - ETH_ALEN;
+	u8 max_subframes = 0xff;
+	int max_frags = local->hw.max_tx_fragments;
+	int max_amsdu_len;
+	__be16 len;
+	void *data;
+	bool ret = false;
+	int n = 1, nfrags;
+
+	if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
+		return false;
+
+	if (!txq)
+		return false;
+
+	txqi = to_txq_info(txq);
+	if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags))
+		return false;
+
+	if (sta->sta.max_amsdu_subframes)
+		max_subframes = min(max_subframes,
+				    sta->sta.max_amsdu_subframes);
+
+	if (local->hw.max_tx_amsdu_subframes)
+		max_subframes = min(max_subframes,
+				    local->hw.max_tx_amsdu_subframes);
+
+	spin_lock_bh(&txqi->queue.lock);
+
+	head = skb_peek_tail(&txqi->queue);
+	if (!head)
+		goto out;
+
+	if (skb->len + head->len > max_amsdu_len)
+		goto out;
+
+	/*
+	 * HT A-MPDU limits maximum MPDU size to 4095 bytes. Since aggregation
+	 * sessions are started/stopped without txq flush, use the limit here
+	 * to avoid having to de-aggregate later.
+	 */
+	if (skb->len + head->len > 4095 &&
+	    !sta->sta.vht_cap.vht_supported)
+		goto out;
+
+	if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
+		goto out;
+
+	nfrags = 1 + skb_shinfo(skb)->nr_frags;
+	nfrags += 1 + skb_shinfo(head)->nr_frags;
+	frag_tail = &skb_shinfo(head)->frag_list;
+	while (*frag_tail) {
+		nfrags += 1 + skb_shinfo(*frag_tail)->nr_frags;
+	    frag_tail = &(*frag_tail)->next;
+		n++;
+	}
+
+	if (max_subframes && n > max_subframes)
+		goto out;
+
+	if (max_frags && nfrags > max_frags)
+		goto out;
+
+	if (skb_headroom(skb) < 8 || skb_tailroom(skb) < 3) {
+		I802_DEBUG_INC(local->tx_expand_skb_head);
+
+		if (pskb_expand_head(skb, 8, 3, GFP_ATOMIC)) {
+			wiphy_debug(local->hw.wiphy,
+				    "failed to reallocate TX buffer\n");
+			goto out;
+		}
+	}
+
+	subframe_len += ieee80211_amsdu_pad(skb, subframe_len);
+
+	ret = true;
+	data = skb_push(skb, ETH_ALEN + 2);
+	memmove(data, data + ETH_ALEN + 2, 2 * ETH_ALEN);
+
+	data += 2 * ETH_ALEN;
+	len = cpu_to_be16(subframe_len);
+	memcpy(data, &len, 2);
+	memcpy(data + 2, rfc1042_header, ETH_ALEN);
+
+	head->len += skb->len;
+	head->data_len += skb->len;
+	*frag_tail = skb;
+
+out:
+	spin_unlock_bh(&txqi->queue.lock);
+
+	return ret;
+}
+
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 				struct net_device *dev, struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
@@ -2817,6 +2981,10 @@  static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 
 	ieee80211_tx_stats(dev, skb->len + extra_head);
 
+	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
+	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
+		return true;
+
 	/* will not be crypto-handled beyond what we do here, so use false
 	 * as the may-encrypt argument for the resize to not account for
 	 * more room than we already have in 'extra_head'