Message ID | 20220510033219.2639364-14-eric.dumazet@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | tcp: BIG TCP implementation | expand |
On 09 May 20:32, Eric Dumazet wrote: >From: Coco Li <lixiaoyan@google.com> > >mlx5 supports LSOv2. > >IPv6 gro/tcp stacks insert a temporary Hop-by-Hop header >with JUMBO TLV for big packets. > >We need to ignore/skip this HBH header when populating TX descriptor. > Sorry i didn't go through all the documentations or previous discussions, please bare with me, so why not clear HBH just before calling the driver xmit ndo ? Or if HBH has to stick, why not provide some helpers to the driver, to make it less intrusive to the driver. mlx5_xmit_skb() { skb_remove_hbh(skb); populate tx descriptor as usual; skb_restore_hbh(skb); //must be before doorbell ring doorbell } >Note that ipv6_has_hopopt_jumbo() only recognizes very specific packet >layout, thus mlx5e_sq_xmit_wqe() is taking care of this layout only. > >v2: clear hopbyhop in mlx5e_tx_get_gso_ihs() >v4: fix compile error for CONFIG_MLX5_CORE_IPOIB=y > >Signed-off-by: Coco Li <lixiaoyan@google.com> >Signed-off-by: Eric Dumazet <edumazet@google.com> >Reviewed-by: Tariq Toukan <tariqt@nvidia.com> >Cc: Saeed Mahameed <saeedm@nvidia.com> >Cc: Leon Romanovsky <leon@kernel.org> >--- > .../net/ethernet/mellanox/mlx5/core/en_main.c | 1 + > .../net/ethernet/mellanox/mlx5/core/en_tx.c | 84 +++++++++++++++---- > 2 files changed, 69 insertions(+), 16 deletions(-) > >diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c >index d27986869b8ba070d1a4f8bcdc7e14ab54ae984e..bf3bca79e160124abd128ac1e9910cb2f39a39ff 100644 >--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c >+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c >@@ -4920,6 +4920,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) > > netdev->priv_flags |= IFF_UNICAST_FLT; > >+ netif_set_tso_max_size(netdev, GSO_MAX_SIZE); > mlx5e_set_netdev_dev_addr(netdev); > mlx5e_ipsec_build_netdev(priv); > mlx5e_ktls_build_netdev(priv); >diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c >index 2dc48406cd08d21ff94f665cd61ab9227f351215..b4fc45ba1b347fb9ad0f46b9c091cc45e4d3d84f 100644 >--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c >+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c >@@ -40,6 +40,7 @@ > #include "en_accel/en_accel.h" > #include "en_accel/ipsec_rxtx.h" > #include "en/ptp.h" >+#include <net/ipv6.h> > > static void mlx5e_dma_unmap_wqe_err(struct mlx5e_txqsq *sq, u8 num_dma) > { >@@ -130,23 +131,32 @@ mlx5e_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, > sq->stats->csum_none++; > } > >+/* Returns the number of header bytes that we plan >+ * to inline later in the transmit descriptor >+ */ > static inline u16 >-mlx5e_tx_get_gso_ihs(struct mlx5e_txqsq *sq, struct sk_buff *skb) >+mlx5e_tx_get_gso_ihs(struct mlx5e_txqsq *sq, struct sk_buff *skb, int *hopbyhop) > { > struct mlx5e_sq_stats *stats = sq->stats; > u16 ihs; > >+ *hopbyhop = 0; > if (skb->encapsulation) { > ihs = skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb); > stats->tso_inner_packets++; > stats->tso_inner_bytes += skb->len - ihs; > } else { >- if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) >+ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) { > ihs = skb_transport_offset(skb) + sizeof(struct udphdr); >- else >+ } else { > ihs = skb_transport_offset(skb) + tcp_hdrlen(skb); >+ if (ipv6_has_hopopt_jumbo(skb)) { >+ *hopbyhop = sizeof(struct hop_jumbo_hdr); >+ ihs -= sizeof(struct hop_jumbo_hdr); >+ } >+ } > stats->tso_packets++; >- stats->tso_bytes += skb->len - ihs; >+ stats->tso_bytes += skb->len - ihs - *hopbyhop; > } > > return ihs; >@@ -208,6 +218,7 @@ struct mlx5e_tx_attr { > __be16 mss; > u16 insz; > u8 opcode; >+ u8 hopbyhop; > }; > > struct mlx5e_tx_wqe_attr { >@@ -244,14 +255,16 @@ static void mlx5e_sq_xmit_prepare(struct mlx5e_txqsq *sq, struct sk_buff *skb, > struct mlx5e_sq_stats *stats = sq->stats; > > if (skb_is_gso(skb)) { >- u16 ihs = mlx5e_tx_get_gso_ihs(sq, skb); >+ int hopbyhop; >+ u16 ihs = mlx5e_tx_get_gso_ihs(sq, skb, &hopbyhop); > > *attr = (struct mlx5e_tx_attr) { > .opcode = MLX5_OPCODE_LSO, > .mss = cpu_to_be16(skb_shinfo(skb)->gso_size), > .ihs = ihs, > .num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs, >- .headlen = skb_headlen(skb) - ihs, >+ .headlen = skb_headlen(skb) - ihs - hopbyhop, >+ .hopbyhop = hopbyhop, > }; > > stats->packets += skb_shinfo(skb)->gso_segs; >@@ -365,7 +378,8 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, > struct mlx5_wqe_eth_seg *eseg; > struct mlx5_wqe_data_seg *dseg; > struct mlx5e_tx_wqe_info *wi; >- >+ u16 ihs = attr->ihs; >+ struct ipv6hdr *h6; > struct mlx5e_sq_stats *stats = sq->stats; > int num_dma; > >@@ -379,15 +393,36 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, > > eseg->mss = attr->mss; > >- if (attr->ihs) { >- if (skb_vlan_tag_present(skb)) { >- eseg->inline_hdr.sz |= cpu_to_be16(attr->ihs + VLAN_HLEN); >- mlx5e_insert_vlan(eseg->inline_hdr.start, skb, attr->ihs); >+ if (ihs) { >+ u8 *start = eseg->inline_hdr.start; >+ >+ if (unlikely(attr->hopbyhop)) { >+ /* remove the HBH header. >+ * Layout: [Ethernet header][IPv6 header][HBH][TCP header] >+ */ >+ if (skb_vlan_tag_present(skb)) { >+ mlx5e_insert_vlan(start, skb, ETH_HLEN + sizeof(*h6)); >+ ihs += VLAN_HLEN; >+ h6 = (struct ipv6hdr *)(start + sizeof(struct vlan_ethhdr)); >+ } else { >+ memcpy(start, skb->data, ETH_HLEN + sizeof(*h6)); >+ h6 = (struct ipv6hdr *)(start + ETH_HLEN); >+ } >+ h6->nexthdr = IPPROTO_TCP; >+ /* Copy the TCP header after the IPv6 one */ >+ memcpy(h6 + 1, >+ skb->data + ETH_HLEN + sizeof(*h6) + >+ sizeof(struct hop_jumbo_hdr), >+ tcp_hdrlen(skb)); >+ /* Leave ipv6 payload_len set to 0, as LSO v2 specs request. */ >+ } else if (skb_vlan_tag_present(skb)) { >+ mlx5e_insert_vlan(start, skb, ihs); >+ ihs += VLAN_HLEN; > stats->added_vlan_packets++; > } else { >- eseg->inline_hdr.sz |= cpu_to_be16(attr->ihs); >- memcpy(eseg->inline_hdr.start, skb->data, attr->ihs); >+ memcpy(start, skb->data, ihs); > } >+ eseg->inline_hdr.sz |= cpu_to_be16(ihs); > dseg += wqe_attr->ds_cnt_inl; > } else if (skb_vlan_tag_present(skb)) { > eseg->insert.type = cpu_to_be16(MLX5_ETH_WQE_INSERT_VLAN); >@@ -398,7 +433,7 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, > } > > dseg += wqe_attr->ds_cnt_ids; >- num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr->ihs, >+ num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr->ihs + attr->hopbyhop, > attr->headlen, dseg); > if (unlikely(num_dma < 0)) > goto err_drop; >@@ -918,12 +953,29 @@ void mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, > eseg->mss = attr.mss; > > if (attr.ihs) { >- memcpy(eseg->inline_hdr.start, skb->data, attr.ihs); >+ if (unlikely(attr.hopbyhop)) { >+ struct ipv6hdr *h6; >+ >+ /* remove the HBH header. >+ * Layout: [Ethernet header][IPv6 header][HBH][TCP header] >+ */ >+ memcpy(eseg->inline_hdr.start, skb->data, ETH_HLEN + sizeof(*h6)); >+ h6 = (struct ipv6hdr *)((char *)eseg->inline_hdr.start + ETH_HLEN); >+ h6->nexthdr = IPPROTO_TCP; >+ /* Copy the TCP header after the IPv6 one */ >+ memcpy(h6 + 1, >+ skb->data + ETH_HLEN + sizeof(*h6) + >+ sizeof(struct hop_jumbo_hdr), >+ tcp_hdrlen(skb)); >+ /* Leave ipv6 payload_len set to 0, as LSO v2 specs request. */ >+ } else { >+ memcpy(eseg->inline_hdr.start, skb->data, attr.ihs); >+ } > eseg->inline_hdr.sz = cpu_to_be16(attr.ihs); > dseg += wqe_attr.ds_cnt_inl; > } > >- num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr.ihs, >+ num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr.ihs + attr.hopbyhop, > attr.headlen, dseg); > if (unlikely(num_dma < 0)) > goto err_drop; >-- >2.36.0.512.ge40c2bad7a-goog >
On Thu, 2022-05-12 at 01:40 -0700, Saeed Mahameed wrote: > On 09 May 20:32, Eric Dumazet wrote: > > From: Coco Li <lixiaoyan@google.com> > > > > mlx5 supports LSOv2. > > > > IPv6 gro/tcp stacks insert a temporary Hop-by-Hop header > > with JUMBO TLV for big packets. > > > > We need to ignore/skip this HBH header when populating TX descriptor. > > > > Sorry i didn't go through all the documentations or previous discussions, > please bare with me, so why not clear HBH just before calling the > driver xmit ndo ? I guess this way is more efficient: the driver copies IP hdr and TCP hdr directly in the correct/final location into the tx descriptor, otherwise the caller would have to memmove L2/L3 just before the driver copies them again. > > Or if HBH has to stick, My understanding is that this is not the case. Cheers, Paolo
On 12 May 11:02, Paolo Abeni wrote: >On Thu, 2022-05-12 at 01:40 -0700, Saeed Mahameed wrote: >> On 09 May 20:32, Eric Dumazet wrote: >> > From: Coco Li <lixiaoyan@google.com> >> > >> > mlx5 supports LSOv2. >> > >> > IPv6 gro/tcp stacks insert a temporary Hop-by-Hop header >> > with JUMBO TLV for big packets. >> > >> > We need to ignore/skip this HBH header when populating TX descriptor. >> > >> >> Sorry i didn't go through all the documentations or previous discussions, >> please bare with me, so why not clear HBH just before calling the >> driver xmit ndo ? > >I guess this way is more efficient: the driver copies IP hdr and TCP >hdr directly in the correct/final location into the tx descriptor, >otherwise the caller would have to memmove L2/L3 just before the driver >copies them again. >> memmove(sizeof(L2/L3)) is not that bad when done only every 64KB+. it's going to be hard to repeat this and maintain this across all drivers only to get this micro optimization that I doubt it will be even measurable. >> Or if HBH has to stick, > >My understanding is that this is not the case. > >Cheers, > >Paolo >
On Thu, May 12, 2022 at 9:29 PM Saeed Mahameed <saeedm@nvidia.com> wrote: > > On 12 May 11:02, Paolo Abeni wrote: > >On Thu, 2022-05-12 at 01:40 -0700, Saeed Mahameed wrote: > >> On 09 May 20:32, Eric Dumazet wrote: > >> > From: Coco Li <lixiaoyan@google.com> > >> > > >> > mlx5 supports LSOv2. > >> > > >> > IPv6 gro/tcp stacks insert a temporary Hop-by-Hop header > >> > with JUMBO TLV for big packets. > >> > > >> > We need to ignore/skip this HBH header when populating TX descriptor. > >> > > >> > >> Sorry i didn't go through all the documentations or previous discussions, > >> please bare with me, so why not clear HBH just before calling the > >> driver xmit ndo ? > > > >I guess this way is more efficient: the driver copies IP hdr and TCP > >hdr directly in the correct/final location into the tx descriptor, > >otherwise the caller would have to memmove L2/L3 just before the driver > >copies them again. > >> > > memmove(sizeof(L2/L3)) is not that bad when done only every 64KB+. > it's going to be hard to repeat this and maintain this across all drivers > only to get this micro optimization that I doubt it will be even measurable. We prefer not changing skb->head, this would break tcpdump. Surely calling skb_cow_head() would incur a cost. As I suggested, we can respin the series without the mlx5 patch, this is totally fine for us, if we can avoid missing 5.19 train.
On 12 May 21:34, Eric Dumazet wrote: >On Thu, May 12, 2022 at 9:29 PM Saeed Mahameed <saeedm@nvidia.com> wrote: >> >> On 12 May 11:02, Paolo Abeni wrote: >> >On Thu, 2022-05-12 at 01:40 -0700, Saeed Mahameed wrote: >> >> On 09 May 20:32, Eric Dumazet wrote: >> >> > From: Coco Li <lixiaoyan@google.com> >> >> > >> >> > mlx5 supports LSOv2. >> >> > >> >> > IPv6 gro/tcp stacks insert a temporary Hop-by-Hop header >> >> > with JUMBO TLV for big packets. >> >> > >> >> > We need to ignore/skip this HBH header when populating TX descriptor. >> >> > >> >> >> >> Sorry i didn't go through all the documentations or previous discussions, >> >> please bare with me, so why not clear HBH just before calling the >> >> driver xmit ndo ? >> > >> >I guess this way is more efficient: the driver copies IP hdr and TCP >> >hdr directly in the correct/final location into the tx descriptor, >> >otherwise the caller would have to memmove L2/L3 just before the driver >> >copies them again. >> >> >> >> memmove(sizeof(L2/L3)) is not that bad when done only every 64KB+. >> it's going to be hard to repeat this and maintain this across all drivers >> only to get this micro optimization that I doubt it will be even measurable. > >We prefer not changing skb->head, this would break tcpdump. > in that case we can provide a helper to the drivers to call, just before they start processing the skb. >Surely calling skb_cow_head() would incur a cost. > Sure, but the benefit of this patch outweighs this cost by orders of magnitude, you pay an extra 0.1$ for a cleaner code, and you still get your 64K$ BIG TCP cash. >As I suggested, we can respin the series without the mlx5 patch, this >is totally fine for us, if we can avoid missing 5.19 train. To be clear, I am not nacking, Tariq already reviewed and gave his blessing, and i won't resist this patch on v6. I am Just suggesting an improvement to code readability and scalability to other drivers. FWIW: Acked-by: Saeed Mahameed <saeedm@nvidia.com>
On Thu, May 12, 2022 at 10:49 PM Saeed Mahameed <saeedm@nvidia.com> wrote: > > On 12 May 21:34, Eric Dumazet wrote: > >On Thu, May 12, 2022 at 9:29 PM Saeed Mahameed <saeedm@nvidia.com> wrote: > >> > >> On 12 May 11:02, Paolo Abeni wrote: > >> >On Thu, 2022-05-12 at 01:40 -0700, Saeed Mahameed wrote: > >> >> On 09 May 20:32, Eric Dumazet wrote: > >> >> > From: Coco Li <lixiaoyan@google.com> > >> >> > > >> >> > mlx5 supports LSOv2. > >> >> > > >> >> > IPv6 gro/tcp stacks insert a temporary Hop-by-Hop header > >> >> > with JUMBO TLV for big packets. > >> >> > > >> >> > We need to ignore/skip this HBH header when populating TX descriptor. > >> >> > > >> >> > >> >> Sorry i didn't go through all the documentations or previous discussions, > >> >> please bare with me, so why not clear HBH just before calling the > >> >> driver xmit ndo ? > >> > > >> >I guess this way is more efficient: the driver copies IP hdr and TCP > >> >hdr directly in the correct/final location into the tx descriptor, > >> >otherwise the caller would have to memmove L2/L3 just before the driver > >> >copies them again. > >> >> > >> > >> memmove(sizeof(L2/L3)) is not that bad when done only every 64KB+. > >> it's going to be hard to repeat this and maintain this across all drivers > >> only to get this micro optimization that I doubt it will be even measurable. > > > >We prefer not changing skb->head, this would break tcpdump. > > > > in that case we can provide a helper to the drivers to call, just before > they start processing the skb. > > >Surely calling skb_cow_head() would incur a cost. > > > > Sure, but the benefit of this patch outweighs this cost by orders of > magnitude, you pay an extra 0.1$ for a cleaner code, and you still > get your 64K$ BIG TCP cash. > > >As I suggested, we can respin the series without the mlx5 patch, this > >is totally fine for us, if we can avoid missing 5.19 train. > > To be clear, I am not nacking, Tariq already reviewed and gave his blessing, > and i won't resist this patch on v6. I am Just suggesting an improvement > to code readability and scalability to other drivers. The problem is that skb_cow_head() can fail. Really we have thought about this already. A common helper for drivers is mostly unusable, you would have to pre-allocate a per TX-ring slot to store the headers. We would end up with adding complexity at queue creation/dismantle. We could do that later, because some NICs do not inline the headers in TX descriptor, but instead request one mapped buffer for the headers part only. BTW, I know Tariq already reviewed, the issue at hand is about CONFIG_FORTIFY which is blocking us. This is why I was considering not submitting mlx5 change until Kees Cook and others come up with a solution.
On Fri, 13 May 2022 06:05:36 -0700 Eric Dumazet wrote: > The problem is that skb_cow_head() can fail. > > Really we have thought about this already. > > A common helper for drivers is mostly unusable, you would have to > pre-allocate a per TX-ring slot to store the headers. > We would end up with adding complexity at queue creation/dismantle. > > We could do that later, because some NICs do not inline the headers in > TX descriptor, but instead request > one mapped buffer for the headers part only. > > BTW, I know Tariq already reviewed, the issue at hand is about > CONFIG_FORTIFY which is blocking us. > > This is why I was considering not submitting mlx5 change until Kees > Cook and others come up with a solution. We do have the solution, no? commit 43213daed6d6 ("fortify: Provide a memcpy trap door for sharp corners")
On Fri, May 13, 2022 at 10:04 AM Jakub Kicinski <kuba@kernel.org> wrote: > > On Fri, 13 May 2022 06:05:36 -0700 Eric Dumazet wrote: > > The problem is that skb_cow_head() can fail. > > > > Really we have thought about this already. > > > > A common helper for drivers is mostly unusable, you would have to > > pre-allocate a per TX-ring slot to store the headers. > > We would end up with adding complexity at queue creation/dismantle. > > > > We could do that later, because some NICs do not inline the headers in > > TX descriptor, but instead request > > one mapped buffer for the headers part only. > > > > BTW, I know Tariq already reviewed, the issue at hand is about > > CONFIG_FORTIFY which is blocking us. > > > > This is why I was considering not submitting mlx5 change until Kees > > Cook and others come up with a solution. > > We do have the solution, no? > > commit 43213daed6d6 ("fortify: Provide a memcpy trap door for sharp > corners") Oh I missed this was already merged. I will rebase then. Hopefully ARCH=hexagon|awesome won't trigger a new issue :) Thanks.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index d27986869b8ba070d1a4f8bcdc7e14ab54ae984e..bf3bca79e160124abd128ac1e9910cb2f39a39ff 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4920,6 +4920,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) netdev->priv_flags |= IFF_UNICAST_FLT; + netif_set_tso_max_size(netdev, GSO_MAX_SIZE); mlx5e_set_netdev_dev_addr(netdev); mlx5e_ipsec_build_netdev(priv); mlx5e_ktls_build_netdev(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 2dc48406cd08d21ff94f665cd61ab9227f351215..b4fc45ba1b347fb9ad0f46b9c091cc45e4d3d84f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -40,6 +40,7 @@ #include "en_accel/en_accel.h" #include "en_accel/ipsec_rxtx.h" #include "en/ptp.h" +#include <net/ipv6.h> static void mlx5e_dma_unmap_wqe_err(struct mlx5e_txqsq *sq, u8 num_dma) { @@ -130,23 +131,32 @@ mlx5e_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, sq->stats->csum_none++; } +/* Returns the number of header bytes that we plan + * to inline later in the transmit descriptor + */ static inline u16 -mlx5e_tx_get_gso_ihs(struct mlx5e_txqsq *sq, struct sk_buff *skb) +mlx5e_tx_get_gso_ihs(struct mlx5e_txqsq *sq, struct sk_buff *skb, int *hopbyhop) { struct mlx5e_sq_stats *stats = sq->stats; u16 ihs; + *hopbyhop = 0; if (skb->encapsulation) { ihs = skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb); stats->tso_inner_packets++; stats->tso_inner_bytes += skb->len - ihs; } else { - if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) { ihs = skb_transport_offset(skb) + sizeof(struct udphdr); - else + } else { ihs = skb_transport_offset(skb) + tcp_hdrlen(skb); + if (ipv6_has_hopopt_jumbo(skb)) { + *hopbyhop = sizeof(struct hop_jumbo_hdr); + ihs -= sizeof(struct hop_jumbo_hdr); + } + } stats->tso_packets++; - stats->tso_bytes += skb->len - ihs; + stats->tso_bytes += skb->len - ihs - *hopbyhop; } return ihs; @@ -208,6 +218,7 @@ struct mlx5e_tx_attr { __be16 mss; u16 insz; u8 opcode; + u8 hopbyhop; }; struct mlx5e_tx_wqe_attr { @@ -244,14 +255,16 @@ static void mlx5e_sq_xmit_prepare(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5e_sq_stats *stats = sq->stats; if (skb_is_gso(skb)) { - u16 ihs = mlx5e_tx_get_gso_ihs(sq, skb); + int hopbyhop; + u16 ihs = mlx5e_tx_get_gso_ihs(sq, skb, &hopbyhop); *attr = (struct mlx5e_tx_attr) { .opcode = MLX5_OPCODE_LSO, .mss = cpu_to_be16(skb_shinfo(skb)->gso_size), .ihs = ihs, .num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs, - .headlen = skb_headlen(skb) - ihs, + .headlen = skb_headlen(skb) - ihs - hopbyhop, + .hopbyhop = hopbyhop, }; stats->packets += skb_shinfo(skb)->gso_segs; @@ -365,7 +378,8 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg; struct mlx5_wqe_data_seg *dseg; struct mlx5e_tx_wqe_info *wi; - + u16 ihs = attr->ihs; + struct ipv6hdr *h6; struct mlx5e_sq_stats *stats = sq->stats; int num_dma; @@ -379,15 +393,36 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, eseg->mss = attr->mss; - if (attr->ihs) { - if (skb_vlan_tag_present(skb)) { - eseg->inline_hdr.sz |= cpu_to_be16(attr->ihs + VLAN_HLEN); - mlx5e_insert_vlan(eseg->inline_hdr.start, skb, attr->ihs); + if (ihs) { + u8 *start = eseg->inline_hdr.start; + + if (unlikely(attr->hopbyhop)) { + /* remove the HBH header. + * Layout: [Ethernet header][IPv6 header][HBH][TCP header] + */ + if (skb_vlan_tag_present(skb)) { + mlx5e_insert_vlan(start, skb, ETH_HLEN + sizeof(*h6)); + ihs += VLAN_HLEN; + h6 = (struct ipv6hdr *)(start + sizeof(struct vlan_ethhdr)); + } else { + memcpy(start, skb->data, ETH_HLEN + sizeof(*h6)); + h6 = (struct ipv6hdr *)(start + ETH_HLEN); + } + h6->nexthdr = IPPROTO_TCP; + /* Copy the TCP header after the IPv6 one */ + memcpy(h6 + 1, + skb->data + ETH_HLEN + sizeof(*h6) + + sizeof(struct hop_jumbo_hdr), + tcp_hdrlen(skb)); + /* Leave ipv6 payload_len set to 0, as LSO v2 specs request. */ + } else if (skb_vlan_tag_present(skb)) { + mlx5e_insert_vlan(start, skb, ihs); + ihs += VLAN_HLEN; stats->added_vlan_packets++; } else { - eseg->inline_hdr.sz |= cpu_to_be16(attr->ihs); - memcpy(eseg->inline_hdr.start, skb->data, attr->ihs); + memcpy(start, skb->data, ihs); } + eseg->inline_hdr.sz |= cpu_to_be16(ihs); dseg += wqe_attr->ds_cnt_inl; } else if (skb_vlan_tag_present(skb)) { eseg->insert.type = cpu_to_be16(MLX5_ETH_WQE_INSERT_VLAN); @@ -398,7 +433,7 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, } dseg += wqe_attr->ds_cnt_ids; - num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr->ihs, + num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr->ihs + attr->hopbyhop, attr->headlen, dseg); if (unlikely(num_dma < 0)) goto err_drop; @@ -918,12 +953,29 @@ void mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, eseg->mss = attr.mss; if (attr.ihs) { - memcpy(eseg->inline_hdr.start, skb->data, attr.ihs); + if (unlikely(attr.hopbyhop)) { + struct ipv6hdr *h6; + + /* remove the HBH header. + * Layout: [Ethernet header][IPv6 header][HBH][TCP header] + */ + memcpy(eseg->inline_hdr.start, skb->data, ETH_HLEN + sizeof(*h6)); + h6 = (struct ipv6hdr *)((char *)eseg->inline_hdr.start + ETH_HLEN); + h6->nexthdr = IPPROTO_TCP; + /* Copy the TCP header after the IPv6 one */ + memcpy(h6 + 1, + skb->data + ETH_HLEN + sizeof(*h6) + + sizeof(struct hop_jumbo_hdr), + tcp_hdrlen(skb)); + /* Leave ipv6 payload_len set to 0, as LSO v2 specs request. */ + } else { + memcpy(eseg->inline_hdr.start, skb->data, attr.ihs); + } eseg->inline_hdr.sz = cpu_to_be16(attr.ihs); dseg += wqe_attr.ds_cnt_inl; } - num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr.ihs, + num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr.ihs + attr.hopbyhop, attr.headlen, dseg); if (unlikely(num_dma < 0)) goto err_drop;