diff mbox series

[net-next,v2,1/1] net: seg6: Add support for SRv6 Headend Reduced Encapsulation

Message ID 20220609132750.4917-1-anton.makarov11235@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next,v2,1/1] net: seg6: Add support for SRv6 Headend Reduced Encapsulation | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 29 this patch: 29
netdev/cc_maintainers warning 4 maintainers not CCed: yoshfuji@linux-ipv6.org pabeni@redhat.com dsahern@kernel.org edumazet@google.com
netdev/build_clang success Errors and warnings before: 6 this patch: 6
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 29 this patch: 29
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: No space is necessary after a cast CHECK: extern prototypes should be avoided in .h files WARNING: From:/Signed-off-by: email address mismatch: 'From: Anton Makarov <antonmakarov11235@gmail.com>' != 'Signed-off-by: Anton Makarov <anton.makarov11235@gmail.com>' WARNING: line length of 83 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Anton Makarov June 9, 2022, 1:27 p.m. UTC
SRv6 Headend H.Encaps.Red and H.Encaps.L2.Red behaviors are implemented
accordingly to RFC 8986. The H.Encaps.Red is an optimization of
The H.Encaps behavior. The H.Encaps.L2.Red is an optimization of
the H.Encaps.L2 behavior. Both new behaviors reduce the length of
the SRH by excluding the first SID in the SRH of the pushed IPv6 header.
The first SID is only placed in the Destination Address field
of the pushed IPv6 header.

The push of the SRH is omitted when the SRv6 Policy only contains
one segment.

Signed-off-by: Anton Makarov <anton.makarov11235@gmail.com>
---
v2: 1) Fixed sparse warnings
    2) memset now uses sizeof() instead of hardcoded value
    3) Removed EXPORT_SYMBOL_GPL
---
 include/net/seg6.h                 |  2 +
 include/uapi/linux/seg6_iptunnel.h |  2 +
 net/ipv6/seg6_iptunnel.c           | 94 +++++++++++++++++++++++++++++-
 3 files changed, 96 insertions(+), 2 deletions(-)

Comments

Andrea Mayer June 10, 2022, 11:59 a.m. UTC | #1
Hi Anton,
please see my comments inline, thanks.

On Thu,  9 Jun 2022 16:27:50 +0300
Anton Makarov <antonmakarov11235@gmail.com> wrote:

> SRv6 Headend H.Encaps.Red and H.Encaps.L2.Red behaviors are implemented
> accordingly to RFC 8986. The H.Encaps.Red is an optimization of
> The H.Encaps behavior. The H.Encaps.L2.Red is an optimization of
> the H.Encaps.L2 behavior. Both new behaviors reduce the length of
> the SRH by excluding the first SID in the SRH of the pushed IPv6 header.
> The first SID is only placed in the Destination Address field
> of the pushed IPv6 header.
> 
> The push of the SRH is omitted when the SRv6 Policy only contains
> one segment.
> 
> Signed-off-by: Anton Makarov <anton.makarov11235@gmail.com>
> 
> ...
>  
> +/* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */
> +int seg6_do_srh_encap_red(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
> +{
> +	struct dst_entry *dst = skb_dst(skb);
> +	struct net *net = dev_net(dst->dev);
> +	struct ipv6hdr *hdr, *inner_hdr6;
> +	struct iphdr *inner_hdr4;
> +	struct ipv6_sr_hdr *isrh;
> +	int hdrlen = 0, tot_len, err;

I suppose we should stick with the reverse XMAS tree code style.

> +	__be32 flowlabel = 0;

this initialization is unnecessary since the variable is accessed for the first
time in writing, later in the code.

> +	if (osrh->first_segment > 0)
> +		hdrlen = (osrh->hdrlen - 1) << 3;
> +
> +	tot_len = hdrlen + sizeof(struct ipv6hdr);
> +
> +	err = skb_cow_head(skb, tot_len + skb->mac_len);
> +	if (unlikely(err))
> +		return err;
> +
> +	inner_hdr6 = ipv6_hdr(skb);
> +	inner_hdr4 = ip_hdr(skb);

inner_hdr4 is only used in the *if* block that follows later on.

> +	flowlabel = seg6_make_flowlabel(net, skb, inner_hdr6);
> +
> +	skb_push(skb, tot_len);
> +	skb_reset_network_header(skb);
> +	skb_mac_header_rebuild(skb);
> +	hdr = ipv6_hdr(skb);
> +
> +	memset(skb->cb, 0, sizeof(skb->cb));

is there a specific reason why we should consider the whole CB size and not
only the part covered by the struct inet6_skb_parm?

> +	IP6CB(skb)->iif = skb->skb_iif;
> +
> +	if (skb->protocol == htons(ETH_P_IPV6)) {
> +		ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr6)),
> +			     flowlabel);
> +		hdr->hop_limit = inner_hdr6->hop_limit;
> +	} else if (skb->protocol == htons(ETH_P_IP)) {
> +		ip6_flow_hdr(hdr, (unsigned int) inner_hdr4->tos, flowlabel);
> +		hdr->hop_limit = inner_hdr4->ttl;
> +	}
> +

Don't IPv4 and IPv6 cover all possible cases?

> +	skb->protocol = htons(ETH_P_IPV6);
> +
> +	hdr->daddr = osrh->segments[osrh->first_segment];
> +	hdr->version = 6;
> +
> +	if (osrh->first_segment > 0) {
> +		hdr->nexthdr = NEXTHDR_ROUTING;
> +
> +		isrh = (void *)hdr + sizeof(struct ipv6hdr);
> +		memcpy(isrh, osrh, hdrlen);
> +
> +		isrh->nexthdr = proto;
> +		isrh->first_segment--;
> +		isrh->hdrlen -= 2;
> +	} else {
> +		hdr->nexthdr = proto;
> +	}
> +
> +	set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
> +
> +#ifdef CONFIG_IPV6_SEG6_HMAC
> +	if (osrh->first_segment > 0 && sr_has_hmac(isrh)) {
> +		err = seg6_push_hmac(net, &hdr->saddr, isrh);
> +		if (unlikely(err))
> +			return err;
> +	}
> +#endif
> +

When there is only one SID and HMAC is configured, the SRH is not kept.
Aren't we losing information this way?

Andrea
Anton Makarov June 10, 2022, 6:21 p.m. UTC | #2
Hi Andrea,
Thank you very much for your feedback! Plese look at my response inline
and let me know what you think about that. Many thanks!

On Fri, 10 Jun 2022 13:59:58 +0200
Andrea Mayer <andrea.mayer@uniroma2.it> wrote:

> Hi Anton,
> please see my comments inline, thanks.
> 
> On Thu,  9 Jun 2022 16:27:50 +0300
> Anton Makarov <antonmakarov11235@gmail.com> wrote:
> 
> > SRv6 Headend H.Encaps.Red and H.Encaps.L2.Red behaviors are implemented
> > accordingly to RFC 8986. The H.Encaps.Red is an optimization of
> > The H.Encaps behavior. The H.Encaps.L2.Red is an optimization of
> > the H.Encaps.L2 behavior. Both new behaviors reduce the length of
> > the SRH by excluding the first SID in the SRH of the pushed IPv6 header.
> > The first SID is only placed in the Destination Address field
> > of the pushed IPv6 header.
> > 
> > The push of the SRH is omitted when the SRv6 Policy only contains
> > one segment.
> > 
> > Signed-off-by: Anton Makarov <anton.makarov11235@gmail.com>
> > 
> > ...
> >  
> > +/* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */
> > +int seg6_do_srh_encap_red(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
> > +{
> > +	struct dst_entry *dst = skb_dst(skb);
> > +	struct net *net = dev_net(dst->dev);
> > +	struct ipv6hdr *hdr, *inner_hdr6;
> > +	struct iphdr *inner_hdr4;
> > +	struct ipv6_sr_hdr *isrh;
> > +	int hdrlen = 0, tot_len, err;
> 
> I suppose we should stick with the reverse XMAS tree code style.

Sure, no problem.

> 
> > +	__be32 flowlabel = 0;
> 
> this initialization is unnecessary since the variable is accessed for the first
> time in writing, later in the code.

Sorry, missed this extra action. You are correct.

> 
> > +	if (osrh->first_segment > 0)
> > +		hdrlen = (osrh->hdrlen - 1) << 3;
> > +
> > +	tot_len = hdrlen + sizeof(struct ipv6hdr);
> > +
> > +	err = skb_cow_head(skb, tot_len + skb->mac_len);
> > +	if (unlikely(err))
> > +		return err;
> > +
> > +	inner_hdr6 = ipv6_hdr(skb);
> > +	inner_hdr4 = ip_hdr(skb);
> 
> inner_hdr4 is only used in the *if* block that follows later on.

Do you mean it has to be defined inside *if* block and assigned via
inner_ip_hdr()?

> 
> > +	flowlabel = seg6_make_flowlabel(net, skb, inner_hdr6);
> > +
> > +	skb_push(skb, tot_len);
> > +	skb_reset_network_header(skb);
> > +	skb_mac_header_rebuild(skb);
> > +	hdr = ipv6_hdr(skb);
> > +
> > +	memset(skb->cb, 0, sizeof(skb->cb));
> 
> is there a specific reason why we should consider the whole CB size and not
> only the part covered by the struct inet6_skb_parm?

Oh yes, memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))) would be better. You
are correct.

> 
> > +	IP6CB(skb)->iif = skb->skb_iif;
> > +
> > +	if (skb->protocol == htons(ETH_P_IPV6)) {
> > +		ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr6)),
> > +			     flowlabel);
> > +		hdr->hop_limit = inner_hdr6->hop_limit;
> > +	} else if (skb->protocol == htons(ETH_P_IP)) {
> > +		ip6_flow_hdr(hdr, (unsigned int) inner_hdr4->tos, flowlabel);
> > +		hdr->hop_limit = inner_hdr4->ttl;
> > +	}
> > +
> 
> Don't IPv4 and IPv6 cover all possible cases?

In fact while case SEG6_IPTUN_MODE_ENCAP in seg6_do_srh() does
preliminary check of protocol value, case SEG6_IPTUN_MODE_L2ENCAP does
not. So potentially skb->protocol can be of any value. Although
additional check brings extra impact on performance, sure.

> 
> > +	skb->protocol = htons(ETH_P_IPV6);
> > +
> > +	hdr->daddr = osrh->segments[osrh->first_segment];
> > +	hdr->version = 6;
> > +
> > +	if (osrh->first_segment > 0) {
> > +		hdr->nexthdr = NEXTHDR_ROUTING;
> > +
> > +		isrh = (void *)hdr + sizeof(struct ipv6hdr);
> > +		memcpy(isrh, osrh, hdrlen);
> > +
> > +		isrh->nexthdr = proto;
> > +		isrh->first_segment--;
> > +		isrh->hdrlen -= 2;
> > +	} else {
> > +		hdr->nexthdr = proto;
> > +	}
> > +
> > +	set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
> > +
> > +#ifdef CONFIG_IPV6_SEG6_HMAC
> > +	if (osrh->first_segment > 0 && sr_has_hmac(isrh)) {
> > +		err = seg6_push_hmac(net, &hdr->saddr, isrh);
> > +		if (unlikely(err))
> > +			return err;
> > +	}
> > +#endif
> > +
> 
> When there is only one SID and HMAC is configured, the SRH is not kept.
> Aren't we losing information this way?

Yes, but HMAC is just an optional part of SRH. RFC 8986 allows us to
omit entire SRH in reduced encapsulation when the SRv6 Policy only
contains one segment. And it seems to be the most usefull approach as
far as:
1) About all hardware implementations do not procede HMAC at all
2) Too many networking guys have a great concern about huge overhead of
SRv6 in compare with MPLS, so they are not happy to get extra 256 bits
3) If one consider HMAC mandatory then there is still basic (not
reduced) encapsulation option

What do you think about it?

> 
> Andrea

Anton
Andrea Mayer June 11, 2022, 6:23 p.m. UTC | #3
Hi Anton,
please consider my answers below. Thanks.

On Fri, 10 Jun 2022 21:21:08 +0300
Anton Makarov <antonmakarov11235@gmail.com> wrote:

> Hi Andrea,
> Thank you very much for your feedback! Plese look at my response inline
> and let me know what you think about that. Many thanks!
> 

you are welcome!

> 
> > 
> > > +	if (osrh->first_segment > 0)
> > > +		hdrlen = (osrh->hdrlen - 1) << 3;
> > > +
> > > +	tot_len = hdrlen + sizeof(struct ipv6hdr);
> > > +
> > > +	err = skb_cow_head(skb, tot_len + skb->mac_len);
> > > +	if (unlikely(err))
> > > +		return err;
> > > +
> > > +	inner_hdr6 = ipv6_hdr(skb);
> > > +	inner_hdr4 = ip_hdr(skb);
> > 
> > inner_hdr4 is only used in the *if* block that follows later on.
> 
> Do you mean it has to be defined inside *if* block and assigned via
> inner_ip_hdr()?
> 

I think it is correct to define inner_hdr4 as you have already done, but
initialize and use it only within the *else if* block (the IPv4 one, of
course), as it is no further accessed outside.

> > > +	IP6CB(skb)->iif = skb->skb_iif;
> > > +
> > > +	if (skb->protocol == htons(ETH_P_IPV6)) {
> > > +		ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr6)),
> > > +			     flowlabel);
> > > +		hdr->hop_limit = inner_hdr6->hop_limit;
> > > +	} else if (skb->protocol == htons(ETH_P_IP)) {
> > > +		ip6_flow_hdr(hdr, (unsigned int) inner_hdr4->tos, flowlabel);
> > > +		hdr->hop_limit = inner_hdr4->ttl;
> > > +	}
> > > +
> > 
> > Don't IPv4 and IPv6 cover all possible cases?
> 
> In fact while case SEG6_IPTUN_MODE_ENCAP in seg6_do_srh() does
> preliminary check of protocol value, case SEG6_IPTUN_MODE_L2ENCAP does
> not. So potentially skb->protocol can be of any value. Although
> additional check brings extra impact on performance, sure.
> 

If you fill the pushed IPv6 header in the same way as is done in
seg6_do_srh_encap(), I don't think you need to provide other checks
on skb->protocol (thus avoiding the *if else if* in favor of only *if else*).

In your solution, if the protocol is neither IP6 nor IP4, then the pushed
IPv6 header is partially initialized. In particular, it seems to me that the
traffic class, flow label, and hop limit are not set anywhere else.

> > 
> > > +	skb->protocol = htons(ETH_P_IPV6);
> > > +

This seems to be redundant, since in seg6_do_srh() the protocol is set after
calling the seg6_do_srh_encap{_red}() functions.

> > > +	hdr->daddr = osrh->segments[osrh->first_segment];
> > > +	hdr->version = 6;
> > > +
> > > +	if (osrh->first_segment > 0) {
> > > +		hdr->nexthdr = NEXTHDR_ROUTING;
> > > +
> > > +		isrh = (void *)hdr + sizeof(struct ipv6hdr);
> > > +		memcpy(isrh, osrh, hdrlen);
> > > +
> > > +		isrh->nexthdr = proto;
> > > +		isrh->first_segment--;
> > > +		isrh->hdrlen -= 2;
> > > +	} else {
> > > +		hdr->nexthdr = proto;
> > > +	}
> > > +
> > > +	set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
> > > +
> > > +#ifdef CONFIG_IPV6_SEG6_HMAC
> > > +	if (osrh->first_segment > 0 && sr_has_hmac(isrh)) {
> > > +		err = seg6_push_hmac(net, &hdr->saddr, isrh);
> > > +		if (unlikely(err))
> > > +			return err;
> > > +	}
> > > +#endif
> > > +
> > 
> > When there is only one SID and HMAC is configured, the SRH is not kept.
> > Aren't we losing information this way?
> 
> Yes, but HMAC is just an optional part of SRH. RFC 8986 allows us to
> omit entire SRH in reduced encapsulation when the SRv6 Policy only
> contains one segment. 
> And it seems to be the most usefull approach as
> far as:
> 1) About all hardware implementations do not procede HMAC at all
> 2) Too many networking guys have a great concern about huge overhead of
> SRv6 in compare with MPLS, so they are not happy to get extra 256 bits
> 3) If one consider HMAC mandatory then there is still basic (not
> reduced) encapsulation option
> 
> What do you think about it?
> 

Thanks for the explanation.

However, considering the RFC 8986 Section 5.2:
  "The push of the SRH MAY be omitted when the SRv6 Policy only contains one
   segment and there is no need to use any flag, tag, or TLV."

Hence, if a user needs to use HMAC (rare case) or any other type of supported
flags, tags and TLVs, then the SRH should not be removed, even if there is
only one SID.

Andrea
Anton Makarov June 13, 2022, 12:42 p.m. UTC | #4
Hi Andrea,
Please look inline. Many thanks!

On Sat, 11 Jun 2022 20:23:35 +0200
Andrea Mayer <andrea.mayer@uniroma2.it> wrote:

> Hi Anton,
> please consider my answers below. Thanks.
> 
> On Fri, 10 Jun 2022 21:21:08 +0300
> Anton Makarov <antonmakarov11235@gmail.com> wrote:
> 
> > Hi Andrea,
> > Thank you very much for your feedback! Plese look at my response inline
> > and let me know what you think about that. Many thanks!
> > 
> 
> you are welcome!
> 
> > 
> > > 
> > > > +	if (osrh->first_segment > 0)
> > > > +		hdrlen = (osrh->hdrlen - 1) << 3;
> > > > +
> > > > +	tot_len = hdrlen + sizeof(struct ipv6hdr);
> > > > +
> > > > +	err = skb_cow_head(skb, tot_len + skb->mac_len);
> > > > +	if (unlikely(err))
> > > > +		return err;
> > > > +
> > > > +	inner_hdr6 = ipv6_hdr(skb);
> > > > +	inner_hdr4 = ip_hdr(skb);
> > > 
> > > inner_hdr4 is only used in the *if* block that follows later on.
> > 
> > Do you mean it has to be defined inside *if* block and assigned via
> > inner_ip_hdr()?
> > 
> 
> I think it is correct to define inner_hdr4 as you have already done, but
> initialize and use it only within the *else if* block (the IPv4 one, of
> course), as it is no further accessed outside.

Agree.

> 
> > > > +	IP6CB(skb)->iif = skb->skb_iif;
> > > > +
> > > > +	if (skb->protocol == htons(ETH_P_IPV6)) {
> > > > +		ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr6)),
> > > > +			     flowlabel);
> > > > +		hdr->hop_limit = inner_hdr6->hop_limit;
> > > > +	} else if (skb->protocol == htons(ETH_P_IP)) {
> > > > +		ip6_flow_hdr(hdr, (unsigned int) inner_hdr4->tos, flowlabel);
> > > > +		hdr->hop_limit = inner_hdr4->ttl;
> > > > +	}
> > > > +
> > > 
> > > Don't IPv4 and IPv6 cover all possible cases?
> > 
> > In fact while case SEG6_IPTUN_MODE_ENCAP in seg6_do_srh() does
> > preliminary check of protocol value, case SEG6_IPTUN_MODE_L2ENCAP does
> > not. So potentially skb->protocol can be of any value. Although
> > additional check brings extra impact on performance, sure.
> > 
> 
> If you fill the pushed IPv6 header in the same way as is done in
> seg6_do_srh_encap(), I don't think you need to provide other checks
> on skb->protocol (thus avoiding the *if else if* in favor of only *if else*).
> 
> In your solution, if the protocol is neither IP6 nor IP4, then the pushed
> IPv6 header is partially initialized. In particular, it seems to me that the
> traffic class, flow label, and hop limit are not set anywhere else.

Seems there is a more global inconsistence in entire seg6 implementation
because L2.Encaps has to inherit some parameters not from IPv6/IPv4
headers, but from inner Ethernet and VLAN headers. No?

> 
> > > 
> > > > +	skb->protocol = htons(ETH_P_IPV6);
> > > > +
> 
> This seems to be redundant, since in seg6_do_srh() the protocol is set after
> calling the seg6_do_srh_encap{_red}() functions.

> 
> > > > +	hdr->daddr = osrh->segments[osrh->first_segment];
> > > > +	hdr->version = 6;
> > > > +
> > > > +	if (osrh->first_segment > 0) {
> > > > +		hdr->nexthdr = NEXTHDR_ROUTING;
> > > > +
> > > > +		isrh = (void *)hdr + sizeof(struct ipv6hdr);
> > > > +		memcpy(isrh, osrh, hdrlen);
> > > > +
> > > > +		isrh->nexthdr = proto;
> > > > +		isrh->first_segment--;
> > > > +		isrh->hdrlen -= 2;
> > > > +	} else {
> > > > +		hdr->nexthdr = proto;
> > > > +	}
> > > > +
> > > > +	set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
> > > > +
> > > > +#ifdef CONFIG_IPV6_SEG6_HMAC
> > > > +	if (osrh->first_segment > 0 && sr_has_hmac(isrh)) {
> > > > +		err = seg6_push_hmac(net, &hdr->saddr, isrh);
> > > > +		if (unlikely(err))
> > > > +			return err;
> > > > +	}
> > > > +#endif
> > > > +
> > > 
> > > When there is only one SID and HMAC is configured, the SRH is not kept.
> > > Aren't we losing information this way?
> > 
> > Yes, but HMAC is just an optional part of SRH. RFC 8986 allows us to
> > omit entire SRH in reduced encapsulation when the SRv6 Policy only
> > contains one segment. 
> > And it seems to be the most usefull approach as
> > far as:
> > 1) About all hardware implementations do not procede HMAC at all
> > 2) Too many networking guys have a great concern about huge overhead of
> > SRv6 in compare with MPLS, so they are not happy to get extra 256 bits
> > 3) If one consider HMAC mandatory then there is still basic (not
> > reduced) encapsulation option
> > 
> > What do you think about it?
> > 
> 
> Thanks for the explanation.

You are welcome.

> 
> However, considering the RFC 8986 Section 5.2:
>   "The push of the SRH MAY be omitted when the SRv6 Policy only contains one
>    segment and there is no need to use any flag, tag, or TLV."
> 
> Hence, if a user needs to use HMAC (rare case) or any other type of supported
> flags, tags and TLVs, then the SRH should not be removed, even if there is
> only one SID.

Well noted - rare case. Seems to be academic research only, perhaps
some synthetic PoC. For many years since SRv6 introduction IANA
registered no more SHR TLV besides stillborn HMAC.
Processing some kind of security keys in hardware on multi-Tbps speeds
is unacceptably expensive for production network.

> 
> Andrea

Dear Andrea,
I see you are passionately keeping SRv6 part of kernel code under your
umbrella and this is great! Thus please offer your implementation of
SRv6 reduced encapsulation with your favorite coding style and so on
and so forth. I would be happy to get it and withdraw my version. I
just need encap.red to exist in the kernel, no more. Anyway we ignore
all unneeded parameters while offloading it in our hardware :)

Please note that if you are passionate of strictly followig standards
then entire SRv6 endpoint behaviors' implementation has to be reworked
hardly - lwtunnel is not a correct place for it. I will designate just
a couple of unacceptable points:

1) SRv6 Endpoint behaviors cannot require dev as mandatory parameter
like current implementation does. While End.DT4, End.DT6, End.DT46 have
some workaround- dev of type vrf, basic End behavior assumess egress dev
can be defined only in the second IP lookup _after_ processing of End
behavior itself. Thus dev parameter cannot be assigned during endpoint
behavior configuration. Please refer RFC 8986 section 4.1:

"S01. When an SRH is processed {
S02.   If (Segments Left == 0) {
S03.      Stop processing the SRH, and proceed to process the next
             header in the packet, whose type is identified by
             the Next Header field in the routing header.
S04.   }
S05.   If (IPv6 Hop Limit <= 1) {
S06.      Send an ICMP Time Exceeded message to the Source Address
             with Code 0 (Hop limit exceeded in transit),
             interrupt packet processing, and discard the packet.
S07.   }
S08.   max_LE = (Hdr Ext Len / 2) - 1
S09.   If ((Last Entry > max_LE) or (Segments Left > Last Entry+1)) {
S10.      Send an ICMP Parameter Problem to the Source Address
             with Code 0 (Erroneous header field encountered)
             and Pointer set to the Segments Left field,
             interrupt packet processing, and discard the packet.
S11.   }
S12.   Decrement IPv6 Hop Limit by 1
S13.   Decrement Segments Left by 1
S14.   Update IPv6 DA with Segment List[Segments Left]
S15.   Submit the packet to the egress IPv6 FIB lookup for
          transmission to the new destination
S16. }"

Of course, current implementation of End behavior has a hack - using dev
of type dummy and then next IP lookup. But I am afraid it impacts
networking stack performance and memory consuming much harder that just
extra variable. MPLS part of stack is a good reference how it has to be
implemented accordingly to RFC 8986.

2) H.Encaps.L2 cannot be processed _after_ IP lookup. Accordingly to RFC
8986 section 5.3 "The H.Encaps.L2 behavior encapsulates a received
Ethernet [IEEE.802.3_2018] frame and its attached VLAN header, if
present, in an IPv6 packet with an SRH. The Ethernet frame becomes the
payload of the new IPv6 packet".

IP lookup before H.Encaps.L2 in lwtunnel drops all broadcast and non-IP
frames and this is not acceptable for L2 tunnel behavior. Moreover
current H.Encaps.L2 implementation matches IP packets for tunneling by
IP address while it has to match Ethernet frames by MAC addresses. Dev
of type vxlan is a good reference how it has to be done.

With best regards,
Anton
diff mbox series

Patch

diff --git a/include/net/seg6.h b/include/net/seg6.h
index af668f17b398..8d0ce782f830 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -62,6 +62,8 @@  extern struct ipv6_sr_hdr *seg6_get_srh(struct sk_buff *skb, int flags);
 extern void seg6_icmp_srh(struct sk_buff *skb, struct inet6_skb_parm *opt);
 extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh,
 			     int proto);
+extern int seg6_do_srh_encap_red(struct sk_buff *skb, struct ipv6_sr_hdr *osrh,
+			     int proto);
 extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh);
 extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
 			       u32 tbl_id);
diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h
index eb815e0d0ac3..a9fa777f16de 100644
--- a/include/uapi/linux/seg6_iptunnel.h
+++ b/include/uapi/linux/seg6_iptunnel.h
@@ -35,6 +35,8 @@  enum {
 	SEG6_IPTUN_MODE_INLINE,
 	SEG6_IPTUN_MODE_ENCAP,
 	SEG6_IPTUN_MODE_L2ENCAP,
+	SEG6_IPTUN_MODE_ENCAP_RED,
+	SEG6_IPTUN_MODE_L2ENCAP_RED,
 };
 
 #endif
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index d64855010948..e70c0401715e 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -36,9 +36,11 @@  static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
 	case SEG6_IPTUN_MODE_INLINE:
 		break;
 	case SEG6_IPTUN_MODE_ENCAP:
+	case SEG6_IPTUN_MODE_ENCAP_RED:
 		head = sizeof(struct ipv6hdr);
 		break;
 	case SEG6_IPTUN_MODE_L2ENCAP:
+	case SEG6_IPTUN_MODE_L2ENCAP_RED:
 		return 0;
 	}
 
@@ -195,6 +197,80 @@  int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
 }
 EXPORT_SYMBOL_GPL(seg6_do_srh_encap);
 
+/* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */
+int seg6_do_srh_encap_red(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct net *net = dev_net(dst->dev);
+	struct ipv6hdr *hdr, *inner_hdr6;
+	struct iphdr *inner_hdr4;
+	struct ipv6_sr_hdr *isrh;
+	int hdrlen = 0, tot_len, err;
+	__be32 flowlabel = 0;
+
+	if (osrh->first_segment > 0)
+		hdrlen = (osrh->hdrlen - 1) << 3;
+
+	tot_len = hdrlen + sizeof(struct ipv6hdr);
+
+	err = skb_cow_head(skb, tot_len + skb->mac_len);
+	if (unlikely(err))
+		return err;
+
+	inner_hdr6 = ipv6_hdr(skb);
+	inner_hdr4 = ip_hdr(skb);
+	flowlabel = seg6_make_flowlabel(net, skb, inner_hdr6);
+
+	skb_push(skb, tot_len);
+	skb_reset_network_header(skb);
+	skb_mac_header_rebuild(skb);
+	hdr = ipv6_hdr(skb);
+
+	memset(skb->cb, 0, sizeof(skb->cb));
+	IP6CB(skb)->iif = skb->skb_iif;
+
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr6)),
+			     flowlabel);
+		hdr->hop_limit = inner_hdr6->hop_limit;
+	} else if (skb->protocol == htons(ETH_P_IP)) {
+		ip6_flow_hdr(hdr, (unsigned int) inner_hdr4->tos, flowlabel);
+		hdr->hop_limit = inner_hdr4->ttl;
+	}
+
+	skb->protocol = htons(ETH_P_IPV6);
+
+	hdr->daddr = osrh->segments[osrh->first_segment];
+	hdr->version = 6;
+
+	if (osrh->first_segment > 0) {
+		hdr->nexthdr = NEXTHDR_ROUTING;
+
+		isrh = (void *)hdr + sizeof(struct ipv6hdr);
+		memcpy(isrh, osrh, hdrlen);
+
+		isrh->nexthdr = proto;
+		isrh->first_segment--;
+		isrh->hdrlen -= 2;
+	} else {
+		hdr->nexthdr = proto;
+	}
+
+	set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+	if (osrh->first_segment > 0 && sr_has_hmac(isrh)) {
+		err = seg6_push_hmac(net, &hdr->saddr, isrh);
+		if (unlikely(err))
+			return err;
+	}
+#endif
+
+	skb_postpush_rcsum(skb, hdr, tot_len);
+
+	return 0;
+}
+
 /* insert an SRH within an IPv6 packet, just after the IPv6 header */
 int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
 {
@@ -265,6 +341,7 @@  static int seg6_do_srh(struct sk_buff *skb)
 			return err;
 		break;
 	case SEG6_IPTUN_MODE_ENCAP:
+	case SEG6_IPTUN_MODE_ENCAP_RED:
 		err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6);
 		if (err)
 			return err;
@@ -276,7 +353,11 @@  static int seg6_do_srh(struct sk_buff *skb)
 		else
 			return -EINVAL;
 
-		err = seg6_do_srh_encap(skb, tinfo->srh, proto);
+		if (tinfo->mode == SEG6_IPTUN_MODE_ENCAP)
+			err = seg6_do_srh_encap(skb, tinfo->srh, proto);
+		else
+			err = seg6_do_srh_encap_red(skb, tinfo->srh, proto);
+
 		if (err)
 			return err;
 
@@ -285,6 +366,7 @@  static int seg6_do_srh(struct sk_buff *skb)
 		skb->protocol = htons(ETH_P_IPV6);
 		break;
 	case SEG6_IPTUN_MODE_L2ENCAP:
+	case SEG6_IPTUN_MODE_L2ENCAP_RED:
 		if (!skb_mac_header_was_set(skb))
 			return -EINVAL;
 
@@ -294,7 +376,11 @@  static int seg6_do_srh(struct sk_buff *skb)
 		skb_mac_header_rebuild(skb);
 		skb_push(skb, skb->mac_len);
 
-		err = seg6_do_srh_encap(skb, tinfo->srh, IPPROTO_ETHERNET);
+		if (tinfo->mode == SEG6_IPTUN_MODE_L2ENCAP)
+			err = seg6_do_srh_encap(skb, tinfo->srh, IPPROTO_ETHERNET);
+		else
+			err = seg6_do_srh_encap_red(skb, tinfo->srh, IPPROTO_ETHERNET);
+
 		if (err)
 			return err;
 
@@ -514,6 +600,10 @@  static int seg6_build_state(struct net *net, struct nlattr *nla,
 		break;
 	case SEG6_IPTUN_MODE_L2ENCAP:
 		break;
+	case SEG6_IPTUN_MODE_ENCAP_RED:
+		break;
+	case SEG6_IPTUN_MODE_L2ENCAP_RED:
+		break;
 	default:
 		return -EINVAL;
 	}