diff mbox series

[net-next,3/3] openvswitch: set IPS_CONFIRMED in tmpl status only when commit is set in conntrack

Message ID cf477f4a26579e752465a5951c1d28ba109346e3.1689541664.git.lucien.xin@gmail.com (mailing list archive)
State Accepted
Commit 8c8b733208058702da451b7d60a12c0ff90b6879
Delegated to: Netdev Maintainers
Headers show
Series net: handle the exp removal problem with ovs upcall properly | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1342 this patch: 1342
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/build_clang success Errors and warnings before: 1365 this patch: 1365
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1365 this patch: 1365
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 98 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Xin Long July 16, 2023, 9:09 p.m. UTC
By not setting IPS_CONFIRMED in tmpl that allows the exp not to be removed
from the hashtable when lookup, we can simplify the exp processing code a
lot in openvswitch conntrack.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/openvswitch/conntrack.c | 78 +++++--------------------------------
 1 file changed, 10 insertions(+), 68 deletions(-)

Comments

Aaron Conole July 19, 2023, 4:08 p.m. UTC | #1
Xin Long <lucien.xin@gmail.com> writes:

> By not setting IPS_CONFIRMED in tmpl that allows the exp not to be removed
> from the hashtable when lookup, we can simplify the exp processing code a
> lot in openvswitch conntrack.
>
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> ---

Acked-by: Aaron Conole <aconole@redhat.com>
Ilya Maximets June 17, 2024, 8:10 p.m. UTC | #2
On 7/16/23 23:09, Xin Long wrote:
> By not setting IPS_CONFIRMED in tmpl that allows the exp not to be removed
> from the hashtable when lookup, we can simplify the exp processing code a
> lot in openvswitch conntrack.
> 
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> ---
>  net/openvswitch/conntrack.c | 78 +++++--------------------------------
>  1 file changed, 10 insertions(+), 68 deletions(-)
> 
> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
> index 331730fd3580..fa955e892210 100644
> --- a/net/openvswitch/conntrack.c
> +++ b/net/openvswitch/conntrack.c
> @@ -455,45 +455,6 @@ static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key,
>  	return 0;
>  }
>  
> -static struct nf_conntrack_expect *
> -ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
> -		   u16 proto, const struct sk_buff *skb)
> -{
> -	struct nf_conntrack_tuple tuple;
> -	struct nf_conntrack_expect *exp;
> -
> -	if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple))
> -		return NULL;
> -
> -	exp = __nf_ct_expect_find(net, zone, &tuple);
> -	if (exp) {
> -		struct nf_conntrack_tuple_hash *h;
> -
> -		/* Delete existing conntrack entry, if it clashes with the
> -		 * expectation.  This can happen since conntrack ALGs do not
> -		 * check for clashes between (new) expectations and existing
> -		 * conntrack entries.  nf_conntrack_in() will check the
> -		 * expectations only if a conntrack entry can not be found,
> -		 * which can lead to OVS finding the expectation (here) in the
> -		 * init direction, but which will not be removed by the
> -		 * nf_conntrack_in() call, if a matching conntrack entry is
> -		 * found instead.  In this case all init direction packets
> -		 * would be reported as new related packets, while reply
> -		 * direction packets would be reported as un-related
> -		 * established packets.
> -		 */
> -		h = nf_conntrack_find_get(net, zone, &tuple);
> -		if (h) {
> -			struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
> -
> -			nf_ct_delete(ct, 0, 0);
> -			nf_ct_put(ct);
> -		}
> -	}
> -
> -	return exp;
> -}
> -
>  /* This replicates logic from nf_conntrack_core.c that is not exported. */
>  static enum ip_conntrack_info
>  ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
> @@ -852,36 +813,16 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
>  			 const struct ovs_conntrack_info *info,
>  			 struct sk_buff *skb)
>  {
> -	struct nf_conntrack_expect *exp;
> -
> -	/* If we pass an expected packet through nf_conntrack_in() the
> -	 * expectation is typically removed, but the packet could still be
> -	 * lost in upcall processing.  To prevent this from happening we
> -	 * perform an explicit expectation lookup.  Expected connections are
> -	 * always new, and will be passed through conntrack only when they are
> -	 * committed, as it is OK to remove the expectation at that time.
> -	 */
> -	exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
> -	if (exp) {
> -		u8 state;
> -
> -		/* NOTE: New connections are NATted and Helped only when
> -		 * committed, so we are not calling into NAT here.
> -		 */
> -		state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
> -		__ovs_ct_update_key(key, state, &info->zone, exp->master);

Hi, Xin, others.

Unfortunately, it seems like removal of this code broke the expected behavior.
OVS in userspace expects that SYN packet of a new related FTP connection will
get +new+rel+trk flags, but after this patch we're only getting +rel+trk and not
new.  This is a problem because we need to commit this connection with the label
and we do that for +new packets.  If we can't get +new packet we'll have to commit
every single +rel+trk packet, which doesn't make a lot of sense.  And it's a
significant behavior change regardless.

Could you, please, take a look?

The issue can be reproduced by running check-kernel tests in OVS repo.
'FTP SNAT orig tuple' tests fail 100% of the time.

Best regards, Ilya Maximets.
Ilya Maximets June 18, 2024, 11:34 a.m. UTC | #3
On 6/17/24 22:10, Ilya Maximets wrote:
> On 7/16/23 23:09, Xin Long wrote:
>> By not setting IPS_CONFIRMED in tmpl that allows the exp not to be removed
>> from the hashtable when lookup, we can simplify the exp processing code a
>> lot in openvswitch conntrack.
>>
>> Signed-off-by: Xin Long <lucien.xin@gmail.com>
>> ---
>>  net/openvswitch/conntrack.c | 78 +++++--------------------------------
>>  1 file changed, 10 insertions(+), 68 deletions(-)
>>
>> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
>> index 331730fd3580..fa955e892210 100644
>> --- a/net/openvswitch/conntrack.c
>> +++ b/net/openvswitch/conntrack.c
>> @@ -455,45 +455,6 @@ static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key,
>>  	return 0;
>>  }
>>  
>> -static struct nf_conntrack_expect *
>> -ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
>> -		   u16 proto, const struct sk_buff *skb)
>> -{
>> -	struct nf_conntrack_tuple tuple;
>> -	struct nf_conntrack_expect *exp;
>> -
>> -	if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple))
>> -		return NULL;
>> -
>> -	exp = __nf_ct_expect_find(net, zone, &tuple);
>> -	if (exp) {
>> -		struct nf_conntrack_tuple_hash *h;
>> -
>> -		/* Delete existing conntrack entry, if it clashes with the
>> -		 * expectation.  This can happen since conntrack ALGs do not
>> -		 * check for clashes between (new) expectations and existing
>> -		 * conntrack entries.  nf_conntrack_in() will check the
>> -		 * expectations only if a conntrack entry can not be found,
>> -		 * which can lead to OVS finding the expectation (here) in the
>> -		 * init direction, but which will not be removed by the
>> -		 * nf_conntrack_in() call, if a matching conntrack entry is
>> -		 * found instead.  In this case all init direction packets
>> -		 * would be reported as new related packets, while reply
>> -		 * direction packets would be reported as un-related
>> -		 * established packets.
>> -		 */
>> -		h = nf_conntrack_find_get(net, zone, &tuple);
>> -		if (h) {
>> -			struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
>> -
>> -			nf_ct_delete(ct, 0, 0);
>> -			nf_ct_put(ct);
>> -		}
>> -	}
>> -
>> -	return exp;
>> -}
>> -
>>  /* This replicates logic from nf_conntrack_core.c that is not exported. */
>>  static enum ip_conntrack_info
>>  ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
>> @@ -852,36 +813,16 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
>>  			 const struct ovs_conntrack_info *info,
>>  			 struct sk_buff *skb)
>>  {
>> -	struct nf_conntrack_expect *exp;
>> -
>> -	/* If we pass an expected packet through nf_conntrack_in() the
>> -	 * expectation is typically removed, but the packet could still be
>> -	 * lost in upcall processing.  To prevent this from happening we
>> -	 * perform an explicit expectation lookup.  Expected connections are
>> -	 * always new, and will be passed through conntrack only when they are
>> -	 * committed, as it is OK to remove the expectation at that time.
>> -	 */
>> -	exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
>> -	if (exp) {
>> -		u8 state;
>> -
>> -		/* NOTE: New connections are NATted and Helped only when
>> -		 * committed, so we are not calling into NAT here.
>> -		 */
>> -		state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
>> -		__ovs_ct_update_key(key, state, &info->zone, exp->master);
> 
> Hi, Xin, others.
> 
> Unfortunately, it seems like removal of this code broke the expected behavior.
> OVS in userspace expects that SYN packet of a new related FTP connection will
> get +new+rel+trk flags, but after this patch we're only getting +rel+trk and not
> new.  This is a problem because we need to commit this connection with the label
> and we do that for +new packets.  If we can't get +new packet we'll have to commit
> every single +rel+trk packet, which doesn't make a lot of sense.  And it's a
> significant behavior change regardless.

Interestingly enough I see +new+rel+trk packets in cases without SNAT,
but we can only get +rel+trk in cases with SNAT.  So, this may be just
a generic conntrack bug somewhere.  At least the behavior seems fairly
inconsistent.

> 
> Could you, please, take a look?
> 
> The issue can be reproduced by running check-kernel tests in OVS repo.
> 'FTP SNAT orig tuple' tests fail 100% of the time.
> 
> Best regards, Ilya Maximets.
Xin Long June 18, 2024, 2:58 p.m. UTC | #4
On Tue, Jun 18, 2024 at 7:34 AM Ilya Maximets <i.maximets@ovn.org> wrote:
>
> On 6/17/24 22:10, Ilya Maximets wrote:
> > On 7/16/23 23:09, Xin Long wrote:
> >> By not setting IPS_CONFIRMED in tmpl that allows the exp not to be removed
> >> from the hashtable when lookup, we can simplify the exp processing code a
> >> lot in openvswitch conntrack.
> >>
> >> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> >> ---
> >>  net/openvswitch/conntrack.c | 78 +++++--------------------------------
> >>  1 file changed, 10 insertions(+), 68 deletions(-)
> >>
> >> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
> >> index 331730fd3580..fa955e892210 100644
> >> --- a/net/openvswitch/conntrack.c
> >> +++ b/net/openvswitch/conntrack.c
> >> @@ -455,45 +455,6 @@ static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key,
> >>      return 0;
> >>  }
> >>
> >> -static struct nf_conntrack_expect *
> >> -ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
> >> -               u16 proto, const struct sk_buff *skb)
> >> -{
> >> -    struct nf_conntrack_tuple tuple;
> >> -    struct nf_conntrack_expect *exp;
> >> -
> >> -    if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple))
> >> -            return NULL;
> >> -
> >> -    exp = __nf_ct_expect_find(net, zone, &tuple);
> >> -    if (exp) {
> >> -            struct nf_conntrack_tuple_hash *h;
> >> -
> >> -            /* Delete existing conntrack entry, if it clashes with the
> >> -             * expectation.  This can happen since conntrack ALGs do not
> >> -             * check for clashes between (new) expectations and existing
> >> -             * conntrack entries.  nf_conntrack_in() will check the
> >> -             * expectations only if a conntrack entry can not be found,
> >> -             * which can lead to OVS finding the expectation (here) in the
> >> -             * init direction, but which will not be removed by the
> >> -             * nf_conntrack_in() call, if a matching conntrack entry is
> >> -             * found instead.  In this case all init direction packets
> >> -             * would be reported as new related packets, while reply
> >> -             * direction packets would be reported as un-related
> >> -             * established packets.
> >> -             */
> >> -            h = nf_conntrack_find_get(net, zone, &tuple);
> >> -            if (h) {
> >> -                    struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
> >> -
> >> -                    nf_ct_delete(ct, 0, 0);
> >> -                    nf_ct_put(ct);
> >> -            }
> >> -    }
> >> -
> >> -    return exp;
> >> -}
> >> -
> >>  /* This replicates logic from nf_conntrack_core.c that is not exported. */
> >>  static enum ip_conntrack_info
> >>  ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
> >> @@ -852,36 +813,16 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
> >>                       const struct ovs_conntrack_info *info,
> >>                       struct sk_buff *skb)
> >>  {
> >> -    struct nf_conntrack_expect *exp;
> >> -
> >> -    /* If we pass an expected packet through nf_conntrack_in() the
> >> -     * expectation is typically removed, but the packet could still be
> >> -     * lost in upcall processing.  To prevent this from happening we
> >> -     * perform an explicit expectation lookup.  Expected connections are
> >> -     * always new, and will be passed through conntrack only when they are
> >> -     * committed, as it is OK to remove the expectation at that time.
> >> -     */
> >> -    exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
> >> -    if (exp) {
> >> -            u8 state;
> >> -
> >> -            /* NOTE: New connections are NATted and Helped only when
> >> -             * committed, so we are not calling into NAT here.
> >> -             */
> >> -            state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
> >> -            __ovs_ct_update_key(key, state, &info->zone, exp->master);
> >
> > Hi, Xin, others.
> >
> > Unfortunately, it seems like removal of this code broke the expected behavior.
> > OVS in userspace expects that SYN packet of a new related FTP connection will
> > get +new+rel+trk flags, but after this patch we're only getting +rel+trk and not
> > new.  This is a problem because we need to commit this connection with the label
> > and we do that for +new packets.  If we can't get +new packet we'll have to commit
> > every single +rel+trk packet, which doesn't make a lot of sense.  And it's a
> > significant behavior change regardless.
>
> Interestingly enough I see +new+rel+trk packets in cases without SNAT,
> but we can only get +rel+trk in cases with SNAT.  So, this may be just
> a generic conntrack bug somewhere.  At least the behavior seems fairly
> inconsistent.
>
In nf_conntrack, IP_CT_RELATED and IP_CT_NEW do not exist at the same
time. With this patch, we expect OVS_CS_F_RELATED and OVS_CS_F_NEW
are set at the same time by ovs_ct_update_key() when this related ct
is not confirmed.

The check-kernel test of "FTP SNAT orig tuple" skiped on my env somehow:

# make check-kernel
144: conntrack - FTP SNAT orig tuple   skipped (system-traffic.at:7295)

Any idea why? or do you know any other testcase that expects +new+rel+trk
but returns +rel+trk only?

Thanks.
> >
> > Could you, please, take a look?
> >
> > The issue can be reproduced by running check-kernel tests in OVS repo.
> > 'FTP SNAT orig tuple' tests fail 100% of the time.
> >
> > Best regards, Ilya Maximets.
>
Ilya Maximets June 18, 2024, 3:50 p.m. UTC | #5
On 6/18/24 16:58, Xin Long wrote:
> On Tue, Jun 18, 2024 at 7:34 AM Ilya Maximets <i.maximets@ovn.org> wrote:
>>
>> On 6/17/24 22:10, Ilya Maximets wrote:
>>> On 7/16/23 23:09, Xin Long wrote:
>>>> By not setting IPS_CONFIRMED in tmpl that allows the exp not to be removed
>>>> from the hashtable when lookup, we can simplify the exp processing code a
>>>> lot in openvswitch conntrack.
>>>>
>>>> Signed-off-by: Xin Long <lucien.xin@gmail.com>
>>>> ---
>>>>  net/openvswitch/conntrack.c | 78 +++++--------------------------------
>>>>  1 file changed, 10 insertions(+), 68 deletions(-)
>>>>
>>>> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
>>>> index 331730fd3580..fa955e892210 100644
>>>> --- a/net/openvswitch/conntrack.c
>>>> +++ b/net/openvswitch/conntrack.c
>>>> @@ -455,45 +455,6 @@ static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key,
>>>>      return 0;
>>>>  }
>>>>
>>>> -static struct nf_conntrack_expect *
>>>> -ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
>>>> -               u16 proto, const struct sk_buff *skb)
>>>> -{
>>>> -    struct nf_conntrack_tuple tuple;
>>>> -    struct nf_conntrack_expect *exp;
>>>> -
>>>> -    if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple))
>>>> -            return NULL;
>>>> -
>>>> -    exp = __nf_ct_expect_find(net, zone, &tuple);
>>>> -    if (exp) {
>>>> -            struct nf_conntrack_tuple_hash *h;
>>>> -
>>>> -            /* Delete existing conntrack entry, if it clashes with the
>>>> -             * expectation.  This can happen since conntrack ALGs do not
>>>> -             * check for clashes between (new) expectations and existing
>>>> -             * conntrack entries.  nf_conntrack_in() will check the
>>>> -             * expectations only if a conntrack entry can not be found,
>>>> -             * which can lead to OVS finding the expectation (here) in the
>>>> -             * init direction, but which will not be removed by the
>>>> -             * nf_conntrack_in() call, if a matching conntrack entry is
>>>> -             * found instead.  In this case all init direction packets
>>>> -             * would be reported as new related packets, while reply
>>>> -             * direction packets would be reported as un-related
>>>> -             * established packets.
>>>> -             */
>>>> -            h = nf_conntrack_find_get(net, zone, &tuple);
>>>> -            if (h) {
>>>> -                    struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
>>>> -
>>>> -                    nf_ct_delete(ct, 0, 0);
>>>> -                    nf_ct_put(ct);
>>>> -            }
>>>> -    }
>>>> -
>>>> -    return exp;
>>>> -}
>>>> -
>>>>  /* This replicates logic from nf_conntrack_core.c that is not exported. */
>>>>  static enum ip_conntrack_info
>>>>  ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
>>>> @@ -852,36 +813,16 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
>>>>                       const struct ovs_conntrack_info *info,
>>>>                       struct sk_buff *skb)
>>>>  {
>>>> -    struct nf_conntrack_expect *exp;
>>>> -
>>>> -    /* If we pass an expected packet through nf_conntrack_in() the
>>>> -     * expectation is typically removed, but the packet could still be
>>>> -     * lost in upcall processing.  To prevent this from happening we
>>>> -     * perform an explicit expectation lookup.  Expected connections are
>>>> -     * always new, and will be passed through conntrack only when they are
>>>> -     * committed, as it is OK to remove the expectation at that time.
>>>> -     */
>>>> -    exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
>>>> -    if (exp) {
>>>> -            u8 state;
>>>> -
>>>> -            /* NOTE: New connections are NATted and Helped only when
>>>> -             * committed, so we are not calling into NAT here.
>>>> -             */
>>>> -            state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
>>>> -            __ovs_ct_update_key(key, state, &info->zone, exp->master);
>>>
>>> Hi, Xin, others.
>>>
>>> Unfortunately, it seems like removal of this code broke the expected behavior.
>>> OVS in userspace expects that SYN packet of a new related FTP connection will
>>> get +new+rel+trk flags, but after this patch we're only getting +rel+trk and not
>>> new.  This is a problem because we need to commit this connection with the label
>>> and we do that for +new packets.  If we can't get +new packet we'll have to commit
>>> every single +rel+trk packet, which doesn't make a lot of sense.  And it's a
>>> significant behavior change regardless.
>>
>> Interestingly enough I see +new+rel+trk packets in cases without SNAT,
>> but we can only get +rel+trk in cases with SNAT.  So, this may be just
>> a generic conntrack bug somewhere.  At least the behavior seems fairly
>> inconsistent.
>>
> In nf_conntrack, IP_CT_RELATED and IP_CT_NEW do not exist at the same
> time. With this patch, we expect OVS_CS_F_RELATED and OVS_CS_F_NEW
> are set at the same time by ovs_ct_update_key() when this related ct
> is not confirmed.
> 
> The check-kernel test of "FTP SNAT orig tuple" skiped on my env somehow:
> 
> # make check-kernel
> 144: conntrack - FTP SNAT orig tuple   skipped (system-traffic.at:7295)
> 
> Any idea why? or do you know any other testcase that expects +new+rel+trk
> but returns +rel+trk only?

You need to install lftp and pyftpdlib.  The pyftpdlib may only be available
via pip on some systems.

> 
> Thanks.
>>>
>>> Could you, please, take a look?
>>>
>>> The issue can be reproduced by running check-kernel tests in OVS repo.
>>> 'FTP SNAT orig tuple' tests fail 100% of the time.
>>>
>>> Best regards, Ilya Maximets.
>>
Ilya Maximets June 19, 2024, 12:58 p.m. UTC | #6
On 6/18/24 17:50, Ilya Maximets wrote:
> On 6/18/24 16:58, Xin Long wrote:
>> On Tue, Jun 18, 2024 at 7:34 AM Ilya Maximets <i.maximets@ovn.org> wrote:
>>>
>>> On 6/17/24 22:10, Ilya Maximets wrote:
>>>> On 7/16/23 23:09, Xin Long wrote:
>>>>> By not setting IPS_CONFIRMED in tmpl that allows the exp not to be removed
>>>>> from the hashtable when lookup, we can simplify the exp processing code a
>>>>> lot in openvswitch conntrack.
>>>>>
>>>>> Signed-off-by: Xin Long <lucien.xin@gmail.com>
>>>>> ---
>>>>>  net/openvswitch/conntrack.c | 78 +++++--------------------------------
>>>>>  1 file changed, 10 insertions(+), 68 deletions(-)
>>>>>
>>>>> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
>>>>> index 331730fd3580..fa955e892210 100644
>>>>> --- a/net/openvswitch/conntrack.c
>>>>> +++ b/net/openvswitch/conntrack.c
>>>>> @@ -455,45 +455,6 @@ static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key,
>>>>>      return 0;
>>>>>  }
>>>>>
>>>>> -static struct nf_conntrack_expect *
>>>>> -ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
>>>>> -               u16 proto, const struct sk_buff *skb)
>>>>> -{
>>>>> -    struct nf_conntrack_tuple tuple;
>>>>> -    struct nf_conntrack_expect *exp;
>>>>> -
>>>>> -    if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple))
>>>>> -            return NULL;
>>>>> -
>>>>> -    exp = __nf_ct_expect_find(net, zone, &tuple);
>>>>> -    if (exp) {
>>>>> -            struct nf_conntrack_tuple_hash *h;
>>>>> -
>>>>> -            /* Delete existing conntrack entry, if it clashes with the
>>>>> -             * expectation.  This can happen since conntrack ALGs do not
>>>>> -             * check for clashes between (new) expectations and existing
>>>>> -             * conntrack entries.  nf_conntrack_in() will check the
>>>>> -             * expectations only if a conntrack entry can not be found,
>>>>> -             * which can lead to OVS finding the expectation (here) in the
>>>>> -             * init direction, but which will not be removed by the
>>>>> -             * nf_conntrack_in() call, if a matching conntrack entry is
>>>>> -             * found instead.  In this case all init direction packets
>>>>> -             * would be reported as new related packets, while reply
>>>>> -             * direction packets would be reported as un-related
>>>>> -             * established packets.
>>>>> -             */
>>>>> -            h = nf_conntrack_find_get(net, zone, &tuple);
>>>>> -            if (h) {
>>>>> -                    struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
>>>>> -
>>>>> -                    nf_ct_delete(ct, 0, 0);
>>>>> -                    nf_ct_put(ct);
>>>>> -            }
>>>>> -    }
>>>>> -
>>>>> -    return exp;
>>>>> -}
>>>>> -
>>>>>  /* This replicates logic from nf_conntrack_core.c that is not exported. */
>>>>>  static enum ip_conntrack_info
>>>>>  ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
>>>>> @@ -852,36 +813,16 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
>>>>>                       const struct ovs_conntrack_info *info,
>>>>>                       struct sk_buff *skb)
>>>>>  {
>>>>> -    struct nf_conntrack_expect *exp;
>>>>> -
>>>>> -    /* If we pass an expected packet through nf_conntrack_in() the
>>>>> -     * expectation is typically removed, but the packet could still be
>>>>> -     * lost in upcall processing.  To prevent this from happening we
>>>>> -     * perform an explicit expectation lookup.  Expected connections are
>>>>> -     * always new, and will be passed through conntrack only when they are
>>>>> -     * committed, as it is OK to remove the expectation at that time.
>>>>> -     */
>>>>> -    exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
>>>>> -    if (exp) {
>>>>> -            u8 state;
>>>>> -
>>>>> -            /* NOTE: New connections are NATted and Helped only when
>>>>> -             * committed, so we are not calling into NAT here.
>>>>> -             */
>>>>> -            state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
>>>>> -            __ovs_ct_update_key(key, state, &info->zone, exp->master);
>>>>
>>>> Hi, Xin, others.
>>>>
>>>> Unfortunately, it seems like removal of this code broke the expected behavior.
>>>> OVS in userspace expects that SYN packet of a new related FTP connection will
>>>> get +new+rel+trk flags, but after this patch we're only getting +rel+trk and not
>>>> new.  This is a problem because we need to commit this connection with the label
>>>> and we do that for +new packets.  If we can't get +new packet we'll have to commit
>>>> every single +rel+trk packet, which doesn't make a lot of sense.  And it's a
>>>> significant behavior change regardless.
>>>
>>> Interestingly enough I see +new+rel+trk packets in cases without SNAT,
>>> but we can only get +rel+trk in cases with SNAT.  So, this may be just
>>> a generic conntrack bug somewhere.  At least the behavior seems fairly
>>> inconsistent.
>>>
>> In nf_conntrack, IP_CT_RELATED and IP_CT_NEW do not exist at the same
>> time. With this patch, we expect OVS_CS_F_RELATED and OVS_CS_F_NEW
>> are set at the same time by ovs_ct_update_key() when this related ct
>> is not confirmed.
>>
>> The check-kernel test of "FTP SNAT orig tuple" skiped on my env somehow:
>>
>> # make check-kernel
>> 144: conntrack - FTP SNAT orig tuple   skipped (system-traffic.at:7295)
>>
>> Any idea why? or do you know any other testcase that expects +new+rel+trk
>> but returns +rel+trk only?
> 
> You need to install lftp and pyftpdlib.  The pyftpdlib may only be available
> via pip on some systems.
> 
>>
>> Thanks.
>>>>
>>>> Could you, please, take a look?
>>>>
>>>> The issue can be reproduced by running check-kernel tests in OVS repo.
>>>> 'FTP SNAT orig tuple' tests fail 100% of the time.
>>>>
>>>> Best regards, Ilya Maximets.
>>>
> 

Hmm.  After further investigation, it seems that the issue is not about ct state,
but the ct label.  Before this commit we had information about both the original
tuple of the parent connection and the mark/label of the parent connection:

system@ovs-system: miss upcall:
recirc_id(0x2),dp_hash(0),skb_priority(0),in_port(3),skb_mark(0),ct_state(0x25),
ct_zone(0x1),ct_mark(0),ct_label(0x4d2000000000000000000000001),
ct_tuple4(src=10.1.1.1,dst=10.1.1.2,proto=6,tp_src=50648,tp_dst=21),
eth(src=de:d9:f3:c8:5a:3a,dst=80:88:88:88:88:88),eth_type(0x0800),
ipv4(src=10.1.1.2,dst=10.1.1.9,proto=6,tos=0,ttl=64,frag=no),
tcp(src=57027,dst=38153),tcp_flags(syn)

But after this change, we still have the original tuple of the parent connection,
but the label is no longer in the flow key:

system@ovs-system: miss upcall:
recirc_id(0x2),dp_hash(0),skb_priority(0),in_port(3),skb_mark(0),ct_state(0x25),
ct_zone(0x1),ct_mark(0),ct_label(0),
ct_tuple4(src=10.1.1.1,dst=10.1.1.2,proto=6,tp_src=34668,tp_dst=21),
eth(src=66:eb:74:c6:79:24,dst=80:88:88:88:88:88),eth_type(0x0800),
ipv4(src=10.1.1.2,dst=10.1.1.9,proto=6,tos=0,ttl=64,frag=no),
tcp(src=49529,dst=35459),tcp_flags(syn)

ct_state(0x25) == +new+rel+trk

Best regards, Ilya Maximets.
Xin Long June 19, 2024, 2:07 p.m. UTC | #7
On Wed, Jun 19, 2024 at 8:58 AM Ilya Maximets <i.maximets@ovn.org> wrote:
>
> On 6/18/24 17:50, Ilya Maximets wrote:
> > On 6/18/24 16:58, Xin Long wrote:
> >> On Tue, Jun 18, 2024 at 7:34 AM Ilya Maximets <i.maximets@ovn.org> wrote:
> >>>
> >>> On 6/17/24 22:10, Ilya Maximets wrote:
> >>>> On 7/16/23 23:09, Xin Long wrote:
> >>>>> By not setting IPS_CONFIRMED in tmpl that allows the exp not to be removed
> >>>>> from the hashtable when lookup, we can simplify the exp processing code a
> >>>>> lot in openvswitch conntrack.
> >>>>>
> >>>>> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> >>>>> ---
> >>>>>  net/openvswitch/conntrack.c | 78 +++++--------------------------------
> >>>>>  1 file changed, 10 insertions(+), 68 deletions(-)
> >>>>>
> >>>>> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
> >>>>> index 331730fd3580..fa955e892210 100644
> >>>>> --- a/net/openvswitch/conntrack.c
> >>>>> +++ b/net/openvswitch/conntrack.c
> >>>>> @@ -455,45 +455,6 @@ static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key,
> >>>>>      return 0;
> >>>>>  }
> >>>>>
> >>>>> -static struct nf_conntrack_expect *
> >>>>> -ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
> >>>>> -               u16 proto, const struct sk_buff *skb)
> >>>>> -{
> >>>>> -    struct nf_conntrack_tuple tuple;
> >>>>> -    struct nf_conntrack_expect *exp;
> >>>>> -
> >>>>> -    if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple))
> >>>>> -            return NULL;
> >>>>> -
> >>>>> -    exp = __nf_ct_expect_find(net, zone, &tuple);
> >>>>> -    if (exp) {
> >>>>> -            struct nf_conntrack_tuple_hash *h;
> >>>>> -
> >>>>> -            /* Delete existing conntrack entry, if it clashes with the
> >>>>> -             * expectation.  This can happen since conntrack ALGs do not
> >>>>> -             * check for clashes between (new) expectations and existing
> >>>>> -             * conntrack entries.  nf_conntrack_in() will check the
> >>>>> -             * expectations only if a conntrack entry can not be found,
> >>>>> -             * which can lead to OVS finding the expectation (here) in the
> >>>>> -             * init direction, but which will not be removed by the
> >>>>> -             * nf_conntrack_in() call, if a matching conntrack entry is
> >>>>> -             * found instead.  In this case all init direction packets
> >>>>> -             * would be reported as new related packets, while reply
> >>>>> -             * direction packets would be reported as un-related
> >>>>> -             * established packets.
> >>>>> -             */
> >>>>> -            h = nf_conntrack_find_get(net, zone, &tuple);
> >>>>> -            if (h) {
> >>>>> -                    struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
> >>>>> -
> >>>>> -                    nf_ct_delete(ct, 0, 0);
> >>>>> -                    nf_ct_put(ct);
> >>>>> -            }
> >>>>> -    }
> >>>>> -
> >>>>> -    return exp;
> >>>>> -}
> >>>>> -
> >>>>>  /* This replicates logic from nf_conntrack_core.c that is not exported. */
> >>>>>  static enum ip_conntrack_info
> >>>>>  ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
> >>>>> @@ -852,36 +813,16 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
> >>>>>                       const struct ovs_conntrack_info *info,
> >>>>>                       struct sk_buff *skb)
> >>>>>  {
> >>>>> -    struct nf_conntrack_expect *exp;
> >>>>> -
> >>>>> -    /* If we pass an expected packet through nf_conntrack_in() the
> >>>>> -     * expectation is typically removed, but the packet could still be
> >>>>> -     * lost in upcall processing.  To prevent this from happening we
> >>>>> -     * perform an explicit expectation lookup.  Expected connections are
> >>>>> -     * always new, and will be passed through conntrack only when they are
> >>>>> -     * committed, as it is OK to remove the expectation at that time.
> >>>>> -     */
> >>>>> -    exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
> >>>>> -    if (exp) {
> >>>>> -            u8 state;
> >>>>> -
> >>>>> -            /* NOTE: New connections are NATted and Helped only when
> >>>>> -             * committed, so we are not calling into NAT here.
> >>>>> -             */
> >>>>> -            state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
> >>>>> -            __ovs_ct_update_key(key, state, &info->zone, exp->master);
> >>>>
> >>>> Hi, Xin, others.
> >>>>
> >>>> Unfortunately, it seems like removal of this code broke the expected behavior.
> >>>> OVS in userspace expects that SYN packet of a new related FTP connection will
> >>>> get +new+rel+trk flags, but after this patch we're only getting +rel+trk and not
> >>>> new.  This is a problem because we need to commit this connection with the label
> >>>> and we do that for +new packets.  If we can't get +new packet we'll have to commit
> >>>> every single +rel+trk packet, which doesn't make a lot of sense.  And it's a
> >>>> significant behavior change regardless.
> >>>
> >>> Interestingly enough I see +new+rel+trk packets in cases without SNAT,
> >>> but we can only get +rel+trk in cases with SNAT.  So, this may be just
> >>> a generic conntrack bug somewhere.  At least the behavior seems fairly
> >>> inconsistent.
> >>>
> >> In nf_conntrack, IP_CT_RELATED and IP_CT_NEW do not exist at the same
> >> time. With this patch, we expect OVS_CS_F_RELATED and OVS_CS_F_NEW
> >> are set at the same time by ovs_ct_update_key() when this related ct
> >> is not confirmed.
> >>
> >> The check-kernel test of "FTP SNAT orig tuple" skiped on my env somehow:
> >>
> >> # make check-kernel
> >> 144: conntrack - FTP SNAT orig tuple   skipped (system-traffic.at:7295)
> >>
> >> Any idea why? or do you know any other testcase that expects +new+rel+trk
> >> but returns +rel+trk only?
> >
> > You need to install lftp and pyftpdlib.  The pyftpdlib may only be available
> > via pip on some systems.
> >
> >>
> >> Thanks.
> >>>>
> >>>> Could you, please, take a look?
> >>>>
> >>>> The issue can be reproduced by running check-kernel tests in OVS repo.
> >>>> 'FTP SNAT orig tuple' tests fail 100% of the time.
> >>>>
> >>>> Best regards, Ilya Maximets.
> >>>
> >
>
> Hmm.  After further investigation, it seems that the issue is not about ct state,
> but the ct label.  Before this commit we had information about both the original
> tuple of the parent connection and the mark/label of the parent connection:
>
Make senses. Now I can see the difference after this commit.
We will need a fix in __ovs_ct_update_key() to copy mark & label from
ct->master for exp ct.

@@ -196,8 +196,8 @@ static void __ovs_ct_update_key(struct sw_flow_key
*key, u8 state,
 {
        key->ct_state = state;
        key->ct_zone = zone->id;
-       key->ct.mark = ovs_ct_get_mark(ct);
-       ovs_ct_get_labels(ct, &key->ct.labels);
+       key->ct.mark = 0;
+       memset(&key->ct.labels, 0, OVS_CT_LABELS_LEN);

        if (ct) {
                const struct nf_conntrack_tuple *orig;
@@ -205,6 +205,8 @@ static void __ovs_ct_update_key(struct sw_flow_key
*key, u8 state,
                /* Use the master if we have one. */
                if (ct->master)
                        ct = ct->master;
+               key->ct.mark = ovs_ct_get_mark(ct);
+               ovs_ct_get_labels(ct, &key->ct.labels);
                orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;

                /* IP version must match with the master connection. */

We may need to run some regression tests for such a change.

Thanks.

> system@ovs-system: miss upcall:
> recirc_id(0x2),dp_hash(0),skb_priority(0),in_port(3),skb_mark(0),ct_state(0x25),
> ct_zone(0x1),ct_mark(0),ct_label(0x4d2000000000000000000000001),
> ct_tuple4(src=10.1.1.1,dst=10.1.1.2,proto=6,tp_src=50648,tp_dst=21),
> eth(src=de:d9:f3:c8:5a:3a,dst=80:88:88:88:88:88),eth_type(0x0800),
> ipv4(src=10.1.1.2,dst=10.1.1.9,proto=6,tos=0,ttl=64,frag=no),
> tcp(src=57027,dst=38153),tcp_flags(syn)
>
> But after this change, we still have the original tuple of the parent connection,
> but the label is no longer in the flow key:
>
> system@ovs-system: miss upcall:
> recirc_id(0x2),dp_hash(0),skb_priority(0),in_port(3),skb_mark(0),ct_state(0x25),
> ct_zone(0x1),ct_mark(0),ct_label(0),
> ct_tuple4(src=10.1.1.1,dst=10.1.1.2,proto=6,tp_src=34668,tp_dst=21),
> eth(src=66:eb:74:c6:79:24,dst=80:88:88:88:88:88),eth_type(0x0800),
> ipv4(src=10.1.1.2,dst=10.1.1.9,proto=6,tos=0,ttl=64,frag=no),
> tcp(src=49529,dst=35459),tcp_flags(syn)
>
> ct_state(0x25) == +new+rel+trk
>
> Best regards, Ilya Maximets.
Ilya Maximets June 19, 2024, 5:30 p.m. UTC | #8
On 6/19/24 16:07, Xin Long wrote:
> On Wed, Jun 19, 2024 at 8:58 AM Ilya Maximets <i.maximets@ovn.org> wrote:
>>
>> On 6/18/24 17:50, Ilya Maximets wrote:
>>> On 6/18/24 16:58, Xin Long wrote:
>>>> On Tue, Jun 18, 2024 at 7:34 AM Ilya Maximets <i.maximets@ovn.org> wrote:
>>>>>
>>>>> On 6/17/24 22:10, Ilya Maximets wrote:
>>>>>> On 7/16/23 23:09, Xin Long wrote:
>>>>>>> By not setting IPS_CONFIRMED in tmpl that allows the exp not to be removed
>>>>>>> from the hashtable when lookup, we can simplify the exp processing code a
>>>>>>> lot in openvswitch conntrack.
>>>>>>>
>>>>>>> Signed-off-by: Xin Long <lucien.xin@gmail.com>
>>>>>>> ---
>>>>>>>  net/openvswitch/conntrack.c | 78 +++++--------------------------------
>>>>>>>  1 file changed, 10 insertions(+), 68 deletions(-)
>>>>>>>
>>>>>>> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
>>>>>>> index 331730fd3580..fa955e892210 100644
>>>>>>> --- a/net/openvswitch/conntrack.c
>>>>>>> +++ b/net/openvswitch/conntrack.c
>>>>>>> @@ -455,45 +455,6 @@ static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key,
>>>>>>>      return 0;
>>>>>>>  }
>>>>>>>
>>>>>>> -static struct nf_conntrack_expect *
>>>>>>> -ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
>>>>>>> -               u16 proto, const struct sk_buff *skb)
>>>>>>> -{
>>>>>>> -    struct nf_conntrack_tuple tuple;
>>>>>>> -    struct nf_conntrack_expect *exp;
>>>>>>> -
>>>>>>> -    if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple))
>>>>>>> -            return NULL;
>>>>>>> -
>>>>>>> -    exp = __nf_ct_expect_find(net, zone, &tuple);
>>>>>>> -    if (exp) {
>>>>>>> -            struct nf_conntrack_tuple_hash *h;
>>>>>>> -
>>>>>>> -            /* Delete existing conntrack entry, if it clashes with the
>>>>>>> -             * expectation.  This can happen since conntrack ALGs do not
>>>>>>> -             * check for clashes between (new) expectations and existing
>>>>>>> -             * conntrack entries.  nf_conntrack_in() will check the
>>>>>>> -             * expectations only if a conntrack entry can not be found,
>>>>>>> -             * which can lead to OVS finding the expectation (here) in the
>>>>>>> -             * init direction, but which will not be removed by the
>>>>>>> -             * nf_conntrack_in() call, if a matching conntrack entry is
>>>>>>> -             * found instead.  In this case all init direction packets
>>>>>>> -             * would be reported as new related packets, while reply
>>>>>>> -             * direction packets would be reported as un-related
>>>>>>> -             * established packets.
>>>>>>> -             */
>>>>>>> -            h = nf_conntrack_find_get(net, zone, &tuple);
>>>>>>> -            if (h) {
>>>>>>> -                    struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
>>>>>>> -
>>>>>>> -                    nf_ct_delete(ct, 0, 0);
>>>>>>> -                    nf_ct_put(ct);
>>>>>>> -            }
>>>>>>> -    }
>>>>>>> -
>>>>>>> -    return exp;
>>>>>>> -}
>>>>>>> -
>>>>>>>  /* This replicates logic from nf_conntrack_core.c that is not exported. */
>>>>>>>  static enum ip_conntrack_info
>>>>>>>  ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
>>>>>>> @@ -852,36 +813,16 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
>>>>>>>                       const struct ovs_conntrack_info *info,
>>>>>>>                       struct sk_buff *skb)
>>>>>>>  {
>>>>>>> -    struct nf_conntrack_expect *exp;
>>>>>>> -
>>>>>>> -    /* If we pass an expected packet through nf_conntrack_in() the
>>>>>>> -     * expectation is typically removed, but the packet could still be
>>>>>>> -     * lost in upcall processing.  To prevent this from happening we
>>>>>>> -     * perform an explicit expectation lookup.  Expected connections are
>>>>>>> -     * always new, and will be passed through conntrack only when they are
>>>>>>> -     * committed, as it is OK to remove the expectation at that time.
>>>>>>> -     */
>>>>>>> -    exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
>>>>>>> -    if (exp) {
>>>>>>> -            u8 state;
>>>>>>> -
>>>>>>> -            /* NOTE: New connections are NATted and Helped only when
>>>>>>> -             * committed, so we are not calling into NAT here.
>>>>>>> -             */
>>>>>>> -            state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
>>>>>>> -            __ovs_ct_update_key(key, state, &info->zone, exp->master);
>>>>>>
>>>>>> Hi, Xin, others.
>>>>>>
>>>>>> Unfortunately, it seems like removal of this code broke the expected behavior.
>>>>>> OVS in userspace expects that SYN packet of a new related FTP connection will
>>>>>> get +new+rel+trk flags, but after this patch we're only getting +rel+trk and not
>>>>>> new.  This is a problem because we need to commit this connection with the label
>>>>>> and we do that for +new packets.  If we can't get +new packet we'll have to commit
>>>>>> every single +rel+trk packet, which doesn't make a lot of sense.  And it's a
>>>>>> significant behavior change regardless.
>>>>>
>>>>> Interestingly enough I see +new+rel+trk packets in cases without SNAT,
>>>>> but we can only get +rel+trk in cases with SNAT.  So, this may be just
>>>>> a generic conntrack bug somewhere.  At least the behavior seems fairly
>>>>> inconsistent.
>>>>>
>>>> In nf_conntrack, IP_CT_RELATED and IP_CT_NEW do not exist at the same
>>>> time. With this patch, we expect OVS_CS_F_RELATED and OVS_CS_F_NEW
>>>> are set at the same time by ovs_ct_update_key() when this related ct
>>>> is not confirmed.
>>>>
>>>> The check-kernel test of "FTP SNAT orig tuple" skiped on my env somehow:
>>>>
>>>> # make check-kernel
>>>> 144: conntrack - FTP SNAT orig tuple   skipped (system-traffic.at:7295)
>>>>
>>>> Any idea why? or do you know any other testcase that expects +new+rel+trk
>>>> but returns +rel+trk only?
>>>
>>> You need to install lftp and pyftpdlib.  The pyftpdlib may only be available
>>> via pip on some systems.
>>>
>>>>
>>>> Thanks.
>>>>>>
>>>>>> Could you, please, take a look?
>>>>>>
>>>>>> The issue can be reproduced by running check-kernel tests in OVS repo.
>>>>>> 'FTP SNAT orig tuple' tests fail 100% of the time.
>>>>>>
>>>>>> Best regards, Ilya Maximets.
>>>>>
>>>
>>
>> Hmm.  After further investigation, it seems that the issue is not about ct state,
>> but the ct label.  Before this commit we had information about both the original
>> tuple of the parent connection and the mark/label of the parent connection:
>>
> Make senses. Now I can see the difference after this commit.
> We will need a fix in __ovs_ct_update_key() to copy mark & label from
> ct->master for exp ct.
> 
> @@ -196,8 +196,8 @@ static void __ovs_ct_update_key(struct sw_flow_key
> *key, u8 state,
>  {
>         key->ct_state = state;
>         key->ct_zone = zone->id;
> -       key->ct.mark = ovs_ct_get_mark(ct);
> -       ovs_ct_get_labels(ct, &key->ct.labels);
> +       key->ct.mark = 0;
> +       memset(&key->ct.labels, 0, OVS_CT_LABELS_LEN);
> 
>         if (ct) {
>                 const struct nf_conntrack_tuple *orig;
> @@ -205,6 +205,8 @@ static void __ovs_ct_update_key(struct sw_flow_key
> *key, u8 state,
>                 /* Use the master if we have one. */
>                 if (ct->master)
>                         ct = ct->master;
> +               key->ct.mark = ovs_ct_get_mark(ct);
> +               ovs_ct_get_labels(ct, &key->ct.labels);
>                 orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
> 
>                 /* IP version must match with the master connection. */
> 
> We may need to run some regression tests for such a change.

Thank, Xin!  This seems like a change in the right direction and it fixes
this particular test.  But, I guess, we should get mark/labels from the
master connection only if it is not yet confirmed.  Users may commit different
labels for the related connection.  This should be more in line with the
previous behavior.

What do you think?

Best regards, Ilya Maximets.

> 
> Thanks.
> 
>> system@ovs-system: miss upcall:
>> recirc_id(0x2),dp_hash(0),skb_priority(0),in_port(3),skb_mark(0),ct_state(0x25),
>> ct_zone(0x1),ct_mark(0),ct_label(0x4d2000000000000000000000001),
>> ct_tuple4(src=10.1.1.1,dst=10.1.1.2,proto=6,tp_src=50648,tp_dst=21),
>> eth(src=de:d9:f3:c8:5a:3a,dst=80:88:88:88:88:88),eth_type(0x0800),
>> ipv4(src=10.1.1.2,dst=10.1.1.9,proto=6,tos=0,ttl=64,frag=no),
>> tcp(src=57027,dst=38153),tcp_flags(syn)
>>
>> But after this change, we still have the original tuple of the parent connection,
>> but the label is no longer in the flow key:
>>
>> system@ovs-system: miss upcall:
>> recirc_id(0x2),dp_hash(0),skb_priority(0),in_port(3),skb_mark(0),ct_state(0x25),
>> ct_zone(0x1),ct_mark(0),ct_label(0),
>> ct_tuple4(src=10.1.1.1,dst=10.1.1.2,proto=6,tp_src=34668,tp_dst=21),
>> eth(src=66:eb:74:c6:79:24,dst=80:88:88:88:88:88),eth_type(0x0800),
>> ipv4(src=10.1.1.2,dst=10.1.1.9,proto=6,tos=0,ttl=64,frag=no),
>> tcp(src=49529,dst=35459),tcp_flags(syn)
>>
>> ct_state(0x25) == +new+rel+trk
>>
>> Best regards, Ilya Maximets.
Xin Long June 19, 2024, 8:11 p.m. UTC | #9
On Wed, Jun 19, 2024 at 1:30 PM Ilya Maximets <i.maximets@ovn.org> wrote:
>
> On 6/19/24 16:07, Xin Long wrote:
> > On Wed, Jun 19, 2024 at 8:58 AM Ilya Maximets <i.maximets@ovn.org> wrote:
> >>
> >> On 6/18/24 17:50, Ilya Maximets wrote:
> >>> On 6/18/24 16:58, Xin Long wrote:
> >>>> On Tue, Jun 18, 2024 at 7:34 AM Ilya Maximets <i.maximets@ovn.org> wrote:
> >>>>>
> >>>>> On 6/17/24 22:10, Ilya Maximets wrote:
> >>>>>> On 7/16/23 23:09, Xin Long wrote:
> >>>>>>> By not setting IPS_CONFIRMED in tmpl that allows the exp not to be removed
> >>>>>>> from the hashtable when lookup, we can simplify the exp processing code a
> >>>>>>> lot in openvswitch conntrack.
> >>>>>>>
> >>>>>>> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> >>>>>>> ---
> >>>>>>>  net/openvswitch/conntrack.c | 78 +++++--------------------------------
> >>>>>>>  1 file changed, 10 insertions(+), 68 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
> >>>>>>> index 331730fd3580..fa955e892210 100644
> >>>>>>> --- a/net/openvswitch/conntrack.c
> >>>>>>> +++ b/net/openvswitch/conntrack.c
> >>>>>>> @@ -455,45 +455,6 @@ static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key,
> >>>>>>>      return 0;
> >>>>>>>  }
> >>>>>>>
> >>>>>>> -static struct nf_conntrack_expect *
> >>>>>>> -ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
> >>>>>>> -               u16 proto, const struct sk_buff *skb)
> >>>>>>> -{
> >>>>>>> -    struct nf_conntrack_tuple tuple;
> >>>>>>> -    struct nf_conntrack_expect *exp;
> >>>>>>> -
> >>>>>>> -    if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple))
> >>>>>>> -            return NULL;
> >>>>>>> -
> >>>>>>> -    exp = __nf_ct_expect_find(net, zone, &tuple);
> >>>>>>> -    if (exp) {
> >>>>>>> -            struct nf_conntrack_tuple_hash *h;
> >>>>>>> -
> >>>>>>> -            /* Delete existing conntrack entry, if it clashes with the
> >>>>>>> -             * expectation.  This can happen since conntrack ALGs do not
> >>>>>>> -             * check for clashes between (new) expectations and existing
> >>>>>>> -             * conntrack entries.  nf_conntrack_in() will check the
> >>>>>>> -             * expectations only if a conntrack entry can not be found,
> >>>>>>> -             * which can lead to OVS finding the expectation (here) in the
> >>>>>>> -             * init direction, but which will not be removed by the
> >>>>>>> -             * nf_conntrack_in() call, if a matching conntrack entry is
> >>>>>>> -             * found instead.  In this case all init direction packets
> >>>>>>> -             * would be reported as new related packets, while reply
> >>>>>>> -             * direction packets would be reported as un-related
> >>>>>>> -             * established packets.
> >>>>>>> -             */
> >>>>>>> -            h = nf_conntrack_find_get(net, zone, &tuple);
> >>>>>>> -            if (h) {
> >>>>>>> -                    struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
> >>>>>>> -
> >>>>>>> -                    nf_ct_delete(ct, 0, 0);
> >>>>>>> -                    nf_ct_put(ct);
> >>>>>>> -            }
> >>>>>>> -    }
> >>>>>>> -
> >>>>>>> -    return exp;
> >>>>>>> -}
> >>>>>>> -
> >>>>>>>  /* This replicates logic from nf_conntrack_core.c that is not exported. */
> >>>>>>>  static enum ip_conntrack_info
> >>>>>>>  ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
> >>>>>>> @@ -852,36 +813,16 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
> >>>>>>>                       const struct ovs_conntrack_info *info,
> >>>>>>>                       struct sk_buff *skb)
> >>>>>>>  {
> >>>>>>> -    struct nf_conntrack_expect *exp;
> >>>>>>> -
> >>>>>>> -    /* If we pass an expected packet through nf_conntrack_in() the
> >>>>>>> -     * expectation is typically removed, but the packet could still be
> >>>>>>> -     * lost in upcall processing.  To prevent this from happening we
> >>>>>>> -     * perform an explicit expectation lookup.  Expected connections are
> >>>>>>> -     * always new, and will be passed through conntrack only when they are
> >>>>>>> -     * committed, as it is OK to remove the expectation at that time.
> >>>>>>> -     */
> >>>>>>> -    exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
> >>>>>>> -    if (exp) {
> >>>>>>> -            u8 state;
> >>>>>>> -
> >>>>>>> -            /* NOTE: New connections are NATted and Helped only when
> >>>>>>> -             * committed, so we are not calling into NAT here.
> >>>>>>> -             */
> >>>>>>> -            state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
> >>>>>>> -            __ovs_ct_update_key(key, state, &info->zone, exp->master);
> >>>>>>
> >>>>>> Hi, Xin, others.
> >>>>>>
> >>>>>> Unfortunately, it seems like removal of this code broke the expected behavior.
> >>>>>> OVS in userspace expects that SYN packet of a new related FTP connection will
> >>>>>> get +new+rel+trk flags, but after this patch we're only getting +rel+trk and not
> >>>>>> new.  This is a problem because we need to commit this connection with the label
> >>>>>> and we do that for +new packets.  If we can't get +new packet we'll have to commit
> >>>>>> every single +rel+trk packet, which doesn't make a lot of sense.  And it's a
> >>>>>> significant behavior change regardless.
> >>>>>
> >>>>> Interestingly enough I see +new+rel+trk packets in cases without SNAT,
> >>>>> but we can only get +rel+trk in cases with SNAT.  So, this may be just
> >>>>> a generic conntrack bug somewhere.  At least the behavior seems fairly
> >>>>> inconsistent.
> >>>>>
> >>>> In nf_conntrack, IP_CT_RELATED and IP_CT_NEW do not exist at the same
> >>>> time. With this patch, we expect OVS_CS_F_RELATED and OVS_CS_F_NEW
> >>>> are set at the same time by ovs_ct_update_key() when this related ct
> >>>> is not confirmed.
> >>>>
> >>>> The check-kernel test of "FTP SNAT orig tuple" skiped on my env somehow:
> >>>>
> >>>> # make check-kernel
> >>>> 144: conntrack - FTP SNAT orig tuple   skipped (system-traffic.at:7295)
> >>>>
> >>>> Any idea why? or do you know any other testcase that expects +new+rel+trk
> >>>> but returns +rel+trk only?
> >>>
> >>> You need to install lftp and pyftpdlib.  The pyftpdlib may only be available
> >>> via pip on some systems.
> >>>
> >>>>
> >>>> Thanks.
> >>>>>>
> >>>>>> Could you, please, take a look?
> >>>>>>
> >>>>>> The issue can be reproduced by running check-kernel tests in OVS repo.
> >>>>>> 'FTP SNAT orig tuple' tests fail 100% of the time.
> >>>>>>
> >>>>>> Best regards, Ilya Maximets.
> >>>>>
> >>>
> >>
> >> Hmm.  After further investigation, it seems that the issue is not about ct state,
> >> but the ct label.  Before this commit we had information about both the original
> >> tuple of the parent connection and the mark/label of the parent connection:
> >>
> > Make senses. Now I can see the difference after this commit.
> > We will need a fix in __ovs_ct_update_key() to copy mark & label from
> > ct->master for exp ct.
> >
> > @@ -196,8 +196,8 @@ static void __ovs_ct_update_key(struct sw_flow_key
> > *key, u8 state,
> >  {
> >         key->ct_state = state;
> >         key->ct_zone = zone->id;
> > -       key->ct.mark = ovs_ct_get_mark(ct);
> > -       ovs_ct_get_labels(ct, &key->ct.labels);
> > +       key->ct.mark = 0;
> > +       memset(&key->ct.labels, 0, OVS_CT_LABELS_LEN);
> >
> >         if (ct) {
> >                 const struct nf_conntrack_tuple *orig;
> > @@ -205,6 +205,8 @@ static void __ovs_ct_update_key(struct sw_flow_key
> > *key, u8 state,
> >                 /* Use the master if we have one. */
> >                 if (ct->master)
> >                         ct = ct->master;
> > +               key->ct.mark = ovs_ct_get_mark(ct);
> > +               ovs_ct_get_labels(ct, &key->ct.labels);
> >                 orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
> >
> >                 /* IP version must match with the master connection. */
> >
> > We may need to run some regression tests for such a change.
>
> Thank, Xin!  This seems like a change in the right direction and it fixes
> this particular test.  But, I guess, we should get mark/labels from the
> master connection only if it is not yet confirmed.  Users may commit different
> labels for the related connection.  This should be more in line with the
> previous behavior.
>
> What do you think?
>
You're right.
Also, I noticed the related ct->mark is set to master ct->mark in
init_conntrack() as well as secmark when creating the related ct.

Hi, Florian,

Any reason why the labels are not set to master ct's in there?

Thanks.

>
> >
> > Thanks.
> >
> >> system@ovs-system: miss upcall:
> >> recirc_id(0x2),dp_hash(0),skb_priority(0),in_port(3),skb_mark(0),ct_state(0x25),
> >> ct_zone(0x1),ct_mark(0),ct_label(0x4d2000000000000000000000001),
> >> ct_tuple4(src=10.1.1.1,dst=10.1.1.2,proto=6,tp_src=50648,tp_dst=21),
> >> eth(src=de:d9:f3:c8:5a:3a,dst=80:88:88:88:88:88),eth_type(0x0800),
> >> ipv4(src=10.1.1.2,dst=10.1.1.9,proto=6,tos=0,ttl=64,frag=no),
> >> tcp(src=57027,dst=38153),tcp_flags(syn)
> >>
> >> But after this change, we still have the original tuple of the parent connection,
> >> but the label is no longer in the flow key:
> >>
> >> system@ovs-system: miss upcall:
> >> recirc_id(0x2),dp_hash(0),skb_priority(0),in_port(3),skb_mark(0),ct_state(0x25),
> >> ct_zone(0x1),ct_mark(0),ct_label(0),
> >> ct_tuple4(src=10.1.1.1,dst=10.1.1.2,proto=6,tp_src=34668,tp_dst=21),
> >> eth(src=66:eb:74:c6:79:24,dst=80:88:88:88:88:88),eth_type(0x0800),
> >> ipv4(src=10.1.1.2,dst=10.1.1.9,proto=6,tos=0,ttl=64,frag=no),
> >> tcp(src=49529,dst=35459),tcp_flags(syn)
> >>
> >> ct_state(0x25) == +new+rel+trk
> >>
> >> Best regards, Ilya Maximets.
>
Florian Westphal June 19, 2024, 8:19 p.m. UTC | #10
Xin Long <lucien.xin@gmail.com> wrote:
> > master connection only if it is not yet confirmed.  Users may commit different
> > labels for the related connection.  This should be more in line with the
> > previous behavior.
> >
> > What do you think?
> >
> You're right.
> Also, I noticed the related ct->mark is set to master ct->mark in
> init_conntrack() as well as secmark when creating the related ct.
> 
> Hi, Florian,
> 
> Any reason why the labels are not set to master ct's in there?

The intent was to have lables be set only via ctnetlink (userspace)
or ruleset.

The original use case was for tagging connections based on
observed behaviour/properties at a later time, not at start of flow.
Xin Long June 19, 2024, 8:50 p.m. UTC | #11
On Wed, Jun 19, 2024 at 4:20 PM Florian Westphal <fw@strlen.de> wrote:
>
> Xin Long <lucien.xin@gmail.com> wrote:
> > > master connection only if it is not yet confirmed.  Users may commit different
> > > labels for the related connection.  This should be more in line with the
> > > previous behavior.
> > >
> > > What do you think?
> > >
> > You're right.
> > Also, I noticed the related ct->mark is set to master ct->mark in
> > init_conntrack() as well as secmark when creating the related ct.
> >
> > Hi, Florian,
> >
> > Any reason why the labels are not set to master ct's in there?
>
> The intent was to have lables be set only via ctnetlink (userspace)
> or ruleset.
>
> The original use case was for tagging connections based on
> observed behaviour/properties at a later time, not at start of flow.
Got it, I will fix this in ovs.

Thanks.
Florian Westphal June 19, 2024, 9:20 p.m. UTC | #12
Xin Long <lucien.xin@gmail.com> wrote:
> Got it, I will fix this in ovs.

Thanks!

I don't want to throw more work at you, but since you are
already working on ovs+conntrack:

ovs_ct_init() is bad, as this runs unconditionally.

modprobe openvswitch -> conntrack becomes active in all
existing and future namespaces.

Conntrack is slow, we should avoid this behaviour.

ovs_ct_limit_init() should be called only when the feature is
configured (the problematic call is nf_conncount_init, as that
turns on connection tracking, the kmalloc etc is fine).

Likewise, nf_connlabels_get() should only be called when
labels are added/configured, not at the start.

I resolved this for tc in 70f06c115bcc but it seems i never
got around to fix it for ovs.
Xin Long June 19, 2024, 10:10 p.m. UTC | #13
On Wed, Jun 19, 2024 at 5:20 PM Florian Westphal <fw@strlen.de> wrote:
>
> Xin Long <lucien.xin@gmail.com> wrote:
> > Got it, I will fix this in ovs.
>
> Thanks!
>
> I don't want to throw more work at you, but since you are
> already working on ovs+conntrack:
>
> ovs_ct_init() is bad, as this runs unconditionally.
>
> modprobe openvswitch -> conntrack becomes active in all
> existing and future namespaces.
>
> Conntrack is slow, we should avoid this behaviour.
>
> ovs_ct_limit_init() should be called only when the feature is
> configured (the problematic call is nf_conncount_init, as that
> turns on connection tracking, the kmalloc etc is fine).
understand.

>
> Likewise, nf_connlabels_get() should only be called when
> labels are added/configured, not at the start.
>
> I resolved this for tc in 70f06c115bcc but it seems i never
> got around to fix it for ovs.
I will take a look.

Thanks.
Xin Long July 8, 2024, 10:03 p.m. UTC | #14
On Wed, Jun 19, 2024 at 6:10 PM Xin Long <lucien.xin@gmail.com> wrote:
>
> On Wed, Jun 19, 2024 at 5:20 PM Florian Westphal <fw@strlen.de> wrote:
> >
> > Xin Long <lucien.xin@gmail.com> wrote:
> > > Got it, I will fix this in ovs.
> >
> > Thanks!
> >
> > I don't want to throw more work at you, but since you are
> > already working on ovs+conntrack:
> >
> > ovs_ct_init() is bad, as this runs unconditionally.
> >
> > modprobe openvswitch -> conntrack becomes active in all
> > existing and future namespaces.
> >
> > Conntrack is slow, we should avoid this behaviour.
> >
> > ovs_ct_limit_init() should be called only when the feature is
> > configured (the problematic call is nf_conncount_init, as that
> > turns on connection tracking, the kmalloc etc is fine).
> understand.
>
> >
> > Likewise, nf_connlabels_get() should only be called when
> > labels are added/configured, not at the start.
> >
> > I resolved this for tc in 70f06c115bcc but it seems i never
> > got around to fix it for ovs.
Hi, Florian,

I noticed the warning in nf_ct_ext_add() while I'm making this change:

   WARN_ON(nf_ct_is_confirmed(ct));

It can be triggered by these ovs flows from ovs selftests:

  table=0,priority=30,in_port=1,ip,nw_dst=172.1.1.2,actions=ct(commit,nat(dst=10.1.1.2:80),table=1)
  table=1,ip,actions=ct(commit,nat(src=10.1.1.240),exec(set_field:0xac->ct_mark,set_field:0xac->ct_label),table=2)

The 1st flow creates the ct and commits/confirms it, then the 2nd flow is
setting ct_label on a confirmed ct. With this change, the connlabels ext
is not yet allocated at the time, and then the warning is triggered when
allocating it in nf_ct_ext_add().

tc act_ct action doesn't have this issue, as it returns an error if the
connlabels is not found in tcf_ct_act_set_labels(), instead of allocating it.

I can avoid this warning by not allocating ext for commit ct in ovs:

@@ -426,7 +426,7 @@ static int ovs_ct_set_labels(struct nf_conn *ct,
struct sw_flow_key *key,
        struct nf_conn_labels *cl;
        int err;

-       cl = ovs_ct_get_conn_labels(ct);
+       cl = nf_ct_labels_find(ct);
        if (!cl)
                return -ENOSPC;

However, the test case would fail, although the failure can be worked around
by setting ct_label in the 1st rule:

  table=0,priority=30,in_port=1,ip,nw_dst=172.1.1.2,actions=ct(commit,nat(dst=10.1.1.2:80),exec(set_field:0x01->ct_label),table=1)

So I'm worrying our change may break some existing OVS user cases.
Florian Westphal July 8, 2024, 10:38 p.m. UTC | #15
Xin Long <lucien.xin@gmail.com> wrote:
> I can avoid this warning by not allocating ext for commit ct in ovs:
> 
> @@ -426,7 +426,7 @@ static int ovs_ct_set_labels(struct nf_conn *ct,
> struct sw_flow_key *key,
>         struct nf_conn_labels *cl;
>         int err;
> 
> -       cl = ovs_ct_get_conn_labels(ct);
> +       cl = nf_ct_labels_find(ct);
>         if (!cl)
>                 return -ENOSPC;
> 
> However, the test case would fail, although the failure can be worked around
> by setting ct_label in the 1st rule:
> 
>   table=0,priority=30,in_port=1,ip,nw_dst=172.1.1.2,actions=ct(commit,nat(dst=10.1.1.2:80),exec(set_field:0x01->ct_label),table=1)
> 
> So I'm worrying our change may break some existing OVS user cases.

Then ovs_ct_limit_init() and nf_connlabels_get() need to be called
once on the first conntrack operatation, regardless if labels are asked
for or not.

Not nice but still better than current state.

ovs_ct_execute() perhaps?
Xin Long July 9, 2024, 1:49 a.m. UTC | #16
On Mon, Jul 8, 2024 at 6:38 PM Florian Westphal <fw@strlen.de> wrote:
>
> Xin Long <lucien.xin@gmail.com> wrote:
> > I can avoid this warning by not allocating ext for commit ct in ovs:
> >
> > @@ -426,7 +426,7 @@ static int ovs_ct_set_labels(struct nf_conn *ct,
> > struct sw_flow_key *key,
> >         struct nf_conn_labels *cl;
> >         int err;
> >
> > -       cl = ovs_ct_get_conn_labels(ct);
> > +       cl = nf_ct_labels_find(ct);
> >         if (!cl)
> >                 return -ENOSPC;
ovs_ct_get_conn_labels() must be replaced with nf_ct_labels_find() in here
anyway, thinking that the confirmed ct without labels was created in other
places (not by OVS conntrack), the warning may still be triggered when
trying to set labels in OVS after.

> >
> > However, the test case would fail, although the failure can be worked around
> > by setting ct_label in the 1st rule:
> >
> >   table=0,priority=30,in_port=1,ip,nw_dst=172.1.1.2,actions=ct(commit,nat(dst=10.1.1.2:80),exec(set_field:0x01->ct_label),table=1)
> >
> > So I'm worrying our change may break some existing OVS user cases.
>
> Then ovs_ct_limit_init() and nf_connlabels_get() need to be called
> once on the first conntrack operatation, regardless if labels are asked
> for or not.
>
> Not nice but still better than current state.
Right, not nice, it undermines the bits check against NF_CT_LABELS_MAX_SIZE.

>
> ovs_ct_execute() perhaps?
ovs_ct_execute() is in the hot path, and a better place would be
ovs_ct_copy_action() where the ct for an ovs flow is added.
Florian Westphal July 9, 2024, 5:49 a.m. UTC | #17
Xin Long <lucien.xin@gmail.com> wrote:
> > Xin Long <lucien.xin@gmail.com> wrote:
> > > I can avoid this warning by not allocating ext for commit ct in ovs:
> > >
> > > @@ -426,7 +426,7 @@ static int ovs_ct_set_labels(struct nf_conn *ct,
> > > struct sw_flow_key *key,
> > >         struct nf_conn_labels *cl;
> > >         int err;
> > >
> > > -       cl = ovs_ct_get_conn_labels(ct);
> > > +       cl = nf_ct_labels_find(ct);
> > >         if (!cl)
> > >                 return -ENOSPC;
> ovs_ct_get_conn_labels() must be replaced with nf_ct_labels_find() in here
> anyway, thinking that the confirmed ct without labels was created in other
> places (not by OVS conntrack), the warning may still be triggered when
> trying to set labels in OVS after.

Right.

> > Then ovs_ct_limit_init() and nf_connlabels_get() need to be called
> > once on the first conntrack operatation, regardless if labels are asked
> > for or not.
> >
> > Not nice but still better than current state.
> Right, not nice, it undermines the bits check against NF_CT_LABELS_MAX_SIZE.

Yes, but OTOH this bit check isn't used anymore, it comes from days when
label area was dynamically sized, today is hardcoded to 128 anyway.
diff mbox series

Patch

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 331730fd3580..fa955e892210 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -455,45 +455,6 @@  static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key,
 	return 0;
 }
 
-static struct nf_conntrack_expect *
-ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
-		   u16 proto, const struct sk_buff *skb)
-{
-	struct nf_conntrack_tuple tuple;
-	struct nf_conntrack_expect *exp;
-
-	if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple))
-		return NULL;
-
-	exp = __nf_ct_expect_find(net, zone, &tuple);
-	if (exp) {
-		struct nf_conntrack_tuple_hash *h;
-
-		/* Delete existing conntrack entry, if it clashes with the
-		 * expectation.  This can happen since conntrack ALGs do not
-		 * check for clashes between (new) expectations and existing
-		 * conntrack entries.  nf_conntrack_in() will check the
-		 * expectations only if a conntrack entry can not be found,
-		 * which can lead to OVS finding the expectation (here) in the
-		 * init direction, but which will not be removed by the
-		 * nf_conntrack_in() call, if a matching conntrack entry is
-		 * found instead.  In this case all init direction packets
-		 * would be reported as new related packets, while reply
-		 * direction packets would be reported as un-related
-		 * established packets.
-		 */
-		h = nf_conntrack_find_get(net, zone, &tuple);
-		if (h) {
-			struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
-
-			nf_ct_delete(ct, 0, 0);
-			nf_ct_put(ct);
-		}
-	}
-
-	return exp;
-}
-
 /* This replicates logic from nf_conntrack_core.c that is not exported. */
 static enum ip_conntrack_info
 ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
@@ -852,36 +813,16 @@  static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 			 const struct ovs_conntrack_info *info,
 			 struct sk_buff *skb)
 {
-	struct nf_conntrack_expect *exp;
-
-	/* If we pass an expected packet through nf_conntrack_in() the
-	 * expectation is typically removed, but the packet could still be
-	 * lost in upcall processing.  To prevent this from happening we
-	 * perform an explicit expectation lookup.  Expected connections are
-	 * always new, and will be passed through conntrack only when they are
-	 * committed, as it is OK to remove the expectation at that time.
-	 */
-	exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
-	if (exp) {
-		u8 state;
-
-		/* NOTE: New connections are NATted and Helped only when
-		 * committed, so we are not calling into NAT here.
-		 */
-		state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
-		__ovs_ct_update_key(key, state, &info->zone, exp->master);
-	} else {
-		struct nf_conn *ct;
-		int err;
+	struct nf_conn *ct;
+	int err;
 
-		err = __ovs_ct_lookup(net, key, info, skb);
-		if (err)
-			return err;
+	err = __ovs_ct_lookup(net, key, info, skb);
+	if (err)
+		return err;
 
-		ct = (struct nf_conn *)skb_nfct(skb);
-		if (ct)
-			nf_ct_deliver_cached_events(ct);
-	}
+	ct = (struct nf_conn *)skb_nfct(skb);
+	if (ct)
+		nf_ct_deliver_cached_events(ct);
 
 	return 0;
 }
@@ -1460,7 +1401,8 @@  int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
 	if (err)
 		goto err_free_ct;
 
-	__set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status);
+	if (ct_info.commit)
+		__set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status);
 	return 0;
 err_free_ct:
 	__ovs_ct_free_action(&ct_info);