diff mbox series

[V12,01/19] filter-rewriter: Add TCP state machine and fix memory leak in connection_track_table

Message ID 20180903043900.28592-2-zhangckid@gmail.com (mailing list archive)
State New, archived
Headers show
Series COLO: integrate colo frame with block replication and COLO proxy | expand

Commit Message

Zhang Chen Sept. 3, 2018, 4:38 a.m. UTC
We add almost full TCP state machine in filter-rewriter, except
TCPS_LISTEN and some simplify in VM active close FIN states.

After a net connection is closed, we didn't clear its releated resources
in connection_track_table, which will lead to memory leak.

Let't track the state of net connection, if it is closed, its related
resources will be cleared up.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
---
 net/colo.c            |   2 +-
 net/colo.h            |   9 ++--
 net/filter-rewriter.c | 105 ++++++++++++++++++++++++++++++++++++++----
 3 files changed, 100 insertions(+), 16 deletions(-)

Comments

Jason Wang Sept. 12, 2018, 7:36 a.m. UTC | #1
On 2018年09月03日 12:38, Zhang Chen wrote:
> We add almost full TCP state machine in filter-rewriter, except
> TCPS_LISTEN and some simplify in VM active close FIN states.

Need to explain why it can be simplified.

>
> After a net connection is closed, we didn't clear its releated resources
> in connection_track_table, which will lead to memory leak.
>
> Let't track the state of net connection, if it is closed, its related
> resources will be cleared up.
>
> Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
> Signed-off-by: Zhang Chen <zhangckid@gmail.com>
> Signed-off-by: Zhang Chen <chen.zhang@intel.com>
> ---
>   net/colo.c            |   2 +-
>   net/colo.h            |   9 ++--
>   net/filter-rewriter.c | 105 ++++++++++++++++++++++++++++++++++++++----
>   3 files changed, 100 insertions(+), 16 deletions(-)
>
> diff --git a/net/colo.c b/net/colo.c
> index 6dda4ed66e..97c8fc928f 100644
> --- a/net/colo.c
> +++ b/net/colo.c
> @@ -137,7 +137,7 @@ Connection *connection_new(ConnectionKey *key)
>       conn->ip_proto = key->ip_proto;
>       conn->processing = false;
>       conn->offset = 0;
> -    conn->syn_flag = 0;
> +    conn->tcp_state = TCPS_CLOSED;
>       conn->pack = 0;
>       conn->sack = 0;
>       g_queue_init(&conn->primary_list);
> diff --git a/net/colo.h b/net/colo.h
> index da6c36dcf7..0277e0e9ba 100644
> --- a/net/colo.h
> +++ b/net/colo.h
> @@ -18,6 +18,7 @@
>   #include "slirp/slirp.h"
>   #include "qemu/jhash.h"
>   #include "qemu/timer.h"
> +#include "slirp/tcp.h"
>   
>   #define HASHTABLE_MAX_SIZE 16384
>   
> @@ -81,11 +82,9 @@ typedef struct Connection {
>       uint32_t sack;
>       /* offset = secondary_seq - primary_seq */
>       tcp_seq  offset;
> -    /*
> -     * we use this flag update offset func
> -     * run once in independent tcp connection
> -     */
> -    int syn_flag;
> +
> +    int tcp_state; /* TCP FSM state */
> +    tcp_seq fin_ack_seq; /* the seq of 'fin=1,ack=1' */
>   } Connection;
>   
>   uint32_t connection_key_hash(const void *opaque);
> diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
> index f584e4eba4..f18a71bf2e 100644
> --- a/net/filter-rewriter.c
> +++ b/net/filter-rewriter.c
> @@ -59,9 +59,9 @@ static int is_tcp_packet(Packet *pkt)
>   }
>   
>   /* handle tcp packet from primary guest */
> -static int handle_primary_tcp_pkt(NetFilterState *nf,
> +static int handle_primary_tcp_pkt(RewriterState *rf,
>                                     Connection *conn,
> -                                  Packet *pkt)
> +                                  Packet *pkt, ConnectionKey *key)
>   {
>       struct tcphdr *tcp_pkt;
>   
> @@ -74,23 +74,28 @@ static int handle_primary_tcp_pkt(NetFilterState *nf,
>           trace_colo_filter_rewriter_conn_offset(conn->offset);
>       }
>   
> +    if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN)) &&
> +        conn->tcp_state == TCPS_SYN_SENT) {
> +        conn->tcp_state = TCPS_ESTABLISHED;
> +    }
> +
>       if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
>           /*
>            * we use this flag update offset func
>            * run once in independent tcp connection
>            */
> -        conn->syn_flag = 1;
> +        conn->tcp_state = TCPS_SYN_RECEIVED;
>       }
>   
>       if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK)) {
> -        if (conn->syn_flag) {
> +        if (conn->tcp_state == TCPS_SYN_RECEIVED) {
>               /*
>                * offset = secondary_seq - primary seq
>                * ack packet sent by guest from primary node,
>                * so we use th_ack - 1 get primary_seq
>                */
>               conn->offset -= (ntohl(tcp_pkt->th_ack) - 1);
> -            conn->syn_flag = 0;
> +            conn->tcp_state = TCPS_ESTABLISHED;
>           }
>           if (conn->offset) {
>               /* handle packets to the secondary from the primary */
> @@ -99,15 +104,63 @@ static int handle_primary_tcp_pkt(NetFilterState *nf,
>               net_checksum_calculate((uint8_t *)pkt->data + pkt->vnet_hdr_len,
>                                      pkt->size - pkt->vnet_hdr_len);
>           }
> +        /*
> +         * Case 1:
> +         * Step 3:

Better have gathered comment instead of several scattered ones. This 
will make the code much more easier to be read.

> +         * The *server* side of this connect is VM, *client* tries to close
> +         * the connection.

Better use "passive close" here since:

- Server can active close the connection
- TCP allows distributed computing which does not differ much between 
server and client.

> +         *
> +         * We got 'ack=1' packets from client side, it acks 'fin=1, ack=1'
> +         * packet from server side. From this point, we can ensure that there
> +         * will be no packets in the connection, except that, some errors
> +         * happen between the path of 'filter object' and vNIC, if this rare
> +         * case really happen, we can still create a new connection,
> +         * So it is safe to remove the connection from connection_track_table.
> +         *
> +         */
> +        if ((conn->tcp_state == TCPS_LAST_ACK) &&
> +            (ntohl(tcp_pkt->th_ack) == (conn->fin_ack_seq + 1))) {
> +            conn->tcp_state = TCPS_CLOSED;
> +            g_hash_table_remove(rf->connection_track_table, key);
> +        }
> +    }
> +
> +    if ((tcp_pkt->th_flags & TH_FIN) == TH_FIN) {
> +        /*
> +         * Case 1:
> +         * Step 1:
> +         * The *server* side of this connect is VM, *client* tries to close
> +         * the connection. We will into CLOSE_WAIT status.
> +         */
> +        if (conn->tcp_state == TCPS_ESTABLISHED) {
> +            conn->tcp_state = TCPS_CLOSE_WAIT;
> +        }
> +
> +        /*
> +         * Case 2:
> +         * Step 2:
> +         * The *server* side of this connect is VM, *server* tries to close
> +         * the connection. We will into CLOSE_WAIT status.
> +         * We simplify the TCPS_FIN_WAIT_2, TCPS_TIME_WAIT and CLOSING status.
> +         */

Better explain why you can do the simplification. E.g what happens if 
packets

> +        if (conn->tcp_state == TCPS_FIN_WAIT_1) {
> +            conn->tcp_state = TCPS_TIME_WAIT;
> +            /*
> +             * For simplify implementation, we needn't wait 2MSL time
> +             * in filter rewriter.
> +             */
> +            conn->tcp_state = TCPS_CLOSED;
> +            g_hash_table_remove(rf->connection_track_table, key);
> +        }
>       }
>   
>       return 0;
>   }
>   
>   /* handle tcp packet from secondary guest */
> -static int handle_secondary_tcp_pkt(NetFilterState *nf,
> +static int handle_secondary_tcp_pkt(RewriterState *rf,
>                                       Connection *conn,
> -                                    Packet *pkt)
> +                                    Packet *pkt, ConnectionKey *key)
>   {
>       struct tcphdr *tcp_pkt;
>   
> @@ -121,7 +174,8 @@ static int handle_secondary_tcp_pkt(NetFilterState *nf,
>           trace_colo_filter_rewriter_conn_offset(conn->offset);
>       }
>   
> -    if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
> +    if (conn->tcp_state == TCPS_SYN_RECEIVED &&
> +        ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
>           /*
>            * save offset = secondary_seq and then
>            * in handle_primary_tcp_pkt make offset
> @@ -130,6 +184,12 @@ static int handle_secondary_tcp_pkt(NetFilterState *nf,
>           conn->offset = ntohl(tcp_pkt->th_seq);
>       }
>   
> +    /* VM active connect */
> +    if (conn->tcp_state == TCPS_CLOSED &&
> +        ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
> +        conn->tcp_state = TCPS_SYN_SENT;
> +    }
> +
>       if ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK) {
>           /* Only need to adjust seq while offset is Non-zero */
>           if (conn->offset) {
> @@ -140,6 +200,31 @@ static int handle_secondary_tcp_pkt(NetFilterState *nf,
>                                      pkt->size - pkt->vnet_hdr_len);
>           }
>       }
> +    /*
> +     * Case 1:
> +     * Step 2:
> +     * The *server* side of this connect is VM, *client* tries to close
> +     * the connection. In this step we will into LAST_ACK status.
> +     *
> +     * We got 'fin=1, ack=1' packet from server side, we need to
> +     * record the seq of 'fin=1, ack=1' packet.
> +     */
> +    if (conn->tcp_state == TCPS_CLOSE_WAIT &&
> +        (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == (TH_ACK | TH_FIN)) {
> +        conn->fin_ack_seq = ntohl(tcp_pkt->th_seq);
> +        conn->tcp_state = TCPS_LAST_ACK;
> +    }
> +
> +    /*
> +     * Case 2:
> +     * Step 1:
> +     * The *server* side of this connect is VM, *server* tries to close
> +     * the connection.
> +     */
> +    if (conn->tcp_state == TCPS_ESTABLISHED &&
> +        (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == TH_FIN) {
> +        conn->tcp_state = TCPS_FIN_WAIT_1;
> +    }
>   
>       return 0;
>   }
> @@ -190,7 +275,7 @@ static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
>   
>           if (sender == nf->netdev) {
>               /* NET_FILTER_DIRECTION_TX */
> -            if (!handle_primary_tcp_pkt(nf, conn, pkt)) {
> +            if (!handle_primary_tcp_pkt(s, conn, pkt, &key)) {
>                   qemu_net_queue_send(s->incoming_queue, sender, 0,
>                   (const uint8_t *)pkt->data, pkt->size, NULL);
>                   packet_destroy(pkt, NULL);
> @@ -203,7 +288,7 @@ static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
>               }
>           } else {
>               /* NET_FILTER_DIRECTION_RX */
> -            if (!handle_secondary_tcp_pkt(nf, conn, pkt)) {
> +            if (!handle_secondary_tcp_pkt(s, conn, pkt, &key)) {
>                   qemu_net_queue_send(s->incoming_queue, sender, 0,
>                   (const uint8_t *)pkt->data, pkt->size, NULL);
>                   packet_destroy(pkt, NULL);
Zhang Chen Sept. 13, 2018, 3:12 a.m. UTC | #2
On Wed, Sep 12, 2018 at 3:36 PM Jason Wang <jasowang@redhat.com> wrote:

>
>
> On 2018年09月03日 12:38, Zhang Chen wrote:
> > We add almost full TCP state machine in filter-rewriter, except
> > TCPS_LISTEN and some simplify in VM active close FIN states.
>
> Need to explain why it can be simplified.
>

OK, I will note here like this : "We do this simplify job because guest
kernel will track the TCP status and wait 2MSL time,
if client resend the FIN packet, guest will resend the last ACK too, so we
needn't wait 2MSL time in filter-rewriter."


>
> >
> > After a net connection is closed, we didn't clear its releated resources
> > in connection_track_table, which will lead to memory leak.
> >
> > Let't track the state of net connection, if it is closed, its related
> > resources will be cleared up.
> >
> > Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
> > Signed-off-by: Zhang Chen <zhangckid@gmail.com>
> > Signed-off-by: Zhang Chen <chen.zhang@intel.com>
> > ---
> >   net/colo.c            |   2 +-
> >   net/colo.h            |   9 ++--
> >   net/filter-rewriter.c | 105 ++++++++++++++++++++++++++++++++++++++----
> >   3 files changed, 100 insertions(+), 16 deletions(-)
> >
> > diff --git a/net/colo.c b/net/colo.c
> > index 6dda4ed66e..97c8fc928f 100644
> > --- a/net/colo.c
> > +++ b/net/colo.c
> > @@ -137,7 +137,7 @@ Connection *connection_new(ConnectionKey *key)
> >       conn->ip_proto = key->ip_proto;
> >       conn->processing = false;
> >       conn->offset = 0;
> > -    conn->syn_flag = 0;
> > +    conn->tcp_state = TCPS_CLOSED;
> >       conn->pack = 0;
> >       conn->sack = 0;
> >       g_queue_init(&conn->primary_list);
> > diff --git a/net/colo.h b/net/colo.h
> > index da6c36dcf7..0277e0e9ba 100644
> > --- a/net/colo.h
> > +++ b/net/colo.h
> > @@ -18,6 +18,7 @@
> >   #include "slirp/slirp.h"
> >   #include "qemu/jhash.h"
> >   #include "qemu/timer.h"
> > +#include "slirp/tcp.h"
> >
> >   #define HASHTABLE_MAX_SIZE 16384
> >
> > @@ -81,11 +82,9 @@ typedef struct Connection {
> >       uint32_t sack;
> >       /* offset = secondary_seq - primary_seq */
> >       tcp_seq  offset;
> > -    /*
> > -     * we use this flag update offset func
> > -     * run once in independent tcp connection
> > -     */
> > -    int syn_flag;
> > +
> > +    int tcp_state; /* TCP FSM state */
> > +    tcp_seq fin_ack_seq; /* the seq of 'fin=1,ack=1' */
> >   } Connection;
> >
> >   uint32_t connection_key_hash(const void *opaque);
> > diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
> > index f584e4eba4..f18a71bf2e 100644
> > --- a/net/filter-rewriter.c
> > +++ b/net/filter-rewriter.c
> > @@ -59,9 +59,9 @@ static int is_tcp_packet(Packet *pkt)
> >   }
> >
> >   /* handle tcp packet from primary guest */
> > -static int handle_primary_tcp_pkt(NetFilterState *nf,
> > +static int handle_primary_tcp_pkt(RewriterState *rf,
> >                                     Connection *conn,
> > -                                  Packet *pkt)
> > +                                  Packet *pkt, ConnectionKey *key)
> >   {
> >       struct tcphdr *tcp_pkt;
> >
> > @@ -74,23 +74,28 @@ static int handle_primary_tcp_pkt(NetFilterState *nf,
> >           trace_colo_filter_rewriter_conn_offset(conn->offset);
> >       }
> >
> > +    if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))
> &&
> > +        conn->tcp_state == TCPS_SYN_SENT) {
> > +        conn->tcp_state = TCPS_ESTABLISHED;
> > +    }
> > +
> >       if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
> >           /*
> >            * we use this flag update offset func
> >            * run once in independent tcp connection
> >            */
> > -        conn->syn_flag = 1;
> > +        conn->tcp_state = TCPS_SYN_RECEIVED;
> >       }
> >
> >       if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK)) {
> > -        if (conn->syn_flag) {
> > +        if (conn->tcp_state == TCPS_SYN_RECEIVED) {
> >               /*
> >                * offset = secondary_seq - primary seq
> >                * ack packet sent by guest from primary node,
> >                * so we use th_ack - 1 get primary_seq
> >                */
> >               conn->offset -= (ntohl(tcp_pkt->th_ack) - 1);
> > -            conn->syn_flag = 0;
> > +            conn->tcp_state = TCPS_ESTABLISHED;
> >           }
> >           if (conn->offset) {
> >               /* handle packets to the secondary from the primary */
> > @@ -99,15 +104,63 @@ static int handle_primary_tcp_pkt(NetFilterState
> *nf,
> >               net_checksum_calculate((uint8_t *)pkt->data +
> pkt->vnet_hdr_len,
> >                                      pkt->size - pkt->vnet_hdr_len);
> >           }
> > +        /*
> > +         * Case 1:
> > +         * Step 3:
>
> Better have gathered comment instead of several scattered ones. This
> will make the code much more easier to be read.
>

OK, I will fix it and send one patch instead of this one.


>
> > +         * The *server* side of this connect is VM, *client* tries to
> close
> > +         * the connection.
>
> Better use "passive close" here since:
>
> - Server can active close the connection
> - TCP allows distributed computing which does not differ much between
> server and client.
>

Yes. I got it.


>
> > +         *
> > +         * We got 'ack=1' packets from client side, it acks 'fin=1,
> ack=1'
> > +         * packet from server side. From this point, we can ensure that
> there
> > +         * will be no packets in the connection, except that, some
> errors
> > +         * happen between the path of 'filter object' and vNIC, if this
> rare
> > +         * case really happen, we can still create a new connection,
> > +         * So it is safe to remove the connection from
> connection_track_table.
> > +         *
> > +         */
> > +        if ((conn->tcp_state == TCPS_LAST_ACK) &&
> > +            (ntohl(tcp_pkt->th_ack) == (conn->fin_ack_seq + 1))) {
> > +            conn->tcp_state = TCPS_CLOSED;
> > +            g_hash_table_remove(rf->connection_track_table, key);
> > +        }
> > +    }
> > +
> > +    if ((tcp_pkt->th_flags & TH_FIN) == TH_FIN) {
> > +        /*
> > +         * Case 1:
> > +         * Step 1:
> > +         * The *server* side of this connect is VM, *client* tries to
> close
> > +         * the connection. We will into CLOSE_WAIT status.
> > +         */
> > +        if (conn->tcp_state == TCPS_ESTABLISHED) {
> > +            conn->tcp_state = TCPS_CLOSE_WAIT;
> > +        }
> > +
> > +        /*
> > +         * Case 2:
> > +         * Step 2:
> > +         * The *server* side of this connect is VM, *server* tries to
> close
> > +         * the connection. We will into CLOSE_WAIT status.
> > +         * We simplify the TCPS_FIN_WAIT_2, TCPS_TIME_WAIT and CLOSING
> status.
> > +         */
>
> Better explain why you can do the simplification. E.g what happens if
> packets
>

Yes, have been explained above.

Thanks
Zhang Chen


>
> > +        if (conn->tcp_state == TCPS_FIN_WAIT_1) {
> > +            conn->tcp_state = TCPS_TIME_WAIT;
> > +            /*
> > +             * For simplify implementation, we needn't wait 2MSL time
> > +             * in filter rewriter.
> > +             */
> > +            conn->tcp_state = TCPS_CLOSED;
> > +            g_hash_table_remove(rf->connection_track_table, key);
> > +        }
> >       }
> >
> >       return 0;
> >   }
> >
> >   /* handle tcp packet from secondary guest */
> > -static int handle_secondary_tcp_pkt(NetFilterState *nf,
> > +static int handle_secondary_tcp_pkt(RewriterState *rf,
> >                                       Connection *conn,
> > -                                    Packet *pkt)
> > +                                    Packet *pkt, ConnectionKey *key)
> >   {
> >       struct tcphdr *tcp_pkt;
> >
> > @@ -121,7 +174,8 @@ static int handle_secondary_tcp_pkt(NetFilterState
> *nf,
> >           trace_colo_filter_rewriter_conn_offset(conn->offset);
> >       }
> >
> > -    if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN)))
> {
> > +    if (conn->tcp_state == TCPS_SYN_RECEIVED &&
> > +        ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN)))
> {
> >           /*
> >            * save offset = secondary_seq and then
> >            * in handle_primary_tcp_pkt make offset
> > @@ -130,6 +184,12 @@ static int handle_secondary_tcp_pkt(NetFilterState
> *nf,
> >           conn->offset = ntohl(tcp_pkt->th_seq);
> >       }
> >
> > +    /* VM active connect */
> > +    if (conn->tcp_state == TCPS_CLOSED &&
> > +        ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
> > +        conn->tcp_state = TCPS_SYN_SENT;
> > +    }
> > +
> >       if ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK) {
> >           /* Only need to adjust seq while offset is Non-zero */
> >           if (conn->offset) {
> > @@ -140,6 +200,31 @@ static int handle_secondary_tcp_pkt(NetFilterState
> *nf,
> >                                      pkt->size - pkt->vnet_hdr_len);
> >           }
> >       }
> > +    /*
> > +     * Case 1:
> > +     * Step 2:
> > +     * The *server* side of this connect is VM, *client* tries to close
> > +     * the connection. In this step we will into LAST_ACK status.
> > +     *
> > +     * We got 'fin=1, ack=1' packet from server side, we need to
> > +     * record the seq of 'fin=1, ack=1' packet.
> > +     */
> > +    if (conn->tcp_state == TCPS_CLOSE_WAIT &&
> > +        (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == (TH_ACK | TH_FIN)) {
> > +        conn->fin_ack_seq = ntohl(tcp_pkt->th_seq);
> > +        conn->tcp_state = TCPS_LAST_ACK;
> > +    }
> > +
> > +    /*
> > +     * Case 2:
> > +     * Step 1:
> > +     * The *server* side of this connect is VM, *server* tries to close
> > +     * the connection.
> > +     */
> > +    if (conn->tcp_state == TCPS_ESTABLISHED &&
> > +        (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == TH_FIN) {
> > +        conn->tcp_state = TCPS_FIN_WAIT_1;
> > +    }
> >
> >       return 0;
> >   }
> > @@ -190,7 +275,7 @@ static ssize_t
> colo_rewriter_receive_iov(NetFilterState *nf,
> >
> >           if (sender == nf->netdev) {
> >               /* NET_FILTER_DIRECTION_TX */
> > -            if (!handle_primary_tcp_pkt(nf, conn, pkt)) {
> > +            if (!handle_primary_tcp_pkt(s, conn, pkt, &key)) {
> >                   qemu_net_queue_send(s->incoming_queue, sender, 0,
> >                   (const uint8_t *)pkt->data, pkt->size, NULL);
> >                   packet_destroy(pkt, NULL);
> > @@ -203,7 +288,7 @@ static ssize_t
> colo_rewriter_receive_iov(NetFilterState *nf,
> >               }
> >           } else {
> >               /* NET_FILTER_DIRECTION_RX */
> > -            if (!handle_secondary_tcp_pkt(nf, conn, pkt)) {
> > +            if (!handle_secondary_tcp_pkt(s, conn, pkt, &key)) {
> >                   qemu_net_queue_send(s->incoming_queue, sender, 0,
> >                   (const uint8_t *)pkt->data, pkt->size, NULL);
> >                   packet_destroy(pkt, NULL);
>
>
diff mbox series

Patch

diff --git a/net/colo.c b/net/colo.c
index 6dda4ed66e..97c8fc928f 100644
--- a/net/colo.c
+++ b/net/colo.c
@@ -137,7 +137,7 @@  Connection *connection_new(ConnectionKey *key)
     conn->ip_proto = key->ip_proto;
     conn->processing = false;
     conn->offset = 0;
-    conn->syn_flag = 0;
+    conn->tcp_state = TCPS_CLOSED;
     conn->pack = 0;
     conn->sack = 0;
     g_queue_init(&conn->primary_list);
diff --git a/net/colo.h b/net/colo.h
index da6c36dcf7..0277e0e9ba 100644
--- a/net/colo.h
+++ b/net/colo.h
@@ -18,6 +18,7 @@ 
 #include "slirp/slirp.h"
 #include "qemu/jhash.h"
 #include "qemu/timer.h"
+#include "slirp/tcp.h"
 
 #define HASHTABLE_MAX_SIZE 16384
 
@@ -81,11 +82,9 @@  typedef struct Connection {
     uint32_t sack;
     /* offset = secondary_seq - primary_seq */
     tcp_seq  offset;
-    /*
-     * we use this flag update offset func
-     * run once in independent tcp connection
-     */
-    int syn_flag;
+
+    int tcp_state; /* TCP FSM state */
+    tcp_seq fin_ack_seq; /* the seq of 'fin=1,ack=1' */
 } Connection;
 
 uint32_t connection_key_hash(const void *opaque);
diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
index f584e4eba4..f18a71bf2e 100644
--- a/net/filter-rewriter.c
+++ b/net/filter-rewriter.c
@@ -59,9 +59,9 @@  static int is_tcp_packet(Packet *pkt)
 }
 
 /* handle tcp packet from primary guest */
-static int handle_primary_tcp_pkt(NetFilterState *nf,
+static int handle_primary_tcp_pkt(RewriterState *rf,
                                   Connection *conn,
-                                  Packet *pkt)
+                                  Packet *pkt, ConnectionKey *key)
 {
     struct tcphdr *tcp_pkt;
 
@@ -74,23 +74,28 @@  static int handle_primary_tcp_pkt(NetFilterState *nf,
         trace_colo_filter_rewriter_conn_offset(conn->offset);
     }
 
+    if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN)) &&
+        conn->tcp_state == TCPS_SYN_SENT) {
+        conn->tcp_state = TCPS_ESTABLISHED;
+    }
+
     if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
         /*
          * we use this flag update offset func
          * run once in independent tcp connection
          */
-        conn->syn_flag = 1;
+        conn->tcp_state = TCPS_SYN_RECEIVED;
     }
 
     if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK)) {
-        if (conn->syn_flag) {
+        if (conn->tcp_state == TCPS_SYN_RECEIVED) {
             /*
              * offset = secondary_seq - primary seq
              * ack packet sent by guest from primary node,
              * so we use th_ack - 1 get primary_seq
              */
             conn->offset -= (ntohl(tcp_pkt->th_ack) - 1);
-            conn->syn_flag = 0;
+            conn->tcp_state = TCPS_ESTABLISHED;
         }
         if (conn->offset) {
             /* handle packets to the secondary from the primary */
@@ -99,15 +104,63 @@  static int handle_primary_tcp_pkt(NetFilterState *nf,
             net_checksum_calculate((uint8_t *)pkt->data + pkt->vnet_hdr_len,
                                    pkt->size - pkt->vnet_hdr_len);
         }
+        /*
+         * Case 1:
+         * Step 3:
+         * The *server* side of this connect is VM, *client* tries to close
+         * the connection.
+         *
+         * We got 'ack=1' packets from client side, it acks 'fin=1, ack=1'
+         * packet from server side. From this point, we can ensure that there
+         * will be no packets in the connection, except that, some errors
+         * happen between the path of 'filter object' and vNIC, if this rare
+         * case really happen, we can still create a new connection,
+         * So it is safe to remove the connection from connection_track_table.
+         *
+         */
+        if ((conn->tcp_state == TCPS_LAST_ACK) &&
+            (ntohl(tcp_pkt->th_ack) == (conn->fin_ack_seq + 1))) {
+            conn->tcp_state = TCPS_CLOSED;
+            g_hash_table_remove(rf->connection_track_table, key);
+        }
+    }
+
+    if ((tcp_pkt->th_flags & TH_FIN) == TH_FIN) {
+        /*
+         * Case 1:
+         * Step 1:
+         * The *server* side of this connect is VM, *client* tries to close
+         * the connection. We will into CLOSE_WAIT status.
+         */
+        if (conn->tcp_state == TCPS_ESTABLISHED) {
+            conn->tcp_state = TCPS_CLOSE_WAIT;
+        }
+
+        /*
+         * Case 2:
+         * Step 2:
+         * The *server* side of this connect is VM, *server* tries to close
+         * the connection. We will into CLOSE_WAIT status.
+         * We simplify the TCPS_FIN_WAIT_2, TCPS_TIME_WAIT and CLOSING status.
+         */
+        if (conn->tcp_state == TCPS_FIN_WAIT_1) {
+            conn->tcp_state = TCPS_TIME_WAIT;
+            /*
+             * For simplify implementation, we needn't wait 2MSL time
+             * in filter rewriter.
+             */
+            conn->tcp_state = TCPS_CLOSED;
+            g_hash_table_remove(rf->connection_track_table, key);
+        }
     }
 
     return 0;
 }
 
 /* handle tcp packet from secondary guest */
-static int handle_secondary_tcp_pkt(NetFilterState *nf,
+static int handle_secondary_tcp_pkt(RewriterState *rf,
                                     Connection *conn,
-                                    Packet *pkt)
+                                    Packet *pkt, ConnectionKey *key)
 {
     struct tcphdr *tcp_pkt;
 
@@ -121,7 +174,8 @@  static int handle_secondary_tcp_pkt(NetFilterState *nf,
         trace_colo_filter_rewriter_conn_offset(conn->offset);
     }
 
-    if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
+    if (conn->tcp_state == TCPS_SYN_RECEIVED &&
+        ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
         /*
          * save offset = secondary_seq and then
          * in handle_primary_tcp_pkt make offset
@@ -130,6 +184,12 @@  static int handle_secondary_tcp_pkt(NetFilterState *nf,
         conn->offset = ntohl(tcp_pkt->th_seq);
     }
 
+    /* VM active connect */
+    if (conn->tcp_state == TCPS_CLOSED &&
+        ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
+        conn->tcp_state = TCPS_SYN_SENT;
+    }
+
     if ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK) {
         /* Only need to adjust seq while offset is Non-zero */
         if (conn->offset) {
@@ -140,6 +200,31 @@  static int handle_secondary_tcp_pkt(NetFilterState *nf,
                                    pkt->size - pkt->vnet_hdr_len);
         }
     }
+    /*
+     * Case 1:
+     * Step 2:
+     * The *server* side of this connect is VM, *client* tries to close
+     * the connection. In this step we will into LAST_ACK status.
+     *
+     * We got 'fin=1, ack=1' packet from server side, we need to
+     * record the seq of 'fin=1, ack=1' packet.
+     */
+    if (conn->tcp_state == TCPS_CLOSE_WAIT &&
+        (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == (TH_ACK | TH_FIN)) {
+        conn->fin_ack_seq = ntohl(tcp_pkt->th_seq);
+        conn->tcp_state = TCPS_LAST_ACK;
+    }
+
+    /*
+     * Case 2:
+     * Step 1:
+     * The *server* side of this connect is VM, *server* tries to close
+     * the connection.
+     */
+    if (conn->tcp_state == TCPS_ESTABLISHED &&
+        (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == TH_FIN) {
+        conn->tcp_state = TCPS_FIN_WAIT_1;
+    }
 
     return 0;
 }
@@ -190,7 +275,7 @@  static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
 
         if (sender == nf->netdev) {
             /* NET_FILTER_DIRECTION_TX */
-            if (!handle_primary_tcp_pkt(nf, conn, pkt)) {
+            if (!handle_primary_tcp_pkt(s, conn, pkt, &key)) {
                 qemu_net_queue_send(s->incoming_queue, sender, 0,
                 (const uint8_t *)pkt->data, pkt->size, NULL);
                 packet_destroy(pkt, NULL);
@@ -203,7 +288,7 @@  static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
             }
         } else {
             /* NET_FILTER_DIRECTION_RX */
-            if (!handle_secondary_tcp_pkt(nf, conn, pkt)) {
+            if (!handle_secondary_tcp_pkt(s, conn, pkt, &key)) {
                 qemu_net_queue_send(s->incoming_queue, sender, 0,
                 (const uint8_t *)pkt->data, pkt->size, NULL);
                 packet_destroy(pkt, NULL);