diff mbox

[RFC,V3,2/4] colo-compare: track connection and enqueue packet

Message ID 1460977906-25218-3-git-send-email-zhangchen.fnst@cn.fujitsu.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zhang Chen April 18, 2016, 11:11 a.m. UTC
In this patch we use kernel jhash table to track
connection, and then enqueue net packet like this:

+ CompareState ++
|               |
+---------------+   +---------------+         +---------------+
|conn list      +--->conn           +--------->conn           |
+---------------+   +---------------+         +---------------+
|               |     |           |             |          |
+---------------+ +---v----+  +---v----+    +---v----+ +---v----+
                  |primary |  |secondary    |primary | |secondary
                  |packet  |  |packet  +    |packet  | |packet  +
                  +--------+  +--------+    +--------+ +--------+
                      |           |             |          |
                  +---v----+  +---v----+    +---v----+ +---v----+
                  |primary |  |secondary    |primary | |secondary
                  |packet  |  |packet  +    |packet  | |packet  +
                  +--------+  +--------+    +--------+ +--------+
                      |           |             |          |
                  +---v----+  +---v----+    +---v----+ +---v----+
                  |primary |  |secondary    |primary | |secondary
                  |packet  |  |packet  +    |packet  | |packet  +
                  +--------+  +--------+    +--------+ +--------+

Signed-off-by: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
 include/qemu/jhash.h |  59 ++++++++++
 net/colo-compare.c   | 303 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 trace-events         |   3 +
 3 files changed, 360 insertions(+), 5 deletions(-)
 create mode 100644 include/qemu/jhash.h

Comments

Jason Wang April 28, 2016, 7:47 a.m. UTC | #1
On 04/18/2016 07:11 PM, Zhang Chen wrote:
> In this patch we use kernel jhash table to track
> connection, and then enqueue net packet like this:
>
> + CompareState ++
> |               |
> +---------------+   +---------------+         +---------------+
> |conn list      +--->conn           +--------->conn           |
> +---------------+   +---------------+         +---------------+
> |               |     |           |             |          |
> +---------------+ +---v----+  +---v----+    +---v----+ +---v----+
>                   |primary |  |secondary    |primary | |secondary
>                   |packet  |  |packet  +    |packet  | |packet  +
>                   +--------+  +--------+    +--------+ +--------+
>                       |           |             |          |
>                   +---v----+  +---v----+    +---v----+ +---v----+
>                   |primary |  |secondary    |primary | |secondary
>                   |packet  |  |packet  +    |packet  | |packet  +
>                   +--------+  +--------+    +--------+ +--------+
>                       |           |             |          |
>                   +---v----+  +---v----+    +---v----+ +---v----+
>                   |primary |  |secondary    |primary | |secondary
>                   |packet  |  |packet  +    |packet  | |packet  +
>                   +--------+  +--------+    +--------+ +--------+
>
> Signed-off-by: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> ---
>  include/qemu/jhash.h |  59 ++++++++++
>  net/colo-compare.c   | 303 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  trace-events         |   3 +
>  3 files changed, 360 insertions(+), 5 deletions(-)
>  create mode 100644 include/qemu/jhash.h
>
> diff --git a/include/qemu/jhash.h b/include/qemu/jhash.h
> new file mode 100644
> index 0000000..8a8ff0f
> --- /dev/null
> +++ b/include/qemu/jhash.h
> @@ -0,0 +1,59 @@
> +/* jhash.h: Jenkins hash support.
> +  *
> +  * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
> +  *
> +  * http://burtleburtle.net/bob/hash/
> +  *
> +  * These are the credits from Bob's sources:
> +  *
> +  * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
> +  *
> +  * These are functions for producing 32-bit hashes for hash table lookup.
> +  * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
> +  * are externally useful functions.  Routines to test the hash are included
> +  * if SELF_TEST is defined.  You can use this free for any purpose.It's in
> +  * the public domain.  It has no warranty.
> +  *
> +  * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu)
> +  *
> +  * I've modified Bob's hash to be useful in the Linux kernel, and
> +  * any bugs present are my fault.
> +  * Jozsef
> +  */
> +
> +#ifndef QEMU_JHASH_H__
> +#define QEMU_JHASH_H__
> +
> +#include "qemu/bitops.h"
> +
> +/*
> + * hashtable related is copied from linux kernel jhash
> + */
> +
> +/* __jhash_mix -- mix 3 32-bit values reversibly. */
> +#define __jhash_mix(a, b, c)                \
> +{                                           \
> +    a -= c;  a ^= rol32(c, 4);  c += b;     \
> +    b -= a;  b ^= rol32(a, 6);  a += c;     \
> +    c -= b;  c ^= rol32(b, 8);  b += a;     \
> +    a -= c;  a ^= rol32(c, 16); c += b;     \
> +    b -= a;  b ^= rol32(a, 19); a += c;     \
> +    c -= b;  c ^= rol32(b, 4);  b += a;     \
> +}
> +
> +/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
> +#define __jhash_final(a, b, c)  \
> +{                               \
> +    c ^= b; c -= rol32(b, 14);  \
> +    a ^= c; a -= rol32(c, 11);  \
> +    b ^= a; b -= rol32(a, 25);  \
> +    c ^= b; c -= rol32(b, 16);  \
> +    a ^= c; a -= rol32(c, 4);   \
> +    b ^= a; b -= rol32(a, 14);  \
> +    c ^= b; c -= rol32(b, 24);  \
> +}
> +
> +/* An arbitrary initial parameter */
> +#define JHASH_INITVAL           0xdeadbeef
> +
> +#endif /* QEMU_JHASH_H__ */
> diff --git a/net/colo-compare.c b/net/colo-compare.c
> index c45b132..dc57eac 100644
> --- a/net/colo-compare.c
> +++ b/net/colo-compare.c
> @@ -22,12 +22,16 @@
>  #include "qemu/sockets.h"
>  #include "qapi-visit.h"
>  #include "trace.h"
> +#include "slirp/slirp.h"
> +#include "qemu/jhash.h"
> +#include "net/eth.h"
>  
>  #define TYPE_COLO_COMPARE "colo-compare"
>  #define COLO_COMPARE(obj) \
>      OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE)
>  
>  #define COMPARE_READ_LEN_MAX NET_BUFSIZE
> +#define HASHTABLE_MAX_SIZE 16384
>  
>  static QTAILQ_HEAD(, CompareState) net_compares =
>         QTAILQ_HEAD_INITIALIZER(net_compares);
> @@ -39,6 +43,28 @@ typedef struct ReadState {
>      uint8_t buf[COMPARE_READ_LEN_MAX];
>  } ReadState;
>  
> +/*
> +  + CompareState ++
> +  |               |
> +  +---------------+   +---------------+         +---------------+
> +  |conn list      +--->conn           +--------->conn           |
> +  +---------------+   +---------------+         +---------------+
> +  |               |     |           |             |          |
> +  +---------------+ +---v----+  +---v----+    +---v----+ +---v----+
> +                    |primary |  |secondary    |primary | |secondary
> +                    |packet  |  |packet  +    |packet  | |packet  +
> +                    +--------+  +--------+    +--------+ +--------+
> +                        |           |             |          |
> +                    +---v----+  +---v----+    +---v----+ +---v----+
> +                    |primary |  |secondary    |primary | |secondary
> +                    |packet  |  |packet  +    |packet  | |packet  +
> +                    +--------+  +--------+    +--------+ +--------+
> +                        |           |             |          |
> +                    +---v----+  +---v----+    +---v----+ +---v----+
> +                    |primary |  |secondary    |primary | |secondary
> +                    |packet  |  |packet  +    |packet  | |packet  +
> +                    +--------+  +--------+    +--------+ +--------+
> +*/
>  typedef struct CompareState {
>      Object parent;
>  
> @@ -51,12 +77,265 @@ typedef struct CompareState {
>      QTAILQ_ENTRY(CompareState) next;
>      ReadState pri_rs;
>      ReadState sec_rs;
> +
> +    /* connection list: the connections belonged to this NIC could be found
> +     * in this list.
> +     * element type: Connection
> +     */
> +    GQueue conn_list;
> +    QemuMutex conn_list_lock; /* to protect conn_list */
> +    /* hashtable to save connection */
> +    GHashTable *connection_track_table;
> +    /* to save unprocessed_connections */
> +    GQueue unprocessed_connections;
> +    /* proxy current hash size */
> +    uint32_t hashtable_size;
>  } CompareState;
>  
>  typedef struct CompareClass {
>      ObjectClass parent_class;
>  } CompareClass;
>  
> +typedef struct Packet {
> +    void *data;
> +    union {
> +        uint8_t *network_layer;
> +        struct ip *ip;

Does this mean ipv6 is not supported?

> +    };
> +    uint8_t *transport_layer;
> +    int size;
> +    CompareState *s;
> +} Packet;
> +
> +typedef struct ConnectionKey {
> +    /* (src, dst) must be grouped, in the same way than in IP header */
> +    struct in_addr src;
> +    struct in_addr dst;
> +    uint16_t src_port;
> +    uint16_t dst_port;
> +    uint8_t ip_proto;
> +} QEMU_PACKED ConnectionKey;
> +
> +typedef struct Connection {
> +    QemuMutex list_lock;
> +    /* connection primary send queue: element type: Packet */
> +    GQueue primary_list;
> +    /* connection secondary send queue: element type: Packet */
> +    GQueue secondary_list;
> +    /* flag to enqueue unprocessed_connections */
> +    bool processing;
> +    uint8_t ip_proto;
> +} Connection;
> +
> +enum {
> +    PRIMARY_IN = 0,
> +    SECONDARY_IN,
> +};
> +
> +static void packet_destroy(void *opaque, void *user_data);
> +static int compare_chr_send(CharDriverState *out,
> +                            const uint8_t *buf,
> +                            uint32_t size);
> +
> +static uint32_t connection_key_hash(const void *opaque)
> +{
> +    const ConnectionKey *key = opaque;
> +    uint32_t a, b, c;
> +
> +    /* Jenkins hash */
> +    a = b = c = JHASH_INITVAL + sizeof(*key);
> +    a += key->src.s_addr;
> +    b += key->dst.s_addr;
> +    c += (key->src_port | key->dst_port << 16);
> +    __jhash_mix(a, b, c);
> +
> +    a += key->ip_proto;
> +    __jhash_final(a, b, c);
> +
> +    return c;
> +}
> +
> +static int connection_key_equal(const void *opaque1, const void *opaque2)
> +{
> +    return memcmp(opaque1, opaque2, sizeof(ConnectionKey)) == 0;

So why not useing ConnectionKey * consider we're sure of the type?

> +}
> +
> +/*
> + *  initialize connecon_key for packet
> + *  Return 0 on success, if return 1 the pkt will be sent later
> + */
> +static int connection_key_init(Packet *pkt, ConnectionKey *key)
> +{
> +    int network_length;
> +    uint8_t *data = pkt->data;
> +    uint16_t l3_proto;
> +    uint32_t tmp_ports;
> +    ssize_t l2hdr_len = eth_get_l2_hdr_length(data);
> +
> +    pkt->network_layer = data + ETH_HLEN;

Can the length of data be shorter than ETH_HELN?

> +    l3_proto = eth_get_l3_proto(data, l2hdr_len);
> +    if (l3_proto != ETH_P_IP) {
> +        return 1;
> +    }
> +
> +    network_length = pkt->ip->ip_hl * 4;
> +    pkt->transport_layer = pkt->network_layer + network_length;

Do we need sanity check to make sure there's no evil network_length here?

> +    if (!pkt->transport_layer) {
> +        error_report("pkt->transport_layer is valid");

invalid? And if this is caused by the bad packet it self, there's no
need for a error_report.

> +        return 1;
> +    }
> +    key->ip_proto = pkt->ip->ip_p;
> +    key->src = pkt->ip->ip_src;
> +    key->dst = pkt->ip->ip_dst;
> +
> +    switch (key->ip_proto) {
> +    case IPPROTO_TCP:
> +    case IPPROTO_UDP:
> +    case IPPROTO_DCCP:
> +    case IPPROTO_ESP:
> +    case IPPROTO_SCTP:
> +    case IPPROTO_UDPLITE:
> +        tmp_ports = *(uint32_t *)(pkt->transport_layer);
> +        key->src_port = ntohs(tmp_ports & 0xffff);
> +        key->dst_port = ntohs(tmp_ports >> 16);
> +        break;
> +    case IPPROTO_AH:
> +        tmp_ports = *(uint32_t *)(pkt->transport_layer + 4);
> +        key->src_port = ntohs(tmp_ports & 0xffff);
> +        key->dst_port = ntohs(tmp_ports >> 16);
> +        break;
> +    default:
> +        key->src_port = 0;
> +        key->dst_port = 0;
> +        break;
> +    }
> +
> +    return 0;
> +}
> +
> +static Connection *connection_new(ConnectionKey *key)
> +{
> +    Connection *conn = g_slice_new(Connection);
> +
> +    qemu_mutex_init(&conn->list_lock);
> +    conn->ip_proto = key->ip_proto;
> +    conn->processing = false;
> +    g_queue_init(&conn->primary_list);
> +    g_queue_init(&conn->secondary_list);
> +
> +    return conn;
> +}
> +
> +/*
> + * Clear hashtable, stop this hash growing really huge
> + */
> +static void connection_hashtable_reset(CompareState *s)
> +{
> +    s->hashtable_size = 0;
> +    g_hash_table_remove_all(s->connection_track_table);
> +}
> +
> +/* if not found, create a new connection and add to hash table */
> +static Connection *connection_get(CompareState *s, ConnectionKey *key)
> +{
> +    /* FIXME: protect connection_track_table */
> +    Connection *conn = g_hash_table_lookup(s->connection_track_table, key);
> +
> +    if (conn == NULL) {
> +        ConnectionKey *new_key = g_memdup(key, sizeof(*key));
> +
> +        conn = connection_new(key);
> +
> +        s->hashtable_size++;
> +        if (s->hashtable_size > HASHTABLE_MAX_SIZE) {
> +            error_report("colo proxy connection hashtable full, clear it");
> +            connection_hashtable_reset(s);
> +            /* TODO:clear conn_list */
> +        }
> +
> +        g_hash_table_insert(s->connection_track_table, new_key, conn);
> +    }
> +
> +     return conn;
> +}
> +
> +static void connection_destroy(void *opaque)
> +{
> +    Connection *conn = opaque;
> +
> +    qemu_mutex_lock(&conn->list_lock);

Like I said in previous patch, if you do all the processing in colo
compare thread, you can avoid almost all synchronization (e.g mutex).

> +    g_queue_foreach(&conn->primary_list, packet_destroy, NULL);
> +    g_queue_free(&conn->primary_list);
> +    g_queue_foreach(&conn->secondary_list, packet_destroy, NULL);
> +    g_queue_free(&conn->secondary_list);
> +    qemu_mutex_unlock(&conn->list_lock);
> +    qemu_mutex_destroy(&conn->list_lock);
> +    g_slice_free(Connection, conn);
> +}
> +
> +static Packet *packet_new(CompareState *s, const void *data,
> +                              int size, ConnectionKey *key)
> +{
> +    Packet *pkt = g_slice_new(Packet);
> +
> +    pkt->data = g_memdup(data, size);
> +    pkt->size = size;
> +    pkt->s = s;
> +
> +    if (connection_key_init(pkt, key)) {
> +        packet_destroy(pkt, NULL);
> +        pkt = NULL;
> +    }

Can we do connection_key_init() first, this can avoid packet_desctory()
if it fails.

> +
> +    return pkt;
> +}
> +
> +/*
> + * Return 0 on success, if return -1 means the pkt
> + * is unsupported(arp and ipv6) and will be sent later
> + */
> +static int packet_enqueue(CompareState *s, int mode)
> +{
> +    ConnectionKey key = {{ 0 } };
> +    Packet *pkt = NULL;
> +    Connection *conn;
> +
> +    if (mode == PRIMARY_IN) {
> +        pkt = packet_new(s, s->pri_rs.buf, s->pri_rs.packet_len, &key);
> +    } else {
> +        pkt = packet_new(s, s->sec_rs.buf, s->sec_rs.packet_len, &key);
> +    }
> +    if (!pkt) {
> +        return -1;
> +    }
> +
> +    conn = connection_get(s, &key);
> +    if (!conn->processing) {
> +        qemu_mutex_lock(&s->conn_list_lock);
> +        g_queue_push_tail(&s->conn_list, conn);
> +        qemu_mutex_unlock(&s->conn_list_lock);
> +        conn->processing = true;
> +    }
> +
> +    qemu_mutex_lock(&conn->list_lock);
> +    if (mode == PRIMARY_IN) {
> +        g_queue_push_tail(&conn->primary_list, pkt);
> +    } else {
> +        g_queue_push_tail(&conn->secondary_list, pkt);
> +    }
> +    qemu_mutex_unlock(&conn->list_lock);
> +
> +    return 0;
> +}
> +
> +static void packet_destroy(void *opaque, void *user_data)
> +{
> +    Packet *pkt = opaque;
> +
> +    g_free(pkt->data);
> +    g_slice_free(Packet, pkt);
> +}
> +
>  static int compare_chr_send(CharDriverState *out,
>                              const uint8_t *buf,
>                              uint32_t size)
> @@ -158,8 +437,10 @@ static void compare_pri_chr_in(void *opaque, const uint8_t *buf, int size)
>  
>      ret = compare_chr_fill_rstate(&s->pri_rs, buf, size);
>      if (ret == 1) {
> -        /* FIXME: enqueue to primary packet list */
> -        compare_chr_send(s->chr_out, s->pri_rs.buf, s->pri_rs.packet_len);
> +        if (packet_enqueue(s, PRIMARY_IN)) {
> +            trace_colo_compare_main("primary: unsupported packet in");
> +            compare_chr_send(s->chr_out, s->pri_rs.buf, s->pri_rs.packet_len);

Looks like if a packet was not recognized by connection_key_init(), it
will be sent directly without comparing it with the packet sent from
secondary? Is this expected?

> +        }
>      } else if (ret == -1) {
>          qemu_chr_add_handlers(s->chr_pri_in, NULL, NULL, NULL, NULL);
>      }
> @@ -176,9 +457,11 @@ static void compare_sec_chr_in(void *opaque, const uint8_t *buf, int size)
>  
>      ret = compare_chr_fill_rstate(&s->sec_rs, buf, size);
>      if (ret == 1) {
> -        /* TODO: enqueue to secondary packet list*/
> -        /* should we send sec arp pkt? */
> -        compare_chr_send(s->chr_out, s->sec_rs.buf, s->sec_rs.packet_len);
> +        if (packet_enqueue(s, SECONDARY_IN)) {
> +            trace_colo_compare_main("secondary: unsupported packet in");
> +            /* should we send sec arp pkt? */
> +            compare_chr_send(s->chr_out, s->sec_rs.buf, s->sec_rs.packet_len);
> +        }
>      } else if (ret == -1) {
>          qemu_chr_add_handlers(s->chr_sec_in, NULL, NULL, NULL, NULL);
>      }
> @@ -280,6 +563,15 @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
>      qemu_chr_fe_claim_no_fail(s->chr_out);
>      QTAILQ_INSERT_TAIL(&net_compares, s, next);
>  
> +    g_queue_init(&s->conn_list);
> +    qemu_mutex_init(&s->conn_list_lock);
> +    s->hashtable_size = 0;
> +
> +    s->connection_track_table = g_hash_table_new_full(connection_key_hash,
> +                                                      connection_key_equal,
> +                                                      g_free,
> +                                                      connection_destroy);
> +
>      return;
>  }
>  
> @@ -314,6 +606,7 @@ static void colo_compare_class_finalize(ObjectClass *oc, void *data)
>      if (!QTAILQ_EMPTY(&net_compares)) {
>          QTAILQ_REMOVE(&net_compares, s, next);
>      }
> +    qemu_mutex_destroy(&s->conn_list_lock);
>  }
>  
>  static void colo_compare_init(Object *obj)
> diff --git a/trace-events b/trace-events
> index ca7211b..8862288 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -1916,3 +1916,6 @@ aspeed_vic_update_fiq(int flags) "Raising FIQ: %d"
>  aspeed_vic_update_irq(int flags) "Raising IRQ: %d"
>  aspeed_vic_read(uint64_t offset, unsigned size, uint32_t value) "From 0x%" PRIx64 " of size %u: 0x%" PRIx32
>  aspeed_vic_write(uint64_t offset, unsigned size, uint32_t data) "To 0x%" PRIx64 " of size %u: 0x%" PRIx32
> +
> +# net/colo-compare.c
> +colo_compare_main(const char *chr) "chr: %s"
Zhang Chen April 28, 2016, 10:25 a.m. UTC | #2
On 04/28/2016 03:47 PM, Jason Wang wrote:
>
> On 04/18/2016 07:11 PM, Zhang Chen wrote:
>> In this patch we use kernel jhash table to track
>> connection, and then enqueue net packet like this:
>>
>> + CompareState ++
>> |               |
>> +---------------+   +---------------+         +---------------+
>> |conn list      +--->conn           +--------->conn           |
>> +---------------+   +---------------+         +---------------+
>> |               |     |           |             |          |
>> +---------------+ +---v----+  +---v----+    +---v----+ +---v----+
>>                    |primary |  |secondary    |primary | |secondary
>>                    |packet  |  |packet  +    |packet  | |packet  +
>>                    +--------+  +--------+    +--------+ +--------+
>>                        |           |             |          |
>>                    +---v----+  +---v----+    +---v----+ +---v----+
>>                    |primary |  |secondary    |primary | |secondary
>>                    |packet  |  |packet  +    |packet  | |packet  +
>>                    +--------+  +--------+    +--------+ +--------+
>>                        |           |             |          |
>>                    +---v----+  +---v----+    +---v----+ +---v----+
>>                    |primary |  |secondary    |primary | |secondary
>>                    |packet  |  |packet  +    |packet  | |packet  +
>>                    +--------+  +--------+    +--------+ +--------+
>>
>> Signed-off-by: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
>> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
>> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
>> ---
>>   include/qemu/jhash.h |  59 ++++++++++
>>   net/colo-compare.c   | 303 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   trace-events         |   3 +
>>   3 files changed, 360 insertions(+), 5 deletions(-)
>>   create mode 100644 include/qemu/jhash.h
>>
>> diff --git a/include/qemu/jhash.h b/include/qemu/jhash.h
>> new file mode 100644
>> index 0000000..8a8ff0f
>> --- /dev/null
>> +++ b/include/qemu/jhash.h
>> @@ -0,0 +1,59 @@
>> +/* jhash.h: Jenkins hash support.
>> +  *
>> +  * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
>> +  *
>> +  * http://burtleburtle.net/bob/hash/
>> +  *
>> +  * These are the credits from Bob's sources:
>> +  *
>> +  * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
>> +  *
>> +  * These are functions for producing 32-bit hashes for hash table lookup.
>> +  * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
>> +  * are externally useful functions.  Routines to test the hash are included
>> +  * if SELF_TEST is defined.  You can use this free for any purpose.It's in
>> +  * the public domain.  It has no warranty.
>> +  *
>> +  * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu)
>> +  *
>> +  * I've modified Bob's hash to be useful in the Linux kernel, and
>> +  * any bugs present are my fault.
>> +  * Jozsef
>> +  */
>> +
>> +#ifndef QEMU_JHASH_H__
>> +#define QEMU_JHASH_H__
>> +
>> +#include "qemu/bitops.h"
>> +
>> +/*
>> + * hashtable related is copied from linux kernel jhash
>> + */
>> +
>> +/* __jhash_mix -- mix 3 32-bit values reversibly. */
>> +#define __jhash_mix(a, b, c)                \
>> +{                                           \
>> +    a -= c;  a ^= rol32(c, 4);  c += b;     \
>> +    b -= a;  b ^= rol32(a, 6);  a += c;     \
>> +    c -= b;  c ^= rol32(b, 8);  b += a;     \
>> +    a -= c;  a ^= rol32(c, 16); c += b;     \
>> +    b -= a;  b ^= rol32(a, 19); a += c;     \
>> +    c -= b;  c ^= rol32(b, 4);  b += a;     \
>> +}
>> +
>> +/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
>> +#define __jhash_final(a, b, c)  \
>> +{                               \
>> +    c ^= b; c -= rol32(b, 14);  \
>> +    a ^= c; a -= rol32(c, 11);  \
>> +    b ^= a; b -= rol32(a, 25);  \
>> +    c ^= b; c -= rol32(b, 16);  \
>> +    a ^= c; a -= rol32(c, 4);   \
>> +    b ^= a; b -= rol32(a, 14);  \
>> +    c ^= b; c -= rol32(b, 24);  \
>> +}
>> +
>> +/* An arbitrary initial parameter */
>> +#define JHASH_INITVAL           0xdeadbeef
>> +
>> +#endif /* QEMU_JHASH_H__ */
>> diff --git a/net/colo-compare.c b/net/colo-compare.c
>> index c45b132..dc57eac 100644
>> --- a/net/colo-compare.c
>> +++ b/net/colo-compare.c
>> @@ -22,12 +22,16 @@
>>   #include "qemu/sockets.h"
>>   #include "qapi-visit.h"
>>   #include "trace.h"
>> +#include "slirp/slirp.h"
>> +#include "qemu/jhash.h"
>> +#include "net/eth.h"
>>   
>>   #define TYPE_COLO_COMPARE "colo-compare"
>>   #define COLO_COMPARE(obj) \
>>       OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE)
>>   
>>   #define COMPARE_READ_LEN_MAX NET_BUFSIZE
>> +#define HASHTABLE_MAX_SIZE 16384
>>   
>>   static QTAILQ_HEAD(, CompareState) net_compares =
>>          QTAILQ_HEAD_INITIALIZER(net_compares);
>> @@ -39,6 +43,28 @@ typedef struct ReadState {
>>       uint8_t buf[COMPARE_READ_LEN_MAX];
>>   } ReadState;
>>   
>> +/*
>> +  + CompareState ++
>> +  |               |
>> +  +---------------+   +---------------+         +---------------+
>> +  |conn list      +--->conn           +--------->conn           |
>> +  +---------------+   +---------------+         +---------------+
>> +  |               |     |           |             |          |
>> +  +---------------+ +---v----+  +---v----+    +---v----+ +---v----+
>> +                    |primary |  |secondary    |primary | |secondary
>> +                    |packet  |  |packet  +    |packet  | |packet  +
>> +                    +--------+  +--------+    +--------+ +--------+
>> +                        |           |             |          |
>> +                    +---v----+  +---v----+    +---v----+ +---v----+
>> +                    |primary |  |secondary    |primary | |secondary
>> +                    |packet  |  |packet  +    |packet  | |packet  +
>> +                    +--------+  +--------+    +--------+ +--------+
>> +                        |           |             |          |
>> +                    +---v----+  +---v----+    +---v----+ +---v----+
>> +                    |primary |  |secondary    |primary | |secondary
>> +                    |packet  |  |packet  +    |packet  | |packet  +
>> +                    +--------+  +--------+    +--------+ +--------+
>> +*/
>>   typedef struct CompareState {
>>       Object parent;
>>   
>> @@ -51,12 +77,265 @@ typedef struct CompareState {
>>       QTAILQ_ENTRY(CompareState) next;
>>       ReadState pri_rs;
>>       ReadState sec_rs;
>> +
>> +    /* connection list: the connections belonged to this NIC could be found
>> +     * in this list.
>> +     * element type: Connection
>> +     */
>> +    GQueue conn_list;
>> +    QemuMutex conn_list_lock; /* to protect conn_list */
>> +    /* hashtable to save connection */
>> +    GHashTable *connection_track_table;
>> +    /* to save unprocessed_connections */
>> +    GQueue unprocessed_connections;
>> +    /* proxy current hash size */
>> +    uint32_t hashtable_size;
>>   } CompareState;
>>   
>>   typedef struct CompareClass {
>>       ObjectClass parent_class;
>>   } CompareClass;
>>   
>> +typedef struct Packet {
>> +    void *data;
>> +    union {
>> +        uint8_t *network_layer;
>> +        struct ip *ip;
> Does this mean ipv6 is not supported?

Yes,currently not support.

>
>> +    };
>> +    uint8_t *transport_layer;
>> +    int size;
>> +    CompareState *s;
>> +} Packet;
>> +
>> +typedef struct ConnectionKey {
>> +    /* (src, dst) must be grouped, in the same way than in IP header */
>> +    struct in_addr src;
>> +    struct in_addr dst;
>> +    uint16_t src_port;
>> +    uint16_t dst_port;
>> +    uint8_t ip_proto;
>> +} QEMU_PACKED ConnectionKey;
>> +
>> +typedef struct Connection {
>> +    QemuMutex list_lock;
>> +    /* connection primary send queue: element type: Packet */
>> +    GQueue primary_list;
>> +    /* connection secondary send queue: element type: Packet */
>> +    GQueue secondary_list;
>> +    /* flag to enqueue unprocessed_connections */
>> +    bool processing;
>> +    uint8_t ip_proto;
>> +} Connection;
>> +
>> +enum {
>> +    PRIMARY_IN = 0,
>> +    SECONDARY_IN,
>> +};
>> +
>> +static void packet_destroy(void *opaque, void *user_data);
>> +static int compare_chr_send(CharDriverState *out,
>> +                            const uint8_t *buf,
>> +                            uint32_t size);
>> +
>> +static uint32_t connection_key_hash(const void *opaque)
>> +{
>> +    const ConnectionKey *key = opaque;
>> +    uint32_t a, b, c;
>> +
>> +    /* Jenkins hash */
>> +    a = b = c = JHASH_INITVAL + sizeof(*key);
>> +    a += key->src.s_addr;
>> +    b += key->dst.s_addr;
>> +    c += (key->src_port | key->dst_port << 16);
>> +    __jhash_mix(a, b, c);
>> +
>> +    a += key->ip_proto;
>> +    __jhash_final(a, b, c);
>> +
>> +    return c;
>> +}
>> +
>> +static int connection_key_equal(const void *opaque1, const void *opaque2)
>> +{
>> +    return memcmp(opaque1, opaque2, sizeof(ConnectionKey)) == 0;
> So why not useing ConnectionKey * consider we're sure of the type?

OK, will fix it in next version.

>
>> +}
>> +
>> +/*
>> + *  initialize connecon_key for packet
>> + *  Return 0 on success, if return 1 the pkt will be sent later
>> + */
>> +static int connection_key_init(Packet *pkt, ConnectionKey *key)
>> +{
>> +    int network_length;
>> +    uint8_t *data = pkt->data;
>> +    uint16_t l3_proto;
>> +    uint32_t tmp_ports;
>> +    ssize_t l2hdr_len = eth_get_l2_hdr_length(data);
>> +
>> +    pkt->network_layer = data + ETH_HLEN;
> Can the length of data be shorter than ETH_HELN?

Thanks,I will check pkt->size first.

>
>> +    l3_proto = eth_get_l3_proto(data, l2hdr_len);
>> +    if (l3_proto != ETH_P_IP) {
>> +        return 1;
>> +    }
>> +
>> +    network_length = pkt->ip->ip_hl * 4;
>> +    pkt->transport_layer = pkt->network_layer + network_length;
> Do we need sanity check to make sure there's no evil network_length here?

Yes,I will fix.

>
>> +    if (!pkt->transport_layer) {
>> +        error_report("pkt->transport_layer is valid");
> invalid? And if this is caused by the bad packet it self, there's no
> need for a error_report.

OK

>
>> +        return 1;
>> +    }
>> +    key->ip_proto = pkt->ip->ip_p;
>> +    key->src = pkt->ip->ip_src;
>> +    key->dst = pkt->ip->ip_dst;
>> +
>> +    switch (key->ip_proto) {
>> +    case IPPROTO_TCP:
>> +    case IPPROTO_UDP:
>> +    case IPPROTO_DCCP:
>> +    case IPPROTO_ESP:
>> +    case IPPROTO_SCTP:
>> +    case IPPROTO_UDPLITE:
>> +        tmp_ports = *(uint32_t *)(pkt->transport_layer);
>> +        key->src_port = ntohs(tmp_ports & 0xffff);
>> +        key->dst_port = ntohs(tmp_ports >> 16);
>> +        break;
>> +    case IPPROTO_AH:
>> +        tmp_ports = *(uint32_t *)(pkt->transport_layer + 4);
>> +        key->src_port = ntohs(tmp_ports & 0xffff);
>> +        key->dst_port = ntohs(tmp_ports >> 16);
>> +        break;
>> +    default:
>> +        key->src_port = 0;
>> +        key->dst_port = 0;
>> +        break;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static Connection *connection_new(ConnectionKey *key)
>> +{
>> +    Connection *conn = g_slice_new(Connection);
>> +
>> +    qemu_mutex_init(&conn->list_lock);
>> +    conn->ip_proto = key->ip_proto;
>> +    conn->processing = false;
>> +    g_queue_init(&conn->primary_list);
>> +    g_queue_init(&conn->secondary_list);
>> +
>> +    return conn;
>> +}
>> +
>> +/*
>> + * Clear hashtable, stop this hash growing really huge
>> + */
>> +static void connection_hashtable_reset(CompareState *s)
>> +{
>> +    s->hashtable_size = 0;
>> +    g_hash_table_remove_all(s->connection_track_table);
>> +}
>> +
>> +/* if not found, create a new connection and add to hash table */
>> +static Connection *connection_get(CompareState *s, ConnectionKey *key)
>> +{
>> +    /* FIXME: protect connection_track_table */
>> +    Connection *conn = g_hash_table_lookup(s->connection_track_table, key);
>> +
>> +    if (conn == NULL) {
>> +        ConnectionKey *new_key = g_memdup(key, sizeof(*key));
>> +
>> +        conn = connection_new(key);
>> +
>> +        s->hashtable_size++;
>> +        if (s->hashtable_size > HASHTABLE_MAX_SIZE) {
>> +            error_report("colo proxy connection hashtable full, clear it");
>> +            connection_hashtable_reset(s);
>> +            /* TODO:clear conn_list */
>> +        }
>> +
>> +        g_hash_table_insert(s->connection_track_table, new_key, conn);
>> +    }
>> +
>> +     return conn;
>> +}
>> +
>> +static void connection_destroy(void *opaque)
>> +{
>> +    Connection *conn = opaque;
>> +
>> +    qemu_mutex_lock(&conn->list_lock);
> Like I said in previous patch, if you do all the processing in colo
> compare thread, you can avoid almost all synchronization (e.g mutex).
>
>> +    g_queue_foreach(&conn->primary_list, packet_destroy, NULL);
>> +    g_queue_free(&conn->primary_list);
>> +    g_queue_foreach(&conn->secondary_list, packet_destroy, NULL);
>> +    g_queue_free(&conn->secondary_list);
>> +    qemu_mutex_unlock(&conn->list_lock);
>> +    qemu_mutex_destroy(&conn->list_lock);
>> +    g_slice_free(Connection, conn);
>> +}
>> +
>> +static Packet *packet_new(CompareState *s, const void *data,
>> +                              int size, ConnectionKey *key)
>> +{
>> +    Packet *pkt = g_slice_new(Packet);
>> +
>> +    pkt->data = g_memdup(data, size);
>> +    pkt->size = size;
>> +    pkt->s = s;
>> +
>> +    if (connection_key_init(pkt, key)) {
>> +        packet_destroy(pkt, NULL);
>> +        pkt = NULL;
>> +    }
> Can we do connection_key_init() first, this can avoid packet_desctory()
> if it fails.

Do you mean we should call connection_key_init() first
and then call packet_new()?


>
>> +
>> +    return pkt;
>> +}
>> +
>> +/*
>> + * Return 0 on success, if return -1 means the pkt
>> + * is unsupported(arp and ipv6) and will be sent later
>> + */
>> +static int packet_enqueue(CompareState *s, int mode)
>> +{
>> +    ConnectionKey key = {{ 0 } };
>> +    Packet *pkt = NULL;
>> +    Connection *conn;
>> +
>> +    if (mode == PRIMARY_IN) {
>> +        pkt = packet_new(s, s->pri_rs.buf, s->pri_rs.packet_len, &key);
>> +    } else {
>> +        pkt = packet_new(s, s->sec_rs.buf, s->sec_rs.packet_len, &key);
>> +    }
>> +    if (!pkt) {
>> +        return -1;
>> +    }
>> +
>> +    conn = connection_get(s, &key);
>> +    if (!conn->processing) {
>> +        qemu_mutex_lock(&s->conn_list_lock);
>> +        g_queue_push_tail(&s->conn_list, conn);
>> +        qemu_mutex_unlock(&s->conn_list_lock);
>> +        conn->processing = true;
>> +    }
>> +
>> +    qemu_mutex_lock(&conn->list_lock);
>> +    if (mode == PRIMARY_IN) {
>> +        g_queue_push_tail(&conn->primary_list, pkt);
>> +    } else {
>> +        g_queue_push_tail(&conn->secondary_list, pkt);
>> +    }
>> +    qemu_mutex_unlock(&conn->list_lock);
>> +
>> +    return 0;
>> +}
>> +
>> +static void packet_destroy(void *opaque, void *user_data)
>> +{
>> +    Packet *pkt = opaque;
>> +
>> +    g_free(pkt->data);
>> +    g_slice_free(Packet, pkt);
>> +}
>> +
>>   static int compare_chr_send(CharDriverState *out,
>>                               const uint8_t *buf,
>>                               uint32_t size)
>> @@ -158,8 +437,10 @@ static void compare_pri_chr_in(void *opaque, const uint8_t *buf, int size)
>>   
>>       ret = compare_chr_fill_rstate(&s->pri_rs, buf, size);
>>       if (ret == 1) {
>> -        /* FIXME: enqueue to primary packet list */
>> -        compare_chr_send(s->chr_out, s->pri_rs.buf, s->pri_rs.packet_len);
>> +        if (packet_enqueue(s, PRIMARY_IN)) {
>> +            trace_colo_compare_main("primary: unsupported packet in");
>> +            compare_chr_send(s->chr_out, s->pri_rs.buf, s->pri_rs.packet_len);
> Looks like if a packet was not recognized by connection_key_init(), it
> will be sent directly without comparing it with the packet sent from
> secondary? Is this expected?

Yes,we will send primary's arp packet to get mac first.

Thanks
zhangchen

>
>> +        }
>>       } else if (ret == -1) {
>>           qemu_chr_add_handlers(s->chr_pri_in, NULL, NULL, NULL, NULL);
>>       }
>> @@ -176,9 +457,11 @@ static void compare_sec_chr_in(void *opaque, const uint8_t *buf, int size)
>>   
>>       ret = compare_chr_fill_rstate(&s->sec_rs, buf, size);
>>       if (ret == 1) {
>> -        /* TODO: enqueue to secondary packet list*/
>> -        /* should we send sec arp pkt? */
>> -        compare_chr_send(s->chr_out, s->sec_rs.buf, s->sec_rs.packet_len);
>> +        if (packet_enqueue(s, SECONDARY_IN)) {
>> +            trace_colo_compare_main("secondary: unsupported packet in");
>> +            /* should we send sec arp pkt? */
>> +            compare_chr_send(s->chr_out, s->sec_rs.buf, s->sec_rs.packet_len);
>> +        }
>>       } else if (ret == -1) {
>>           qemu_chr_add_handlers(s->chr_sec_in, NULL, NULL, NULL, NULL);
>>       }
>> @@ -280,6 +563,15 @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
>>       qemu_chr_fe_claim_no_fail(s->chr_out);
>>       QTAILQ_INSERT_TAIL(&net_compares, s, next);
>>   
>> +    g_queue_init(&s->conn_list);
>> +    qemu_mutex_init(&s->conn_list_lock);
>> +    s->hashtable_size = 0;
>> +
>> +    s->connection_track_table = g_hash_table_new_full(connection_key_hash,
>> +                                                      connection_key_equal,
>> +                                                      g_free,
>> +                                                      connection_destroy);
>> +
>>       return;
>>   }
>>   
>> @@ -314,6 +606,7 @@ static void colo_compare_class_finalize(ObjectClass *oc, void *data)
>>       if (!QTAILQ_EMPTY(&net_compares)) {
>>           QTAILQ_REMOVE(&net_compares, s, next);
>>       }
>> +    qemu_mutex_destroy(&s->conn_list_lock);
>>   }
>>   
>>   static void colo_compare_init(Object *obj)
>> diff --git a/trace-events b/trace-events
>> index ca7211b..8862288 100644
>> --- a/trace-events
>> +++ b/trace-events
>> @@ -1916,3 +1916,6 @@ aspeed_vic_update_fiq(int flags) "Raising FIQ: %d"
>>   aspeed_vic_update_irq(int flags) "Raising IRQ: %d"
>>   aspeed_vic_read(uint64_t offset, unsigned size, uint32_t value) "From 0x%" PRIx64 " of size %u: 0x%" PRIx32
>>   aspeed_vic_write(uint64_t offset, unsigned size, uint32_t data) "To 0x%" PRIx64 " of size %u: 0x%" PRIx32
>> +
>> +# net/colo-compare.c
>> +colo_compare_main(const char *chr) "chr: %s"
>
>
> .
>
Jason Wang April 29, 2016, 2:05 a.m. UTC | #3
On 04/28/2016 06:25 PM, Zhang Chen wrote:
>>> +static Packet *packet_new(CompareState *s, const void *data,
>>> +                              int size, ConnectionKey *key)
>>> +{
>>> +    Packet *pkt = g_slice_new(Packet);
>>> +
>>> +    pkt->data = g_memdup(data, size);
>>> +    pkt->size = size;
>>> +    pkt->s = s;
>>> +
>>> +    if (connection_key_init(pkt, key)) {
>>> +        packet_destroy(pkt, NULL);
>>> +        pkt = NULL;
>>> +    }
>> Can we do connection_key_init() first, this can avoid packet_desctory()
>> if it fails.
>
> Do you mean we should call connection_key_init() first
> and then call packet_new()?

Yes, only when connection_key_init() succeed.

>
>
>>
>>> +
>>> +    return pkt;
>>> +}
>>> +
>>> +/*
>>> + * Return 0 on success, if return -1 means the pkt
>>> + * is unsupported(arp and ipv6) and will be sent later
>>> + */
>>> +static int packet_enqueue(CompareState *s, int mode)
>>> +{
>>> +    ConnectionKey key = {{ 0 } };
>>> +    Packet *pkt = NULL;
>>> +    Connection *conn;
>>> +
>>> +    if (mode == PRIMARY_IN) {
>>> +        pkt = packet_new(s, s->pri_rs.buf, s->pri_rs.packet_len,
>>> &key);
>>> +    } else {
>>> +        pkt = packet_new(s, s->sec_rs.buf, s->sec_rs.packet_len,
>>> &key);
>>> +    }
>>> +    if (!pkt) {
>>> +        return -1;
>>> +    }
>>> +
>>> +    conn = connection_get(s, &key);
>>> +    if (!conn->processing) {
>>> +        qemu_mutex_lock(&s->conn_list_lock);
>>> +        g_queue_push_tail(&s->conn_list, conn);
>>> +        qemu_mutex_unlock(&s->conn_list_lock);
>>> +        conn->processing = true;
>>> +    }
>>> +
>>> +    qemu_mutex_lock(&conn->list_lock);
>>> +    if (mode == PRIMARY_IN) {
>>> +        g_queue_push_tail(&conn->primary_list, pkt);
>>> +    } else {
>>> +        g_queue_push_tail(&conn->secondary_list, pkt);
>>> +    }
>>> +    qemu_mutex_unlock(&conn->list_lock);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static void packet_destroy(void *opaque, void *user_data)
>>> +{
>>> +    Packet *pkt = opaque;
>>> +
>>> +    g_free(pkt->data);
>>> +    g_slice_free(Packet, pkt);
>>> +}
>>> +
>>>   static int compare_chr_send(CharDriverState *out,
>>>                               const uint8_t *buf,
>>>                               uint32_t size)
>>> @@ -158,8 +437,10 @@ static void compare_pri_chr_in(void *opaque,
>>> const uint8_t *buf, int size)
>>>         ret = compare_chr_fill_rstate(&s->pri_rs, buf, size);
>>>       if (ret == 1) {
>>> -        /* FIXME: enqueue to primary packet list */
>>> -        compare_chr_send(s->chr_out, s->pri_rs.buf,
>>> s->pri_rs.packet_len);
>>> +        if (packet_enqueue(s, PRIMARY_IN)) {
>>> +            trace_colo_compare_main("primary: unsupported packet in");
>>> +            compare_chr_send(s->chr_out, s->pri_rs.buf,
>>> s->pri_rs.packet_len);
>> Looks like if a packet was not recognized by connection_key_init(), it
>> will be sent directly without comparing it with the packet sent from
>> secondary? Is this expected?
>
> Yes,we will send primary's arp packet to get mac first.
>
> Thanks
> zhangchen

But what if the packet was not arp?
Zhang Chen April 29, 2016, 7:24 a.m. UTC | #4
On 04/29/2016 10:05 AM, Jason Wang wrote:
> On 04/28/2016 06:25 PM, Zhang Chen wrote:
>>>> +static Packet *packet_new(CompareState *s, const void *data,
>>>> +                              int size, ConnectionKey *key)
>>>> +{
>>>> +    Packet *pkt = g_slice_new(Packet);
>>>> +
>>>> +    pkt->data = g_memdup(data, size);
>>>> +    pkt->size = size;
>>>> +    pkt->s = s;
>>>> +
>>>> +    if (connection_key_init(pkt, key)) {
>>>> +        packet_destroy(pkt, NULL);
>>>> +        pkt = NULL;
>>>> +    }
>>> Can we do connection_key_init() first, this can avoid packet_desctory()
>>> if it fails.
>> Do you mean we should call connection_key_init() first
>> and then call packet_new()?
> Yes, only when connection_key_init() succeed.

OK~ will fix in next.

>>>> +
>>>> +    return pkt;
>>>> +}
>>>> +
>>>> +/*
>>>> + * Return 0 on success, if return -1 means the pkt
>>>> + * is unsupported(arp and ipv6) and will be sent later
>>>> + */
>>>> +static int packet_enqueue(CompareState *s, int mode)
>>>> +{
>>>> +    ConnectionKey key = {{ 0 } };
>>>> +    Packet *pkt = NULL;
>>>> +    Connection *conn;
>>>> +
>>>> +    if (mode == PRIMARY_IN) {
>>>> +        pkt = packet_new(s, s->pri_rs.buf, s->pri_rs.packet_len,
>>>> &key);
>>>> +    } else {
>>>> +        pkt = packet_new(s, s->sec_rs.buf, s->sec_rs.packet_len,
>>>> &key);
>>>> +    }
>>>> +    if (!pkt) {
>>>> +        return -1;
>>>> +    }
>>>> +
>>>> +    conn = connection_get(s, &key);
>>>> +    if (!conn->processing) {
>>>> +        qemu_mutex_lock(&s->conn_list_lock);
>>>> +        g_queue_push_tail(&s->conn_list, conn);
>>>> +        qemu_mutex_unlock(&s->conn_list_lock);
>>>> +        conn->processing = true;
>>>> +    }
>>>> +
>>>> +    qemu_mutex_lock(&conn->list_lock);
>>>> +    if (mode == PRIMARY_IN) {
>>>> +        g_queue_push_tail(&conn->primary_list, pkt);
>>>> +    } else {
>>>> +        g_queue_push_tail(&conn->secondary_list, pkt);
>>>> +    }
>>>> +    qemu_mutex_unlock(&conn->list_lock);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static void packet_destroy(void *opaque, void *user_data)
>>>> +{
>>>> +    Packet *pkt = opaque;
>>>> +
>>>> +    g_free(pkt->data);
>>>> +    g_slice_free(Packet, pkt);
>>>> +}
>>>> +
>>>>    static int compare_chr_send(CharDriverState *out,
>>>>                                const uint8_t *buf,
>>>>                                uint32_t size)
>>>> @@ -158,8 +437,10 @@ static void compare_pri_chr_in(void *opaque,
>>>> const uint8_t *buf, int size)
>>>>          ret = compare_chr_fill_rstate(&s->pri_rs, buf, size);
>>>>        if (ret == 1) {
>>>> -        /* FIXME: enqueue to primary packet list */
>>>> -        compare_chr_send(s->chr_out, s->pri_rs.buf,
>>>> s->pri_rs.packet_len);
>>>> +        if (packet_enqueue(s, PRIMARY_IN)) {
>>>> +            trace_colo_compare_main("primary: unsupported packet in");
>>>> +            compare_chr_send(s->chr_out, s->pri_rs.buf,
>>>> s->pri_rs.packet_len);
>>> Looks like if a packet was not recognized by connection_key_init(), it
>>> will be sent directly without comparing it with the packet sent from
>>> secondary? Is this expected?
>> Yes,we will send primary's arp packet to get mac first.
>>
>> Thanks
>> zhangchen
> But what if the packet was not arp?
>
>
> .

rarp packet will be sent, ip packet will be enqueue.
diff mbox

Patch

diff --git a/include/qemu/jhash.h b/include/qemu/jhash.h
new file mode 100644
index 0000000..8a8ff0f
--- /dev/null
+++ b/include/qemu/jhash.h
@@ -0,0 +1,59 @@ 
+/* jhash.h: Jenkins hash support.
+  *
+  * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
+  *
+  * http://burtleburtle.net/bob/hash/
+  *
+  * These are the credits from Bob's sources:
+  *
+  * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+  *
+  * These are functions for producing 32-bit hashes for hash table lookup.
+  * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
+  * are externally useful functions.  Routines to test the hash are included
+  * if SELF_TEST is defined.  You can use this free for any purpose.It's in
+  * the public domain.  It has no warranty.
+  *
+  * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu)
+  *
+  * I've modified Bob's hash to be useful in the Linux kernel, and
+  * any bugs present are my fault.
+  * Jozsef
+  */
+
+#ifndef QEMU_JHASH_H__
+#define QEMU_JHASH_H__
+
+#include "qemu/bitops.h"
+
+/*
+ * hashtable related is copied from linux kernel jhash
+ */
+
+/* __jhash_mix -- mix 3 32-bit values reversibly. */
+#define __jhash_mix(a, b, c)                \
+{                                           \
+    a -= c;  a ^= rol32(c, 4);  c += b;     \
+    b -= a;  b ^= rol32(a, 6);  a += c;     \
+    c -= b;  c ^= rol32(b, 8);  b += a;     \
+    a -= c;  a ^= rol32(c, 16); c += b;     \
+    b -= a;  b ^= rol32(a, 19); a += c;     \
+    c -= b;  c ^= rol32(b, 4);  b += a;     \
+}
+
+/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
+#define __jhash_final(a, b, c)  \
+{                               \
+    c ^= b; c -= rol32(b, 14);  \
+    a ^= c; a -= rol32(c, 11);  \
+    b ^= a; b -= rol32(a, 25);  \
+    c ^= b; c -= rol32(b, 16);  \
+    a ^= c; a -= rol32(c, 4);   \
+    b ^= a; b -= rol32(a, 14);  \
+    c ^= b; c -= rol32(b, 24);  \
+}
+
+/* An arbitrary initial parameter */
+#define JHASH_INITVAL           0xdeadbeef
+
+#endif /* QEMU_JHASH_H__ */
diff --git a/net/colo-compare.c b/net/colo-compare.c
index c45b132..dc57eac 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -22,12 +22,16 @@ 
 #include "qemu/sockets.h"
 #include "qapi-visit.h"
 #include "trace.h"
+#include "slirp/slirp.h"
+#include "qemu/jhash.h"
+#include "net/eth.h"
 
 #define TYPE_COLO_COMPARE "colo-compare"
 #define COLO_COMPARE(obj) \
     OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE)
 
 #define COMPARE_READ_LEN_MAX NET_BUFSIZE
+#define HASHTABLE_MAX_SIZE 16384
 
 static QTAILQ_HEAD(, CompareState) net_compares =
        QTAILQ_HEAD_INITIALIZER(net_compares);
@@ -39,6 +43,28 @@  typedef struct ReadState {
     uint8_t buf[COMPARE_READ_LEN_MAX];
 } ReadState;
 
+/*
+  + CompareState ++
+  |               |
+  +---------------+   +---------------+         +---------------+
+  |conn list      +--->conn           +--------->conn           |
+  +---------------+   +---------------+         +---------------+
+  |               |     |           |             |          |
+  +---------------+ +---v----+  +---v----+    +---v----+ +---v----+
+                    |primary |  |secondary    |primary | |secondary
+                    |packet  |  |packet  +    |packet  | |packet  +
+                    +--------+  +--------+    +--------+ +--------+
+                        |           |             |          |
+                    +---v----+  +---v----+    +---v----+ +---v----+
+                    |primary |  |secondary    |primary | |secondary
+                    |packet  |  |packet  +    |packet  | |packet  +
+                    +--------+  +--------+    +--------+ +--------+
+                        |           |             |          |
+                    +---v----+  +---v----+    +---v----+ +---v----+
+                    |primary |  |secondary    |primary | |secondary
+                    |packet  |  |packet  +    |packet  | |packet  +
+                    +--------+  +--------+    +--------+ +--------+
+*/
 typedef struct CompareState {
     Object parent;
 
@@ -51,12 +77,265 @@  typedef struct CompareState {
     QTAILQ_ENTRY(CompareState) next;
     ReadState pri_rs;
     ReadState sec_rs;
+
+    /* connection list: the connections belonged to this NIC could be found
+     * in this list.
+     * element type: Connection
+     */
+    GQueue conn_list;
+    QemuMutex conn_list_lock; /* to protect conn_list */
+    /* hashtable to save connection */
+    GHashTable *connection_track_table;
+    /* to save unprocessed_connections */
+    GQueue unprocessed_connections;
+    /* proxy current hash size */
+    uint32_t hashtable_size;
 } CompareState;
 
 typedef struct CompareClass {
     ObjectClass parent_class;
 } CompareClass;
 
+typedef struct Packet {
+    void *data;
+    union {
+        uint8_t *network_layer;
+        struct ip *ip;
+    };
+    uint8_t *transport_layer;
+    int size;
+    CompareState *s;
+} Packet;
+
+typedef struct ConnectionKey {
+    /* (src, dst) must be grouped, in the same way than in IP header */
+    struct in_addr src;
+    struct in_addr dst;
+    uint16_t src_port;
+    uint16_t dst_port;
+    uint8_t ip_proto;
+} QEMU_PACKED ConnectionKey;
+
+typedef struct Connection {
+    QemuMutex list_lock;
+    /* connection primary send queue: element type: Packet */
+    GQueue primary_list;
+    /* connection secondary send queue: element type: Packet */
+    GQueue secondary_list;
+    /* flag to enqueue unprocessed_connections */
+    bool processing;
+    uint8_t ip_proto;
+} Connection;
+
+enum {
+    PRIMARY_IN = 0,
+    SECONDARY_IN,
+};
+
+static void packet_destroy(void *opaque, void *user_data);
+static int compare_chr_send(CharDriverState *out,
+                            const uint8_t *buf,
+                            uint32_t size);
+
+static uint32_t connection_key_hash(const void *opaque)
+{
+    const ConnectionKey *key = opaque;
+    uint32_t a, b, c;
+
+    /* Jenkins hash */
+    a = b = c = JHASH_INITVAL + sizeof(*key);
+    a += key->src.s_addr;
+    b += key->dst.s_addr;
+    c += (key->src_port | key->dst_port << 16);
+    __jhash_mix(a, b, c);
+
+    a += key->ip_proto;
+    __jhash_final(a, b, c);
+
+    return c;
+}
+
+static int connection_key_equal(const void *opaque1, const void *opaque2)
+{
+    return memcmp(opaque1, opaque2, sizeof(ConnectionKey)) == 0;
+}
+
+/*
+ *  initialize connecon_key for packet
+ *  Return 0 on success, if return 1 the pkt will be sent later
+ */
+static int connection_key_init(Packet *pkt, ConnectionKey *key)
+{
+    int network_length;
+    uint8_t *data = pkt->data;
+    uint16_t l3_proto;
+    uint32_t tmp_ports;
+    ssize_t l2hdr_len = eth_get_l2_hdr_length(data);
+
+    pkt->network_layer = data + ETH_HLEN;
+    l3_proto = eth_get_l3_proto(data, l2hdr_len);
+    if (l3_proto != ETH_P_IP) {
+        return 1;
+    }
+
+    network_length = pkt->ip->ip_hl * 4;
+    pkt->transport_layer = pkt->network_layer + network_length;
+    if (!pkt->transport_layer) {
+        error_report("pkt->transport_layer is valid");
+        return 1;
+    }
+    key->ip_proto = pkt->ip->ip_p;
+    key->src = pkt->ip->ip_src;
+    key->dst = pkt->ip->ip_dst;
+
+    switch (key->ip_proto) {
+    case IPPROTO_TCP:
+    case IPPROTO_UDP:
+    case IPPROTO_DCCP:
+    case IPPROTO_ESP:
+    case IPPROTO_SCTP:
+    case IPPROTO_UDPLITE:
+        tmp_ports = *(uint32_t *)(pkt->transport_layer);
+        key->src_port = ntohs(tmp_ports & 0xffff);
+        key->dst_port = ntohs(tmp_ports >> 16);
+        break;
+    case IPPROTO_AH:
+        tmp_ports = *(uint32_t *)(pkt->transport_layer + 4);
+        key->src_port = ntohs(tmp_ports & 0xffff);
+        key->dst_port = ntohs(tmp_ports >> 16);
+        break;
+    default:
+        key->src_port = 0;
+        key->dst_port = 0;
+        break;
+    }
+
+    return 0;
+}
+
+static Connection *connection_new(ConnectionKey *key)
+{
+    Connection *conn = g_slice_new(Connection);
+
+    qemu_mutex_init(&conn->list_lock);
+    conn->ip_proto = key->ip_proto;
+    conn->processing = false;
+    g_queue_init(&conn->primary_list);
+    g_queue_init(&conn->secondary_list);
+
+    return conn;
+}
+
+/*
+ * Clear hashtable, stop this hash growing really huge
+ */
+static void connection_hashtable_reset(CompareState *s)
+{
+    s->hashtable_size = 0;
+    g_hash_table_remove_all(s->connection_track_table);
+}
+
+/* if not found, create a new connection and add to hash table */
+static Connection *connection_get(CompareState *s, ConnectionKey *key)
+{
+    /* FIXME: protect connection_track_table */
+    Connection *conn = g_hash_table_lookup(s->connection_track_table, key);
+
+    if (conn == NULL) {
+        ConnectionKey *new_key = g_memdup(key, sizeof(*key));
+
+        conn = connection_new(key);
+
+        s->hashtable_size++;
+        if (s->hashtable_size > HASHTABLE_MAX_SIZE) {
+            error_report("colo proxy connection hashtable full, clear it");
+            connection_hashtable_reset(s);
+            /* TODO:clear conn_list */
+        }
+
+        g_hash_table_insert(s->connection_track_table, new_key, conn);
+    }
+
+     return conn;
+}
+
+static void connection_destroy(void *opaque)
+{
+    Connection *conn = opaque;
+
+    qemu_mutex_lock(&conn->list_lock);
+    g_queue_foreach(&conn->primary_list, packet_destroy, NULL);
+    g_queue_free(&conn->primary_list);
+    g_queue_foreach(&conn->secondary_list, packet_destroy, NULL);
+    g_queue_free(&conn->secondary_list);
+    qemu_mutex_unlock(&conn->list_lock);
+    qemu_mutex_destroy(&conn->list_lock);
+    g_slice_free(Connection, conn);
+}
+
+static Packet *packet_new(CompareState *s, const void *data,
+                              int size, ConnectionKey *key)
+{
+    Packet *pkt = g_slice_new(Packet);
+
+    pkt->data = g_memdup(data, size);
+    pkt->size = size;
+    pkt->s = s;
+
+    if (connection_key_init(pkt, key)) {
+        packet_destroy(pkt, NULL);
+        pkt = NULL;
+    }
+
+    return pkt;
+}
+
+/*
+ * Return 0 on success, if return -1 means the pkt
+ * is unsupported(arp and ipv6) and will be sent later
+ */
+static int packet_enqueue(CompareState *s, int mode)
+{
+    ConnectionKey key = {{ 0 } };
+    Packet *pkt = NULL;
+    Connection *conn;
+
+    if (mode == PRIMARY_IN) {
+        pkt = packet_new(s, s->pri_rs.buf, s->pri_rs.packet_len, &key);
+    } else {
+        pkt = packet_new(s, s->sec_rs.buf, s->sec_rs.packet_len, &key);
+    }
+    if (!pkt) {
+        return -1;
+    }
+
+    conn = connection_get(s, &key);
+    if (!conn->processing) {
+        qemu_mutex_lock(&s->conn_list_lock);
+        g_queue_push_tail(&s->conn_list, conn);
+        qemu_mutex_unlock(&s->conn_list_lock);
+        conn->processing = true;
+    }
+
+    qemu_mutex_lock(&conn->list_lock);
+    if (mode == PRIMARY_IN) {
+        g_queue_push_tail(&conn->primary_list, pkt);
+    } else {
+        g_queue_push_tail(&conn->secondary_list, pkt);
+    }
+    qemu_mutex_unlock(&conn->list_lock);
+
+    return 0;
+}
+
+static void packet_destroy(void *opaque, void *user_data)
+{
+    Packet *pkt = opaque;
+
+    g_free(pkt->data);
+    g_slice_free(Packet, pkt);
+}
+
 static int compare_chr_send(CharDriverState *out,
                             const uint8_t *buf,
                             uint32_t size)
@@ -158,8 +437,10 @@  static void compare_pri_chr_in(void *opaque, const uint8_t *buf, int size)
 
     ret = compare_chr_fill_rstate(&s->pri_rs, buf, size);
     if (ret == 1) {
-        /* FIXME: enqueue to primary packet list */
-        compare_chr_send(s->chr_out, s->pri_rs.buf, s->pri_rs.packet_len);
+        if (packet_enqueue(s, PRIMARY_IN)) {
+            trace_colo_compare_main("primary: unsupported packet in");
+            compare_chr_send(s->chr_out, s->pri_rs.buf, s->pri_rs.packet_len);
+        }
     } else if (ret == -1) {
         qemu_chr_add_handlers(s->chr_pri_in, NULL, NULL, NULL, NULL);
     }
@@ -176,9 +457,11 @@  static void compare_sec_chr_in(void *opaque, const uint8_t *buf, int size)
 
     ret = compare_chr_fill_rstate(&s->sec_rs, buf, size);
     if (ret == 1) {
-        /* TODO: enqueue to secondary packet list*/
-        /* should we send sec arp pkt? */
-        compare_chr_send(s->chr_out, s->sec_rs.buf, s->sec_rs.packet_len);
+        if (packet_enqueue(s, SECONDARY_IN)) {
+            trace_colo_compare_main("secondary: unsupported packet in");
+            /* should we send sec arp pkt? */
+            compare_chr_send(s->chr_out, s->sec_rs.buf, s->sec_rs.packet_len);
+        }
     } else if (ret == -1) {
         qemu_chr_add_handlers(s->chr_sec_in, NULL, NULL, NULL, NULL);
     }
@@ -280,6 +563,15 @@  static void colo_compare_complete(UserCreatable *uc, Error **errp)
     qemu_chr_fe_claim_no_fail(s->chr_out);
     QTAILQ_INSERT_TAIL(&net_compares, s, next);
 
+    g_queue_init(&s->conn_list);
+    qemu_mutex_init(&s->conn_list_lock);
+    s->hashtable_size = 0;
+
+    s->connection_track_table = g_hash_table_new_full(connection_key_hash,
+                                                      connection_key_equal,
+                                                      g_free,
+                                                      connection_destroy);
+
     return;
 }
 
@@ -314,6 +606,7 @@  static void colo_compare_class_finalize(ObjectClass *oc, void *data)
     if (!QTAILQ_EMPTY(&net_compares)) {
         QTAILQ_REMOVE(&net_compares, s, next);
     }
+    qemu_mutex_destroy(&s->conn_list_lock);
 }
 
 static void colo_compare_init(Object *obj)
diff --git a/trace-events b/trace-events
index ca7211b..8862288 100644
--- a/trace-events
+++ b/trace-events
@@ -1916,3 +1916,6 @@  aspeed_vic_update_fiq(int flags) "Raising FIQ: %d"
 aspeed_vic_update_irq(int flags) "Raising IRQ: %d"
 aspeed_vic_read(uint64_t offset, unsigned size, uint32_t value) "From 0x%" PRIx64 " of size %u: 0x%" PRIx32
 aspeed_vic_write(uint64_t offset, unsigned size, uint32_t data) "To 0x%" PRIx64 " of size %u: 0x%" PRIx32
+
+# net/colo-compare.c
+colo_compare_main(const char *chr) "chr: %s"