Message ID | 1460977906-25218-3-git-send-email-zhangchen.fnst@cn.fujitsu.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 04/18/2016 07:11 PM, Zhang Chen wrote: > In this patch we use kernel jhash table to track > connection, and then enqueue net packet like this: > > + CompareState ++ > | | > +---------------+ +---------------+ +---------------+ > |conn list +--->conn +--------->conn | > +---------------+ +---------------+ +---------------+ > | | | | | | > +---------------+ +---v----+ +---v----+ +---v----+ +---v----+ > |primary | |secondary |primary | |secondary > |packet | |packet + |packet | |packet + > +--------+ +--------+ +--------+ +--------+ > | | | | > +---v----+ +---v----+ +---v----+ +---v----+ > |primary | |secondary |primary | |secondary > |packet | |packet + |packet | |packet + > +--------+ +--------+ +--------+ +--------+ > | | | | > +---v----+ +---v----+ +---v----+ +---v----+ > |primary | |secondary |primary | |secondary > |packet | |packet + |packet | |packet + > +--------+ +--------+ +--------+ +--------+ > > Signed-off-by: Zhang Chen <zhangchen.fnst@cn.fujitsu.com> > Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com> > Signed-off-by: Wen Congyang <wency@cn.fujitsu.com> > --- > include/qemu/jhash.h | 59 ++++++++++ > net/colo-compare.c | 303 ++++++++++++++++++++++++++++++++++++++++++++++++++- > trace-events | 3 + > 3 files changed, 360 insertions(+), 5 deletions(-) > create mode 100644 include/qemu/jhash.h > > diff --git a/include/qemu/jhash.h b/include/qemu/jhash.h > new file mode 100644 > index 0000000..8a8ff0f > --- /dev/null > +++ b/include/qemu/jhash.h > @@ -0,0 +1,59 @@ > +/* jhash.h: Jenkins hash support. > + * > + * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) > + * > + * http://burtleburtle.net/bob/hash/ > + * > + * These are the credits from Bob's sources: > + * > + * lookup3.c, by Bob Jenkins, May 2006, Public Domain. > + * > + * These are functions for producing 32-bit hashes for hash table lookup. > + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() > + * are externally useful functions. Routines to test the hash are included > + * if SELF_TEST is defined. You can use this free for any purpose.It's in > + * the public domain. It has no warranty. > + * > + * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu) > + * > + * I've modified Bob's hash to be useful in the Linux kernel, and > + * any bugs present are my fault. > + * Jozsef > + */ > + > +#ifndef QEMU_JHASH_H__ > +#define QEMU_JHASH_H__ > + > +#include "qemu/bitops.h" > + > +/* > + * hashtable related is copied from linux kernel jhash > + */ > + > +/* __jhash_mix -- mix 3 32-bit values reversibly. */ > +#define __jhash_mix(a, b, c) \ > +{ \ > + a -= c; a ^= rol32(c, 4); c += b; \ > + b -= a; b ^= rol32(a, 6); a += c; \ > + c -= b; c ^= rol32(b, 8); b += a; \ > + a -= c; a ^= rol32(c, 16); c += b; \ > + b -= a; b ^= rol32(a, 19); a += c; \ > + c -= b; c ^= rol32(b, 4); b += a; \ > +} > + > +/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */ > +#define __jhash_final(a, b, c) \ > +{ \ > + c ^= b; c -= rol32(b, 14); \ > + a ^= c; a -= rol32(c, 11); \ > + b ^= a; b -= rol32(a, 25); \ > + c ^= b; c -= rol32(b, 16); \ > + a ^= c; a -= rol32(c, 4); \ > + b ^= a; b -= rol32(a, 14); \ > + c ^= b; c -= rol32(b, 24); \ > +} > + > +/* An arbitrary initial parameter */ > +#define JHASH_INITVAL 0xdeadbeef > + > +#endif /* QEMU_JHASH_H__ */ > diff --git a/net/colo-compare.c b/net/colo-compare.c > index c45b132..dc57eac 100644 > --- a/net/colo-compare.c > +++ b/net/colo-compare.c > @@ -22,12 +22,16 @@ > #include "qemu/sockets.h" > #include "qapi-visit.h" > #include "trace.h" > +#include "slirp/slirp.h" > +#include "qemu/jhash.h" > +#include "net/eth.h" > > #define TYPE_COLO_COMPARE "colo-compare" > #define COLO_COMPARE(obj) \ > OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE) > > #define COMPARE_READ_LEN_MAX NET_BUFSIZE > +#define HASHTABLE_MAX_SIZE 16384 > > static QTAILQ_HEAD(, CompareState) net_compares = > QTAILQ_HEAD_INITIALIZER(net_compares); > @@ -39,6 +43,28 @@ typedef struct ReadState { > uint8_t buf[COMPARE_READ_LEN_MAX]; > } ReadState; > > +/* > + + CompareState ++ > + | | > + +---------------+ +---------------+ +---------------+ > + |conn list +--->conn +--------->conn | > + +---------------+ +---------------+ +---------------+ > + | | | | | | > + +---------------+ +---v----+ +---v----+ +---v----+ +---v----+ > + |primary | |secondary |primary | |secondary > + |packet | |packet + |packet | |packet + > + +--------+ +--------+ +--------+ +--------+ > + | | | | > + +---v----+ +---v----+ +---v----+ +---v----+ > + |primary | |secondary |primary | |secondary > + |packet | |packet + |packet | |packet + > + +--------+ +--------+ +--------+ +--------+ > + | | | | > + +---v----+ +---v----+ +---v----+ +---v----+ > + |primary | |secondary |primary | |secondary > + |packet | |packet + |packet | |packet + > + +--------+ +--------+ +--------+ +--------+ > +*/ > typedef struct CompareState { > Object parent; > > @@ -51,12 +77,265 @@ typedef struct CompareState { > QTAILQ_ENTRY(CompareState) next; > ReadState pri_rs; > ReadState sec_rs; > + > + /* connection list: the connections belonged to this NIC could be found > + * in this list. > + * element type: Connection > + */ > + GQueue conn_list; > + QemuMutex conn_list_lock; /* to protect conn_list */ > + /* hashtable to save connection */ > + GHashTable *connection_track_table; > + /* to save unprocessed_connections */ > + GQueue unprocessed_connections; > + /* proxy current hash size */ > + uint32_t hashtable_size; > } CompareState; > > typedef struct CompareClass { > ObjectClass parent_class; > } CompareClass; > > +typedef struct Packet { > + void *data; > + union { > + uint8_t *network_layer; > + struct ip *ip; Does this mean ipv6 is not supported? > + }; > + uint8_t *transport_layer; > + int size; > + CompareState *s; > +} Packet; > + > +typedef struct ConnectionKey { > + /* (src, dst) must be grouped, in the same way than in IP header */ > + struct in_addr src; > + struct in_addr dst; > + uint16_t src_port; > + uint16_t dst_port; > + uint8_t ip_proto; > +} QEMU_PACKED ConnectionKey; > + > +typedef struct Connection { > + QemuMutex list_lock; > + /* connection primary send queue: element type: Packet */ > + GQueue primary_list; > + /* connection secondary send queue: element type: Packet */ > + GQueue secondary_list; > + /* flag to enqueue unprocessed_connections */ > + bool processing; > + uint8_t ip_proto; > +} Connection; > + > +enum { > + PRIMARY_IN = 0, > + SECONDARY_IN, > +}; > + > +static void packet_destroy(void *opaque, void *user_data); > +static int compare_chr_send(CharDriverState *out, > + const uint8_t *buf, > + uint32_t size); > + > +static uint32_t connection_key_hash(const void *opaque) > +{ > + const ConnectionKey *key = opaque; > + uint32_t a, b, c; > + > + /* Jenkins hash */ > + a = b = c = JHASH_INITVAL + sizeof(*key); > + a += key->src.s_addr; > + b += key->dst.s_addr; > + c += (key->src_port | key->dst_port << 16); > + __jhash_mix(a, b, c); > + > + a += key->ip_proto; > + __jhash_final(a, b, c); > + > + return c; > +} > + > +static int connection_key_equal(const void *opaque1, const void *opaque2) > +{ > + return memcmp(opaque1, opaque2, sizeof(ConnectionKey)) == 0; So why not useing ConnectionKey * consider we're sure of the type? > +} > + > +/* > + * initialize connecon_key for packet > + * Return 0 on success, if return 1 the pkt will be sent later > + */ > +static int connection_key_init(Packet *pkt, ConnectionKey *key) > +{ > + int network_length; > + uint8_t *data = pkt->data; > + uint16_t l3_proto; > + uint32_t tmp_ports; > + ssize_t l2hdr_len = eth_get_l2_hdr_length(data); > + > + pkt->network_layer = data + ETH_HLEN; Can the length of data be shorter than ETH_HELN? > + l3_proto = eth_get_l3_proto(data, l2hdr_len); > + if (l3_proto != ETH_P_IP) { > + return 1; > + } > + > + network_length = pkt->ip->ip_hl * 4; > + pkt->transport_layer = pkt->network_layer + network_length; Do we need sanity check to make sure there's no evil network_length here? > + if (!pkt->transport_layer) { > + error_report("pkt->transport_layer is valid"); invalid? And if this is caused by the bad packet it self, there's no need for a error_report. > + return 1; > + } > + key->ip_proto = pkt->ip->ip_p; > + key->src = pkt->ip->ip_src; > + key->dst = pkt->ip->ip_dst; > + > + switch (key->ip_proto) { > + case IPPROTO_TCP: > + case IPPROTO_UDP: > + case IPPROTO_DCCP: > + case IPPROTO_ESP: > + case IPPROTO_SCTP: > + case IPPROTO_UDPLITE: > + tmp_ports = *(uint32_t *)(pkt->transport_layer); > + key->src_port = ntohs(tmp_ports & 0xffff); > + key->dst_port = ntohs(tmp_ports >> 16); > + break; > + case IPPROTO_AH: > + tmp_ports = *(uint32_t *)(pkt->transport_layer + 4); > + key->src_port = ntohs(tmp_ports & 0xffff); > + key->dst_port = ntohs(tmp_ports >> 16); > + break; > + default: > + key->src_port = 0; > + key->dst_port = 0; > + break; > + } > + > + return 0; > +} > + > +static Connection *connection_new(ConnectionKey *key) > +{ > + Connection *conn = g_slice_new(Connection); > + > + qemu_mutex_init(&conn->list_lock); > + conn->ip_proto = key->ip_proto; > + conn->processing = false; > + g_queue_init(&conn->primary_list); > + g_queue_init(&conn->secondary_list); > + > + return conn; > +} > + > +/* > + * Clear hashtable, stop this hash growing really huge > + */ > +static void connection_hashtable_reset(CompareState *s) > +{ > + s->hashtable_size = 0; > + g_hash_table_remove_all(s->connection_track_table); > +} > + > +/* if not found, create a new connection and add to hash table */ > +static Connection *connection_get(CompareState *s, ConnectionKey *key) > +{ > + /* FIXME: protect connection_track_table */ > + Connection *conn = g_hash_table_lookup(s->connection_track_table, key); > + > + if (conn == NULL) { > + ConnectionKey *new_key = g_memdup(key, sizeof(*key)); > + > + conn = connection_new(key); > + > + s->hashtable_size++; > + if (s->hashtable_size > HASHTABLE_MAX_SIZE) { > + error_report("colo proxy connection hashtable full, clear it"); > + connection_hashtable_reset(s); > + /* TODO:clear conn_list */ > + } > + > + g_hash_table_insert(s->connection_track_table, new_key, conn); > + } > + > + return conn; > +} > + > +static void connection_destroy(void *opaque) > +{ > + Connection *conn = opaque; > + > + qemu_mutex_lock(&conn->list_lock); Like I said in previous patch, if you do all the processing in colo compare thread, you can avoid almost all synchronization (e.g mutex). > + g_queue_foreach(&conn->primary_list, packet_destroy, NULL); > + g_queue_free(&conn->primary_list); > + g_queue_foreach(&conn->secondary_list, packet_destroy, NULL); > + g_queue_free(&conn->secondary_list); > + qemu_mutex_unlock(&conn->list_lock); > + qemu_mutex_destroy(&conn->list_lock); > + g_slice_free(Connection, conn); > +} > + > +static Packet *packet_new(CompareState *s, const void *data, > + int size, ConnectionKey *key) > +{ > + Packet *pkt = g_slice_new(Packet); > + > + pkt->data = g_memdup(data, size); > + pkt->size = size; > + pkt->s = s; > + > + if (connection_key_init(pkt, key)) { > + packet_destroy(pkt, NULL); > + pkt = NULL; > + } Can we do connection_key_init() first, this can avoid packet_desctory() if it fails. > + > + return pkt; > +} > + > +/* > + * Return 0 on success, if return -1 means the pkt > + * is unsupported(arp and ipv6) and will be sent later > + */ > +static int packet_enqueue(CompareState *s, int mode) > +{ > + ConnectionKey key = {{ 0 } }; > + Packet *pkt = NULL; > + Connection *conn; > + > + if (mode == PRIMARY_IN) { > + pkt = packet_new(s, s->pri_rs.buf, s->pri_rs.packet_len, &key); > + } else { > + pkt = packet_new(s, s->sec_rs.buf, s->sec_rs.packet_len, &key); > + } > + if (!pkt) { > + return -1; > + } > + > + conn = connection_get(s, &key); > + if (!conn->processing) { > + qemu_mutex_lock(&s->conn_list_lock); > + g_queue_push_tail(&s->conn_list, conn); > + qemu_mutex_unlock(&s->conn_list_lock); > + conn->processing = true; > + } > + > + qemu_mutex_lock(&conn->list_lock); > + if (mode == PRIMARY_IN) { > + g_queue_push_tail(&conn->primary_list, pkt); > + } else { > + g_queue_push_tail(&conn->secondary_list, pkt); > + } > + qemu_mutex_unlock(&conn->list_lock); > + > + return 0; > +} > + > +static void packet_destroy(void *opaque, void *user_data) > +{ > + Packet *pkt = opaque; > + > + g_free(pkt->data); > + g_slice_free(Packet, pkt); > +} > + > static int compare_chr_send(CharDriverState *out, > const uint8_t *buf, > uint32_t size) > @@ -158,8 +437,10 @@ static void compare_pri_chr_in(void *opaque, const uint8_t *buf, int size) > > ret = compare_chr_fill_rstate(&s->pri_rs, buf, size); > if (ret == 1) { > - /* FIXME: enqueue to primary packet list */ > - compare_chr_send(s->chr_out, s->pri_rs.buf, s->pri_rs.packet_len); > + if (packet_enqueue(s, PRIMARY_IN)) { > + trace_colo_compare_main("primary: unsupported packet in"); > + compare_chr_send(s->chr_out, s->pri_rs.buf, s->pri_rs.packet_len); Looks like if a packet was not recognized by connection_key_init(), it will be sent directly without comparing it with the packet sent from secondary? Is this expected? > + } > } else if (ret == -1) { > qemu_chr_add_handlers(s->chr_pri_in, NULL, NULL, NULL, NULL); > } > @@ -176,9 +457,11 @@ static void compare_sec_chr_in(void *opaque, const uint8_t *buf, int size) > > ret = compare_chr_fill_rstate(&s->sec_rs, buf, size); > if (ret == 1) { > - /* TODO: enqueue to secondary packet list*/ > - /* should we send sec arp pkt? */ > - compare_chr_send(s->chr_out, s->sec_rs.buf, s->sec_rs.packet_len); > + if (packet_enqueue(s, SECONDARY_IN)) { > + trace_colo_compare_main("secondary: unsupported packet in"); > + /* should we send sec arp pkt? */ > + compare_chr_send(s->chr_out, s->sec_rs.buf, s->sec_rs.packet_len); > + } > } else if (ret == -1) { > qemu_chr_add_handlers(s->chr_sec_in, NULL, NULL, NULL, NULL); > } > @@ -280,6 +563,15 @@ static void colo_compare_complete(UserCreatable *uc, Error **errp) > qemu_chr_fe_claim_no_fail(s->chr_out); > QTAILQ_INSERT_TAIL(&net_compares, s, next); > > + g_queue_init(&s->conn_list); > + qemu_mutex_init(&s->conn_list_lock); > + s->hashtable_size = 0; > + > + s->connection_track_table = g_hash_table_new_full(connection_key_hash, > + connection_key_equal, > + g_free, > + connection_destroy); > + > return; > } > > @@ -314,6 +606,7 @@ static void colo_compare_class_finalize(ObjectClass *oc, void *data) > if (!QTAILQ_EMPTY(&net_compares)) { > QTAILQ_REMOVE(&net_compares, s, next); > } > + qemu_mutex_destroy(&s->conn_list_lock); > } > > static void colo_compare_init(Object *obj) > diff --git a/trace-events b/trace-events > index ca7211b..8862288 100644 > --- a/trace-events > +++ b/trace-events > @@ -1916,3 +1916,6 @@ aspeed_vic_update_fiq(int flags) "Raising FIQ: %d" > aspeed_vic_update_irq(int flags) "Raising IRQ: %d" > aspeed_vic_read(uint64_t offset, unsigned size, uint32_t value) "From 0x%" PRIx64 " of size %u: 0x%" PRIx32 > aspeed_vic_write(uint64_t offset, unsigned size, uint32_t data) "To 0x%" PRIx64 " of size %u: 0x%" PRIx32 > + > +# net/colo-compare.c > +colo_compare_main(const char *chr) "chr: %s"
On 04/28/2016 03:47 PM, Jason Wang wrote: > > On 04/18/2016 07:11 PM, Zhang Chen wrote: >> In this patch we use kernel jhash table to track >> connection, and then enqueue net packet like this: >> >> + CompareState ++ >> | | >> +---------------+ +---------------+ +---------------+ >> |conn list +--->conn +--------->conn | >> +---------------+ +---------------+ +---------------+ >> | | | | | | >> +---------------+ +---v----+ +---v----+ +---v----+ +---v----+ >> |primary | |secondary |primary | |secondary >> |packet | |packet + |packet | |packet + >> +--------+ +--------+ +--------+ +--------+ >> | | | | >> +---v----+ +---v----+ +---v----+ +---v----+ >> |primary | |secondary |primary | |secondary >> |packet | |packet + |packet | |packet + >> +--------+ +--------+ +--------+ +--------+ >> | | | | >> +---v----+ +---v----+ +---v----+ +---v----+ >> |primary | |secondary |primary | |secondary >> |packet | |packet + |packet | |packet + >> +--------+ +--------+ +--------+ +--------+ >> >> Signed-off-by: Zhang Chen <zhangchen.fnst@cn.fujitsu.com> >> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com> >> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com> >> --- >> include/qemu/jhash.h | 59 ++++++++++ >> net/colo-compare.c | 303 ++++++++++++++++++++++++++++++++++++++++++++++++++- >> trace-events | 3 + >> 3 files changed, 360 insertions(+), 5 deletions(-) >> create mode 100644 include/qemu/jhash.h >> >> diff --git a/include/qemu/jhash.h b/include/qemu/jhash.h >> new file mode 100644 >> index 0000000..8a8ff0f >> --- /dev/null >> +++ b/include/qemu/jhash.h >> @@ -0,0 +1,59 @@ >> +/* jhash.h: Jenkins hash support. >> + * >> + * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) >> + * >> + * http://burtleburtle.net/bob/hash/ >> + * >> + * These are the credits from Bob's sources: >> + * >> + * lookup3.c, by Bob Jenkins, May 2006, Public Domain. >> + * >> + * These are functions for producing 32-bit hashes for hash table lookup. >> + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() >> + * are externally useful functions. Routines to test the hash are included >> + * if SELF_TEST is defined. You can use this free for any purpose.It's in >> + * the public domain. It has no warranty. >> + * >> + * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu) >> + * >> + * I've modified Bob's hash to be useful in the Linux kernel, and >> + * any bugs present are my fault. >> + * Jozsef >> + */ >> + >> +#ifndef QEMU_JHASH_H__ >> +#define QEMU_JHASH_H__ >> + >> +#include "qemu/bitops.h" >> + >> +/* >> + * hashtable related is copied from linux kernel jhash >> + */ >> + >> +/* __jhash_mix -- mix 3 32-bit values reversibly. */ >> +#define __jhash_mix(a, b, c) \ >> +{ \ >> + a -= c; a ^= rol32(c, 4); c += b; \ >> + b -= a; b ^= rol32(a, 6); a += c; \ >> + c -= b; c ^= rol32(b, 8); b += a; \ >> + a -= c; a ^= rol32(c, 16); c += b; \ >> + b -= a; b ^= rol32(a, 19); a += c; \ >> + c -= b; c ^= rol32(b, 4); b += a; \ >> +} >> + >> +/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */ >> +#define __jhash_final(a, b, c) \ >> +{ \ >> + c ^= b; c -= rol32(b, 14); \ >> + a ^= c; a -= rol32(c, 11); \ >> + b ^= a; b -= rol32(a, 25); \ >> + c ^= b; c -= rol32(b, 16); \ >> + a ^= c; a -= rol32(c, 4); \ >> + b ^= a; b -= rol32(a, 14); \ >> + c ^= b; c -= rol32(b, 24); \ >> +} >> + >> +/* An arbitrary initial parameter */ >> +#define JHASH_INITVAL 0xdeadbeef >> + >> +#endif /* QEMU_JHASH_H__ */ >> diff --git a/net/colo-compare.c b/net/colo-compare.c >> index c45b132..dc57eac 100644 >> --- a/net/colo-compare.c >> +++ b/net/colo-compare.c >> @@ -22,12 +22,16 @@ >> #include "qemu/sockets.h" >> #include "qapi-visit.h" >> #include "trace.h" >> +#include "slirp/slirp.h" >> +#include "qemu/jhash.h" >> +#include "net/eth.h" >> >> #define TYPE_COLO_COMPARE "colo-compare" >> #define COLO_COMPARE(obj) \ >> OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE) >> >> #define COMPARE_READ_LEN_MAX NET_BUFSIZE >> +#define HASHTABLE_MAX_SIZE 16384 >> >> static QTAILQ_HEAD(, CompareState) net_compares = >> QTAILQ_HEAD_INITIALIZER(net_compares); >> @@ -39,6 +43,28 @@ typedef struct ReadState { >> uint8_t buf[COMPARE_READ_LEN_MAX]; >> } ReadState; >> >> +/* >> + + CompareState ++ >> + | | >> + +---------------+ +---------------+ +---------------+ >> + |conn list +--->conn +--------->conn | >> + +---------------+ +---------------+ +---------------+ >> + | | | | | | >> + +---------------+ +---v----+ +---v----+ +---v----+ +---v----+ >> + |primary | |secondary |primary | |secondary >> + |packet | |packet + |packet | |packet + >> + +--------+ +--------+ +--------+ +--------+ >> + | | | | >> + +---v----+ +---v----+ +---v----+ +---v----+ >> + |primary | |secondary |primary | |secondary >> + |packet | |packet + |packet | |packet + >> + +--------+ +--------+ +--------+ +--------+ >> + | | | | >> + +---v----+ +---v----+ +---v----+ +---v----+ >> + |primary | |secondary |primary | |secondary >> + |packet | |packet + |packet | |packet + >> + +--------+ +--------+ +--------+ +--------+ >> +*/ >> typedef struct CompareState { >> Object parent; >> >> @@ -51,12 +77,265 @@ typedef struct CompareState { >> QTAILQ_ENTRY(CompareState) next; >> ReadState pri_rs; >> ReadState sec_rs; >> + >> + /* connection list: the connections belonged to this NIC could be found >> + * in this list. >> + * element type: Connection >> + */ >> + GQueue conn_list; >> + QemuMutex conn_list_lock; /* to protect conn_list */ >> + /* hashtable to save connection */ >> + GHashTable *connection_track_table; >> + /* to save unprocessed_connections */ >> + GQueue unprocessed_connections; >> + /* proxy current hash size */ >> + uint32_t hashtable_size; >> } CompareState; >> >> typedef struct CompareClass { >> ObjectClass parent_class; >> } CompareClass; >> >> +typedef struct Packet { >> + void *data; >> + union { >> + uint8_t *network_layer; >> + struct ip *ip; > Does this mean ipv6 is not supported? Yes,currently not support. > >> + }; >> + uint8_t *transport_layer; >> + int size; >> + CompareState *s; >> +} Packet; >> + >> +typedef struct ConnectionKey { >> + /* (src, dst) must be grouped, in the same way than in IP header */ >> + struct in_addr src; >> + struct in_addr dst; >> + uint16_t src_port; >> + uint16_t dst_port; >> + uint8_t ip_proto; >> +} QEMU_PACKED ConnectionKey; >> + >> +typedef struct Connection { >> + QemuMutex list_lock; >> + /* connection primary send queue: element type: Packet */ >> + GQueue primary_list; >> + /* connection secondary send queue: element type: Packet */ >> + GQueue secondary_list; >> + /* flag to enqueue unprocessed_connections */ >> + bool processing; >> + uint8_t ip_proto; >> +} Connection; >> + >> +enum { >> + PRIMARY_IN = 0, >> + SECONDARY_IN, >> +}; >> + >> +static void packet_destroy(void *opaque, void *user_data); >> +static int compare_chr_send(CharDriverState *out, >> + const uint8_t *buf, >> + uint32_t size); >> + >> +static uint32_t connection_key_hash(const void *opaque) >> +{ >> + const ConnectionKey *key = opaque; >> + uint32_t a, b, c; >> + >> + /* Jenkins hash */ >> + a = b = c = JHASH_INITVAL + sizeof(*key); >> + a += key->src.s_addr; >> + b += key->dst.s_addr; >> + c += (key->src_port | key->dst_port << 16); >> + __jhash_mix(a, b, c); >> + >> + a += key->ip_proto; >> + __jhash_final(a, b, c); >> + >> + return c; >> +} >> + >> +static int connection_key_equal(const void *opaque1, const void *opaque2) >> +{ >> + return memcmp(opaque1, opaque2, sizeof(ConnectionKey)) == 0; > So why not useing ConnectionKey * consider we're sure of the type? OK, will fix it in next version. > >> +} >> + >> +/* >> + * initialize connecon_key for packet >> + * Return 0 on success, if return 1 the pkt will be sent later >> + */ >> +static int connection_key_init(Packet *pkt, ConnectionKey *key) >> +{ >> + int network_length; >> + uint8_t *data = pkt->data; >> + uint16_t l3_proto; >> + uint32_t tmp_ports; >> + ssize_t l2hdr_len = eth_get_l2_hdr_length(data); >> + >> + pkt->network_layer = data + ETH_HLEN; > Can the length of data be shorter than ETH_HELN? Thanks,I will check pkt->size first. > >> + l3_proto = eth_get_l3_proto(data, l2hdr_len); >> + if (l3_proto != ETH_P_IP) { >> + return 1; >> + } >> + >> + network_length = pkt->ip->ip_hl * 4; >> + pkt->transport_layer = pkt->network_layer + network_length; > Do we need sanity check to make sure there's no evil network_length here? Yes,I will fix. > >> + if (!pkt->transport_layer) { >> + error_report("pkt->transport_layer is valid"); > invalid? And if this is caused by the bad packet it self, there's no > need for a error_report. OK > >> + return 1; >> + } >> + key->ip_proto = pkt->ip->ip_p; >> + key->src = pkt->ip->ip_src; >> + key->dst = pkt->ip->ip_dst; >> + >> + switch (key->ip_proto) { >> + case IPPROTO_TCP: >> + case IPPROTO_UDP: >> + case IPPROTO_DCCP: >> + case IPPROTO_ESP: >> + case IPPROTO_SCTP: >> + case IPPROTO_UDPLITE: >> + tmp_ports = *(uint32_t *)(pkt->transport_layer); >> + key->src_port = ntohs(tmp_ports & 0xffff); >> + key->dst_port = ntohs(tmp_ports >> 16); >> + break; >> + case IPPROTO_AH: >> + tmp_ports = *(uint32_t *)(pkt->transport_layer + 4); >> + key->src_port = ntohs(tmp_ports & 0xffff); >> + key->dst_port = ntohs(tmp_ports >> 16); >> + break; >> + default: >> + key->src_port = 0; >> + key->dst_port = 0; >> + break; >> + } >> + >> + return 0; >> +} >> + >> +static Connection *connection_new(ConnectionKey *key) >> +{ >> + Connection *conn = g_slice_new(Connection); >> + >> + qemu_mutex_init(&conn->list_lock); >> + conn->ip_proto = key->ip_proto; >> + conn->processing = false; >> + g_queue_init(&conn->primary_list); >> + g_queue_init(&conn->secondary_list); >> + >> + return conn; >> +} >> + >> +/* >> + * Clear hashtable, stop this hash growing really huge >> + */ >> +static void connection_hashtable_reset(CompareState *s) >> +{ >> + s->hashtable_size = 0; >> + g_hash_table_remove_all(s->connection_track_table); >> +} >> + >> +/* if not found, create a new connection and add to hash table */ >> +static Connection *connection_get(CompareState *s, ConnectionKey *key) >> +{ >> + /* FIXME: protect connection_track_table */ >> + Connection *conn = g_hash_table_lookup(s->connection_track_table, key); >> + >> + if (conn == NULL) { >> + ConnectionKey *new_key = g_memdup(key, sizeof(*key)); >> + >> + conn = connection_new(key); >> + >> + s->hashtable_size++; >> + if (s->hashtable_size > HASHTABLE_MAX_SIZE) { >> + error_report("colo proxy connection hashtable full, clear it"); >> + connection_hashtable_reset(s); >> + /* TODO:clear conn_list */ >> + } >> + >> + g_hash_table_insert(s->connection_track_table, new_key, conn); >> + } >> + >> + return conn; >> +} >> + >> +static void connection_destroy(void *opaque) >> +{ >> + Connection *conn = opaque; >> + >> + qemu_mutex_lock(&conn->list_lock); > Like I said in previous patch, if you do all the processing in colo > compare thread, you can avoid almost all synchronization (e.g mutex). > >> + g_queue_foreach(&conn->primary_list, packet_destroy, NULL); >> + g_queue_free(&conn->primary_list); >> + g_queue_foreach(&conn->secondary_list, packet_destroy, NULL); >> + g_queue_free(&conn->secondary_list); >> + qemu_mutex_unlock(&conn->list_lock); >> + qemu_mutex_destroy(&conn->list_lock); >> + g_slice_free(Connection, conn); >> +} >> + >> +static Packet *packet_new(CompareState *s, const void *data, >> + int size, ConnectionKey *key) >> +{ >> + Packet *pkt = g_slice_new(Packet); >> + >> + pkt->data = g_memdup(data, size); >> + pkt->size = size; >> + pkt->s = s; >> + >> + if (connection_key_init(pkt, key)) { >> + packet_destroy(pkt, NULL); >> + pkt = NULL; >> + } > Can we do connection_key_init() first, this can avoid packet_desctory() > if it fails. Do you mean we should call connection_key_init() first and then call packet_new()? > >> + >> + return pkt; >> +} >> + >> +/* >> + * Return 0 on success, if return -1 means the pkt >> + * is unsupported(arp and ipv6) and will be sent later >> + */ >> +static int packet_enqueue(CompareState *s, int mode) >> +{ >> + ConnectionKey key = {{ 0 } }; >> + Packet *pkt = NULL; >> + Connection *conn; >> + >> + if (mode == PRIMARY_IN) { >> + pkt = packet_new(s, s->pri_rs.buf, s->pri_rs.packet_len, &key); >> + } else { >> + pkt = packet_new(s, s->sec_rs.buf, s->sec_rs.packet_len, &key); >> + } >> + if (!pkt) { >> + return -1; >> + } >> + >> + conn = connection_get(s, &key); >> + if (!conn->processing) { >> + qemu_mutex_lock(&s->conn_list_lock); >> + g_queue_push_tail(&s->conn_list, conn); >> + qemu_mutex_unlock(&s->conn_list_lock); >> + conn->processing = true; >> + } >> + >> + qemu_mutex_lock(&conn->list_lock); >> + if (mode == PRIMARY_IN) { >> + g_queue_push_tail(&conn->primary_list, pkt); >> + } else { >> + g_queue_push_tail(&conn->secondary_list, pkt); >> + } >> + qemu_mutex_unlock(&conn->list_lock); >> + >> + return 0; >> +} >> + >> +static void packet_destroy(void *opaque, void *user_data) >> +{ >> + Packet *pkt = opaque; >> + >> + g_free(pkt->data); >> + g_slice_free(Packet, pkt); >> +} >> + >> static int compare_chr_send(CharDriverState *out, >> const uint8_t *buf, >> uint32_t size) >> @@ -158,8 +437,10 @@ static void compare_pri_chr_in(void *opaque, const uint8_t *buf, int size) >> >> ret = compare_chr_fill_rstate(&s->pri_rs, buf, size); >> if (ret == 1) { >> - /* FIXME: enqueue to primary packet list */ >> - compare_chr_send(s->chr_out, s->pri_rs.buf, s->pri_rs.packet_len); >> + if (packet_enqueue(s, PRIMARY_IN)) { >> + trace_colo_compare_main("primary: unsupported packet in"); >> + compare_chr_send(s->chr_out, s->pri_rs.buf, s->pri_rs.packet_len); > Looks like if a packet was not recognized by connection_key_init(), it > will be sent directly without comparing it with the packet sent from > secondary? Is this expected? Yes,we will send primary's arp packet to get mac first. Thanks zhangchen > >> + } >> } else if (ret == -1) { >> qemu_chr_add_handlers(s->chr_pri_in, NULL, NULL, NULL, NULL); >> } >> @@ -176,9 +457,11 @@ static void compare_sec_chr_in(void *opaque, const uint8_t *buf, int size) >> >> ret = compare_chr_fill_rstate(&s->sec_rs, buf, size); >> if (ret == 1) { >> - /* TODO: enqueue to secondary packet list*/ >> - /* should we send sec arp pkt? */ >> - compare_chr_send(s->chr_out, s->sec_rs.buf, s->sec_rs.packet_len); >> + if (packet_enqueue(s, SECONDARY_IN)) { >> + trace_colo_compare_main("secondary: unsupported packet in"); >> + /* should we send sec arp pkt? */ >> + compare_chr_send(s->chr_out, s->sec_rs.buf, s->sec_rs.packet_len); >> + } >> } else if (ret == -1) { >> qemu_chr_add_handlers(s->chr_sec_in, NULL, NULL, NULL, NULL); >> } >> @@ -280,6 +563,15 @@ static void colo_compare_complete(UserCreatable *uc, Error **errp) >> qemu_chr_fe_claim_no_fail(s->chr_out); >> QTAILQ_INSERT_TAIL(&net_compares, s, next); >> >> + g_queue_init(&s->conn_list); >> + qemu_mutex_init(&s->conn_list_lock); >> + s->hashtable_size = 0; >> + >> + s->connection_track_table = g_hash_table_new_full(connection_key_hash, >> + connection_key_equal, >> + g_free, >> + connection_destroy); >> + >> return; >> } >> >> @@ -314,6 +606,7 @@ static void colo_compare_class_finalize(ObjectClass *oc, void *data) >> if (!QTAILQ_EMPTY(&net_compares)) { >> QTAILQ_REMOVE(&net_compares, s, next); >> } >> + qemu_mutex_destroy(&s->conn_list_lock); >> } >> >> static void colo_compare_init(Object *obj) >> diff --git a/trace-events b/trace-events >> index ca7211b..8862288 100644 >> --- a/trace-events >> +++ b/trace-events >> @@ -1916,3 +1916,6 @@ aspeed_vic_update_fiq(int flags) "Raising FIQ: %d" >> aspeed_vic_update_irq(int flags) "Raising IRQ: %d" >> aspeed_vic_read(uint64_t offset, unsigned size, uint32_t value) "From 0x%" PRIx64 " of size %u: 0x%" PRIx32 >> aspeed_vic_write(uint64_t offset, unsigned size, uint32_t data) "To 0x%" PRIx64 " of size %u: 0x%" PRIx32 >> + >> +# net/colo-compare.c >> +colo_compare_main(const char *chr) "chr: %s" > > > . >
On 04/28/2016 06:25 PM, Zhang Chen wrote: >>> +static Packet *packet_new(CompareState *s, const void *data, >>> + int size, ConnectionKey *key) >>> +{ >>> + Packet *pkt = g_slice_new(Packet); >>> + >>> + pkt->data = g_memdup(data, size); >>> + pkt->size = size; >>> + pkt->s = s; >>> + >>> + if (connection_key_init(pkt, key)) { >>> + packet_destroy(pkt, NULL); >>> + pkt = NULL; >>> + } >> Can we do connection_key_init() first, this can avoid packet_desctory() >> if it fails. > > Do you mean we should call connection_key_init() first > and then call packet_new()? Yes, only when connection_key_init() succeed. > > >> >>> + >>> + return pkt; >>> +} >>> + >>> +/* >>> + * Return 0 on success, if return -1 means the pkt >>> + * is unsupported(arp and ipv6) and will be sent later >>> + */ >>> +static int packet_enqueue(CompareState *s, int mode) >>> +{ >>> + ConnectionKey key = {{ 0 } }; >>> + Packet *pkt = NULL; >>> + Connection *conn; >>> + >>> + if (mode == PRIMARY_IN) { >>> + pkt = packet_new(s, s->pri_rs.buf, s->pri_rs.packet_len, >>> &key); >>> + } else { >>> + pkt = packet_new(s, s->sec_rs.buf, s->sec_rs.packet_len, >>> &key); >>> + } >>> + if (!pkt) { >>> + return -1; >>> + } >>> + >>> + conn = connection_get(s, &key); >>> + if (!conn->processing) { >>> + qemu_mutex_lock(&s->conn_list_lock); >>> + g_queue_push_tail(&s->conn_list, conn); >>> + qemu_mutex_unlock(&s->conn_list_lock); >>> + conn->processing = true; >>> + } >>> + >>> + qemu_mutex_lock(&conn->list_lock); >>> + if (mode == PRIMARY_IN) { >>> + g_queue_push_tail(&conn->primary_list, pkt); >>> + } else { >>> + g_queue_push_tail(&conn->secondary_list, pkt); >>> + } >>> + qemu_mutex_unlock(&conn->list_lock); >>> + >>> + return 0; >>> +} >>> + >>> +static void packet_destroy(void *opaque, void *user_data) >>> +{ >>> + Packet *pkt = opaque; >>> + >>> + g_free(pkt->data); >>> + g_slice_free(Packet, pkt); >>> +} >>> + >>> static int compare_chr_send(CharDriverState *out, >>> const uint8_t *buf, >>> uint32_t size) >>> @@ -158,8 +437,10 @@ static void compare_pri_chr_in(void *opaque, >>> const uint8_t *buf, int size) >>> ret = compare_chr_fill_rstate(&s->pri_rs, buf, size); >>> if (ret == 1) { >>> - /* FIXME: enqueue to primary packet list */ >>> - compare_chr_send(s->chr_out, s->pri_rs.buf, >>> s->pri_rs.packet_len); >>> + if (packet_enqueue(s, PRIMARY_IN)) { >>> + trace_colo_compare_main("primary: unsupported packet in"); >>> + compare_chr_send(s->chr_out, s->pri_rs.buf, >>> s->pri_rs.packet_len); >> Looks like if a packet was not recognized by connection_key_init(), it >> will be sent directly without comparing it with the packet sent from >> secondary? Is this expected? > > Yes,we will send primary's arp packet to get mac first. > > Thanks > zhangchen But what if the packet was not arp?
On 04/29/2016 10:05 AM, Jason Wang wrote: > On 04/28/2016 06:25 PM, Zhang Chen wrote: >>>> +static Packet *packet_new(CompareState *s, const void *data, >>>> + int size, ConnectionKey *key) >>>> +{ >>>> + Packet *pkt = g_slice_new(Packet); >>>> + >>>> + pkt->data = g_memdup(data, size); >>>> + pkt->size = size; >>>> + pkt->s = s; >>>> + >>>> + if (connection_key_init(pkt, key)) { >>>> + packet_destroy(pkt, NULL); >>>> + pkt = NULL; >>>> + } >>> Can we do connection_key_init() first, this can avoid packet_desctory() >>> if it fails. >> Do you mean we should call connection_key_init() first >> and then call packet_new()? > Yes, only when connection_key_init() succeed. OK~ will fix in next. >>>> + >>>> + return pkt; >>>> +} >>>> + >>>> +/* >>>> + * Return 0 on success, if return -1 means the pkt >>>> + * is unsupported(arp and ipv6) and will be sent later >>>> + */ >>>> +static int packet_enqueue(CompareState *s, int mode) >>>> +{ >>>> + ConnectionKey key = {{ 0 } }; >>>> + Packet *pkt = NULL; >>>> + Connection *conn; >>>> + >>>> + if (mode == PRIMARY_IN) { >>>> + pkt = packet_new(s, s->pri_rs.buf, s->pri_rs.packet_len, >>>> &key); >>>> + } else { >>>> + pkt = packet_new(s, s->sec_rs.buf, s->sec_rs.packet_len, >>>> &key); >>>> + } >>>> + if (!pkt) { >>>> + return -1; >>>> + } >>>> + >>>> + conn = connection_get(s, &key); >>>> + if (!conn->processing) { >>>> + qemu_mutex_lock(&s->conn_list_lock); >>>> + g_queue_push_tail(&s->conn_list, conn); >>>> + qemu_mutex_unlock(&s->conn_list_lock); >>>> + conn->processing = true; >>>> + } >>>> + >>>> + qemu_mutex_lock(&conn->list_lock); >>>> + if (mode == PRIMARY_IN) { >>>> + g_queue_push_tail(&conn->primary_list, pkt); >>>> + } else { >>>> + g_queue_push_tail(&conn->secondary_list, pkt); >>>> + } >>>> + qemu_mutex_unlock(&conn->list_lock); >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +static void packet_destroy(void *opaque, void *user_data) >>>> +{ >>>> + Packet *pkt = opaque; >>>> + >>>> + g_free(pkt->data); >>>> + g_slice_free(Packet, pkt); >>>> +} >>>> + >>>> static int compare_chr_send(CharDriverState *out, >>>> const uint8_t *buf, >>>> uint32_t size) >>>> @@ -158,8 +437,10 @@ static void compare_pri_chr_in(void *opaque, >>>> const uint8_t *buf, int size) >>>> ret = compare_chr_fill_rstate(&s->pri_rs, buf, size); >>>> if (ret == 1) { >>>> - /* FIXME: enqueue to primary packet list */ >>>> - compare_chr_send(s->chr_out, s->pri_rs.buf, >>>> s->pri_rs.packet_len); >>>> + if (packet_enqueue(s, PRIMARY_IN)) { >>>> + trace_colo_compare_main("primary: unsupported packet in"); >>>> + compare_chr_send(s->chr_out, s->pri_rs.buf, >>>> s->pri_rs.packet_len); >>> Looks like if a packet was not recognized by connection_key_init(), it >>> will be sent directly without comparing it with the packet sent from >>> secondary? Is this expected? >> Yes,we will send primary's arp packet to get mac first. >> >> Thanks >> zhangchen > But what if the packet was not arp? > > > . rarp packet will be sent, ip packet will be enqueue.
diff --git a/include/qemu/jhash.h b/include/qemu/jhash.h new file mode 100644 index 0000000..8a8ff0f --- /dev/null +++ b/include/qemu/jhash.h @@ -0,0 +1,59 @@ +/* jhash.h: Jenkins hash support. + * + * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) + * + * http://burtleburtle.net/bob/hash/ + * + * These are the credits from Bob's sources: + * + * lookup3.c, by Bob Jenkins, May 2006, Public Domain. + * + * These are functions for producing 32-bit hashes for hash table lookup. + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() + * are externally useful functions. Routines to test the hash are included + * if SELF_TEST is defined. You can use this free for any purpose.It's in + * the public domain. It has no warranty. + * + * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu) + * + * I've modified Bob's hash to be useful in the Linux kernel, and + * any bugs present are my fault. + * Jozsef + */ + +#ifndef QEMU_JHASH_H__ +#define QEMU_JHASH_H__ + +#include "qemu/bitops.h" + +/* + * hashtable related is copied from linux kernel jhash + */ + +/* __jhash_mix -- mix 3 32-bit values reversibly. */ +#define __jhash_mix(a, b, c) \ +{ \ + a -= c; a ^= rol32(c, 4); c += b; \ + b -= a; b ^= rol32(a, 6); a += c; \ + c -= b; c ^= rol32(b, 8); b += a; \ + a -= c; a ^= rol32(c, 16); c += b; \ + b -= a; b ^= rol32(a, 19); a += c; \ + c -= b; c ^= rol32(b, 4); b += a; \ +} + +/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */ +#define __jhash_final(a, b, c) \ +{ \ + c ^= b; c -= rol32(b, 14); \ + a ^= c; a -= rol32(c, 11); \ + b ^= a; b -= rol32(a, 25); \ + c ^= b; c -= rol32(b, 16); \ + a ^= c; a -= rol32(c, 4); \ + b ^= a; b -= rol32(a, 14); \ + c ^= b; c -= rol32(b, 24); \ +} + +/* An arbitrary initial parameter */ +#define JHASH_INITVAL 0xdeadbeef + +#endif /* QEMU_JHASH_H__ */ diff --git a/net/colo-compare.c b/net/colo-compare.c index c45b132..dc57eac 100644 --- a/net/colo-compare.c +++ b/net/colo-compare.c @@ -22,12 +22,16 @@ #include "qemu/sockets.h" #include "qapi-visit.h" #include "trace.h" +#include "slirp/slirp.h" +#include "qemu/jhash.h" +#include "net/eth.h" #define TYPE_COLO_COMPARE "colo-compare" #define COLO_COMPARE(obj) \ OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE) #define COMPARE_READ_LEN_MAX NET_BUFSIZE +#define HASHTABLE_MAX_SIZE 16384 static QTAILQ_HEAD(, CompareState) net_compares = QTAILQ_HEAD_INITIALIZER(net_compares); @@ -39,6 +43,28 @@ typedef struct ReadState { uint8_t buf[COMPARE_READ_LEN_MAX]; } ReadState; +/* + + CompareState ++ + | | + +---------------+ +---------------+ +---------------+ + |conn list +--->conn +--------->conn | + +---------------+ +---------------+ +---------------+ + | | | | | | + +---------------+ +---v----+ +---v----+ +---v----+ +---v----+ + |primary | |secondary |primary | |secondary + |packet | |packet + |packet | |packet + + +--------+ +--------+ +--------+ +--------+ + | | | | + +---v----+ +---v----+ +---v----+ +---v----+ + |primary | |secondary |primary | |secondary + |packet | |packet + |packet | |packet + + +--------+ +--------+ +--------+ +--------+ + | | | | + +---v----+ +---v----+ +---v----+ +---v----+ + |primary | |secondary |primary | |secondary + |packet | |packet + |packet | |packet + + +--------+ +--------+ +--------+ +--------+ +*/ typedef struct CompareState { Object parent; @@ -51,12 +77,265 @@ typedef struct CompareState { QTAILQ_ENTRY(CompareState) next; ReadState pri_rs; ReadState sec_rs; + + /* connection list: the connections belonged to this NIC could be found + * in this list. + * element type: Connection + */ + GQueue conn_list; + QemuMutex conn_list_lock; /* to protect conn_list */ + /* hashtable to save connection */ + GHashTable *connection_track_table; + /* to save unprocessed_connections */ + GQueue unprocessed_connections; + /* proxy current hash size */ + uint32_t hashtable_size; } CompareState; typedef struct CompareClass { ObjectClass parent_class; } CompareClass; +typedef struct Packet { + void *data; + union { + uint8_t *network_layer; + struct ip *ip; + }; + uint8_t *transport_layer; + int size; + CompareState *s; +} Packet; + +typedef struct ConnectionKey { + /* (src, dst) must be grouped, in the same way than in IP header */ + struct in_addr src; + struct in_addr dst; + uint16_t src_port; + uint16_t dst_port; + uint8_t ip_proto; +} QEMU_PACKED ConnectionKey; + +typedef struct Connection { + QemuMutex list_lock; + /* connection primary send queue: element type: Packet */ + GQueue primary_list; + /* connection secondary send queue: element type: Packet */ + GQueue secondary_list; + /* flag to enqueue unprocessed_connections */ + bool processing; + uint8_t ip_proto; +} Connection; + +enum { + PRIMARY_IN = 0, + SECONDARY_IN, +}; + +static void packet_destroy(void *opaque, void *user_data); +static int compare_chr_send(CharDriverState *out, + const uint8_t *buf, + uint32_t size); + +static uint32_t connection_key_hash(const void *opaque) +{ + const ConnectionKey *key = opaque; + uint32_t a, b, c; + + /* Jenkins hash */ + a = b = c = JHASH_INITVAL + sizeof(*key); + a += key->src.s_addr; + b += key->dst.s_addr; + c += (key->src_port | key->dst_port << 16); + __jhash_mix(a, b, c); + + a += key->ip_proto; + __jhash_final(a, b, c); + + return c; +} + +static int connection_key_equal(const void *opaque1, const void *opaque2) +{ + return memcmp(opaque1, opaque2, sizeof(ConnectionKey)) == 0; +} + +/* + * initialize connecon_key for packet + * Return 0 on success, if return 1 the pkt will be sent later + */ +static int connection_key_init(Packet *pkt, ConnectionKey *key) +{ + int network_length; + uint8_t *data = pkt->data; + uint16_t l3_proto; + uint32_t tmp_ports; + ssize_t l2hdr_len = eth_get_l2_hdr_length(data); + + pkt->network_layer = data + ETH_HLEN; + l3_proto = eth_get_l3_proto(data, l2hdr_len); + if (l3_proto != ETH_P_IP) { + return 1; + } + + network_length = pkt->ip->ip_hl * 4; + pkt->transport_layer = pkt->network_layer + network_length; + if (!pkt->transport_layer) { + error_report("pkt->transport_layer is valid"); + return 1; + } + key->ip_proto = pkt->ip->ip_p; + key->src = pkt->ip->ip_src; + key->dst = pkt->ip->ip_dst; + + switch (key->ip_proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_DCCP: + case IPPROTO_ESP: + case IPPROTO_SCTP: + case IPPROTO_UDPLITE: + tmp_ports = *(uint32_t *)(pkt->transport_layer); + key->src_port = ntohs(tmp_ports & 0xffff); + key->dst_port = ntohs(tmp_ports >> 16); + break; + case IPPROTO_AH: + tmp_ports = *(uint32_t *)(pkt->transport_layer + 4); + key->src_port = ntohs(tmp_ports & 0xffff); + key->dst_port = ntohs(tmp_ports >> 16); + break; + default: + key->src_port = 0; + key->dst_port = 0; + break; + } + + return 0; +} + +static Connection *connection_new(ConnectionKey *key) +{ + Connection *conn = g_slice_new(Connection); + + qemu_mutex_init(&conn->list_lock); + conn->ip_proto = key->ip_proto; + conn->processing = false; + g_queue_init(&conn->primary_list); + g_queue_init(&conn->secondary_list); + + return conn; +} + +/* + * Clear hashtable, stop this hash growing really huge + */ +static void connection_hashtable_reset(CompareState *s) +{ + s->hashtable_size = 0; + g_hash_table_remove_all(s->connection_track_table); +} + +/* if not found, create a new connection and add to hash table */ +static Connection *connection_get(CompareState *s, ConnectionKey *key) +{ + /* FIXME: protect connection_track_table */ + Connection *conn = g_hash_table_lookup(s->connection_track_table, key); + + if (conn == NULL) { + ConnectionKey *new_key = g_memdup(key, sizeof(*key)); + + conn = connection_new(key); + + s->hashtable_size++; + if (s->hashtable_size > HASHTABLE_MAX_SIZE) { + error_report("colo proxy connection hashtable full, clear it"); + connection_hashtable_reset(s); + /* TODO:clear conn_list */ + } + + g_hash_table_insert(s->connection_track_table, new_key, conn); + } + + return conn; +} + +static void connection_destroy(void *opaque) +{ + Connection *conn = opaque; + + qemu_mutex_lock(&conn->list_lock); + g_queue_foreach(&conn->primary_list, packet_destroy, NULL); + g_queue_free(&conn->primary_list); + g_queue_foreach(&conn->secondary_list, packet_destroy, NULL); + g_queue_free(&conn->secondary_list); + qemu_mutex_unlock(&conn->list_lock); + qemu_mutex_destroy(&conn->list_lock); + g_slice_free(Connection, conn); +} + +static Packet *packet_new(CompareState *s, const void *data, + int size, ConnectionKey *key) +{ + Packet *pkt = g_slice_new(Packet); + + pkt->data = g_memdup(data, size); + pkt->size = size; + pkt->s = s; + + if (connection_key_init(pkt, key)) { + packet_destroy(pkt, NULL); + pkt = NULL; + } + + return pkt; +} + +/* + * Return 0 on success, if return -1 means the pkt + * is unsupported(arp and ipv6) and will be sent later + */ +static int packet_enqueue(CompareState *s, int mode) +{ + ConnectionKey key = {{ 0 } }; + Packet *pkt = NULL; + Connection *conn; + + if (mode == PRIMARY_IN) { + pkt = packet_new(s, s->pri_rs.buf, s->pri_rs.packet_len, &key); + } else { + pkt = packet_new(s, s->sec_rs.buf, s->sec_rs.packet_len, &key); + } + if (!pkt) { + return -1; + } + + conn = connection_get(s, &key); + if (!conn->processing) { + qemu_mutex_lock(&s->conn_list_lock); + g_queue_push_tail(&s->conn_list, conn); + qemu_mutex_unlock(&s->conn_list_lock); + conn->processing = true; + } + + qemu_mutex_lock(&conn->list_lock); + if (mode == PRIMARY_IN) { + g_queue_push_tail(&conn->primary_list, pkt); + } else { + g_queue_push_tail(&conn->secondary_list, pkt); + } + qemu_mutex_unlock(&conn->list_lock); + + return 0; +} + +static void packet_destroy(void *opaque, void *user_data) +{ + Packet *pkt = opaque; + + g_free(pkt->data); + g_slice_free(Packet, pkt); +} + static int compare_chr_send(CharDriverState *out, const uint8_t *buf, uint32_t size) @@ -158,8 +437,10 @@ static void compare_pri_chr_in(void *opaque, const uint8_t *buf, int size) ret = compare_chr_fill_rstate(&s->pri_rs, buf, size); if (ret == 1) { - /* FIXME: enqueue to primary packet list */ - compare_chr_send(s->chr_out, s->pri_rs.buf, s->pri_rs.packet_len); + if (packet_enqueue(s, PRIMARY_IN)) { + trace_colo_compare_main("primary: unsupported packet in"); + compare_chr_send(s->chr_out, s->pri_rs.buf, s->pri_rs.packet_len); + } } else if (ret == -1) { qemu_chr_add_handlers(s->chr_pri_in, NULL, NULL, NULL, NULL); } @@ -176,9 +457,11 @@ static void compare_sec_chr_in(void *opaque, const uint8_t *buf, int size) ret = compare_chr_fill_rstate(&s->sec_rs, buf, size); if (ret == 1) { - /* TODO: enqueue to secondary packet list*/ - /* should we send sec arp pkt? */ - compare_chr_send(s->chr_out, s->sec_rs.buf, s->sec_rs.packet_len); + if (packet_enqueue(s, SECONDARY_IN)) { + trace_colo_compare_main("secondary: unsupported packet in"); + /* should we send sec arp pkt? */ + compare_chr_send(s->chr_out, s->sec_rs.buf, s->sec_rs.packet_len); + } } else if (ret == -1) { qemu_chr_add_handlers(s->chr_sec_in, NULL, NULL, NULL, NULL); } @@ -280,6 +563,15 @@ static void colo_compare_complete(UserCreatable *uc, Error **errp) qemu_chr_fe_claim_no_fail(s->chr_out); QTAILQ_INSERT_TAIL(&net_compares, s, next); + g_queue_init(&s->conn_list); + qemu_mutex_init(&s->conn_list_lock); + s->hashtable_size = 0; + + s->connection_track_table = g_hash_table_new_full(connection_key_hash, + connection_key_equal, + g_free, + connection_destroy); + return; } @@ -314,6 +606,7 @@ static void colo_compare_class_finalize(ObjectClass *oc, void *data) if (!QTAILQ_EMPTY(&net_compares)) { QTAILQ_REMOVE(&net_compares, s, next); } + qemu_mutex_destroy(&s->conn_list_lock); } static void colo_compare_init(Object *obj) diff --git a/trace-events b/trace-events index ca7211b..8862288 100644 --- a/trace-events +++ b/trace-events @@ -1916,3 +1916,6 @@ aspeed_vic_update_fiq(int flags) "Raising FIQ: %d" aspeed_vic_update_irq(int flags) "Raising IRQ: %d" aspeed_vic_read(uint64_t offset, unsigned size, uint32_t value) "From 0x%" PRIx64 " of size %u: 0x%" PRIx32 aspeed_vic_write(uint64_t offset, unsigned size, uint32_t data) "To 0x%" PRIx64 " of size %u: 0x%" PRIx32 + +# net/colo-compare.c +colo_compare_main(const char *chr) "chr: %s"