new file mode 100644
@@ -0,0 +1,61 @@
+/* jhash.h: Jenkins hash support.
+ *
+ * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
+ *
+ * http://burtleburtle.net/bob/hash/
+ *
+ * These are the credits from Bob's sources:
+ *
+ * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+ *
+ * These are functions for producing 32-bit hashes for hash table lookup.
+ * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
+ * are externally useful functions. Routines to test the hash are
+included
+ * if SELF_TEST is defined. You can use this free for any purpose.
+It's in
+ * the public domain. It has no warranty.
+ *
+ * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu)
+ *
+ * I've modified Bob's hash to be useful in the Linux kernel, and
+ * any bugs present are my fault.
+ * Jozsef
+ */
+
+#ifndef QEMU_JHASH_H__
+#define QEMU_JHASH_H__
+
+#include "qemu/bitops.h"
+
+/*
+ * hashtable relation copy from linux kernel jhash
+ */
+
+/* __jhash_mix -- mix 3 32-bit values reversibly. */
+#define __jhash_mix(a, b, c) \
+{ \
+ a -= c; a ^= rol32(c, 4); c += b; \
+ b -= a; b ^= rol32(a, 6); a += c; \
+ c -= b; c ^= rol32(b, 8); b += a; \
+ a -= c; a ^= rol32(c, 16); c += b; \
+ b -= a; b ^= rol32(a, 19); a += c; \
+ c -= b; c ^= rol32(b, 4); b += a; \
+}
+
+/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
+#define __jhash_final(a, b, c) \
+{ \
+ c ^= b; c -= rol32(b, 14); \
+ a ^= c; a -= rol32(c, 11); \
+ b ^= a; b -= rol32(a, 25); \
+ c ^= b; c -= rol32(b, 16); \
+ a ^= c; a -= rol32(c, 4); \
+ b ^= a; b -= rol32(a, 14); \
+ c ^= b; c -= rol32(b, 24); \
+}
+
+/* An arbitrary initial parameter */
+#define JHASH_INITVAL 0xdeadbeef
+
+#endif /* QEMU_JHASH_H__ */
@@ -17,3 +17,4 @@ common-obj-y += filter.o
common-obj-y += filter-buffer.o
common-obj-y += filter-mirror.o
common-obj-y += colo-compare.o
+common-obj-y += colo-base.o
new file mode 100644
@@ -0,0 +1,182 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "net/colo-base.h"
+
+uint32_t connection_key_hash(const void *opaque)
+{
+ const ConnectionKey *key = opaque;
+ uint32_t a, b, c;
+
+ /* Jenkins hash */
+ a = b = c = JHASH_INITVAL + sizeof(*key);
+ a += key->src.s_addr;
+ b += key->dst.s_addr;
+ c += (key->src_port | key->dst_port << 16);
+ __jhash_mix(a, b, c);
+
+ a += key->ip_proto;
+ __jhash_final(a, b, c);
+
+ return c;
+}
+
+int connection_key_equal(const void *key1, const void *key2)
+{
+ return memcmp(key1, key2, sizeof(ConnectionKey)) == 0;
+}
+
+int parse_packet_early(Packet *pkt)
+{
+ int network_length;
+ uint8_t *data = pkt->data;
+ uint16_t l3_proto;
+ ssize_t l2hdr_len = eth_get_l2_hdr_length(data);
+
+ if (pkt->size < ETH_HLEN) {
+ error_report("pkt->size < ETH_HLEN");
+ return 1;
+ }
+ pkt->network_layer = data + ETH_HLEN;
+ l3_proto = eth_get_l3_proto(data, l2hdr_len);
+ if (l3_proto != ETH_P_IP) {
+ return 1;
+ }
+
+ network_length = pkt->ip->ip_hl * 4;
+ if (pkt->size < ETH_HLEN + network_length) {
+ error_report("pkt->size < network_layer + network_length");
+ return 1;
+ }
+ pkt->transport_layer = pkt->network_layer + network_length;
+ if (!pkt->transport_layer) {
+ error_report("pkt->transport_layer is valid");
+ return 1;
+ }
+
+ return 0;
+}
+
+void fill_connection_key(Packet *pkt, ConnectionKey *key)
+{
+ uint32_t tmp_ports;
+
+ key->ip_proto = pkt->ip->ip_p;
+ key->src = pkt->ip->ip_src;
+ key->dst = pkt->ip->ip_dst;
+
+ switch (key->ip_proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_DCCP:
+ case IPPROTO_ESP:
+ case IPPROTO_SCTP:
+ case IPPROTO_UDPLITE:
+ tmp_ports = *(uint32_t *)(pkt->transport_layer);
+ key->src_port = ntohs(tmp_ports & 0xffff);
+ key->dst_port = ntohs(tmp_ports >> 16);
+ break;
+ case IPPROTO_AH:
+ tmp_ports = *(uint32_t *)(pkt->transport_layer + 4);
+ key->src_port = ntohs(tmp_ports & 0xffff);
+ key->dst_port = ntohs(tmp_ports >> 16);
+ break;
+ default:
+ key->src_port = 0;
+ key->dst_port = 0;
+ break;
+ }
+}
+
+Connection *connection_new(ConnectionKey *key)
+{
+ Connection *conn = g_slice_new(Connection);
+
+ qemu_mutex_init(&conn->list_lock);
+ conn->ip_proto = key->ip_proto;
+ conn->processing = false;
+ g_queue_init(&conn->primary_list);
+ g_queue_init(&conn->secondary_list);
+
+ return conn;
+}
+
+void connection_destroy(void *opaque)
+{
+ Connection *conn = opaque;
+
+ qemu_mutex_lock(&conn->list_lock);
+ g_queue_foreach(&conn->primary_list, packet_destroy, NULL);
+ g_queue_free(&conn->primary_list);
+ g_queue_foreach(&conn->secondary_list, packet_destroy, NULL);
+ g_queue_free(&conn->secondary_list);
+ qemu_mutex_unlock(&conn->list_lock);
+ qemu_mutex_destroy(&conn->list_lock);
+ g_slice_free(Connection, conn);
+}
+
+Packet *packet_new(const void *data, int size)
+{
+ Packet *pkt = g_slice_new(Packet);
+
+ pkt->data = g_memdup(data, size);
+ pkt->size = size;
+
+ return pkt;
+}
+
+void packet_destroy(void *opaque, void *user_data)
+{
+ Packet *pkt = opaque;
+
+ g_free(pkt->data);
+ g_slice_free(Packet, pkt);
+}
+
+/*
+ * Clear hashtable, stop this hash growing really huge
+ */
+void connection_hashtable_reset(GHashTable *connection_track_table)
+{
+ g_hash_table_remove_all(connection_track_table);
+}
+
+/* if not found, create a new connection and add to hash table */
+Connection *connection_get(GHashTable *connection_track_table,
+ ConnectionKey *key,
+ uint32_t *hashtable_size)
+{
+ /* FIXME: protect connection_track_table */
+ Connection *conn = g_hash_table_lookup(connection_track_table, key);
+
+ if (conn == NULL) {
+ ConnectionKey *new_key = g_memdup(key, sizeof(*key));
+
+ conn = connection_new(key);
+
+ *hashtable_size += 1;
+ if (*hashtable_size > HASHTABLE_MAX_SIZE) {
+ error_report("colo proxy connection hashtable full, clear it");
+ connection_hashtable_reset(connection_track_table);
+ *hashtable_size = 0;
+ /* TODO:clear conn_list */
+ }
+
+ g_hash_table_insert(connection_track_table, new_key, conn);
+ }
+
+ return conn;
+}
new file mode 100644
@@ -0,0 +1,89 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_COLO_BASE_H
+#define QEMU_COLO_BASE_H
+
+#include "slirp/slirp.h"
+#include "qemu/jhash.h"
+#include "qemu/rcu.h"
+
+#define HASHTABLE_MAX_SIZE 16384
+
+typedef enum colo_conn_state {
+ COLO_CONN_IDLE,
+
+ /* States on the primary: For incoming connection */
+ COLO_CONN_PRI_IN_SYN, /* Received Syn */
+ COLO_CONN_PRI_IN_PSYNACK, /* Received syn/ack from primary, but not
+ yet from secondary */
+ COLO_CONN_PRI_IN_SSYNACK, /* Received syn/ack from secondary, but
+ not yet from primary */
+ COLO_CONN_PRI_IN_SYNACK, /* Received syn/ack from both */
+ COLO_CONN_PRI_IN_ESTABLISHED, /* Got the ACK */
+
+ /* States on the secondary: For incoming connection */
+ COLO_CONN_SEC_IN_SYNACK, /* We sent a syn/ack */
+ COLO_CONN_SEC_IN_ACK, /* Saw the ack but didn't yet see our syn/ack */
+ COLO_CONN_SEC_IN_ESTABLISHED, /* Got the ACK from the outside */
+} colo_conn_state;
+
+typedef struct Packet {
+ void *data;
+ union {
+ uint8_t *network_layer;
+ struct ip *ip;
+ };
+ uint8_t *transport_layer;
+ int size;
+} Packet;
+
+typedef struct ConnectionKey {
+ /* (src, dst) must be grouped, in the same way than in IP header */
+ struct in_addr src;
+ struct in_addr dst;
+ uint16_t src_port;
+ uint16_t dst_port;
+ uint8_t ip_proto;
+} QEMU_PACKED ConnectionKey;
+
+typedef struct Connection {
+ QemuMutex list_lock;
+ /* connection primary send queue: element type: Packet */
+ GQueue primary_list;
+ /* connection secondary send queue: element type: Packet */
+ GQueue secondary_list;
+ /* flag to enqueue unprocessed_connections */
+ bool processing;
+ uint8_t ip_proto;
+ /* be used by filter-rewriter */
+ colo_conn_state state;
+ tcp_seq primary_seq;
+ tcp_seq secondary_seq;
+} Connection;
+
+uint32_t connection_key_hash(const void *opaque);
+int connection_key_equal(const void *opaque1, const void *opaque2);
+int parse_packet_early(Packet *pkt);
+void fill_connection_key(Packet *pkt, ConnectionKey *key);
+Connection *connection_new(ConnectionKey *key);
+void connection_destroy(void *opaque);
+Connection *connection_get(GHashTable *connection_track_table,
+ ConnectionKey *key,
+ uint32_t *hashtable_size);
+void connection_hashtable_reset(GHashTable *connection_track_table);
+Packet *packet_new(const void *data, int size);
+void packet_destroy(void *opaque, void *user_data);
+
+#endif /* QEMU_COLO_BASE_H */
@@ -28,6 +28,7 @@
#include "qemu/sockets.h"
#include "qapi-visit.h"
#include "trace.h"
+#include "net/colo-base.h"
#define TYPE_COLO_COMPARE "colo-compare"
#define COLO_COMPARE(obj) \
@@ -38,6 +39,28 @@
static QTAILQ_HEAD(, CompareState) net_compares =
QTAILQ_HEAD_INITIALIZER(net_compares);
+/*
+ + CompareState ++
+ | |
+ +---------------+ +---------------+ +---------------+
+ |conn list +--->conn +--------->conn |
+ +---------------+ +---------------+ +---------------+
+ | | | | | |
+ +---------------+ +---v----+ +---v----+ +---v----+ +---v----+
+ |primary | |secondary |primary | |secondary
+ |packet | |packet + |packet | |packet +
+ +--------+ +--------+ +--------+ +--------+
+ | | | |
+ +---v----+ +---v----+ +---v----+ +---v----+
+ |primary | |secondary |primary | |secondary
+ |packet | |packet + |packet | |packet +
+ +--------+ +--------+ +--------+ +--------+
+ | | | |
+ +---v----+ +---v----+ +---v----+ +---v----+
+ |primary | |secondary |primary | |secondary
+ |packet | |packet + |packet | |packet +
+ +--------+ +--------+ +--------+ +--------+
+*/
typedef struct CompareState {
Object parent;
@@ -50,12 +73,78 @@ typedef struct CompareState {
QTAILQ_ENTRY(CompareState) next;
SocketReadState pri_rs;
SocketReadState sec_rs;
+
+ /* connection list: the connections belonged to this NIC could be found
+ * in this list.
+ * element type: Connection
+ */
+ GQueue conn_list;
+ QemuMutex conn_list_lock; /* to protect conn_list */
+ /* hashtable to save connection */
+ GHashTable *connection_track_table;
+ /* to save unprocessed_connections */
+ GQueue unprocessed_connections;
+ /* proxy current hash size */
+ uint32_t hashtable_size;
} CompareState;
typedef struct CompareClass {
ObjectClass parent_class;
} CompareClass;
+enum {
+ PRIMARY_IN = 0,
+ SECONDARY_IN,
+};
+
+static int compare_chr_send(CharDriverState *out,
+ const uint8_t *buf,
+ uint32_t size);
+
+/*
+ * Return 0 on success, if return -1 means the pkt
+ * is unsupported(arp and ipv6) and will be sent later
+ */
+static int packet_enqueue(CompareState *s, int mode)
+{
+ ConnectionKey key = {{ 0 } };
+ Packet *pkt = NULL;
+ Connection *conn;
+
+ if (mode == PRIMARY_IN) {
+ pkt = packet_new(s->pri_rs.buf, s->pri_rs.packet_len);
+ } else {
+ pkt = packet_new(s->sec_rs.buf, s->sec_rs.packet_len);
+ }
+
+ if (parse_packet_early(pkt)) {
+ packet_destroy(pkt, NULL);
+ pkt = NULL;
+ return -1;
+ }
+ fill_connection_key(pkt, &key);
+
+ conn = connection_get(s->connection_track_table,
+ &key,
+ &s->hashtable_size);
+ if (!conn->processing) {
+ qemu_mutex_lock(&s->conn_list_lock);
+ g_queue_push_tail(&s->conn_list, conn);
+ qemu_mutex_unlock(&s->conn_list_lock);
+ conn->processing = true;
+ }
+
+ qemu_mutex_lock(&conn->list_lock);
+ if (mode == PRIMARY_IN) {
+ g_queue_push_tail(&conn->primary_list, pkt);
+ } else {
+ g_queue_push_tail(&conn->secondary_list, pkt);
+ }
+ qemu_mutex_unlock(&conn->list_lock);
+
+ return 0;
+}
+
static int compare_chr_send(CharDriverState *out,
const uint8_t *buf,
uint32_t size)
@@ -167,12 +256,21 @@ static void compare_set_outdev(Object *obj, const char *value, Error **errp)
static void compare_pri_rs_finalize(SocketReadState *pri_rs)
{
- /* if packet_enqueue pri pkt failed we will send unsupported packet */
+ CompareState *s = container_of(pri_rs, CompareState, pri_rs);
+
+ if (packet_enqueue(s, PRIMARY_IN)) {
+ trace_colo_compare_main("primary: unsupported packet in");
+ compare_chr_send(s->chr_out, pri_rs->buf, pri_rs->packet_len);
+ }
}
static void compare_sec_rs_finalize(SocketReadState *sec_rs)
{
- /* if packet_enqueue sec pkt failed we will notify trace */
+ CompareState *s = container_of(sec_rs, CompareState, sec_rs);
+
+ if (packet_enqueue(s, SECONDARY_IN)) {
+ trace_colo_compare_main("secondary: unsupported packet in");
+ }
}
/*
@@ -229,6 +327,15 @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
net_socket_rs_init(&s->pri_rs, compare_pri_rs_finalize);
net_socket_rs_init(&s->sec_rs, compare_sec_rs_finalize);
+ g_queue_init(&s->conn_list);
+ qemu_mutex_init(&s->conn_list_lock);
+ s->hashtable_size = 0;
+
+ s->connection_track_table = g_hash_table_new_full(connection_key_hash,
+ connection_key_equal,
+ g_free,
+ connection_destroy);
+
return;
}
@@ -271,6 +378,8 @@ static void colo_compare_finalize(Object *obj)
if (!QTAILQ_EMPTY(&net_compares)) {
QTAILQ_REMOVE(&net_compares, s, next);
}
+ qemu_mutex_destroy(&s->conn_list_lock);
+ g_queue_free(&s->conn_list);
g_free(s->pri_indev);
g_free(s->sec_indev);
@@ -1916,3 +1916,6 @@ aspeed_vic_update_fiq(int flags) "Raising FIQ: %d"
aspeed_vic_update_irq(int flags) "Raising IRQ: %d"
aspeed_vic_read(uint64_t offset, unsigned size, uint32_t value) "From 0x%" PRIx64 " of size %u: 0x%" PRIx32
aspeed_vic_write(uint64_t offset, unsigned size, uint32_t data) "To 0x%" PRIx64 " of size %u: 0x%" PRIx32
+
+# net/colo-compare.c
+colo_compare_main(const char *chr) ": %s"