=============
$ fio fio-mixed.job
[...]
fio-sync: (groupid=3, jobs=1): err= 0: pid=563
read : io=102400KB, bw=31239KB/s, iops=429 , runt= 3278msec
clat (usec): min=4 , max=31668 , avg=1663.94, stdev=1810.37
lat (usec): min=281 , max=658022 , avg=162456.88, stdev=242272.58
bw (KB/s) : min=21536, max=29322, per=20.77%, avg=25956.50, stdev=2755.33
write: io=34688KB, bw=13235KB/s, iops=183 , runt= 2621msec
clat (usec): min=77 , max=1619 , avg=431.80, stdev=343.81
lat (usec): min=78 , max=1620 , avg=433.11, stdev=343.83
bw (KB/s) : min=11168, max=14447, per=26.34%, avg=13246.00, stdev=1293.73
[...]
Run status group 0 (all jobs):
READ: io=409600KB, aggrb=74594KB/s, minb=19096KB/s, maxb=20917KB/s,
mint=5013msec, maxt=5491msec
WRITE: io=139552KB, aggrb=42624KB/s, minb=10263KB/s, maxb=14463KB/s,
mint=2414msec, maxt=3274msec
Run status group 1 (all jobs):
READ: io=409600KB, aggrb=146442KB/s, minb=37489KB/s, maxb=48680KB/s,
mint=2154msec, maxt=2797msec
WRITE: io=133296KB, aggrb=59613KB/s, minb=15328KB/s, maxb=16774KB/s,
mint=2041msec, maxt=2236msec
Run status group 2 (all jobs):
READ: io=409600KB, aggrb=7178KB/s, minb=1837KB/s, maxb=1840KB/s,
mint=56961msec, maxt=57057msec
WRITE: io=139136KB, aggrb=2446KB/s, minb=600KB/s, maxb=661KB/s,
mint=56238msec, maxt=56880msec
Run status group 3 (all jobs):
READ: io=409600KB, aggrb=124954KB/s, minb=31988KB/s, maxb=37704KB/s,
mint=2781msec, maxt=3278msec
WRITE: io=137936KB, aggrb=50286KB/s, minb=12447KB/s, maxb=13885KB/s,
mint=2621msec, maxt=2743msec
Disk stats (read/write):
vda: ios=118760/16273, merge=0/107354, ticks=159260/658290, in_queue=817410,
util=94.37%
[...]
With Cache
==========
$ fio fio-mixed.job
[...]
fio-sync: (groupid=3, jobs=1): err= 0: pid=573
read : io=102400KB, bw=76704KB/s, iops=1018 , runt= 1335msec
clat (usec): min=0 , max=66131 , avg=591.46, stdev=2582.59
lat (usec): min=66 , max=123396 , avg=25609.49, stdev=38397.57
bw (KB/s) : min=58208, max=59584, per=34.55%, avg=58896.00, stdev=972.98
write: io=35600KB, bw=29373KB/s, iops=381 , runt= 1212msec
clat (usec): min=11 , max=2661 , avg=169.58, stdev=191.60
lat (usec): min=11 , max=2663 , avg=170.91, stdev=191.78
bw (KB/s) : min=27104, max=37664, per=29.98%, avg=32384.00, stdev=7467.05
[...]
Run status group 0 (all jobs):
READ: io=409600KB, aggrb=140562KB/s, minb=35984KB/s, maxb=44450KB/s,
mint=2359msec, maxt=2914msec
WRITE: io=138816KB, aggrb=124722KB/s, minb=31502KB/s, maxb=40180KB/s,
mint=893msec, maxt=1113msec
Run status group 1 (all jobs):
READ: io=409600KB, aggrb=231151KB/s, minb=59174KB/s, maxb=96111KB/s,
mint=1091msec, maxt=1772msec
WRITE: io=141936KB, aggrb=137268KB/s, minb=32340KB/s, maxb=44496KB/s,
mint=813msec, maxt=1034msec
Run status group 2 (all jobs):
READ: io=409600KB, aggrb=9211KB/s, minb=2358KB/s, maxb=2363KB/s,
mint=44367msec, maxt=44468msec
WRITE: io=129808KB, aggrb=2931KB/s, minb=707KB/s, maxb=797KB/s,
mint=43331msec, maxt=44285msec
Run status group 3 (all jobs):
READ: io=409600KB, aggrb=170453KB/s, minb=43636KB/s, maxb=78545KB/s,
mint=1335msec, maxt=2403msec
WRITE: io=138256KB, aggrb=108012KB/s, minb=27648KB/s, maxb=37931KB/s,
mint=879msec, maxt=1280msec
Disk stats (read/write):
vda: ios=120698/16690, merge=0/114742, ticks=113170/304480, in_queue=417560,
util=93.26%
[...]
Summary
=======
Read bandwidth increased by 1.2 to 1.8 times
Write bandwidth increased by 1.1 to 2.9 times
Read latency decreased by small margin of 0.2
Write latency decreased by 0.4
Signed-off-by: Prasad Joshi <prasadjoshi124@gmail.com>
---
tools/kvm/disk/qcow.c | 244 ++++++++++++++++++++++++++++++++++++++----
tools/kvm/include/kvm/qcow.h | 16 +++
2 files changed, 237 insertions(+), 23 deletions(-)
@@ -16,6 +16,149 @@
#include <linux/kernel.h>
#include <linux/types.h>
+static int insert(struct rb_root *root, struct qcow_l2_cache *new)
+{
+ struct rb_node **link = &(root->rb_node), *parent = NULL;
+ u64 offset = new->offset;
+
+ /* search the tree */
+ while (*link) {
+ struct qcow_l2_cache *t;
+
+ t = rb_entry(*link, struct qcow_l2_cache, node);
+ if (!t)
+ goto error;
+
+ parent = *link;
+
+ if (t->offset > offset)
+ link = &(*link)->rb_left;
+ else if (t->offset < offset)
+ link = &(*link)->rb_right;
+ else
+ goto out;
+ }
+
+ /* add new node */
+ rb_link_node(&new->node, parent, link);
+ rb_insert_color(&new->node, root);
+out:
+ return 0;
+error:
+ return -1;
+}
+
+static struct qcow_l2_cache *search(struct rb_root *root, u64 offset)
+{
+ struct rb_node *link = root->rb_node;
+
+ while (link) {
+ struct qcow_l2_cache *t;
+
+ t = rb_entry(link, struct qcow_l2_cache, node);
+ if (!t)
+ goto out;
+
+ if (t->offset > offset)
+ link = link->rb_left;
+ else if (t->offset < offset)
+ link = link->rb_right;
+ else
+ return t;
+ }
+out:
+ return NULL;
+}
+
+static void free_cache(struct qcow *q)
+{
+ struct list_head *pos, *n;
+ struct qcow_l2_cache *t;
+ struct rb_root *r = &q->root;
+
+ list_for_each_safe(pos, n, &q->lru_list) {
+ /* Remove cache table from the list and RB tree */
+ list_del(pos);
+ t = list_entry(pos, struct qcow_l2_cache, list);
+ rb_erase(&t->node, r);
+
+ /* Free the cached node */
+ free(t);
+ }
+}
+
+static int cache_table(struct qcow *q, struct qcow_l2_cache *c)
+{
+ struct rb_root *r = &q->root;
+ struct qcow_l2_cache *lru;
+
+ if (q->nr_cached == MAX_CACHE_NODES) {
+ /*
+ * The node at the head of the list is least recently used
+ * node. Remove it from the list and replaced with a new node.
+ */
+ lru = list_first_entry(&q->lru_list, struct qcow_l2_cache, list);
+
+ /* Remove the node from the cache */
+ rb_erase(&lru->node, r);
+ list_del_init(&lru->list);
+ q->nr_cached--;
+
+ /* Free the LRUed node */
+ free(lru);
+ }
+
+ /* Add new node in RB Tree: Helps in searching faster */
+ if (insert(r, c) < 0)
+ goto error;
+
+ /* Add in LRU replacement list */
+ list_add_tail(&c->list, &q->lru_list);
+ q->nr_cached++;
+
+ return 0;
+error:
+ return -1;
+}
+
+static int search_table(struct qcow *q, u64 **table, u64 offset)
+{
+ struct qcow_l2_cache *c;
+
+ *table = NULL;
+
+ c = search(&q->root, offset);
+ if (!c)
+ return -1;
+
+ /* Update the LRU state, by moving the searched node to list tail */
+ list_move_tail(&c->list, &q->lru_list);
+
+ *table = c->table;
+ return 0;
+}
+
+/* Allocates a new node for caching L2 table */
+static struct qcow_l2_cache *new_cache_table(struct qcow *q, u64 offset)
+{
+ struct qcow_header *header = q->header;
+ struct qcow_l2_cache *c;
+ u64 l2t_sz;
+ u64 size;
+
+ l2t_sz = 1 << header->l2_bits;
+ size = sizeof(*c) + l2t_sz * sizeof(u64);
+ c = calloc(1, size);
+ if (!c)
+ goto out;
+
+ c->offset = offset;
+ RB_CLEAR_NODE(&c->node);
+ INIT_LIST_HEAD(&c->list);
+out:
+ return c;
+}
+
static inline u64 get_l1_index(struct qcow *q, u64 offset)
{
struct qcow_header *header = q->header;
@@ -37,6 +180,47 @@ static inline u64 get_cluster_offset(struct qcow *q, u64 offset)
return offset & ((1 << header->cluster_bits)-1);
}
+static int qcow_read_l2_table(struct qcow *q, u64 **table, u64 offset)
+{
+ struct qcow_header *header = q->header;
+ struct qcow_l2_cache *c;
+ u64 size;
+ u64 i;
+ u64 *t;
+
+ c = NULL;
+ *table = NULL;
+ size = 1 << header->l2_bits;
+
+ /* search an entry for offset in cache */
+ if (search_table(q, table, offset) >= 0)
+ return 0;
+
+ /* allocate new node for caching l2 table */
+ c = new_cache_table(q, offset);
+ if (!c)
+ goto error;
+ t = c->table;
+
+ /* table not cached: read from the disk */
+ if (pread_in_full(q->fd, t, size * sizeof(u64), offset) < 0)
+ goto error;
+
+ /* cache the table */
+ if (cache_table(q, c) < 0)
+ goto error;
+
+ /* change cached table to CPU's byte-order */
+ for (i = 0; i < size; i++)
+ be64_to_cpus(&t[i]);
+
+ *table = t;
+ return 0;
+error:
+ free(c);
+ return -1;
+}
+
static ssize_t qcow_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_len)
{
struct qcow_header *header = q->header;
@@ -51,7 +235,6 @@ static ssize_t qcow_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_
u64 l1_idx;
u64 l2_idx;
- l2_table = NULL;
cluster_size = 1 << header->cluster_bits;
@@ -72,18 +255,16 @@ static ssize_t qcow_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_
goto zero_cluster;
l2_table_size = 1 << header->l2_bits;
- l2_table = calloc(l2_table_size, sizeof(u64));
- if (!l2_table)
- goto out_error;
- if (pread_in_full(q->fd, l2_table, l2_table_size * sizeof(u64), l2_table_offset) < 0)
+ /* read and cache level 2 table */
+ if (qcow_read_l2_table(q, &l2_table, l2_table_offset) < 0)
goto out_error;
l2_idx = get_l2_index(q, offset);
if (l2_idx >= l2_table_size)
goto out_error;
- clust_start = be64_to_cpu(l2_table[l2_idx]) & ~header->oflag_mask;
+ clust_start = l2_table[l2_idx] & ~header->oflag_mask;
if (!clust_start)
goto zero_cluster;
@@ -91,7 +272,6 @@ static ssize_t qcow_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_
goto out_error;
out:
- free(l2_table);
return length;
zero_cluster:
@@ -189,6 +369,7 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
{
struct qcow_header *header = q->header;
struct qcow_table *table = &q->table;
+ struct qcow_l2_cache *c;
bool update_meta;
u64 clust_start;
u64 clust_off;
@@ -202,6 +383,7 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
u64 len;
u64 t;
+ c = NULL;
l2t_sz = 1 << header->l2_bits;
clust_sz = 1 << header->cluster_bits;
@@ -221,24 +403,26 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
if (len > src_len)
len = src_len;
- l2t = calloc(l2t_sz, sizeof(u64));
- if (!l2t)
- goto error;
-
l2t_off = table->l1_table[l1t_idx] & ~header->oflag_mask;
if (l2t_off) {
- if (pread_in_full(q->fd, l2t, l2t_sz * sizeof(u64), l2t_off) < 0)
- goto free_l2;
+ /* read and cache l2 table */
+ if (qcow_read_l2_table(q, &l2t, l2t_off) < 0)
+ goto error;
} else {
+ c = new_cache_table(q, l2t_off);
+ if (!c)
+ goto error;
+ l2t = c->table;
+
/* Capture the state of the consistent QCOW image */
f_sz = file_size(q->fd);
if (!f_sz)
- goto free_l2;
+ goto free_cache;
/* Write the l2 table of 0's at the end of the file */
l2t_off = qcow_write_l2_table(q, l2t);
if (!l2t_off)
- goto free_l2;
+ goto free_cache;
/* Metadata update: update on disk level 1 table */
t = cpu_to_be64(l2t_off);
@@ -246,9 +430,16 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
if (qcow_pwrite_sync(q->fd, &t, sizeof(t), header->l1_table_offset + l1t_idx * sizeof(u64)) < 0) {
/* restore file to consistent state */
if (ftruncate(q->fd, f_sz) < 0)
- goto free_l2;
+ goto free_cache;
- goto free_l2;
+ goto free_cache;
+ }
+
+ if (cache_table(q, c) < 0) {
+ if (ftruncate(q->fd, f_sz) < 0)
+ goto free_cache;
+
+ goto free_cache;
}
/* Update the in-core entry */
@@ -258,17 +449,15 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
/* Capture the state of the consistent QCOW image */
f_sz = file_size(q->fd);
if (!f_sz)
- goto free_l2;
+ goto error;
- clust_start = be64_to_cpu(l2t[l2t_idx]) & ~header->oflag_mask;
+ clust_start = l2t[l2t_idx] & ~header->oflag_mask;
if (!clust_start) {
clust_start = ALIGN(f_sz, clust_sz);
update_meta = true;
} else
update_meta = false;
- free(l2t);
-
/* Write actual data */
if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0)
goto error;
@@ -282,11 +471,15 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
goto error;
}
+
+ /* Update the cached level2 entry */
+ l2t[l2t_idx] = clust_start;
}
return len;
-free_l2:
- free(l2t);
+
+free_cache:
+ free(c);
error:
return -1;
}
@@ -336,6 +529,7 @@ static int qcow_disk_close(struct disk_image *disk)
q = disk->priv;
+ free_cache(q);
free(q->table.l1_table);
free(q->header);
free(q);
@@ -428,6 +622,8 @@ static struct disk_image *qcow2_probe(int fd, bool readonly)
goto error;
q->fd = fd;
+ q->root = RB_ROOT;
+ INIT_LIST_HEAD(&q->lru_list);
h = q->header = qcow2_read_header(fd);
if (!h)
@@ -525,6 +721,8 @@ static struct disk_image *qcow1_probe(int fd, bool readonly)
goto error;
q->fd = fd;
+ q->root = RB_ROOT;
+ INIT_LIST_HEAD(&q->lru_list);
h = q->header = qcow1_read_header(fd);
if (!h)
@@ -3,6 +3,8 @@
#include <linux/types.h>
#include <stdbool.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
@@ -17,6 +19,15 @@
#define QCOW2_OFLAG_COMPRESSED (1LL << 62)
#define QCOW2_OFLAG_MASK (QCOW2_OFLAG_COPIED|QCOW2_OFLAG_COMPRESSED)
+#define MAX_CACHE_NODES 32
+
+struct qcow_l2_cache {
+ u64 offset;
+ struct rb_node node;
+ struct list_head list;
+ u64 table[];
+};
+
struct qcow_table {
u32 table_size;
u64 *l1_table;
@@ -26,6 +37,11 @@ struct qcow {
void *header;
struct qcow_table table;
int fd;
+
+ /* Level2 caching data structures */
+ struct rb_root root;
+ struct list_head lru_list;
+ int nr_cached;
};
struct qcow_header {
QCOW uses two tables level1 (L1) table and level2 (L2) table. The L1 table points to offset of L2 table. When a QCOW image is probed, the L1 table is cached in the memory to avoid reading it from disk on every access. This caching improves the performance. The similar performance improvement can be observed when L2 tables are cached. It is impossible to cache all of the L2 tables because of the memory constraint. The patch adds L2 table caching capability for up to 32 L2 tables, it uses combination of RB tree and List to manage the L2 cached tables. The link list implementation helps in building simple LRU structure and RB tree helps to search cached table efficiently. To calculate the performance numbers, the VM was started with following command line arguments $ ./kvm run -d /dev/shm/kvm_ubuntu_server.qcow2 --params "root=/dev/vda1" \ > -m 2048 Without Cache