diff mbox

[RFC,v2,09/22] block/pcache: separation AIOCB on requests

Message ID 20160829171021.4902-10-pbutsykin@virtuozzo.com (mailing list archive)
State New, archived
Headers show

Commit Message

Pavel Butsykin Aug. 29, 2016, 5:10 p.m. UTC
for case when the cache partially covers request we are part of the request
is filled from the cache, and the other part request from disk. Also add
reference counting for nodes, as way to maintain multithreading.

There is still no full synchronization in multithreaded mode.

Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
---
 block/pcache.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 155 insertions(+), 14 deletions(-)

Comments

Kevin Wolf Sept. 2, 2016, 9:10 a.m. UTC | #1
Am 29.08.2016 um 19:10 hat Pavel Butsykin geschrieben:
> for case when the cache partially covers request we are part of the request
> is filled from the cache, and the other part request from disk. Also add
> reference counting for nodes, as way to maintain multithreading.
> 
> There is still no full synchronization in multithreaded mode.
> 
> Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
> ---
>  block/pcache.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 155 insertions(+), 14 deletions(-)
> 
> diff --git a/block/pcache.c b/block/pcache.c
> index 28bd056..6114289 100644
> --- a/block/pcache.c
> +++ b/block/pcache.c
> @@ -58,7 +58,10 @@ typedef struct BlockNode {
>  typedef struct PCNode {
>      BlockNode cm;
>  
> +    uint32_t                 status;

I guess this is NODE_*_STATUS. Make it a named enum then instead of
uint32_t so that it's obvious what this field means.

> +    uint32_t                 ref;
>      uint8_t                  *data;
> +    CoMutex                  lock;
>  } PCNode;
>  
>  typedef struct ReqStor {
> @@ -95,9 +98,23 @@ typedef struct PrefCacheAIOCB {
>      uint64_t sector_num;
>      uint32_t nb_sectors;
>      int      aio_type;
> +    struct {
> +        QTAILQ_HEAD(req_head, PrefCachePartReq) list;
> +        CoMutex lock;
> +    } requests;
>      int      ret;
>  } PrefCacheAIOCB;
>  
> +typedef struct PrefCachePartReq {
> +    uint64_t sector_num;
> +    uint32_t nb_sectors;

Should be byte-based, like everything.

> +    QEMUIOVector qiov;
> +    PCNode *node;
> +    PrefCacheAIOCB *acb;
> +    QTAILQ_ENTRY(PrefCachePartReq) entry;
> +} PrefCachePartReq;
> +
>  static const AIOCBInfo pcache_aiocb_info = {
>      .aiocb_size = sizeof(PrefCacheAIOCB),
>  };
> @@ -126,8 +143,39 @@ static QemuOptsList runtime_opts = {
>  #define MB_BITS 20
>  #define PCACHE_DEFAULT_CACHE_SIZE (4 << MB_BITS)
>  
> +enum {
> +    NODE_SUCCESS_STATUS = 0,
> +    NODE_WAIT_STATUS    = 1,
> +    NODE_REMOVE_STATUS  = 2,
> +    NODE_GHOST_STATUS   = 3 /* only for debugging */

NODE_DELETED_STATUS?

> +};
> +
>  #define PCNODE(_n) ((PCNode *)(_n))
>  
> +static inline void pcache_node_unref(PCNode *node)
> +{
> +    assert(node->status == NODE_SUCCESS_STATUS ||
> +           node->status == NODE_REMOVE_STATUS);
> +
> +    if (atomic_fetch_dec(&node->ref) == 0) {

Atomics imply concurrency, which we don't have.

> +        assert(node->status == NODE_REMOVE_STATUS);
> +
> +        node->status = NODE_GHOST_STATUS;
> +        g_free(node->data);
> +        g_slice_free1(sizeof(*node), node);

When you switch to plain g_malloc(), this needs to be updated.

> +    }
> +}
> +
> +static inline PCNode *pcache_node_ref(PCNode *node)
> +{
> +    assert(node->status == NODE_SUCCESS_STATUS ||
> +           node->status == NODE_WAIT_STATUS);
> +    assert(atomic_read(&node->ref) == 0);/* XXX: only for sequential requests */
> +    atomic_inc(&node->ref);

Do you expect concurrent accesses or not? Because if you don't, there is
no need for atomics, but if you do, this is buggy because each of the
lines is atomic for itself, but the assertion isn't atomic with the
refcount increment.

A ref() function that can take only a single reference feels odd anyway
and this restriction seems to be lifted later. Why have it here?

> +
> +    return node;
> +}

Kevin
Pavel Butsykin Sept. 8, 2016, 3:47 p.m. UTC | #2
On 02.09.2016 12:10, Kevin Wolf wrote:
> Am 29.08.2016 um 19:10 hat Pavel Butsykin geschrieben:
>> for case when the cache partially covers request we are part of the request
>> is filled from the cache, and the other part request from disk. Also add
>> reference counting for nodes, as way to maintain multithreading.
>>
>> There is still no full synchronization in multithreaded mode.
>>
>> Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
>> ---
>>   block/pcache.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>>   1 file changed, 155 insertions(+), 14 deletions(-)
>>
>> diff --git a/block/pcache.c b/block/pcache.c
>> index 28bd056..6114289 100644
>> --- a/block/pcache.c
>> +++ b/block/pcache.c
>> @@ -58,7 +58,10 @@ typedef struct BlockNode {
>>   typedef struct PCNode {
>>       BlockNode cm;
>>
>> +    uint32_t                 status;
>
> I guess this is NODE_*_STATUS. Make it a named enum then instead of
> uint32_t so that it's obvious what this field means.

OK

>> +    uint32_t                 ref;
>>       uint8_t                  *data;
>> +    CoMutex                  lock;
>>   } PCNode;
>>
>>   typedef struct ReqStor {
>> @@ -95,9 +98,23 @@ typedef struct PrefCacheAIOCB {
>>       uint64_t sector_num;
>>       uint32_t nb_sectors;
>>       int      aio_type;
>> +    struct {
>> +        QTAILQ_HEAD(req_head, PrefCachePartReq) list;
>> +        CoMutex lock;
>> +    } requests;
>>       int      ret;
>>   } PrefCacheAIOCB;
>>
>> +typedef struct PrefCachePartReq {
>> +    uint64_t sector_num;
>> +    uint32_t nb_sectors;
>
> Should be byte-based, like everything.
>
>> +    QEMUIOVector qiov;
>> +    PCNode *node;
>> +    PrefCacheAIOCB *acb;
>> +    QTAILQ_ENTRY(PrefCachePartReq) entry;
>> +} PrefCachePartReq;
>> +
>>   static const AIOCBInfo pcache_aiocb_info = {
>>       .aiocb_size = sizeof(PrefCacheAIOCB),
>>   };
>> @@ -126,8 +143,39 @@ static QemuOptsList runtime_opts = {
>>   #define MB_BITS 20
>>   #define PCACHE_DEFAULT_CACHE_SIZE (4 << MB_BITS)
>>
>> +enum {
>> +    NODE_SUCCESS_STATUS = 0,
>> +    NODE_WAIT_STATUS    = 1,
>> +    NODE_REMOVE_STATUS  = 2,
>> +    NODE_GHOST_STATUS   = 3 /* only for debugging */
>
> NODE_DELETED_STATUS?

Yes :)

>> +};
>> +
>>   #define PCNODE(_n) ((PCNode *)(_n))
>>
>> +static inline void pcache_node_unref(PCNode *node)
>> +{
>> +    assert(node->status == NODE_SUCCESS_STATUS ||
>> +           node->status == NODE_REMOVE_STATUS);
>> +
>> +    if (atomic_fetch_dec(&node->ref) == 0) {
>
> Atomics imply concurrency, which we don't have.
>
>> +        assert(node->status == NODE_REMOVE_STATUS);
>> +
>> +        node->status = NODE_GHOST_STATUS;
>> +        g_free(node->data);
>> +        g_slice_free1(sizeof(*node), node);
>
> When you switch to plain g_malloc(), this needs to be updated.
>
>> +    }
>> +}
>> +
>> +static inline PCNode *pcache_node_ref(PCNode *node)
>> +{
>> +    assert(node->status == NODE_SUCCESS_STATUS ||
>> +           node->status == NODE_WAIT_STATUS);
>> +    assert(atomic_read(&node->ref) == 0);/* XXX: only for sequential requests */
>> +    atomic_inc(&node->ref);
>
> Do you expect concurrent accesses or not? Because if you don't, there is
> no need for atomics, but if you do, this is buggy because each of the
> lines is atomic for itself, but the assertion isn't atomic with the
> refcount increment.

Well, about concurrent accesses, we've already figured out.

> A ref() function that can take only a single reference feels odd anyway
> and this restriction seems to be lifted later. Why have it here?

No, this is a temporary assert(). In fact, it is not necessary, but the
assert helps to check the correct functioning on the current patch,
because not yet implemented reading of nodes and rescheduling requests.

>> +
>> +    return node;
>> +}
>
> Kevin
>
diff mbox

Patch

diff --git a/block/pcache.c b/block/pcache.c
index 28bd056..6114289 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -58,7 +58,10 @@  typedef struct BlockNode {
 typedef struct PCNode {
     BlockNode cm;
 
+    uint32_t                 status;
+    uint32_t                 ref;
     uint8_t                  *data;
+    CoMutex                  lock;
 } PCNode;
 
 typedef struct ReqStor {
@@ -95,9 +98,23 @@  typedef struct PrefCacheAIOCB {
     uint64_t sector_num;
     uint32_t nb_sectors;
     int      aio_type;
+    struct {
+        QTAILQ_HEAD(req_head, PrefCachePartReq) list;
+        CoMutex lock;
+    } requests;
     int      ret;
 } PrefCacheAIOCB;
 
+typedef struct PrefCachePartReq {
+    uint64_t sector_num;
+    uint32_t nb_sectors;
+
+    QEMUIOVector qiov;
+    PCNode *node;
+    PrefCacheAIOCB *acb;
+    QTAILQ_ENTRY(PrefCachePartReq) entry;
+} PrefCachePartReq;
+
 static const AIOCBInfo pcache_aiocb_info = {
     .aiocb_size = sizeof(PrefCacheAIOCB),
 };
@@ -126,8 +143,39 @@  static QemuOptsList runtime_opts = {
 #define MB_BITS 20
 #define PCACHE_DEFAULT_CACHE_SIZE (4 << MB_BITS)
 
+enum {
+    NODE_SUCCESS_STATUS = 0,
+    NODE_WAIT_STATUS    = 1,
+    NODE_REMOVE_STATUS  = 2,
+    NODE_GHOST_STATUS   = 3 /* only for debugging */
+};
+
 #define PCNODE(_n) ((PCNode *)(_n))
 
+static inline void pcache_node_unref(PCNode *node)
+{
+    assert(node->status == NODE_SUCCESS_STATUS ||
+           node->status == NODE_REMOVE_STATUS);
+
+    if (atomic_fetch_dec(&node->ref) == 0) {
+        assert(node->status == NODE_REMOVE_STATUS);
+
+        node->status = NODE_GHOST_STATUS;
+        g_free(node->data);
+        g_slice_free1(sizeof(*node), node);
+    }
+}
+
+static inline PCNode *pcache_node_ref(PCNode *node)
+{
+    assert(node->status == NODE_SUCCESS_STATUS ||
+           node->status == NODE_WAIT_STATUS);
+    assert(atomic_read(&node->ref) == 0);/* XXX: only for sequential requests */
+    atomic_inc(&node->ref);
+
+    return node;
+}
+
 static int pcache_key_cmp(const RbNodeKey *key1, const RbNodeKey *key2)
 {
     assert(key1 != NULL);
@@ -184,13 +232,7 @@  static void *node_insert(struct RbRoot *root, BlockNode *node)
 
 static inline PCNode *pcache_node_insert(struct RbRoot *root, PCNode *node)
 {
-    return node_insert(root, &node->cm);
-}
-
-static inline void pcache_node_free(PCNode *node)
-{
-    g_free(node->data);
-    g_slice_free1(sizeof(*node), node);
+    return pcache_node_ref(node_insert(root, &node->cm));
 }
 
 static inline void *pcache_node_alloc(RbNodeKey* key)
@@ -199,6 +241,9 @@  static inline void *pcache_node_alloc(RbNodeKey* key)
 
     node->cm.sector_num = key->num;
     node->cm.nb_sectors = key->size;
+    node->ref = 0;
+    node->status = NODE_WAIT_STATUS;
+    qemu_co_mutex_init(&node->lock);
     node->data = g_malloc(node->cm.nb_sectors << BDRV_SECTOR_BITS);
 
     return node;
@@ -206,6 +251,12 @@  static inline void *pcache_node_alloc(RbNodeKey* key)
 
 static void pcache_node_drop(BDRVPCacheState *s, PCNode *node)
 {
+    uint32_t prev_status = atomic_xchg(&node->status, NODE_REMOVE_STATUS);
+    if (prev_status == NODE_REMOVE_STATUS) {
+        return;
+    }
+    assert(prev_status != NODE_GHOST_STATUS);
+
     atomic_sub(&s->pcache.curr_size, node->cm.nb_sectors);
 
     qemu_co_mutex_lock(&s->pcache.lru.lock);
@@ -216,7 +267,7 @@  static void pcache_node_drop(BDRVPCacheState *s, PCNode *node)
     rb_erase(&node->cm.rb_node, &s->pcache.tree.root);
     qemu_co_mutex_unlock(&s->pcache.tree.lock);
 
-    pcache_node_free(node);
+    pcache_node_unref(node);
 }
 
 static void pcache_try_shrink(BDRVPCacheState *s)
@@ -234,6 +285,30 @@  static void pcache_try_shrink(BDRVPCacheState *s)
     }
 }
 
+static PrefCachePartReq *pcache_req_get(PrefCacheAIOCB *acb, PCNode *node)
+{
+    PrefCachePartReq *req = g_slice_alloc(sizeof(*req));
+
+    req->nb_sectors = node->cm.nb_sectors;
+    req->sector_num = node->cm.sector_num;
+    req->node = node;
+    req->acb = acb;
+
+    assert(acb->sector_num <= node->cm.sector_num + node->cm.nb_sectors);
+
+    qemu_iovec_init(&req->qiov, 1);
+    qemu_iovec_add(&req->qiov, node->data,
+                   node->cm.nb_sectors << BDRV_SECTOR_BITS);
+    return req;
+}
+
+static inline void push_node_request(PrefCacheAIOCB *acb, PCNode *node)
+{
+    PrefCachePartReq *req = pcache_req_get(acb, node);
+
+    QTAILQ_INSERT_HEAD(&acb->requests.list, req, entry);
+}
+
 static inline void pcache_lru_node_up(BDRVPCacheState *s, PCNode *node)
 {
     qemu_co_mutex_lock(&s->pcache.lru.lock);
@@ -253,16 +328,17 @@  static bool pcache_node_find_and_create(PrefCacheAIOCB *acb, RbNodeKey *key,
     found = pcache_node_insert(&s->pcache.tree.root, new_node);
     qemu_co_mutex_unlock(&s->pcache.tree.lock);
     if (found != new_node) {
-        pcache_node_free(new_node);
-        pcache_lru_node_up(s, found);
+        g_free(new_node->data);
+        g_slice_free1(sizeof(*new_node), new_node);
+        if (found->status == NODE_SUCCESS_STATUS) {
+            pcache_lru_node_up(s, found);
+        }
         *out_node = found;
         return false;
     }
     atomic_add(&s->pcache.curr_size, new_node->cm.nb_sectors);
 
-    qemu_co_mutex_lock(&s->pcache.lru.lock);
-    QTAILQ_INSERT_HEAD(&s->pcache.lru.list, &new_node->cm, entry);
-    qemu_co_mutex_unlock(&s->pcache.lru.lock);
+    push_node_request(acb, new_node);
 
     pcache_try_shrink(s);
 
@@ -291,6 +367,7 @@  static void pcache_pickup_parts_of_cache(PrefCacheAIOCB *acb, PCNode *node,
             up_size = lc_key.size;
 
             if (!pcache_node_find_and_create(acb, &lc_key, &new_node)) {
+                pcache_node_unref(node);
                 node = new_node;
                 continue;
             }
@@ -300,6 +377,8 @@  static void pcache_pickup_parts_of_cache(PrefCacheAIOCB *acb, PCNode *node,
         /* XXX: node read */
         up_size = MIN(node->cm.sector_num + node->cm.nb_sectors - num, size);
 
+        pcache_node_unref(node);
+
         size -= up_size;
         num += up_size;
         if (size != 0) {
@@ -336,6 +415,8 @@  static int32_t pcache_prefetch(PrefCacheAIOCB *acb)
         node->cm.sector_num + node->cm.nb_sectors >= acb->sector_num +
                                                      acb->nb_sectors)
     {
+        /* XXX: node read */
+        pcache_node_unref(node);
         return PREFETCH_FULL_UP;
     }
     pcache_pickup_parts_of_cache(acb, node, key.num, key.size);
@@ -343,10 +424,56 @@  static int32_t pcache_prefetch(PrefCacheAIOCB *acb)
     return PREFETCH_PART_UP;
 }
 
+static void pcache_node_submit(PrefCachePartReq *req)
+{
+    PCNode *node = req->node;
+    BDRVPCacheState *s = req->acb->s;
+
+    assert(node != NULL);
+    assert(atomic_read(&node->ref) != 0);
+    assert(node->data != NULL);
+
+    qemu_co_mutex_lock(&node->lock);
+    if (node->status == NODE_WAIT_STATUS) {
+        qemu_co_mutex_lock(&s->pcache.lru.lock);
+        QTAILQ_INSERT_HEAD(&s->pcache.lru.list, &node->cm, entry);
+        qemu_co_mutex_unlock(&s->pcache.lru.lock);
+
+        node->status = NODE_SUCCESS_STATUS;
+    }
+    qemu_co_mutex_unlock(&node->lock);
+}
+
+static void pcache_merge_requests(PrefCacheAIOCB *acb)
+{
+    PrefCachePartReq *req, *next;
+
+    qemu_co_mutex_lock(&acb->requests.lock);
+    QTAILQ_FOREACH_SAFE(req, &acb->requests.list, entry, next) {
+        QTAILQ_REMOVE(&acb->requests.list, req, entry);
+
+        assert(req != NULL);
+        assert(req->node->status == NODE_WAIT_STATUS);
+
+        pcache_node_submit(req);
+
+        /* XXX: pcache read */
+
+        pcache_node_unref(req->node);
+
+        g_slice_free1(sizeof(*req), req);
+    }
+    qemu_co_mutex_unlock(&acb->requests.lock);
+}
+
 static void pcache_aio_cb(void *opaque, int ret)
 {
     PrefCacheAIOCB *acb = opaque;
 
+    if (acb->aio_type & QEMU_AIO_READ) {
+        pcache_merge_requests(acb);
+    }
+
     acb->common.cb(acb->common.opaque, ret);
 
     qemu_aio_unref(acb);
@@ -366,6 +493,9 @@  static PrefCacheAIOCB *pcache_aio_get(BlockDriverState *bs, int64_t sector_num,
     acb->aio_type = type;
     acb->ret = 0;
 
+    QTAILQ_INIT(&acb->requests.list);
+    qemu_co_mutex_init(&acb->requests.lock);
+
     return acb;
 }
 
@@ -445,6 +575,17 @@  fail:
     return ret;
 }
 
+static void pcache_node_check_and_free(BDRVPCacheState *s, PCNode *node)
+{
+    assert(node->status == NODE_SUCCESS_STATUS);
+    assert(node->ref == 0);
+
+    node->status = NODE_REMOVE_STATUS;
+    rb_erase(&node->cm.rb_node, &s->pcache.tree.root);
+    g_free(node->data);
+    g_slice_free1(sizeof(*node), node);
+}
+
 static void pcache_close(BlockDriverState *bs)
 {
     uint32_t cnt = 0;
@@ -452,7 +593,7 @@  static void pcache_close(BlockDriverState *bs)
     BlockNode *node, *next;
     QTAILQ_FOREACH_SAFE(node, &s->pcache.lru.list, entry, next) {
         QTAILQ_REMOVE(&s->pcache.lru.list, node, entry);
-        pcache_node_free(PCNODE(node));
+        pcache_node_check_and_free(s, PCNODE(node));
         cnt++;
     }
     DPRINTF("used %d nodes\n", cnt);