diff mbox series

[v2,09/19] block: introduce FleecingState class

Message ID 20210827181808.311670-10-vsementsov@virtuozzo.com (mailing list archive)
State New, archived
Headers show
Series Make image fleecing more usable | expand

Commit Message

Vladimir Sementsov-Ogievskiy Aug. 27, 2021, 6:17 p.m. UTC
FleecingState represents state shared between copy-before-write filter
and upcoming fleecing block driver.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 block/fleecing.h  | 135 ++++++++++++++++++++++++++++++++++
 block/fleecing.c  | 182 ++++++++++++++++++++++++++++++++++++++++++++++
 MAINTAINERS       |   2 +
 block/meson.build |   1 +
 4 files changed, 320 insertions(+)
 create mode 100644 block/fleecing.h
 create mode 100644 block/fleecing.c
diff mbox series

Patch

diff --git a/block/fleecing.h b/block/fleecing.h
new file mode 100644
index 0000000000..fb7b2f86c4
--- /dev/null
+++ b/block/fleecing.h
@@ -0,0 +1,135 @@ 
+/*
+ * FleecingState
+ *
+ * The common state of image fleecing, shared between copy-before-write filter
+ * and fleecing block driver.
+ *
+ * Copyright (c) 2021 Virtuozzo International GmbH.
+ *
+ * Author:
+ *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Fleecing scheme looks as follows:
+ *
+ * [guest blk]                   [nbd export]
+ *    |                              |
+ *    |root                          |
+ *    v                              v
+ * [copy-before-write]--target-->[fleecing drv]
+ *    |                          /   |
+ *    |file                     /    |file
+ *    v                        /     v
+ * [active disk]<--source-----/  [temp disk]
+ *
+ * Note that "active disk" is also called just "source" and "temp disk" is also
+ * called "target".
+ *
+ * What happens here:
+ *
+ * copy-before-write filter performs copy-before-write operations: on guest
+ * write we should copy old data to target child before rewriting. Note that we
+ * write this data through fleecing driver: it saves a possibility to implement
+ * a kind of cache in fleecing driver in future.
+ *
+ * Fleecing user is nbd export: it can read from fleecing node, which guarantees
+ * a snapshot-view for fleecing user. Fleecing user may also do discard
+ * operations.
+ *
+ * FleecingState is responsible for most of the fleecing logic:
+ *
+ * 1. Fleecing read. Handle reads of fleecing user: we should decide where from
+ * to read, from source node or from copy-before-write target node. In former
+ * case we need to synchronize with guest writes. See fleecing_read_lock() and
+ * fleecing_read_unlock() functionality.
+ *
+ * 2. Guest write synchronization (part of [1] actually). See
+ * fleecing_mark_done_and_wait_readers()
+ *
+ * 3. Fleecing discard. Used by fleecing user when corresponding area is already
+ * copied. Fleecing user may discard the area which is not needed anymore, that
+ * should result in:
+ *   - discarding data to free disk space
+ *   - clear bits in copy-bitmap of block-copy, to avoid extra copy-before-write
+ *     operations
+ *   - clear bits in access-bitmap of FleecingState, to avoid further wrong
+ *     access
+ *
+ * Still, FleecingState doesn't own any block children, so all real io
+ * operations (reads, writes and discards) are done by copy-before-write filter
+ * and fleecing block driver.
+ */
+
+#ifndef FLEECING_H
+#define FLEECING_H
+
+#include "block/block_int.h"
+#include "block/block-copy.h"
+#include "block/reqlist.h"
+
+typedef struct FleecingState FleecingState;
+
+/*
+ * Create FleecingState.
+ *
+ * @bcs: link to block-copy owned by copy-before-write filter.
+ *
+ * @fleecing_node: should be fleecing block driver node. Used to create some
+ * bitmaps in it.
+ */
+FleecingState *fleecing_new(BlockCopyState *bcs,
+                            BlockDriverState *fleecing_node,
+                            Error **errp);
+
+/* Free the state. Doesn't free block-copy state (@bcs) */
+void fleecing_free(FleecingState *s);
+
+/*
+ * Convenient function for thous who want to do fleecing read.
+ *
+ * If requested region starts in "done" area, i.e. data is already copied to
+ * copy-before-write target node, req is set to NULL, pnum is set to available
+ * bytes to read from target. User is free to read @pnum bytes from target.
+ * Still, user is responsible for concurrent discards on target.
+ *
+ * If requests region starts in "not done" area, i.e. we have to read from
+ * source node directly, than @pnum bytes of source node are frozen and
+ * guaranteed not be rewritten until user calls cbw_snapshot_read_unlock().
+ *
+ * Returns 0 on success and -EACCES when try to read non-dirty area of
+ * access_bitmap.
+ */
+int fleecing_read_lock(FleecingState *f, int64_t offset,
+                       int64_t bytes, const BlockReq **req, int64_t *pnum);
+/* Called as closing pair for fleecing_read_lock() */
+void fleecing_read_unlock(FleecingState *f, const BlockReq *req);
+
+/*
+ * Called when fleecing user doesn't need the region anymore (for example the
+ * region is successfully read and backed up somewhere).
+ * This prevents extra copy-before-write operations in this area in future.
+ * Next fleecing read from this area will fail with -EACCES.
+ */
+void fleecing_discard(FleecingState *f, int64_t offset, int64_t bytes);
+
+/*
+ * Called by copy-before-write filter after successful copy-before-write
+ * operation to synchronize with parallel fleecing reads.
+ */
+void fleecing_mark_done_and_wait_readers(FleecingState *f, int64_t offset,
+                                         int64_t bytes);
+
+#endif /* FLEECING_H */
diff --git a/block/fleecing.c b/block/fleecing.c
new file mode 100644
index 0000000000..f75d11b892
--- /dev/null
+++ b/block/fleecing.c
@@ -0,0 +1,182 @@ 
+/*
+ * FleecingState
+ *
+ * The common state of image fleecing, shared between copy-before-write filter
+ * and fleecing block driver.
+ *
+ * Copyright (c) 2021 Virtuozzo International GmbH.
+ *
+ * Author:
+ *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+
+#include "sysemu/block-backend.h"
+#include "qemu/cutils.h"
+#include "qapi/error.h"
+#include "block/block_int.h"
+#include "block/coroutines.h"
+#include "block/qdict.h"
+#include "block/block-copy.h"
+#include "block/reqlist.h"
+
+#include "block/fleecing.h"
+
+/*
+ * @bcs: link to block-copy state owned by copy-before-write filter which
+ * performs copy-before-write operations in context of fleecing scheme.
+ * FleecingState doesn't own the block-copy state and don't free it on cleanup.
+ *
+ * @lock: protects access to @access_bitmap, @done_bitmap and @frozen_read_reqs
+ *
+ * @access_bitmap: represents areas allowed for reading by fleecing user.
+ * Reading from non-dirty areas leads to -EACCES. Discard operation among other
+ * things clears corresponding bits in this bitmaps.
+ *
+ * @done_bitmap: represents areas that was successfully copied by
+ * copy-before-write operations. So, for dirty areas fleecing user should read
+ * from target node and for clear areas - from source node.
+ *
+ * @frozen_read_reqs: current read requests for fleecing user in source node.
+ * corresponding areas must not be rewritten by guest.
+ */
+typedef struct FleecingState {
+    BlockCopyState *bcs;
+
+    CoMutex lock;
+
+    BdrvDirtyBitmap *access_bitmap;
+    BdrvDirtyBitmap *done_bitmap;
+
+    BlockReqList frozen_read_reqs;
+} FleecingState;
+
+FleecingState *fleecing_new(BlockCopyState *bcs,
+                            BlockDriverState *fleecing_node,
+                            Error **errp)
+{
+    BdrvDirtyBitmap *bcs_bitmap = block_copy_dirty_bitmap(bcs),
+                    *done_bitmap, *access_bitmap;
+    int64_t cluster_size = block_copy_cluster_size(bcs);
+    FleecingState *s;
+
+    /* done_bitmap starts empty */
+    done_bitmap = bdrv_create_dirty_bitmap(fleecing_node, cluster_size, NULL,
+                                           errp);
+    if (!done_bitmap) {
+        return NULL;
+    }
+    bdrv_disable_dirty_bitmap(done_bitmap);
+
+    /* access_bitmap starts equal to bcs_bitmap */
+    access_bitmap = bdrv_create_dirty_bitmap(fleecing_node, cluster_size, NULL,
+                                             errp);
+    if (!access_bitmap) {
+        return NULL;
+    }
+    bdrv_disable_dirty_bitmap(access_bitmap);
+    if (!bdrv_dirty_bitmap_merge_internal(access_bitmap, bcs_bitmap,
+                                          NULL, true))
+    {
+        return NULL;
+    }
+
+    s = g_new(FleecingState, 1);
+    *s = (FleecingState) {
+        .bcs = bcs,
+        .done_bitmap = done_bitmap,
+        .access_bitmap = access_bitmap,
+    };
+    qemu_co_mutex_init(&s->lock);
+    QLIST_INIT(&s->frozen_read_reqs);
+
+    return s;
+}
+
+void fleecing_free(FleecingState *s)
+{
+    if (!s) {
+        return;
+    }
+
+    bdrv_release_dirty_bitmap(s->access_bitmap);
+    bdrv_release_dirty_bitmap(s->done_bitmap);
+    g_free(s);
+}
+
+static BlockReq *add_read_req(FleecingState *s, uint64_t offset, uint64_t bytes)
+{
+    BlockReq *req = g_new(BlockReq, 1);
+
+    reqlist_init_req(&s->frozen_read_reqs, req, offset, bytes);
+
+    return req;
+}
+
+static void drop_read_req(BlockReq *req)
+{
+    reqlist_remove_req(req);
+    g_free(req);
+}
+
+int fleecing_read_lock(FleecingState *s, int64_t offset,
+                       int64_t bytes, const BlockReq **req,
+                       int64_t *pnum)
+{
+    bool done;
+
+    QEMU_LOCK_GUARD(&s->lock);
+
+    if (bdrv_dirty_bitmap_next_zero(s->access_bitmap, offset, bytes) != -1) {
+        return -EACCES;
+    }
+
+    bdrv_dirty_bitmap_status(s->done_bitmap, offset, bytes, &done, pnum);
+    if (!done) {
+        *req = add_read_req(s, offset, *pnum);
+    }
+
+    return 0;
+}
+
+void fleecing_read_unlock(FleecingState *s, const BlockReq *req)
+{
+    QEMU_LOCK_GUARD(&s->lock);
+
+    drop_read_req((BlockReq *)req);
+}
+
+void fleecing_discard(FleecingState *s, int64_t offset, int64_t bytes)
+{
+    WITH_QEMU_LOCK_GUARD(&s->lock) {
+        bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes);
+    }
+
+    block_copy_reset(s->bcs, offset, bytes);
+}
+
+void fleecing_mark_done_and_wait_readers(FleecingState *s, int64_t offset,
+                                         int64_t bytes)
+{
+    assert(QEMU_IS_ALIGNED(offset, block_copy_cluster_size(s->bcs)));
+    assert(QEMU_IS_ALIGNED(bytes, block_copy_cluster_size(s->bcs)));
+
+    WITH_QEMU_LOCK_GUARD(&s->lock) {
+        bdrv_set_dirty_bitmap(s->done_bitmap, offset, bytes);
+        reqlist_wait_all(&s->frozen_read_reqs, offset, bytes, &s->lock);
+    }
+}
diff --git a/MAINTAINERS b/MAINTAINERS
index 2d58170224..20f3c7eb68 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2341,6 +2341,8 @@  F: block/reqlist.c
 F: include/block/reqlist.h
 F: block/copy-before-write.h
 F: block/copy-before-write.c
+F: block/fleecing.h
+F: block/fleecing.c
 F: include/block/aio_task.h
 F: block/aio_task.c
 F: util/qemu-co-shared-resource.c
diff --git a/block/meson.build b/block/meson.build
index c630e9374a..caa52f49c2 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -18,6 +18,7 @@  block_ss.add(files(
   'crypto.c',
   'dirty-bitmap.c',
   'filter-compress.c',
+  'fleecing.c',
   'io.c',
   'mirror.c',
   'nbd.c',