new file mode 100644
@@ -0,0 +1,135 @@
+/*
+ * FleecingState
+ *
+ * The common state of image fleecing, shared between copy-before-write filter
+ * and fleecing block driver.
+ *
+ * Copyright (c) 2021 Virtuozzo International GmbH.
+ *
+ * Author:
+ * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Fleecing scheme looks as follows:
+ *
+ * [guest blk] [nbd export]
+ * | |
+ * |root |
+ * v v
+ * [copy-before-write]--target-->[fleecing drv]
+ * | / |
+ * |file / |file
+ * v / v
+ * [active disk]<--source-----/ [temp disk]
+ *
+ * Note that "active disk" is also called just "source" and "temp disk" is also
+ * called "target".
+ *
+ * What happens here:
+ *
+ * copy-before-write filter performs copy-before-write operations: on guest
+ * write we should copy old data to target child before rewriting. Note that we
+ * write this data through fleecing driver: it saves a possibility to implement
+ * a kind of cache in fleecing driver in future.
+ *
+ * Fleecing user is nbd export: it can read from fleecing node, which guarantees
+ * a snapshot-view for fleecing user. Fleecing user may also do discard
+ * operations.
+ *
+ * FleecingState is responsible for most of the fleecing logic:
+ *
+ * 1. Fleecing read. Handle reads of fleecing user: we should decide where from
+ * to read, from source node or from copy-before-write target node. In former
+ * case we need to synchronize with guest writes. See fleecing_read_lock() and
+ * fleecing_read_unlock() functionality.
+ *
+ * 2. Guest write synchronization (part of [1] actually). See
+ * fleecing_mark_done_and_wait_readers()
+ *
+ * 3. Fleecing discard. Used by fleecing user when corresponding area is already
+ * copied. Fleecing user may discard the area which is not needed anymore, that
+ * should result in:
+ * - discarding data to free disk space
+ * - clear bits in copy-bitmap of block-copy, to avoid extra copy-before-write
+ * operations
+ * - clear bits in access-bitmap of FleecingState, to avoid further wrong
+ * access
+ *
+ * Still, FleecingState doesn't own any block children, so all real io
+ * operations (reads, writes and discards) are done by copy-before-write filter
+ * and fleecing block driver.
+ */
+
+#ifndef FLEECING_H
+#define FLEECING_H
+
+#include "block/block_int.h"
+#include "block/block-copy.h"
+#include "block/reqlist.h"
+
+typedef struct FleecingState FleecingState;
+
+/*
+ * Create FleecingState.
+ *
+ * @bcs: link to block-copy owned by copy-before-write filter.
+ *
+ * @fleecing_node: should be fleecing block driver node. Used to create some
+ * bitmaps in it.
+ */
+FleecingState *fleecing_new(BlockCopyState *bcs,
+ BlockDriverState *fleecing_node,
+ Error **errp);
+
+/* Free the state. Doesn't free block-copy state (@bcs) */
+void fleecing_free(FleecingState *s);
+
+/*
+ * Convenient function for thous who want to do fleecing read.
+ *
+ * If requested region starts in "done" area, i.e. data is already copied to
+ * copy-before-write target node, req is set to NULL, pnum is set to available
+ * bytes to read from target. User is free to read @pnum bytes from target.
+ * Still, user is responsible for concurrent discards on target.
+ *
+ * If requests region starts in "not done" area, i.e. we have to read from
+ * source node directly, than @pnum bytes of source node are frozen and
+ * guaranteed not be rewritten until user calls cbw_snapshot_read_unlock().
+ *
+ * Returns 0 on success and -EACCES when try to read non-dirty area of
+ * access_bitmap.
+ */
+int fleecing_read_lock(FleecingState *f, int64_t offset,
+ int64_t bytes, const BlockReq **req, int64_t *pnum);
+/* Called as closing pair for fleecing_read_lock() */
+void fleecing_read_unlock(FleecingState *f, const BlockReq *req);
+
+/*
+ * Called when fleecing user doesn't need the region anymore (for example the
+ * region is successfully read and backed up somewhere).
+ * This prevents extra copy-before-write operations in this area in future.
+ * Next fleecing read from this area will fail with -EACCES.
+ */
+void fleecing_discard(FleecingState *f, int64_t offset, int64_t bytes);
+
+/*
+ * Called by copy-before-write filter after successful copy-before-write
+ * operation to synchronize with parallel fleecing reads.
+ */
+void fleecing_mark_done_and_wait_readers(FleecingState *f, int64_t offset,
+ int64_t bytes);
+
+#endif /* FLEECING_H */
new file mode 100644
@@ -0,0 +1,182 @@
+/*
+ * FleecingState
+ *
+ * The common state of image fleecing, shared between copy-before-write filter
+ * and fleecing block driver.
+ *
+ * Copyright (c) 2021 Virtuozzo International GmbH.
+ *
+ * Author:
+ * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+
+#include "sysemu/block-backend.h"
+#include "qemu/cutils.h"
+#include "qapi/error.h"
+#include "block/block_int.h"
+#include "block/coroutines.h"
+#include "block/qdict.h"
+#include "block/block-copy.h"
+#include "block/reqlist.h"
+
+#include "block/fleecing.h"
+
+/*
+ * @bcs: link to block-copy state owned by copy-before-write filter which
+ * performs copy-before-write operations in context of fleecing scheme.
+ * FleecingState doesn't own the block-copy state and don't free it on cleanup.
+ *
+ * @lock: protects access to @access_bitmap, @done_bitmap and @frozen_read_reqs
+ *
+ * @access_bitmap: represents areas allowed for reading by fleecing user.
+ * Reading from non-dirty areas leads to -EACCES. Discard operation among other
+ * things clears corresponding bits in this bitmaps.
+ *
+ * @done_bitmap: represents areas that was successfully copied by
+ * copy-before-write operations. So, for dirty areas fleecing user should read
+ * from target node and for clear areas - from source node.
+ *
+ * @frozen_read_reqs: current read requests for fleecing user in source node.
+ * corresponding areas must not be rewritten by guest.
+ */
+typedef struct FleecingState {
+ BlockCopyState *bcs;
+
+ CoMutex lock;
+
+ BdrvDirtyBitmap *access_bitmap;
+ BdrvDirtyBitmap *done_bitmap;
+
+ BlockReqList frozen_read_reqs;
+} FleecingState;
+
+FleecingState *fleecing_new(BlockCopyState *bcs,
+ BlockDriverState *fleecing_node,
+ Error **errp)
+{
+ BdrvDirtyBitmap *bcs_bitmap = block_copy_dirty_bitmap(bcs),
+ *done_bitmap, *access_bitmap;
+ int64_t cluster_size = block_copy_cluster_size(bcs);
+ FleecingState *s;
+
+ /* done_bitmap starts empty */
+ done_bitmap = bdrv_create_dirty_bitmap(fleecing_node, cluster_size, NULL,
+ errp);
+ if (!done_bitmap) {
+ return NULL;
+ }
+ bdrv_disable_dirty_bitmap(done_bitmap);
+
+ /* access_bitmap starts equal to bcs_bitmap */
+ access_bitmap = bdrv_create_dirty_bitmap(fleecing_node, cluster_size, NULL,
+ errp);
+ if (!access_bitmap) {
+ return NULL;
+ }
+ bdrv_disable_dirty_bitmap(access_bitmap);
+ if (!bdrv_dirty_bitmap_merge_internal(access_bitmap, bcs_bitmap,
+ NULL, true))
+ {
+ return NULL;
+ }
+
+ s = g_new(FleecingState, 1);
+ *s = (FleecingState) {
+ .bcs = bcs,
+ .done_bitmap = done_bitmap,
+ .access_bitmap = access_bitmap,
+ };
+ qemu_co_mutex_init(&s->lock);
+ QLIST_INIT(&s->frozen_read_reqs);
+
+ return s;
+}
+
+void fleecing_free(FleecingState *s)
+{
+ if (!s) {
+ return;
+ }
+
+ bdrv_release_dirty_bitmap(s->access_bitmap);
+ bdrv_release_dirty_bitmap(s->done_bitmap);
+ g_free(s);
+}
+
+static BlockReq *add_read_req(FleecingState *s, uint64_t offset, uint64_t bytes)
+{
+ BlockReq *req = g_new(BlockReq, 1);
+
+ reqlist_init_req(&s->frozen_read_reqs, req, offset, bytes);
+
+ return req;
+}
+
+static void drop_read_req(BlockReq *req)
+{
+ reqlist_remove_req(req);
+ g_free(req);
+}
+
+int fleecing_read_lock(FleecingState *s, int64_t offset,
+ int64_t bytes, const BlockReq **req,
+ int64_t *pnum)
+{
+ bool done;
+
+ QEMU_LOCK_GUARD(&s->lock);
+
+ if (bdrv_dirty_bitmap_next_zero(s->access_bitmap, offset, bytes) != -1) {
+ return -EACCES;
+ }
+
+ bdrv_dirty_bitmap_status(s->done_bitmap, offset, bytes, &done, pnum);
+ if (!done) {
+ *req = add_read_req(s, offset, *pnum);
+ }
+
+ return 0;
+}
+
+void fleecing_read_unlock(FleecingState *s, const BlockReq *req)
+{
+ QEMU_LOCK_GUARD(&s->lock);
+
+ drop_read_req((BlockReq *)req);
+}
+
+void fleecing_discard(FleecingState *s, int64_t offset, int64_t bytes)
+{
+ WITH_QEMU_LOCK_GUARD(&s->lock) {
+ bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes);
+ }
+
+ block_copy_reset(s->bcs, offset, bytes);
+}
+
+void fleecing_mark_done_and_wait_readers(FleecingState *s, int64_t offset,
+ int64_t bytes)
+{
+ assert(QEMU_IS_ALIGNED(offset, block_copy_cluster_size(s->bcs)));
+ assert(QEMU_IS_ALIGNED(bytes, block_copy_cluster_size(s->bcs)));
+
+ WITH_QEMU_LOCK_GUARD(&s->lock) {
+ bdrv_set_dirty_bitmap(s->done_bitmap, offset, bytes);
+ reqlist_wait_all(&s->frozen_read_reqs, offset, bytes, &s->lock);
+ }
+}
@@ -2341,6 +2341,8 @@ F: block/reqlist.c
F: include/block/reqlist.h
F: block/copy-before-write.h
F: block/copy-before-write.c
+F: block/fleecing.h
+F: block/fleecing.c
F: include/block/aio_task.h
F: block/aio_task.c
F: util/qemu-co-shared-resource.c
@@ -18,6 +18,7 @@ block_ss.add(files(
'crypto.c',
'dirty-bitmap.c',
'filter-compress.c',
+ 'fleecing.c',
'io.c',
'mirror.c',
'nbd.c',
FleecingState represents state shared between copy-before-write filter and upcoming fleecing block driver. Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> --- block/fleecing.h | 135 ++++++++++++++++++++++++++++++++++ block/fleecing.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++ MAINTAINERS | 2 + block/meson.build | 1 + 4 files changed, 320 insertions(+) create mode 100644 block/fleecing.h create mode 100644 block/fleecing.c