diff mbox

[RFC] backup: export interfaces for extra serialization

Message ID 573C3490.5040003@cn.fujitsu.com (mailing list archive)
State New, archived
Headers show

Commit Message

Changlong Xie May 18, 2016, 9:23 a.m. UTC
On 05/06/2016 11:46 PM, Stefan Hajnoczi wrote:
> Did you run stress tests where the primary is writing to the disk while
> the secondary reads from the same sectors?
>
> I thought about this some more and I'm wondering about the following
> scenario:
>
> NBD writes to secondary_disk and the guest reads from the disk at the
> same time.  There is a coroutine mutex in qcow2.c that protects both
> read and write requests, but only until they perform the data I/O.  It
> may be possible that the read request from the Secondary VM could be
> started before the NBD write but the preadv() syscall isn't entered
> because of CPU scheduling decisions.  In the meantime the
> secondary_disk->hidden_disk backup operation takes place.  With some
> unlucky timing it may be possible for the Secondary VM to read the new
> contents from secondary_disk instead of the old contents that were
> backed up into hidden_disk.
>
> Extra serialization would be needed.
> block/backup.c:wait_for_overlapping_requests() and
> block/io.c:mark_request_serialising() are good starting points for
> solving this.

I'm worried about if this patch introduce unexpect deadlock, and would
like ask for RFC here.

 From 753d9a151351fb14ea774e36a2899f229a7e26ac Mon Sep 17 00:00:00 2001
From: Changlong Xie <xiecl.fnst@cn.fujitsu.com>
Date: Wed, 18 May 2016 16:19:51 +0800
Subject: [PATCH] [RFC] backup: export interfaces for extra serialization

Normal backup(sync='none') workflow:
step 1. NBD peformance I/O write from client to server
    qcow2_co_writev
     bdrv_co_writev
      ...
        bdrv_aligned_pwritev
         notifier_with_return_list_notify -> backup_do_cow
          bdrv_driver_pwritev // write new contents

step 2. drive-backup sync=none
    backup_do_cow
    {
     wait_for_overlapping_requests
     cow_request_begin
     for(; start < end; start++) {
             bdrv_co_readv_no_serialising //read old contents from 
Secondary disk
             bdrv_co_writev // write old contents to hidden-disk
     }
     cow_request_end
    }

step 3. Then roll back to "step 1" to write new contents to Secondary disk.

And for replication, we must make sure that we only read the old 
contents from
Secondary disk in order to keep contents consistent.

1) Replication workflow of Secondary
                                                          virtio-blk
                                                               ^
------->  1 NBD                                               |
    ||     server                                       3 replication
    ||        ^                                                ^
    ||        |           backing                 backing      |
    ||  Secondary disk 6<-------- hidden-disk 5 <-------- active-disk 4
    ||        |                         ^
    ||        '-------------------------'
    ||           drive-backup sync=none 2

Hence, we need these interfaces to implement coarse-grained 
serialization between
COW of Secondary disk and the read operation of replication.

Example codes about how to use them:

*#include "block/block_backup.h"

static coroutine_fn int xxx_co_readv()
{
         CowRequest req;
         BlockJob *job = secondary_disk->bs->job;

         if (job) {
               backup_wait_for_overlapping_requests(job, start, end);
               backup_cow_request_begin(&req, job, start, end);
               ret = bdrv_co_readv();
               backup_cow_request_end(&req);
               goto out;
         }
         ret = bdrv_co_readv();
out:
         return ret;
}

Signed-off-by: Changlong Xie <xiecl.fnst@cn.fujitsu.com>
---
  block/backup.c               | 42 
+++++++++++++++++++++++++++++++++++-------
  include/block/block_backup.h | 13 +++++++++++++
  2 files changed, 48 insertions(+), 7 deletions(-)
  create mode 100644 include/block/block_backup.h

+void backup_cow_request_end(CowRequest *req);
diff mbox

Patch

diff --git a/block/backup.c b/block/backup.c
index d5ffc32..424d29d 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -17,6 +17,7 @@ 
  #include "block/block.h"
  #include "block/block_int.h"
  #include "block/blockjob.h"
+#include "block/block_backup.h"
  #include "qapi/error.h"
  #include "qapi/qmp/qerror.h"
  #include "qemu/ratelimit.h"
@@ -27,13 +28,6 @@ 
  #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
  #define SLICE_TIME 100000000ULL /* ns */

-typedef struct CowRequest {
-    int64_t start;
-    int64_t end;
-    QLIST_ENTRY(CowRequest) list;
-    CoQueue wait_queue; /* coroutines blocked on this request */
-} CowRequest;
-
  typedef struct BackupBlockJob {
      BlockJob common;
      BlockDriverState *target;
@@ -276,6 +270,40 @@  void backup_do_checkpoint(BlockJob *job, Error **errp)
      bitmap_zero(backup_job->done_bitmap, len);
  }

+void backup_wait_for_overlapping_requests(BlockJob *job, int64_t 
sector_num,
+                                          int nb_sectors)
+{
+    BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
+    int64_t sectors_per_cluster = cluster_size_sectors(backup_job);
+    int64_t start, end;
+
+    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
+
+    start = sector_num / sectors_per_cluster;
+    end = DIV_ROUND_UP(sector_num + nb_sectors, sectors_per_cluster);
+    wait_for_overlapping_requests(backup_job, start, end);
+}
+
+void backup_cow_request_begin(CowRequest *req, BlockJob *job,
+                              int64_t sector_num,
+                              int nb_sectors)
+{
+    BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
+    int64_t sectors_per_cluster = cluster_size_sectors(backup_job);
+    int64_t start, end;
+
+    assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
+
+    start = sector_num / sectors_per_cluster;
+    end = DIV_ROUND_UP(sector_num + nb_sectors, sectors_per_cluster);
+    cow_request_begin(req, backup_job, start, end);
+}
+
+void backup_cow_request_end(CowRequest *req)
+{
+    cow_request_end(req);
+}
+
  static const BlockJobDriver backup_job_driver = {
      .instance_size  = sizeof(BackupBlockJob),
      .job_type       = BLOCK_JOB_TYPE_BACKUP,
diff --git a/include/block/block_backup.h b/include/block/block_backup.h
new file mode 100644
index 0000000..80f5c5c
--- /dev/null
+++ b/include/block/block_backup.h
@@ -0,0 +1,13 @@ 
+typedef struct CowRequest {
+    int64_t start;
+    int64_t end;
+    QLIST_ENTRY(CowRequest) list;
+    CoQueue wait_queue; /* coroutines blocked on this request */
+} CowRequest;
+
+void backup_wait_for_overlapping_requests(BlockJob *job, int64_t 
sector_num,
+                                          int nb_sectors);
+void backup_cow_request_begin(CowRequest *req, BlockJob *job,
+                              int64_t sector_num,
+                              int nb_sectors);