Message ID | 596a6f07850002a09461f317afa75f3e0c9bb784.1570280098.git.lukasstraub2@web.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | colo: Add support for continuous replication | expand |
On Sat, 5 Oct 2019 15:05:23 +0200 Lukas Straub <lukasstraub2@web.de> wrote: > After failover the Secondary side of replication shouldn't change state, because > it now functions as our primary disk. > > In replication_start, replication_do_checkpoint, replication_stop, ignore > the request if current state is BLOCK_REPLICATION_DONE (sucessful failover) or > BLOCK_REPLICATION_FAILOVER (failover in progres i.e. currently merging active > and hidden images into the base image). > > Signed-off-by: Lukas Straub <lukasstraub2@web.de> > Reviewed-by: Zhang Chen <chen.zhang@intel.com> > --- > block/replication.c | 38 +++++++++++++++++++++++++++++++++++--- > 1 file changed, 35 insertions(+), 3 deletions(-) > > diff --git a/block/replication.c b/block/replication.c > index 3d4dedddfc..97cc65c0cf 100644 > --- a/block/replication.c > +++ b/block/replication.c > @@ -454,6 +454,17 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, > aio_context_acquire(aio_context); > s = bs->opaque; > > + if (s->stage == BLOCK_REPLICATION_DONE || > + s->stage == BLOCK_REPLICATION_FAILOVER) { > + /* > + * This case happens when a secondary is promoted to primary. > + * Ignore the request because the secondary side of replication > + * doesn't have to do anything anymore. > + */ > + aio_context_release(aio_context); > + return; > + } > + > if (s->stage != BLOCK_REPLICATION_NONE) { > error_setg(errp, "Block replication is running or done"); > aio_context_release(aio_context); > @@ -529,8 +540,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, > "Block device is in use by internal backup job"); > > top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL); > - if (!top_bs || !bdrv_is_root_node(top_bs) || > - !check_top_bs(top_bs, bs)) { > + if (!top_bs || !check_top_bs(top_bs, bs)) { > error_setg(errp, "No top_bs or it is invalid"); > reopen_backing_file(bs, false, NULL); > aio_context_release(aio_context); > @@ -577,6 +587,17 @@ static void replication_do_checkpoint(ReplicationState *rs, Error **errp) > aio_context_acquire(aio_context); > s = bs->opaque; > > + if (s->stage == BLOCK_REPLICATION_DONE || > + s->stage == BLOCK_REPLICATION_FAILOVER) { > + /* > + * This case happens when a secondary was promoted to primary. > + * Ignore the request because the secondary side of replication > + * doesn't have to do anything anymore. > + */ > + aio_context_release(aio_context); > + return; > + } > + > if (s->mode == REPLICATION_MODE_SECONDARY) { > secondary_do_checkpoint(s, errp); > } > @@ -593,7 +614,7 @@ static void replication_get_error(ReplicationState *rs, Error **errp) > aio_context_acquire(aio_context); > s = bs->opaque; > > - if (s->stage != BLOCK_REPLICATION_RUNNING) { > + if (s->stage == BLOCK_REPLICATION_NONE) { > error_setg(errp, "Block replication is not running"); > aio_context_release(aio_context); > return; > @@ -635,6 +656,17 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp) > aio_context_acquire(aio_context); > s = bs->opaque; > > + if (s->stage == BLOCK_REPLICATION_DONE || > + s->stage == BLOCK_REPLICATION_FAILOVER) { > + /* > + * This case happens when a secondary was promoted to primary. > + * Ignore the request because the secondary side of replication > + * doesn't have to do anything anymore. > + */ > + aio_context_release(aio_context); > + return; > + } > + > if (s->stage != BLOCK_REPLICATION_RUNNING) { > error_setg(errp, "Block replication is not running"); > aio_context_release(aio_context); Hello Everyone, Could the block people have a look at this patch? Regards, Lukas Straub
> -----Original Message----- > From: Lukas Straub <lukasstraub2@web.de> > Sent: Saturday, October 19, 2019 2:46 AM > To: qemu-devel <qemu-devel@nongnu.org> > Cc: Zhang, Chen <chen.zhang@intel.com>; Jason Wang > <jasowang@redhat.com>; Wen Congyang <wencongyang2@huawei.com>; > Xie Changlong <xiechanglong.d@gmail.com>; Kevin Wolf > <kwolf@redhat.com>; Max Reitz <mreitz@redhat.com>; qemu-block > <qemu-block@nongnu.org> > Subject: Re: [PATCH v6 1/4] block/replication.c: Ignore requests after failover > > On Sat, 5 Oct 2019 15:05:23 +0200 > Lukas Straub <lukasstraub2@web.de> wrote: > > > After failover the Secondary side of replication shouldn't change > > state, because it now functions as our primary disk. > > > > In replication_start, replication_do_checkpoint, replication_stop, > > ignore the request if current state is BLOCK_REPLICATION_DONE > > (sucessful failover) or BLOCK_REPLICATION_FAILOVER (failover in > > progres i.e. currently merging active and hidden images into the base > image). > > > > Signed-off-by: Lukas Straub <lukasstraub2@web.de> > > Reviewed-by: Zhang Chen <chen.zhang@intel.com> > > --- > > block/replication.c | 38 +++++++++++++++++++++++++++++++++++--- > > 1 file changed, 35 insertions(+), 3 deletions(-) > > > > diff --git a/block/replication.c b/block/replication.c index > > 3d4dedddfc..97cc65c0cf 100644 > > --- a/block/replication.c > > +++ b/block/replication.c > > @@ -454,6 +454,17 @@ static void replication_start(ReplicationState *rs, > ReplicationMode mode, > > aio_context_acquire(aio_context); > > s = bs->opaque; > > > > + if (s->stage == BLOCK_REPLICATION_DONE || > > + s->stage == BLOCK_REPLICATION_FAILOVER) { > > + /* > > + * This case happens when a secondary is promoted to primary. > > + * Ignore the request because the secondary side of replication > > + * doesn't have to do anything anymore. > > + */ > > + aio_context_release(aio_context); > > + return; > > + } > > + > > if (s->stage != BLOCK_REPLICATION_NONE) { > > error_setg(errp, "Block replication is running or done"); > > aio_context_release(aio_context); @@ -529,8 +540,7 @@ static > > void replication_start(ReplicationState *rs, ReplicationMode mode, > > "Block device is in use by internal backup job"); > > > > top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL); > > - if (!top_bs || !bdrv_is_root_node(top_bs) || > > - !check_top_bs(top_bs, bs)) { > > + if (!top_bs || !check_top_bs(top_bs, bs)) { > > error_setg(errp, "No top_bs or it is invalid"); > > reopen_backing_file(bs, false, NULL); > > aio_context_release(aio_context); @@ -577,6 +587,17 @@ > > static void replication_do_checkpoint(ReplicationState *rs, Error **errp) > > aio_context_acquire(aio_context); > > s = bs->opaque; > > > > + if (s->stage == BLOCK_REPLICATION_DONE || > > + s->stage == BLOCK_REPLICATION_FAILOVER) { > > + /* > > + * This case happens when a secondary was promoted to primary. > > + * Ignore the request because the secondary side of replication > > + * doesn't have to do anything anymore. > > + */ > > + aio_context_release(aio_context); > > + return; > > + } > > + > > if (s->mode == REPLICATION_MODE_SECONDARY) { > > secondary_do_checkpoint(s, errp); > > } > > @@ -593,7 +614,7 @@ static void replication_get_error(ReplicationState > *rs, Error **errp) > > aio_context_acquire(aio_context); > > s = bs->opaque; > > > > - if (s->stage != BLOCK_REPLICATION_RUNNING) { > > + if (s->stage == BLOCK_REPLICATION_NONE) { > > error_setg(errp, "Block replication is not running"); > > aio_context_release(aio_context); > > return; > > @@ -635,6 +656,17 @@ static void replication_stop(ReplicationState *rs, > bool failover, Error **errp) > > aio_context_acquire(aio_context); > > s = bs->opaque; > > > > + if (s->stage == BLOCK_REPLICATION_DONE || > > + s->stage == BLOCK_REPLICATION_FAILOVER) { > > + /* > > + * This case happens when a secondary was promoted to primary. > > + * Ignore the request because the secondary side of replication > > + * doesn't have to do anything anymore. > > + */ > > + aio_context_release(aio_context); > > + return; > > + } > > + > > if (s->stage != BLOCK_REPLICATION_RUNNING) { > > error_setg(errp, "Block replication is not running"); > > aio_context_release(aio_context); > > Hello Everyone, > Could the block people have a look at this patch? Add Dave, do you have time to review this series? Thanks Zhang Chen > > Regards, > Lukas Straub
On 05.10.19 15:05, Lukas Straub wrote: > After failover the Secondary side of replication shouldn't change state, because > it now functions as our primary disk. > > In replication_start, replication_do_checkpoint, replication_stop, ignore > the request if current state is BLOCK_REPLICATION_DONE (sucessful failover) or > BLOCK_REPLICATION_FAILOVER (failover in progres i.e. currently merging active > and hidden images into the base image). > > Signed-off-by: Lukas Straub <lukasstraub2@web.de> > Reviewed-by: Zhang Chen <chen.zhang@intel.com> > --- > block/replication.c | 38 +++++++++++++++++++++++++++++++++++--- > 1 file changed, 35 insertions(+), 3 deletions(-) Disclaimer: I don’t know anything about the replication block driver. > diff --git a/block/replication.c b/block/replication.c > index 3d4dedddfc..97cc65c0cf 100644 > --- a/block/replication.c > +++ b/block/replication.c [...] > @@ -529,8 +540,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, > "Block device is in use by internal backup job"); > > top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL); > - if (!top_bs || !bdrv_is_root_node(top_bs) || > - !check_top_bs(top_bs, bs)) { > + if (!top_bs || !check_top_bs(top_bs, bs)) { It appears to me that top_bs is only used to install op blockers. It seems reasonable to require a root node to be able to do so (because op blockers are really only checked on a root node). (And the commit message doesn’t tell why we’d want to drop the is_root_node check here.) Now OTOH I don’t know whether the replication driver needs an op blocker at all or whether the permission system suffices... I suppose the rest of this patch is not really about the block layer, so I can’t really comment on it. (It looks OK to me from a generic and naïve standpoint, though.) > error_setg(errp, "No top_bs or it is invalid"); > reopen_backing_file(bs, false, NULL); > aio_context_release(aio_context); [...] > @@ -593,7 +614,7 @@ static void replication_get_error(ReplicationState *rs, Error **errp) > aio_context_acquire(aio_context); > s = bs->opaque; > > - if (s->stage != BLOCK_REPLICATION_RUNNING) { > + if (s->stage == BLOCK_REPLICATION_NONE) { Just one question out of curiosity, though: Is this a bug fix? Max > error_setg(errp, "Block replication is not running"); > aio_context_release(aio_context); > return;
On Wed, 23 Oct 2019 14:49:29 +0200 Max Reitz <mreitz@redhat.com> wrote: > On 05.10.19 15:05, Lukas Straub wrote: > > After failover the Secondary side of replication shouldn't change state, because > > it now functions as our primary disk. > > > > In replication_start, replication_do_checkpoint, replication_stop, ignore > > the request if current state is BLOCK_REPLICATION_DONE (sucessful failover) or > > BLOCK_REPLICATION_FAILOVER (failover in progres i.e. currently merging active > > and hidden images into the base image). > > > > Signed-off-by: Lukas Straub <lukasstraub2@web.de> > > Reviewed-by: Zhang Chen <chen.zhang@intel.com> > > --- > > block/replication.c | 38 +++++++++++++++++++++++++++++++++++--- > > 1 file changed, 35 insertions(+), 3 deletions(-) > > Disclaimer: I don’t know anything about the replication block driver. > > > diff --git a/block/replication.c b/block/replication.c > > index 3d4dedddfc..97cc65c0cf 100644 > > --- a/block/replication.c > > +++ b/block/replication.c > > [...] > > > @@ -529,8 +540,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, > > "Block device is in use by internal backup job"); > > > > top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL); > > - if (!top_bs || !bdrv_is_root_node(top_bs) || > > - !check_top_bs(top_bs, bs)) { > > + if (!top_bs || !check_top_bs(top_bs, bs)) { > > It appears to me that top_bs is only used to install op blockers. It > seems reasonable to require a root node to be able to do so (because op > blockers are really only checked on a root node). > (And the commit message doesn’t tell why we’d want to drop the > is_root_node check here.) > > Now OTOH I don’t know whether the replication driver needs an op blocker > at all or whether the permission system suffices... Hi, Now that I look at again, it actually works without this change, by passing a correct top-id= parameter to the driver (I somehow overlooked that parameter). So I will revert this change in the next version. > > I suppose the rest of this patch is not really about the block layer, so > I can’t really comment on it. (It looks OK to me from a generic and > naïve standpoint, though.) > > > error_setg(errp, "No top_bs or it is invalid"); > > reopen_backing_file(bs, false, NULL); > > aio_context_release(aio_context); > > [...] > > > @@ -593,7 +614,7 @@ static void replication_get_error(ReplicationState *rs, Error **errp) > > aio_context_acquire(aio_context); > > s = bs->opaque; > > > > - if (s->stage != BLOCK_REPLICATION_RUNNING) { > > + if (s->stage == BLOCK_REPLICATION_NONE) { > > Just one question out of curiosity, though: Is this a bug fix? No, It only applies to continuous replication, because colo will check all replication nodes for errors before checkpointing. So a secondary continuing replication would error out here, because it is either in state BLOCK_REPLICATION_DONE or BLOCK_REPLICATION_FAILOVER. > Max > > > error_setg(errp, "Block replication is not running"); > > aio_context_release(aio_context); > > return; >
diff --git a/block/replication.c b/block/replication.c index 3d4dedddfc..97cc65c0cf 100644 --- a/block/replication.c +++ b/block/replication.c @@ -454,6 +454,17 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, aio_context_acquire(aio_context); s = bs->opaque; + if (s->stage == BLOCK_REPLICATION_DONE || + s->stage == BLOCK_REPLICATION_FAILOVER) { + /* + * This case happens when a secondary is promoted to primary. + * Ignore the request because the secondary side of replication + * doesn't have to do anything anymore. + */ + aio_context_release(aio_context); + return; + } + if (s->stage != BLOCK_REPLICATION_NONE) { error_setg(errp, "Block replication is running or done"); aio_context_release(aio_context); @@ -529,8 +540,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, "Block device is in use by internal backup job"); top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL); - if (!top_bs || !bdrv_is_root_node(top_bs) || - !check_top_bs(top_bs, bs)) { + if (!top_bs || !check_top_bs(top_bs, bs)) { error_setg(errp, "No top_bs or it is invalid"); reopen_backing_file(bs, false, NULL); aio_context_release(aio_context); @@ -577,6 +587,17 @@ static void replication_do_checkpoint(ReplicationState *rs, Error **errp) aio_context_acquire(aio_context); s = bs->opaque; + if (s->stage == BLOCK_REPLICATION_DONE || + s->stage == BLOCK_REPLICATION_FAILOVER) { + /* + * This case happens when a secondary was promoted to primary. + * Ignore the request because the secondary side of replication + * doesn't have to do anything anymore. + */ + aio_context_release(aio_context); + return; + } + if (s->mode == REPLICATION_MODE_SECONDARY) { secondary_do_checkpoint(s, errp); } @@ -593,7 +614,7 @@ static void replication_get_error(ReplicationState *rs, Error **errp) aio_context_acquire(aio_context); s = bs->opaque; - if (s->stage != BLOCK_REPLICATION_RUNNING) { + if (s->stage == BLOCK_REPLICATION_NONE) { error_setg(errp, "Block replication is not running"); aio_context_release(aio_context); return; @@ -635,6 +656,17 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp) aio_context_acquire(aio_context); s = bs->opaque; + if (s->stage == BLOCK_REPLICATION_DONE || + s->stage == BLOCK_REPLICATION_FAILOVER) { + /* + * This case happens when a secondary was promoted to primary. + * Ignore the request because the secondary side of replication + * doesn't have to do anything anymore. + */ + aio_context_release(aio_context); + return; + } + if (s->stage != BLOCK_REPLICATION_RUNNING) { error_setg(errp, "Block replication is not running"); aio_context_release(aio_context);