Message ID | 1456109555-28299-19-git-send-email-wency@cn.fujitsu.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, Feb 22, 2016 at 10:52:22AM +0800, Wen Congyang wrote: [...] > - * With Remus, we buffer the records sent by the primary at checkpoint, > + * With Remus/COLO, we buffer the records sent by the primary at checkpoint, > * in case the primary will fail, we can recover from the last > * checkpoint state. > * This should be enough for most of the cases because primary only send > diff --git a/tools/libxc/xc_sr_restore.c b/tools/libxc/xc_sr_restore.c > index aef9bca..2ae8154 100644 > --- a/tools/libxc/xc_sr_restore.c > +++ b/tools/libxc/xc_sr_restore.c > @@ -460,6 +460,49 @@ static int handle_checkpoint(struct xc_sr_context *ctx) > else > ctx->restore.buffer_all_records = true; > > + if ( ctx->restore.checkpointed == MIG_STREAM_COLO ) > + { > +#define HANDLE_CALLBACK_RETURN_VALUE(ret) \ > + do { \ > + if ( ret == 1 ) \ > + rc = 0; /* Success */ \ > + else \ > + { \ > + if ( ret == 2 ) \ > + rc = BROKEN_CHANNEL; \ > + else \ > + rc = -1; /* Some unspecified error */ \ > + goto err; \ > + } \ > + } while (0) > + > + /* COLO */ > + > + /* We need to resume guest */ > + rc = ctx->restore.ops.stream_complete(ctx); > + if ( rc ) > + goto err; > + > + /* TODO: call restore_results */ > + > + /* Resume secondary vm */ > + ret = ctx->restore.callbacks->postcopy(ctx->restore.callbacks->data); > + HANDLE_CALLBACK_RETURN_VALUE(ret); > + > + /* Wait for a new checkpoint */ > + ret = ctx->restore.callbacks->wait_checkpoint( > + ctx->restore.callbacks->data); > + HANDLE_CALLBACK_RETURN_VALUE(ret); > + > + /* suspend secondary vm */ > + ret = ctx->restore.callbacks->suspend(ctx->restore.callbacks->data); > + HANDLE_CALLBACK_RETURN_VALUE(ret); > + > +#undef HANDLE_CALLBACK_RETURN_VALUE > + > + /* TODO: send dirty pfn list to primary */ You replace the TODOs with actual code in the next two patches. You can rearrange them a bit so that you don't need to add TODOs at all. Wei.
On 02/25/2016 11:57 PM, Wei Liu wrote: > On Mon, Feb 22, 2016 at 10:52:22AM +0800, Wen Congyang wrote: > [...] >> - * With Remus, we buffer the records sent by the primary at checkpoint, >> + * With Remus/COLO, we buffer the records sent by the primary at checkpoint, >> * in case the primary will fail, we can recover from the last >> * checkpoint state. >> * This should be enough for most of the cases because primary only send >> diff --git a/tools/libxc/xc_sr_restore.c b/tools/libxc/xc_sr_restore.c >> index aef9bca..2ae8154 100644 >> --- a/tools/libxc/xc_sr_restore.c >> +++ b/tools/libxc/xc_sr_restore.c >> @@ -460,6 +460,49 @@ static int handle_checkpoint(struct xc_sr_context *ctx) >> else >> ctx->restore.buffer_all_records = true; >> >> + if ( ctx->restore.checkpointed == MIG_STREAM_COLO ) >> + { >> +#define HANDLE_CALLBACK_RETURN_VALUE(ret) \ >> + do { \ >> + if ( ret == 1 ) \ >> + rc = 0; /* Success */ \ >> + else \ >> + { \ >> + if ( ret == 2 ) \ >> + rc = BROKEN_CHANNEL; \ >> + else \ >> + rc = -1; /* Some unspecified error */ \ >> + goto err; \ >> + } \ >> + } while (0) >> + >> + /* COLO */ >> + >> + /* We need to resume guest */ >> + rc = ctx->restore.ops.stream_complete(ctx); >> + if ( rc ) >> + goto err; >> + >> + /* TODO: call restore_results */ >> + >> + /* Resume secondary vm */ >> + ret = ctx->restore.callbacks->postcopy(ctx->restore.callbacks->data); >> + HANDLE_CALLBACK_RETURN_VALUE(ret); >> + >> + /* Wait for a new checkpoint */ >> + ret = ctx->restore.callbacks->wait_checkpoint( >> + ctx->restore.callbacks->data); >> + HANDLE_CALLBACK_RETURN_VALUE(ret); >> + >> + /* suspend secondary vm */ >> + ret = ctx->restore.callbacks->suspend(ctx->restore.callbacks->data); >> + HANDLE_CALLBACK_RETURN_VALUE(ret); >> + >> +#undef HANDLE_CALLBACK_RETURN_VALUE >> + >> + /* TODO: send dirty pfn list to primary */ > > You replace the TODOs with actual code in the next two patches. > > You can rearrange them a bit so that you don't need to add TODOs at all. Yes, will fix it in the next version. Thanks Wen Congyang > > > Wei. > > > . >
diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h index 2bfed64..a24a9ad 100644 --- a/tools/libxc/xc_sr_common.h +++ b/tools/libxc/xc_sr_common.h @@ -234,13 +234,13 @@ struct xc_sr_context uint32_t guest_page_size; /* Plain VM, or checkpoints over time. */ - bool checkpointed; + int checkpointed; /* Currently buffering records between a checkpoint */ bool buffer_all_records; /* - * With Remus, we buffer the records sent by the primary at checkpoint, + * With Remus/COLO, we buffer the records sent by the primary at checkpoint, * in case the primary will fail, we can recover from the last * checkpoint state. * This should be enough for most of the cases because primary only send diff --git a/tools/libxc/xc_sr_restore.c b/tools/libxc/xc_sr_restore.c index aef9bca..2ae8154 100644 --- a/tools/libxc/xc_sr_restore.c +++ b/tools/libxc/xc_sr_restore.c @@ -460,6 +460,49 @@ static int handle_checkpoint(struct xc_sr_context *ctx) else ctx->restore.buffer_all_records = true; + if ( ctx->restore.checkpointed == MIG_STREAM_COLO ) + { +#define HANDLE_CALLBACK_RETURN_VALUE(ret) \ + do { \ + if ( ret == 1 ) \ + rc = 0; /* Success */ \ + else \ + { \ + if ( ret == 2 ) \ + rc = BROKEN_CHANNEL; \ + else \ + rc = -1; /* Some unspecified error */ \ + goto err; \ + } \ + } while (0) + + /* COLO */ + + /* We need to resume guest */ + rc = ctx->restore.ops.stream_complete(ctx); + if ( rc ) + goto err; + + /* TODO: call restore_results */ + + /* Resume secondary vm */ + ret = ctx->restore.callbacks->postcopy(ctx->restore.callbacks->data); + HANDLE_CALLBACK_RETURN_VALUE(ret); + + /* Wait for a new checkpoint */ + ret = ctx->restore.callbacks->wait_checkpoint( + ctx->restore.callbacks->data); + HANDLE_CALLBACK_RETURN_VALUE(ret); + + /* suspend secondary vm */ + ret = ctx->restore.callbacks->suspend(ctx->restore.callbacks->data); + HANDLE_CALLBACK_RETURN_VALUE(ret); + +#undef HANDLE_CALLBACK_RETURN_VALUE + + /* TODO: send dirty pfn list to primary */ + } + err: return rc; } @@ -631,6 +674,15 @@ static int restore(struct xc_sr_context *ctx) } while ( rec.type != REC_TYPE_END ); remus_failover: + + if ( ctx->restore.checkpointed == MIG_STREAM_COLO ) + { + /* With COLO, we have already called stream_complete */ + rc = 0; + IPRINTF("COLO Failover"); + goto done; + } + /* * With Remus, if we reach here, there must be some error on primary, * failover from the last checkpoint state. @@ -685,6 +737,14 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom, if ( checkpointed_stream ) assert(callbacks->checkpoint); + if ( ctx.restore.checkpointed == MIG_STREAM_COLO ) + { + /* this is COLO restore */ + assert(callbacks->suspend && + callbacks->postcopy && + callbacks->wait_checkpoint); + } + DPRINTF("fd %d, dom %u, hvm %u, pae %u, superpages %d" ", checkpointed_stream %d", io_fd, dom, hvm, pae, superpages, checkpointed_stream);