diff mbox

[1/3] COLO: fix setting checkpoint-delay not working properly

Message ID 1484657864-21708-2-git-send-email-zhang.zhanghailiang@huawei.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zhanghailiang Jan. 17, 2017, 12:57 p.m. UTC
If we set checkpoint-delay through command 'migrate-set-parameters',
It will not take effect until we finish last sleep chekpoint-delay,
That's will be offensive espeically when we want to change its value
from an extreme big one to a proper value.

Fix it by using timer to realize checkpoint-delay.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
---
 include/migration/colo.h      |  2 ++
 include/migration/migration.h |  5 +++++
 migration/colo.c              | 33 +++++++++++++++++++++++----------
 migration/migration.c         |  3 +++
 4 files changed, 33 insertions(+), 10 deletions(-)

Comments

Dr. David Alan Gilbert Feb. 8, 2017, 10:38 a.m. UTC | #1
* zhanghailiang (zhang.zhanghailiang@huawei.com) wrote:
> If we set checkpoint-delay through command 'migrate-set-parameters',
> It will not take effect until we finish last sleep chekpoint-delay,
> That's will be offensive espeically when we want to change its value
> from an extreme big one to a proper value.
> 
> Fix it by using timer to realize checkpoint-delay.

Yes, I think this works; you only kick the timer in COLO state,
and you create the timer before going into COLO state and clean it up
after we leave, so it feels safe.

Are you also going to kick this semaphore when doing a failover to
cause this thread to exit quickly?


> Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>

Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

Dave
> ---
>  include/migration/colo.h      |  2 ++
>  include/migration/migration.h |  5 +++++
>  migration/colo.c              | 33 +++++++++++++++++++++++----------
>  migration/migration.c         |  3 +++
>  4 files changed, 33 insertions(+), 10 deletions(-)
> 
> diff --git a/include/migration/colo.h b/include/migration/colo.h
> index e32eef4..2bbff9e 100644
> --- a/include/migration/colo.h
> +++ b/include/migration/colo.h
> @@ -35,4 +35,6 @@ COLOMode get_colo_mode(void);
>  
>  /* failover */
>  void colo_do_failover(MigrationState *s);
> +
> +void colo_checkpoint_notify(void *opaque);
>  #endif
> diff --git a/include/migration/migration.h b/include/migration/migration.h
> index c309d23..487ac1e 100644
> --- a/include/migration/migration.h
> +++ b/include/migration/migration.h
> @@ -183,6 +183,11 @@ struct MigrationState
>      /* The RAMBlock used in the last src_page_request */
>      RAMBlock *last_req_rb;
>  
> +    /* The semaphore is used to notify COLO thread to do checkpoint */
> +    QemuSemaphore colo_checkpoint_sem;
> +    int64_t colo_checkpoint_time;
> +    QEMUTimer *colo_delay_timer;
> +
>      /* The last error that occurred */
>      Error *error;
>  };
> diff --git a/migration/colo.c b/migration/colo.c
> index 93c85c5..08b2e46 100644
> --- a/migration/colo.c
> +++ b/migration/colo.c
> @@ -302,7 +302,7 @@ static void colo_process_checkpoint(MigrationState *s)
>  {
>      QIOChannelBuffer *bioc;
>      QEMUFile *fb = NULL;
> -    int64_t current_time, checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
> +    int64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
>      Error *local_err = NULL;
>      int ret;
>  
> @@ -332,26 +332,21 @@ static void colo_process_checkpoint(MigrationState *s)
>      qemu_mutex_unlock_iothread();
>      trace_colo_vm_state_change("stop", "run");
>  
> +    timer_mod(s->colo_delay_timer,
> +            current_time + s->parameters.x_checkpoint_delay);
> +
>      while (s->state == MIGRATION_STATUS_COLO) {
>          if (failover_get_state() != FAILOVER_STATUS_NONE) {
>              error_report("failover request");
>              goto out;
>          }
>  
> -        current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
> -        if (current_time - checkpoint_time <
> -            s->parameters.x_checkpoint_delay) {
> -            int64_t delay_ms;
> +        qemu_sem_wait(&s->colo_checkpoint_sem);
>  
> -            delay_ms = s->parameters.x_checkpoint_delay -
> -                       (current_time - checkpoint_time);
> -            g_usleep(delay_ms * 1000);
> -        }
>          ret = colo_do_checkpoint_transaction(s, bioc, fb);
>          if (ret < 0) {
>              goto out;
>          }
> -        checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
>      }
>  
>  out:
> @@ -364,14 +359,32 @@ out:
>          qemu_fclose(fb);
>      }
>  
> +    timer_del(s->colo_delay_timer);
> +
>      if (s->rp_state.from_dst_file) {
>          qemu_fclose(s->rp_state.from_dst_file);
>      }
>  }
>  
> +void colo_checkpoint_notify(void *opaque)
> +{
> +    MigrationState *s = opaque;
> +    int64_t next_notify_time;
> +
> +    qemu_sem_post(&s->colo_checkpoint_sem);
> +    s->colo_checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
> +    next_notify_time = s->colo_checkpoint_time +
> +                    s->parameters.x_checkpoint_delay;
> +    timer_mod(s->colo_delay_timer, next_notify_time);
> +}
> +
>  void migrate_start_colo_process(MigrationState *s)
>  {
>      qemu_mutex_unlock_iothread();
> +    qemu_sem_init(&s->colo_checkpoint_sem, 0);
> +    s->colo_delay_timer =  timer_new_ms(QEMU_CLOCK_HOST,
> +                                colo_checkpoint_notify, s);
> +
>      migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
>                        MIGRATION_STATUS_COLO);
>      colo_process_checkpoint(s);
> diff --git a/migration/migration.c b/migration/migration.c
> index f498ab8..a4861da 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -895,6 +895,9 @@ void qmp_migrate_set_parameters(MigrationParameters *params, Error **errp)
>  
>      if (params->has_x_checkpoint_delay) {
>          s->parameters.x_checkpoint_delay = params->x_checkpoint_delay;
> +        if (migration_in_colo_state()) {
> +            colo_checkpoint_notify(s);
> +        }
>      }
>  }
>  
> -- 
> 1.8.3.1
> 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Zhanghailiang Feb. 8, 2017, 11:18 a.m. UTC | #2
On 2017/2/8 18:38, Dr. David Alan Gilbert wrote:
> * zhanghailiang (zhang.zhanghailiang@huawei.com) wrote:
>> If we set checkpoint-delay through command 'migrate-set-parameters',
>> It will not take effect until we finish last sleep chekpoint-delay,
>> That's will be offensive espeically when we want to change its value
>> from an extreme big one to a proper value.
>>
>> Fix it by using timer to realize checkpoint-delay.
>
> Yes, I think this works; you only kick the timer in COLO state,
> and you create the timer before going into COLO state and clean it up
> after we leave, so it feels safe.
>
> Are you also going to kick this semaphore when doing a failover to
> cause this thread to exit quickly?
>

Ha, good suggestion! I think it will work.
Accepted, thanks very much.

>
>> Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
>
> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>
> Dave
>> ---
>>   include/migration/colo.h      |  2 ++
>>   include/migration/migration.h |  5 +++++
>>   migration/colo.c              | 33 +++++++++++++++++++++++----------
>>   migration/migration.c         |  3 +++
>>   4 files changed, 33 insertions(+), 10 deletions(-)
>>
>> diff --git a/include/migration/colo.h b/include/migration/colo.h
>> index e32eef4..2bbff9e 100644
>> --- a/include/migration/colo.h
>> +++ b/include/migration/colo.h
>> @@ -35,4 +35,6 @@ COLOMode get_colo_mode(void);
>>
>>   /* failover */
>>   void colo_do_failover(MigrationState *s);
>> +
>> +void colo_checkpoint_notify(void *opaque);
>>   #endif
>> diff --git a/include/migration/migration.h b/include/migration/migration.h
>> index c309d23..487ac1e 100644
>> --- a/include/migration/migration.h
>> +++ b/include/migration/migration.h
>> @@ -183,6 +183,11 @@ struct MigrationState
>>       /* The RAMBlock used in the last src_page_request */
>>       RAMBlock *last_req_rb;
>>
>> +    /* The semaphore is used to notify COLO thread to do checkpoint */
>> +    QemuSemaphore colo_checkpoint_sem;
>> +    int64_t colo_checkpoint_time;
>> +    QEMUTimer *colo_delay_timer;
>> +
>>       /* The last error that occurred */
>>       Error *error;
>>   };
>> diff --git a/migration/colo.c b/migration/colo.c
>> index 93c85c5..08b2e46 100644
>> --- a/migration/colo.c
>> +++ b/migration/colo.c
>> @@ -302,7 +302,7 @@ static void colo_process_checkpoint(MigrationState *s)
>>   {
>>       QIOChannelBuffer *bioc;
>>       QEMUFile *fb = NULL;
>> -    int64_t current_time, checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
>> +    int64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
>>       Error *local_err = NULL;
>>       int ret;
>>
>> @@ -332,26 +332,21 @@ static void colo_process_checkpoint(MigrationState *s)
>>       qemu_mutex_unlock_iothread();
>>       trace_colo_vm_state_change("stop", "run");
>>
>> +    timer_mod(s->colo_delay_timer,
>> +            current_time + s->parameters.x_checkpoint_delay);
>> +
>>       while (s->state == MIGRATION_STATUS_COLO) {
>>           if (failover_get_state() != FAILOVER_STATUS_NONE) {
>>               error_report("failover request");
>>               goto out;
>>           }
>>
>> -        current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
>> -        if (current_time - checkpoint_time <
>> -            s->parameters.x_checkpoint_delay) {
>> -            int64_t delay_ms;
>> +        qemu_sem_wait(&s->colo_checkpoint_sem);
>>
>> -            delay_ms = s->parameters.x_checkpoint_delay -
>> -                       (current_time - checkpoint_time);
>> -            g_usleep(delay_ms * 1000);
>> -        }
>>           ret = colo_do_checkpoint_transaction(s, bioc, fb);
>>           if (ret < 0) {
>>               goto out;
>>           }
>> -        checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
>>       }
>>
>>   out:
>> @@ -364,14 +359,32 @@ out:
>>           qemu_fclose(fb);
>>       }
>>
>> +    timer_del(s->colo_delay_timer);
>> +
>>       if (s->rp_state.from_dst_file) {
>>           qemu_fclose(s->rp_state.from_dst_file);
>>       }
>>   }
>>
>> +void colo_checkpoint_notify(void *opaque)
>> +{
>> +    MigrationState *s = opaque;
>> +    int64_t next_notify_time;
>> +
>> +    qemu_sem_post(&s->colo_checkpoint_sem);
>> +    s->colo_checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
>> +    next_notify_time = s->colo_checkpoint_time +
>> +                    s->parameters.x_checkpoint_delay;
>> +    timer_mod(s->colo_delay_timer, next_notify_time);
>> +}
>> +
>>   void migrate_start_colo_process(MigrationState *s)
>>   {
>>       qemu_mutex_unlock_iothread();
>> +    qemu_sem_init(&s->colo_checkpoint_sem, 0);
>> +    s->colo_delay_timer =  timer_new_ms(QEMU_CLOCK_HOST,
>> +                                colo_checkpoint_notify, s);
>> +
>>       migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
>>                         MIGRATION_STATUS_COLO);
>>       colo_process_checkpoint(s);
>> diff --git a/migration/migration.c b/migration/migration.c
>> index f498ab8..a4861da 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -895,6 +895,9 @@ void qmp_migrate_set_parameters(MigrationParameters *params, Error **errp)
>>
>>       if (params->has_x_checkpoint_delay) {
>>           s->parameters.x_checkpoint_delay = params->x_checkpoint_delay;
>> +        if (migration_in_colo_state()) {
>> +            colo_checkpoint_notify(s);
>> +        }
>>       }
>>   }
>>
>> --
>> 1.8.3.1
>>
>>
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
> .
>
diff mbox

Patch

diff --git a/include/migration/colo.h b/include/migration/colo.h
index e32eef4..2bbff9e 100644
--- a/include/migration/colo.h
+++ b/include/migration/colo.h
@@ -35,4 +35,6 @@  COLOMode get_colo_mode(void);
 
 /* failover */
 void colo_do_failover(MigrationState *s);
+
+void colo_checkpoint_notify(void *opaque);
 #endif
diff --git a/include/migration/migration.h b/include/migration/migration.h
index c309d23..487ac1e 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -183,6 +183,11 @@  struct MigrationState
     /* The RAMBlock used in the last src_page_request */
     RAMBlock *last_req_rb;
 
+    /* The semaphore is used to notify COLO thread to do checkpoint */
+    QemuSemaphore colo_checkpoint_sem;
+    int64_t colo_checkpoint_time;
+    QEMUTimer *colo_delay_timer;
+
     /* The last error that occurred */
     Error *error;
 };
diff --git a/migration/colo.c b/migration/colo.c
index 93c85c5..08b2e46 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -302,7 +302,7 @@  static void colo_process_checkpoint(MigrationState *s)
 {
     QIOChannelBuffer *bioc;
     QEMUFile *fb = NULL;
-    int64_t current_time, checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+    int64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
     Error *local_err = NULL;
     int ret;
 
@@ -332,26 +332,21 @@  static void colo_process_checkpoint(MigrationState *s)
     qemu_mutex_unlock_iothread();
     trace_colo_vm_state_change("stop", "run");
 
+    timer_mod(s->colo_delay_timer,
+            current_time + s->parameters.x_checkpoint_delay);
+
     while (s->state == MIGRATION_STATUS_COLO) {
         if (failover_get_state() != FAILOVER_STATUS_NONE) {
             error_report("failover request");
             goto out;
         }
 
-        current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
-        if (current_time - checkpoint_time <
-            s->parameters.x_checkpoint_delay) {
-            int64_t delay_ms;
+        qemu_sem_wait(&s->colo_checkpoint_sem);
 
-            delay_ms = s->parameters.x_checkpoint_delay -
-                       (current_time - checkpoint_time);
-            g_usleep(delay_ms * 1000);
-        }
         ret = colo_do_checkpoint_transaction(s, bioc, fb);
         if (ret < 0) {
             goto out;
         }
-        checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
     }
 
 out:
@@ -364,14 +359,32 @@  out:
         qemu_fclose(fb);
     }
 
+    timer_del(s->colo_delay_timer);
+
     if (s->rp_state.from_dst_file) {
         qemu_fclose(s->rp_state.from_dst_file);
     }
 }
 
+void colo_checkpoint_notify(void *opaque)
+{
+    MigrationState *s = opaque;
+    int64_t next_notify_time;
+
+    qemu_sem_post(&s->colo_checkpoint_sem);
+    s->colo_checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+    next_notify_time = s->colo_checkpoint_time +
+                    s->parameters.x_checkpoint_delay;
+    timer_mod(s->colo_delay_timer, next_notify_time);
+}
+
 void migrate_start_colo_process(MigrationState *s)
 {
     qemu_mutex_unlock_iothread();
+    qemu_sem_init(&s->colo_checkpoint_sem, 0);
+    s->colo_delay_timer =  timer_new_ms(QEMU_CLOCK_HOST,
+                                colo_checkpoint_notify, s);
+
     migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
                       MIGRATION_STATUS_COLO);
     colo_process_checkpoint(s);
diff --git a/migration/migration.c b/migration/migration.c
index f498ab8..a4861da 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -895,6 +895,9 @@  void qmp_migrate_set_parameters(MigrationParameters *params, Error **errp)
 
     if (params->has_x_checkpoint_delay) {
         s->parameters.x_checkpoint_delay = params->x_checkpoint_delay;
+        if (migration_in_colo_state()) {
+            colo_checkpoint_notify(s);
+        }
     }
 }