diff mbox series

[v2,1/2] arm/hvf: Optimize and simplify WFI handling

Message ID 4a2d15d01087207e2fba1f55ad312727dbfc782e.1606856104.git.pcc@google.com (mailing list archive)
State New, archived
Headers show
Series [v2,1/2] arm/hvf: Optimize and simplify WFI handling | expand

Commit Message

Peter Collingbourne Dec. 1, 2020, 9 p.m. UTC
Sleep on WFx until the VTIMER is due but allow ourselves to be woken
up on IPI.

Signed-off-by: Peter Collingbourne <pcc@google.com>
---
v2:
- simplify locking further
- wait indefinitely on disabled or masked timers

 accel/hvf/hvf-cpus.c     |   5 +-
 include/sysemu/hvf_int.h |   3 +-
 target/arm/hvf/hvf.c     | 116 ++++++++++++++-------------------------
 3 files changed, 43 insertions(+), 81 deletions(-)

Comments

Alexander Graf Dec. 1, 2020, 11:24 p.m. UTC | #1
On 01.12.20 22:00, Peter Collingbourne wrote:
> Sleep on WFx until the VTIMER is due but allow ourselves to be woken
> up on IPI.
>
> Signed-off-by: Peter Collingbourne <pcc@google.com>
> ---
> v2:
> - simplify locking further
> - wait indefinitely on disabled or masked timers
>
>   accel/hvf/hvf-cpus.c     |   5 +-
>   include/sysemu/hvf_int.h |   3 +-
>   target/arm/hvf/hvf.c     | 116 ++++++++++++++-------------------------
>   3 files changed, 43 insertions(+), 81 deletions(-)
>
> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
> index 4360f64671..b2c8fb57f6 100644
> --- a/accel/hvf/hvf-cpus.c
> +++ b/accel/hvf/hvf-cpus.c
> @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu)
>       sigact.sa_handler = dummy_signal;
>       sigaction(SIG_IPI, &sigact, NULL);
>   
> -    pthread_sigmask(SIG_BLOCK, NULL, &set);
> -    sigdelset(&set, SIG_IPI);
> -    pthread_sigmask(SIG_SETMASK, &set, NULL);
> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
> +    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
>   
>   #ifdef __aarch64__
>       r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
> index c56baa3ae8..13adf6ea77 100644
> --- a/include/sysemu/hvf_int.h
> +++ b/include/sysemu/hvf_int.h
> @@ -62,8 +62,7 @@ extern HVFState *hvf_state;
>   struct hvf_vcpu_state {
>       uint64_t fd;
>       void *exit;
> -    struct timespec ts;
> -    bool sleeping;
> +    sigset_t unblock_ipi_mask;
>   };
>   
>   void assert_hvf_ok(hv_return_t ret);
> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
> index 8fe10966d2..3321d48aa2 100644
> --- a/target/arm/hvf/hvf.c
> +++ b/target/arm/hvf/hvf.c
> @@ -2,6 +2,7 @@
>    * QEMU Hypervisor.framework support for Apple Silicon
>   
>    * Copyright 2020 Alexander Graf <agraf@csgraf.de>
> + * Copyright 2020 Google LLC
>    *
>    * This work is licensed under the terms of the GNU GPL, version 2 or later.
>    * See the COPYING file in the top-level directory.
> @@ -18,6 +19,7 @@
>   #include "sysemu/hw_accel.h"
>   
>   #include <Hypervisor/Hypervisor.h>
> +#include <mach/mach_time.h>
>   
>   #include "exec/address-spaces.h"
>   #include "hw/irq.h"
> @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)
>   
>   void hvf_kick_vcpu_thread(CPUState *cpu)
>   {
> -    if (cpu->hvf->sleeping) {
> -        /*
> -         * When sleeping, make sure we always send signals. Also, clear the
> -         * timespec, so that an IPI that arrives between setting hvf->sleeping
> -         * and the nanosleep syscall still aborts the sleep.
> -         */
> -        cpu->thread_kicked = false;
> -        cpu->hvf->ts = (struct timespec){ };
> -        cpus_kick_thread(cpu);
> -    } else {
> -        hv_vcpus_exit(&cpu->hvf->fd, 1);
> -    }
> +    cpus_kick_thread(cpu);
> +    hv_vcpus_exit(&cpu->hvf->fd, 1);
>   }
>   
>   static int hvf_inject_interrupts(CPUState *cpu)
> @@ -349,6 +341,18 @@ static int hvf_inject_interrupts(CPUState *cpu)
>       return 0;
>   }
>   
> +static void hvf_wait_for_ipi(CPUState *cpu, struct timespec *ts)
> +{
> +    /*
> +     * Use pselect to sleep so that other threads can IPI us while we're
> +     * sleeping.
> +     */
> +    qatomic_mb_set(&cpu->thread_kicked, false);
> +    qemu_mutex_unlock_iothread();
> +    pselect(0, 0, 0, 0, ts, &cpu->hvf->unblock_ipi_mask);
> +    qemu_mutex_lock_iothread();
> +}
> +
>   int hvf_vcpu_exec(CPUState *cpu)
>   {
>       ARMCPU *arm_cpu = ARM_CPU(cpu);
> @@ -357,15 +361,11 @@ int hvf_vcpu_exec(CPUState *cpu)
>       hv_return_t r;
>       int ret = 0;
>   
> -    qemu_mutex_unlock_iothread();
> -
>       do {
>           bool advance_pc = false;
>   
> -        qemu_mutex_lock_iothread();
>           current_cpu = cpu;
>           qemu_wait_io_event_common(cpu);
> -        qemu_mutex_unlock_iothread();
>   
>           flush_cpu_state(cpu);
>   
> @@ -374,10 +374,10 @@ int hvf_vcpu_exec(CPUState *cpu)
>           }
>   
>           if (cpu->halted) {
> -            qemu_mutex_lock_iothread();
>               return EXCP_HLT;
>           }
>   
> +        qemu_mutex_unlock_iothread();
>           assert_hvf_ok(hv_vcpu_run(cpu->hvf->fd));
>   
>           /* handle VMEXIT */
> @@ -385,15 +385,14 @@ int hvf_vcpu_exec(CPUState *cpu)
>           uint64_t syndrome = hvf_exit->exception.syndrome;
>           uint32_t ec = syn_get_ec(syndrome);
>   
> +        qemu_mutex_lock_iothread();
>           switch (exit_reason) {
>           case HV_EXIT_REASON_EXCEPTION:
>               /* This is the main one, handle below. */
>               break;
>           case HV_EXIT_REASON_VTIMER_ACTIVATED:
> -            qemu_mutex_lock_iothread();
>               current_cpu = cpu;
>               qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
> -            qemu_mutex_unlock_iothread();
>               continue;
>           case HV_EXIT_REASON_CANCELED:
>               /* we got kicked, no exit to process */
> @@ -413,7 +412,6 @@ int hvf_vcpu_exec(CPUState *cpu)
>               uint32_t srt = (syndrome >> 16) & 0x1f;
>               uint64_t val = 0;
>   
> -            qemu_mutex_lock_iothread();
>               current_cpu = cpu;
>   
>               DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
> @@ -446,8 +444,6 @@ int hvf_vcpu_exec(CPUState *cpu)
>                   hvf_set_reg(cpu, srt, val);
>               }
>   
> -            qemu_mutex_unlock_iothread();
> -
>               advance_pc = true;
>               break;
>           }
> @@ -493,68 +489,40 @@ int hvf_vcpu_exec(CPUState *cpu)
>           case EC_WFX_TRAP:
>               if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
>                   (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> -                uint64_t cval, ctl, val, diff, now;
> +                advance_pc = true;
>   
> -                /* Set up a local timer for vtimer if necessary ... */
> -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
> -                assert_hvf_ok(r);
> -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
> +                uint64_t ctl;
> +                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0,
> +                                        &ctl);
>                   assert_hvf_ok(r);
>   
> -                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
> -                diff = cval - val;
> -
> -                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
> -                      gt_cntfrq_period_ns(arm_cpu);
> -
> -                /* Timer disabled or masked, just wait for long */
>                   if (!(ctl & 1) || (ctl & 2)) {
> -                    diff = (120 * NANOSECONDS_PER_SECOND) /
> -                           gt_cntfrq_period_ns(arm_cpu);
> +                    /* Timer disabled or masked, just wait for an IPI. */
> +                    hvf_wait_for_ipi(cpu, NULL);
> +                    break;
>                   }
>   
> -                if (diff < INT64_MAX) {
> -                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
> -                    struct timespec *ts = &cpu->hvf->ts;
> -
> -                    *ts = (struct timespec){
> -                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
> -                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
> -                    };
> -
> -                    /*
> -                     * Waking up easily takes 1ms, don't go to sleep for smaller
> -                     * time periods than 2ms.
> -                     */
> -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
> -                        advance_pc = true;
> -                        break;
> -                    }
> -
> -                    /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */
> -                    cpu->hvf->sleeping = true;
> -                    smp_mb();
> -
> -                    /* Bail out if we received an IRQ meanwhile */
> -                    if (cpu->thread_kicked || (cpu->interrupt_request &
> -                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> -                        cpu->hvf->sleeping = false;
> -                        break;
> -                    }
> -
> -                    /* nanosleep returns on signal, so we wake up on kick. */
> -                    nanosleep(ts, NULL);
> -
> -                    /* Out of sleep - either naturally or because of a kick */
> -                    cpu->hvf->sleeping = false;
> +                uint64_t cval;
> +                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0,
> +                                        &cval);
> +                assert_hvf_ok(r);
> +
> +                int64_t ticks_to_sleep = cval - mach_absolute_time();


I think you touched based on it in a previous thread, but would you mind 
to explain again why mach_absolute_time() is the right thing to check 
cval against? If I read the headers correctly, the cnvt_off register 
should be 0, so cntvct should be the reference time, no?

Also, can you please split the patch into one that I can squash into my 
existing one (remove WFI handling altogether), an individual one to 
revive the global io lock (happy to squash too unless you think it's 
useful to keep separate) and then this one?


Alex
Peter Collingbourne Dec. 2, 2020, 1:32 a.m. UTC | #2
On Tue, Dec 1, 2020 at 3:24 PM Alexander Graf <agraf@csgraf.de> wrote:
>
>
> On 01.12.20 22:00, Peter Collingbourne wrote:
> > Sleep on WFx until the VTIMER is due but allow ourselves to be woken
> > up on IPI.
> >
> > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > ---
> > v2:
> > - simplify locking further
> > - wait indefinitely on disabled or masked timers
> >
> >   accel/hvf/hvf-cpus.c     |   5 +-
> >   include/sysemu/hvf_int.h |   3 +-
> >   target/arm/hvf/hvf.c     | 116 ++++++++++++++-------------------------
> >   3 files changed, 43 insertions(+), 81 deletions(-)
> >
> > diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
> > index 4360f64671..b2c8fb57f6 100644
> > --- a/accel/hvf/hvf-cpus.c
> > +++ b/accel/hvf/hvf-cpus.c
> > @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu)
> >       sigact.sa_handler = dummy_signal;
> >       sigaction(SIG_IPI, &sigact, NULL);
> >
> > -    pthread_sigmask(SIG_BLOCK, NULL, &set);
> > -    sigdelset(&set, SIG_IPI);
> > -    pthread_sigmask(SIG_SETMASK, &set, NULL);
> > +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
> > +    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
> >
> >   #ifdef __aarch64__
> >       r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
> > diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
> > index c56baa3ae8..13adf6ea77 100644
> > --- a/include/sysemu/hvf_int.h
> > +++ b/include/sysemu/hvf_int.h
> > @@ -62,8 +62,7 @@ extern HVFState *hvf_state;
> >   struct hvf_vcpu_state {
> >       uint64_t fd;
> >       void *exit;
> > -    struct timespec ts;
> > -    bool sleeping;
> > +    sigset_t unblock_ipi_mask;
> >   };
> >
> >   void assert_hvf_ok(hv_return_t ret);
> > diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
> > index 8fe10966d2..3321d48aa2 100644
> > --- a/target/arm/hvf/hvf.c
> > +++ b/target/arm/hvf/hvf.c
> > @@ -2,6 +2,7 @@
> >    * QEMU Hypervisor.framework support for Apple Silicon
> >
> >    * Copyright 2020 Alexander Graf <agraf@csgraf.de>
> > + * Copyright 2020 Google LLC
> >    *
> >    * This work is licensed under the terms of the GNU GPL, version 2 or later.
> >    * See the COPYING file in the top-level directory.
> > @@ -18,6 +19,7 @@
> >   #include "sysemu/hw_accel.h"
> >
> >   #include <Hypervisor/Hypervisor.h>
> > +#include <mach/mach_time.h>
> >
> >   #include "exec/address-spaces.h"
> >   #include "hw/irq.h"
> > @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)
> >
> >   void hvf_kick_vcpu_thread(CPUState *cpu)
> >   {
> > -    if (cpu->hvf->sleeping) {
> > -        /*
> > -         * When sleeping, make sure we always send signals. Also, clear the
> > -         * timespec, so that an IPI that arrives between setting hvf->sleeping
> > -         * and the nanosleep syscall still aborts the sleep.
> > -         */
> > -        cpu->thread_kicked = false;
> > -        cpu->hvf->ts = (struct timespec){ };
> > -        cpus_kick_thread(cpu);
> > -    } else {
> > -        hv_vcpus_exit(&cpu->hvf->fd, 1);
> > -    }
> > +    cpus_kick_thread(cpu);
> > +    hv_vcpus_exit(&cpu->hvf->fd, 1);
> >   }
> >
> >   static int hvf_inject_interrupts(CPUState *cpu)
> > @@ -349,6 +341,18 @@ static int hvf_inject_interrupts(CPUState *cpu)
> >       return 0;
> >   }
> >
> > +static void hvf_wait_for_ipi(CPUState *cpu, struct timespec *ts)
> > +{
> > +    /*
> > +     * Use pselect to sleep so that other threads can IPI us while we're
> > +     * sleeping.
> > +     */
> > +    qatomic_mb_set(&cpu->thread_kicked, false);
> > +    qemu_mutex_unlock_iothread();
> > +    pselect(0, 0, 0, 0, ts, &cpu->hvf->unblock_ipi_mask);
> > +    qemu_mutex_lock_iothread();
> > +}
> > +
> >   int hvf_vcpu_exec(CPUState *cpu)
> >   {
> >       ARMCPU *arm_cpu = ARM_CPU(cpu);
> > @@ -357,15 +361,11 @@ int hvf_vcpu_exec(CPUState *cpu)
> >       hv_return_t r;
> >       int ret = 0;
> >
> > -    qemu_mutex_unlock_iothread();
> > -
> >       do {
> >           bool advance_pc = false;
> >
> > -        qemu_mutex_lock_iothread();
> >           current_cpu = cpu;
> >           qemu_wait_io_event_common(cpu);
> > -        qemu_mutex_unlock_iothread();
> >
> >           flush_cpu_state(cpu);
> >
> > @@ -374,10 +374,10 @@ int hvf_vcpu_exec(CPUState *cpu)
> >           }
> >
> >           if (cpu->halted) {
> > -            qemu_mutex_lock_iothread();
> >               return EXCP_HLT;
> >           }
> >
> > +        qemu_mutex_unlock_iothread();
> >           assert_hvf_ok(hv_vcpu_run(cpu->hvf->fd));
> >
> >           /* handle VMEXIT */
> > @@ -385,15 +385,14 @@ int hvf_vcpu_exec(CPUState *cpu)
> >           uint64_t syndrome = hvf_exit->exception.syndrome;
> >           uint32_t ec = syn_get_ec(syndrome);
> >
> > +        qemu_mutex_lock_iothread();
> >           switch (exit_reason) {
> >           case HV_EXIT_REASON_EXCEPTION:
> >               /* This is the main one, handle below. */
> >               break;
> >           case HV_EXIT_REASON_VTIMER_ACTIVATED:
> > -            qemu_mutex_lock_iothread();
> >               current_cpu = cpu;
> >               qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
> > -            qemu_mutex_unlock_iothread();
> >               continue;
> >           case HV_EXIT_REASON_CANCELED:
> >               /* we got kicked, no exit to process */
> > @@ -413,7 +412,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> >               uint32_t srt = (syndrome >> 16) & 0x1f;
> >               uint64_t val = 0;
> >
> > -            qemu_mutex_lock_iothread();
> >               current_cpu = cpu;
> >
> >               DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
> > @@ -446,8 +444,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> >                   hvf_set_reg(cpu, srt, val);
> >               }
> >
> > -            qemu_mutex_unlock_iothread();
> > -
> >               advance_pc = true;
> >               break;
> >           }
> > @@ -493,68 +489,40 @@ int hvf_vcpu_exec(CPUState *cpu)
> >           case EC_WFX_TRAP:
> >               if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
> >                   (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> > -                uint64_t cval, ctl, val, diff, now;
> > +                advance_pc = true;
> >
> > -                /* Set up a local timer for vtimer if necessary ... */
> > -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
> > -                assert_hvf_ok(r);
> > -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
> > +                uint64_t ctl;
> > +                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0,
> > +                                        &ctl);
> >                   assert_hvf_ok(r);
> >
> > -                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
> > -                diff = cval - val;
> > -
> > -                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
> > -                      gt_cntfrq_period_ns(arm_cpu);
> > -
> > -                /* Timer disabled or masked, just wait for long */
> >                   if (!(ctl & 1) || (ctl & 2)) {
> > -                    diff = (120 * NANOSECONDS_PER_SECOND) /
> > -                           gt_cntfrq_period_ns(arm_cpu);
> > +                    /* Timer disabled or masked, just wait for an IPI. */
> > +                    hvf_wait_for_ipi(cpu, NULL);
> > +                    break;
> >                   }
> >
> > -                if (diff < INT64_MAX) {
> > -                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
> > -                    struct timespec *ts = &cpu->hvf->ts;
> > -
> > -                    *ts = (struct timespec){
> > -                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
> > -                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
> > -                    };
> > -
> > -                    /*
> > -                     * Waking up easily takes 1ms, don't go to sleep for smaller
> > -                     * time periods than 2ms.
> > -                     */
> > -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
> > -                        advance_pc = true;
> > -                        break;
> > -                    }
> > -
> > -                    /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */
> > -                    cpu->hvf->sleeping = true;
> > -                    smp_mb();
> > -
> > -                    /* Bail out if we received an IRQ meanwhile */
> > -                    if (cpu->thread_kicked || (cpu->interrupt_request &
> > -                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> > -                        cpu->hvf->sleeping = false;
> > -                        break;
> > -                    }
> > -
> > -                    /* nanosleep returns on signal, so we wake up on kick. */
> > -                    nanosleep(ts, NULL);
> > -
> > -                    /* Out of sleep - either naturally or because of a kick */
> > -                    cpu->hvf->sleeping = false;
> > +                uint64_t cval;
> > +                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0,
> > +                                        &cval);
> > +                assert_hvf_ok(r);
> > +
> > +                int64_t ticks_to_sleep = cval - mach_absolute_time();
>
>
> I think you touched based on it in a previous thread, but would you mind
> to explain again why mach_absolute_time() is the right thing to check
> cval against? If I read the headers correctly, the cnvt_off register
> should be 0, so cntvct should be the reference time, no?

In my experiments I've found that CNTPCT_EL0 and CNTVCT_EL0 are the
same when read on the host (i.e. host CNTVOFF_EL2 = 0). When we look
at the guest we see that CNTPCT_EL0 corresponds to
mach_absolute_time() on the host and not host CNTPCT_EL0 (if you look
at XNU kernel sources you will see that mach_absolute_time() reads
CNTPCT_EL0 and adds a constant corresponding to the amount of time
that the machine spends asleep) so I think that what's going on at the
hypervisor level is that guest CNTPOFF_EL2 is being set to the same
constant to make it correspond to mach_absolute_time().

> Also, can you please split the patch into one that I can squash into my
> existing one (remove WFI handling altogether), an individual one to
> revive the global io lock (happy to squash too unless you think it's
> useful to keep separate) and then this one?

Sure, I'll do that.

Peter
Alexander Graf Dec. 2, 2020, 1:39 a.m. UTC | #3
On 02.12.20 02:32, Peter Collingbourne wrote:
> On Tue, Dec 1, 2020 at 3:24 PM Alexander Graf <agraf@csgraf.de> wrote:
>>
>> On 01.12.20 22:00, Peter Collingbourne wrote:
>>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken
>>> up on IPI.
>>>
>>> Signed-off-by: Peter Collingbourne <pcc@google.com>
>>> ---
>>> v2:
>>> - simplify locking further
>>> - wait indefinitely on disabled or masked timers
>>>
>>>    accel/hvf/hvf-cpus.c     |   5 +-
>>>    include/sysemu/hvf_int.h |   3 +-
>>>    target/arm/hvf/hvf.c     | 116 ++++++++++++++-------------------------
>>>    3 files changed, 43 insertions(+), 81 deletions(-)
>>>
>>> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
>>> index 4360f64671..b2c8fb57f6 100644
>>> --- a/accel/hvf/hvf-cpus.c
>>> +++ b/accel/hvf/hvf-cpus.c
>>> @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu)
>>>        sigact.sa_handler = dummy_signal;
>>>        sigaction(SIG_IPI, &sigact, NULL);
>>>
>>> -    pthread_sigmask(SIG_BLOCK, NULL, &set);
>>> -    sigdelset(&set, SIG_IPI);
>>> -    pthread_sigmask(SIG_SETMASK, &set, NULL);
>>> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
>>> +    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
>>>
>>>    #ifdef __aarch64__
>>>        r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
>>> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
>>> index c56baa3ae8..13adf6ea77 100644
>>> --- a/include/sysemu/hvf_int.h
>>> +++ b/include/sysemu/hvf_int.h
>>> @@ -62,8 +62,7 @@ extern HVFState *hvf_state;
>>>    struct hvf_vcpu_state {
>>>        uint64_t fd;
>>>        void *exit;
>>> -    struct timespec ts;
>>> -    bool sleeping;
>>> +    sigset_t unblock_ipi_mask;
>>>    };
>>>
>>>    void assert_hvf_ok(hv_return_t ret);
>>> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
>>> index 8fe10966d2..3321d48aa2 100644
>>> --- a/target/arm/hvf/hvf.c
>>> +++ b/target/arm/hvf/hvf.c
>>> @@ -2,6 +2,7 @@
>>>     * QEMU Hypervisor.framework support for Apple Silicon
>>>
>>>     * Copyright 2020 Alexander Graf <agraf@csgraf.de>
>>> + * Copyright 2020 Google LLC
>>>     *
>>>     * This work is licensed under the terms of the GNU GPL, version 2 or later.
>>>     * See the COPYING file in the top-level directory.
>>> @@ -18,6 +19,7 @@
>>>    #include "sysemu/hw_accel.h"
>>>
>>>    #include <Hypervisor/Hypervisor.h>
>>> +#include <mach/mach_time.h>
>>>
>>>    #include "exec/address-spaces.h"
>>>    #include "hw/irq.h"
>>> @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)
>>>
>>>    void hvf_kick_vcpu_thread(CPUState *cpu)
>>>    {
>>> -    if (cpu->hvf->sleeping) {
>>> -        /*
>>> -         * When sleeping, make sure we always send signals. Also, clear the
>>> -         * timespec, so that an IPI that arrives between setting hvf->sleeping
>>> -         * and the nanosleep syscall still aborts the sleep.
>>> -         */
>>> -        cpu->thread_kicked = false;
>>> -        cpu->hvf->ts = (struct timespec){ };
>>> -        cpus_kick_thread(cpu);
>>> -    } else {
>>> -        hv_vcpus_exit(&cpu->hvf->fd, 1);
>>> -    }
>>> +    cpus_kick_thread(cpu);
>>> +    hv_vcpus_exit(&cpu->hvf->fd, 1);
>>>    }
>>>
>>>    static int hvf_inject_interrupts(CPUState *cpu)
>>> @@ -349,6 +341,18 @@ static int hvf_inject_interrupts(CPUState *cpu)
>>>        return 0;
>>>    }
>>>
>>> +static void hvf_wait_for_ipi(CPUState *cpu, struct timespec *ts)
>>> +{
>>> +    /*
>>> +     * Use pselect to sleep so that other threads can IPI us while we're
>>> +     * sleeping.
>>> +     */
>>> +    qatomic_mb_set(&cpu->thread_kicked, false);
>>> +    qemu_mutex_unlock_iothread();
>>> +    pselect(0, 0, 0, 0, ts, &cpu->hvf->unblock_ipi_mask);
>>> +    qemu_mutex_lock_iothread();
>>> +}
>>> +
>>>    int hvf_vcpu_exec(CPUState *cpu)
>>>    {
>>>        ARMCPU *arm_cpu = ARM_CPU(cpu);
>>> @@ -357,15 +361,11 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>        hv_return_t r;
>>>        int ret = 0;
>>>
>>> -    qemu_mutex_unlock_iothread();
>>> -
>>>        do {
>>>            bool advance_pc = false;
>>>
>>> -        qemu_mutex_lock_iothread();
>>>            current_cpu = cpu;
>>>            qemu_wait_io_event_common(cpu);
>>> -        qemu_mutex_unlock_iothread();
>>>
>>>            flush_cpu_state(cpu);
>>>
>>> @@ -374,10 +374,10 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>            }
>>>
>>>            if (cpu->halted) {
>>> -            qemu_mutex_lock_iothread();
>>>                return EXCP_HLT;
>>>            }
>>>
>>> +        qemu_mutex_unlock_iothread();
>>>            assert_hvf_ok(hv_vcpu_run(cpu->hvf->fd));
>>>
>>>            /* handle VMEXIT */
>>> @@ -385,15 +385,14 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>            uint64_t syndrome = hvf_exit->exception.syndrome;
>>>            uint32_t ec = syn_get_ec(syndrome);
>>>
>>> +        qemu_mutex_lock_iothread();
>>>            switch (exit_reason) {
>>>            case HV_EXIT_REASON_EXCEPTION:
>>>                /* This is the main one, handle below. */
>>>                break;
>>>            case HV_EXIT_REASON_VTIMER_ACTIVATED:
>>> -            qemu_mutex_lock_iothread();
>>>                current_cpu = cpu;
>>>                qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
>>> -            qemu_mutex_unlock_iothread();
>>>                continue;
>>>            case HV_EXIT_REASON_CANCELED:
>>>                /* we got kicked, no exit to process */
>>> @@ -413,7 +412,6 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>                uint32_t srt = (syndrome >> 16) & 0x1f;
>>>                uint64_t val = 0;
>>>
>>> -            qemu_mutex_lock_iothread();
>>>                current_cpu = cpu;
>>>
>>>                DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
>>> @@ -446,8 +444,6 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>                    hvf_set_reg(cpu, srt, val);
>>>                }
>>>
>>> -            qemu_mutex_unlock_iothread();
>>> -
>>>                advance_pc = true;
>>>                break;
>>>            }
>>> @@ -493,68 +489,40 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>            case EC_WFX_TRAP:
>>>                if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
>>>                    (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
>>> -                uint64_t cval, ctl, val, diff, now;
>>> +                advance_pc = true;
>>>
>>> -                /* Set up a local timer for vtimer if necessary ... */
>>> -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
>>> -                assert_hvf_ok(r);
>>> -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
>>> +                uint64_t ctl;
>>> +                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0,
>>> +                                        &ctl);
>>>                    assert_hvf_ok(r);
>>>
>>> -                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
>>> -                diff = cval - val;
>>> -
>>> -                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
>>> -                      gt_cntfrq_period_ns(arm_cpu);
>>> -
>>> -                /* Timer disabled or masked, just wait for long */
>>>                    if (!(ctl & 1) || (ctl & 2)) {
>>> -                    diff = (120 * NANOSECONDS_PER_SECOND) /
>>> -                           gt_cntfrq_period_ns(arm_cpu);
>>> +                    /* Timer disabled or masked, just wait for an IPI. */
>>> +                    hvf_wait_for_ipi(cpu, NULL);
>>> +                    break;
>>>                    }
>>>
>>> -                if (diff < INT64_MAX) {
>>> -                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
>>> -                    struct timespec *ts = &cpu->hvf->ts;
>>> -
>>> -                    *ts = (struct timespec){
>>> -                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
>>> -                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
>>> -                    };
>>> -
>>> -                    /*
>>> -                     * Waking up easily takes 1ms, don't go to sleep for smaller
>>> -                     * time periods than 2ms.
>>> -                     */
>>> -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
>>> -                        advance_pc = true;
>>> -                        break;
>>> -                    }
>>> -
>>> -                    /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */
>>> -                    cpu->hvf->sleeping = true;
>>> -                    smp_mb();
>>> -
>>> -                    /* Bail out if we received an IRQ meanwhile */
>>> -                    if (cpu->thread_kicked || (cpu->interrupt_request &
>>> -                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
>>> -                        cpu->hvf->sleeping = false;
>>> -                        break;
>>> -                    }
>>> -
>>> -                    /* nanosleep returns on signal, so we wake up on kick. */
>>> -                    nanosleep(ts, NULL);
>>> -
>>> -                    /* Out of sleep - either naturally or because of a kick */
>>> -                    cpu->hvf->sleeping = false;
>>> +                uint64_t cval;
>>> +                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0,
>>> +                                        &cval);
>>> +                assert_hvf_ok(r);
>>> +
>>> +                int64_t ticks_to_sleep = cval - mach_absolute_time();
>>
>> I think you touched based on it in a previous thread, but would you mind
>> to explain again why mach_absolute_time() is the right thing to check
>> cval against? If I read the headers correctly, the cnvt_off register
>> should be 0, so cntvct should be the reference time, no?
> In my experiments I've found that CNTPCT_EL0 and CNTVCT_EL0 are the
> same when read on the host (i.e. host CNTVOFF_EL2 = 0). When we look
> at the guest we see that CNTPCT_EL0 corresponds to
> mach_absolute_time() on the host and not host CNTPCT_EL0 (if you look
> at XNU kernel sources you will see that mach_absolute_time() reads
> CNTPCT_EL0 and adds a constant corresponding to the amount of time
> that the machine spends asleep) so I think that what's going on at the
> hypervisor level is that guest CNTPOFF_EL2 is being set to the same
> constant to make it correspond to mach_absolute_time().


Yes, I can absolutely see how it's different from CNTPCT, but it should 
be identical to CNTVCT_EL0 inside QEMU, no?


Alex
Peter Collingbourne Dec. 2, 2020, 1:52 a.m. UTC | #4
On Tue, Dec 1, 2020 at 5:39 PM Alexander Graf <agraf@csgraf.de> wrote:
>
>
> On 02.12.20 02:32, Peter Collingbourne wrote:
> > On Tue, Dec 1, 2020 at 3:24 PM Alexander Graf <agraf@csgraf.de> wrote:
> >>
> >> On 01.12.20 22:00, Peter Collingbourne wrote:
> >>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken
> >>> up on IPI.
> >>>
> >>> Signed-off-by: Peter Collingbourne <pcc@google.com>
> >>> ---
> >>> v2:
> >>> - simplify locking further
> >>> - wait indefinitely on disabled or masked timers
> >>>
> >>>    accel/hvf/hvf-cpus.c     |   5 +-
> >>>    include/sysemu/hvf_int.h |   3 +-
> >>>    target/arm/hvf/hvf.c     | 116 ++++++++++++++-------------------------
> >>>    3 files changed, 43 insertions(+), 81 deletions(-)
> >>>
> >>> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
> >>> index 4360f64671..b2c8fb57f6 100644
> >>> --- a/accel/hvf/hvf-cpus.c
> >>> +++ b/accel/hvf/hvf-cpus.c
> >>> @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu)
> >>>        sigact.sa_handler = dummy_signal;
> >>>        sigaction(SIG_IPI, &sigact, NULL);
> >>>
> >>> -    pthread_sigmask(SIG_BLOCK, NULL, &set);
> >>> -    sigdelset(&set, SIG_IPI);
> >>> -    pthread_sigmask(SIG_SETMASK, &set, NULL);
> >>> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
> >>> +    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
> >>>
> >>>    #ifdef __aarch64__
> >>>        r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
> >>> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
> >>> index c56baa3ae8..13adf6ea77 100644
> >>> --- a/include/sysemu/hvf_int.h
> >>> +++ b/include/sysemu/hvf_int.h
> >>> @@ -62,8 +62,7 @@ extern HVFState *hvf_state;
> >>>    struct hvf_vcpu_state {
> >>>        uint64_t fd;
> >>>        void *exit;
> >>> -    struct timespec ts;
> >>> -    bool sleeping;
> >>> +    sigset_t unblock_ipi_mask;
> >>>    };
> >>>
> >>>    void assert_hvf_ok(hv_return_t ret);
> >>> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
> >>> index 8fe10966d2..3321d48aa2 100644
> >>> --- a/target/arm/hvf/hvf.c
> >>> +++ b/target/arm/hvf/hvf.c
> >>> @@ -2,6 +2,7 @@
> >>>     * QEMU Hypervisor.framework support for Apple Silicon
> >>>
> >>>     * Copyright 2020 Alexander Graf <agraf@csgraf.de>
> >>> + * Copyright 2020 Google LLC
> >>>     *
> >>>     * This work is licensed under the terms of the GNU GPL, version 2 or later.
> >>>     * See the COPYING file in the top-level directory.
> >>> @@ -18,6 +19,7 @@
> >>>    #include "sysemu/hw_accel.h"
> >>>
> >>>    #include <Hypervisor/Hypervisor.h>
> >>> +#include <mach/mach_time.h>
> >>>
> >>>    #include "exec/address-spaces.h"
> >>>    #include "hw/irq.h"
> >>> @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)
> >>>
> >>>    void hvf_kick_vcpu_thread(CPUState *cpu)
> >>>    {
> >>> -    if (cpu->hvf->sleeping) {
> >>> -        /*
> >>> -         * When sleeping, make sure we always send signals. Also, clear the
> >>> -         * timespec, so that an IPI that arrives between setting hvf->sleeping
> >>> -         * and the nanosleep syscall still aborts the sleep.
> >>> -         */
> >>> -        cpu->thread_kicked = false;
> >>> -        cpu->hvf->ts = (struct timespec){ };
> >>> -        cpus_kick_thread(cpu);
> >>> -    } else {
> >>> -        hv_vcpus_exit(&cpu->hvf->fd, 1);
> >>> -    }
> >>> +    cpus_kick_thread(cpu);
> >>> +    hv_vcpus_exit(&cpu->hvf->fd, 1);
> >>>    }
> >>>
> >>>    static int hvf_inject_interrupts(CPUState *cpu)
> >>> @@ -349,6 +341,18 @@ static int hvf_inject_interrupts(CPUState *cpu)
> >>>        return 0;
> >>>    }
> >>>
> >>> +static void hvf_wait_for_ipi(CPUState *cpu, struct timespec *ts)
> >>> +{
> >>> +    /*
> >>> +     * Use pselect to sleep so that other threads can IPI us while we're
> >>> +     * sleeping.
> >>> +     */
> >>> +    qatomic_mb_set(&cpu->thread_kicked, false);
> >>> +    qemu_mutex_unlock_iothread();
> >>> +    pselect(0, 0, 0, 0, ts, &cpu->hvf->unblock_ipi_mask);
> >>> +    qemu_mutex_lock_iothread();
> >>> +}
> >>> +
> >>>    int hvf_vcpu_exec(CPUState *cpu)
> >>>    {
> >>>        ARMCPU *arm_cpu = ARM_CPU(cpu);
> >>> @@ -357,15 +361,11 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>        hv_return_t r;
> >>>        int ret = 0;
> >>>
> >>> -    qemu_mutex_unlock_iothread();
> >>> -
> >>>        do {
> >>>            bool advance_pc = false;
> >>>
> >>> -        qemu_mutex_lock_iothread();
> >>>            current_cpu = cpu;
> >>>            qemu_wait_io_event_common(cpu);
> >>> -        qemu_mutex_unlock_iothread();
> >>>
> >>>            flush_cpu_state(cpu);
> >>>
> >>> @@ -374,10 +374,10 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>            }
> >>>
> >>>            if (cpu->halted) {
> >>> -            qemu_mutex_lock_iothread();
> >>>                return EXCP_HLT;
> >>>            }
> >>>
> >>> +        qemu_mutex_unlock_iothread();
> >>>            assert_hvf_ok(hv_vcpu_run(cpu->hvf->fd));
> >>>
> >>>            /* handle VMEXIT */
> >>> @@ -385,15 +385,14 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>            uint64_t syndrome = hvf_exit->exception.syndrome;
> >>>            uint32_t ec = syn_get_ec(syndrome);
> >>>
> >>> +        qemu_mutex_lock_iothread();
> >>>            switch (exit_reason) {
> >>>            case HV_EXIT_REASON_EXCEPTION:
> >>>                /* This is the main one, handle below. */
> >>>                break;
> >>>            case HV_EXIT_REASON_VTIMER_ACTIVATED:
> >>> -            qemu_mutex_lock_iothread();
> >>>                current_cpu = cpu;
> >>>                qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
> >>> -            qemu_mutex_unlock_iothread();
> >>>                continue;
> >>>            case HV_EXIT_REASON_CANCELED:
> >>>                /* we got kicked, no exit to process */
> >>> @@ -413,7 +412,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>                uint32_t srt = (syndrome >> 16) & 0x1f;
> >>>                uint64_t val = 0;
> >>>
> >>> -            qemu_mutex_lock_iothread();
> >>>                current_cpu = cpu;
> >>>
> >>>                DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
> >>> @@ -446,8 +444,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>                    hvf_set_reg(cpu, srt, val);
> >>>                }
> >>>
> >>> -            qemu_mutex_unlock_iothread();
> >>> -
> >>>                advance_pc = true;
> >>>                break;
> >>>            }
> >>> @@ -493,68 +489,40 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>            case EC_WFX_TRAP:
> >>>                if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
> >>>                    (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> >>> -                uint64_t cval, ctl, val, diff, now;
> >>> +                advance_pc = true;
> >>>
> >>> -                /* Set up a local timer for vtimer if necessary ... */
> >>> -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
> >>> -                assert_hvf_ok(r);
> >>> -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
> >>> +                uint64_t ctl;
> >>> +                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0,
> >>> +                                        &ctl);
> >>>                    assert_hvf_ok(r);
> >>>
> >>> -                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
> >>> -                diff = cval - val;
> >>> -
> >>> -                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
> >>> -                      gt_cntfrq_period_ns(arm_cpu);
> >>> -
> >>> -                /* Timer disabled or masked, just wait for long */
> >>>                    if (!(ctl & 1) || (ctl & 2)) {
> >>> -                    diff = (120 * NANOSECONDS_PER_SECOND) /
> >>> -                           gt_cntfrq_period_ns(arm_cpu);
> >>> +                    /* Timer disabled or masked, just wait for an IPI. */
> >>> +                    hvf_wait_for_ipi(cpu, NULL);
> >>> +                    break;
> >>>                    }
> >>>
> >>> -                if (diff < INT64_MAX) {
> >>> -                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
> >>> -                    struct timespec *ts = &cpu->hvf->ts;
> >>> -
> >>> -                    *ts = (struct timespec){
> >>> -                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
> >>> -                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
> >>> -                    };
> >>> -
> >>> -                    /*
> >>> -                     * Waking up easily takes 1ms, don't go to sleep for smaller
> >>> -                     * time periods than 2ms.
> >>> -                     */
> >>> -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
> >>> -                        advance_pc = true;
> >>> -                        break;
> >>> -                    }
> >>> -
> >>> -                    /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */
> >>> -                    cpu->hvf->sleeping = true;
> >>> -                    smp_mb();
> >>> -
> >>> -                    /* Bail out if we received an IRQ meanwhile */
> >>> -                    if (cpu->thread_kicked || (cpu->interrupt_request &
> >>> -                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> >>> -                        cpu->hvf->sleeping = false;
> >>> -                        break;
> >>> -                    }
> >>> -
> >>> -                    /* nanosleep returns on signal, so we wake up on kick. */
> >>> -                    nanosleep(ts, NULL);
> >>> -
> >>> -                    /* Out of sleep - either naturally or because of a kick */
> >>> -                    cpu->hvf->sleeping = false;
> >>> +                uint64_t cval;
> >>> +                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0,
> >>> +                                        &cval);
> >>> +                assert_hvf_ok(r);
> >>> +
> >>> +                int64_t ticks_to_sleep = cval - mach_absolute_time();
> >>
> >> I think you touched based on it in a previous thread, but would you mind
> >> to explain again why mach_absolute_time() is the right thing to check
> >> cval against? If I read the headers correctly, the cnvt_off register
> >> should be 0, so cntvct should be the reference time, no?
> > In my experiments I've found that CNTPCT_EL0 and CNTVCT_EL0 are the
> > same when read on the host (i.e. host CNTVOFF_EL2 = 0). When we look
> > at the guest we see that CNTPCT_EL0 corresponds to
> > mach_absolute_time() on the host and not host CNTPCT_EL0 (if you look
> > at XNU kernel sources you will see that mach_absolute_time() reads
> > CNTPCT_EL0 and adds a constant corresponding to the amount of time
> > that the machine spends asleep) so I think that what's going on at the
> > hypervisor level is that guest CNTPOFF_EL2 is being set to the same
> > constant to make it correspond to mach_absolute_time().
>
>
> Yes, I can absolutely see how it's different from CNTPCT, but it should
> be identical to CNTVCT_EL0 inside QEMU, no?

Not necessarily. It's possible for the guest to observe a different
CNTVCT_EL0 either because CNTPOFF_EL2 is set to a non-zero value or
because the hypervisor is secretly adding the "time spent asleep"
constant to CNTVOFF_EL2 on top of the value that we set it to via the
API.

Peter
Alexander Graf Dec. 2, 2020, 1:54 a.m. UTC | #5
On 02.12.20 02:52, Peter Collingbourne wrote:
> On Tue, Dec 1, 2020 at 5:39 PM Alexander Graf <agraf@csgraf.de> wrote:
>>
>> On 02.12.20 02:32, Peter Collingbourne wrote:
>>> On Tue, Dec 1, 2020 at 3:24 PM Alexander Graf <agraf@csgraf.de> wrote:
>>>> On 01.12.20 22:00, Peter Collingbourne wrote:
>>>>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken
>>>>> up on IPI.
>>>>>
>>>>> Signed-off-by: Peter Collingbourne <pcc@google.com>
>>>>> ---
>>>>> v2:
>>>>> - simplify locking further
>>>>> - wait indefinitely on disabled or masked timers
>>>>>
>>>>>     accel/hvf/hvf-cpus.c     |   5 +-
>>>>>     include/sysemu/hvf_int.h |   3 +-
>>>>>     target/arm/hvf/hvf.c     | 116 ++++++++++++++-------------------------
>>>>>     3 files changed, 43 insertions(+), 81 deletions(-)
>>>>>
>>>>> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
>>>>> index 4360f64671..b2c8fb57f6 100644
>>>>> --- a/accel/hvf/hvf-cpus.c
>>>>> +++ b/accel/hvf/hvf-cpus.c
>>>>> @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu)
>>>>>         sigact.sa_handler = dummy_signal;
>>>>>         sigaction(SIG_IPI, &sigact, NULL);
>>>>>
>>>>> -    pthread_sigmask(SIG_BLOCK, NULL, &set);
>>>>> -    sigdelset(&set, SIG_IPI);
>>>>> -    pthread_sigmask(SIG_SETMASK, &set, NULL);
>>>>> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
>>>>> +    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
>>>>>
>>>>>     #ifdef __aarch64__
>>>>>         r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
>>>>> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
>>>>> index c56baa3ae8..13adf6ea77 100644
>>>>> --- a/include/sysemu/hvf_int.h
>>>>> +++ b/include/sysemu/hvf_int.h
>>>>> @@ -62,8 +62,7 @@ extern HVFState *hvf_state;
>>>>>     struct hvf_vcpu_state {
>>>>>         uint64_t fd;
>>>>>         void *exit;
>>>>> -    struct timespec ts;
>>>>> -    bool sleeping;
>>>>> +    sigset_t unblock_ipi_mask;
>>>>>     };
>>>>>
>>>>>     void assert_hvf_ok(hv_return_t ret);
>>>>> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
>>>>> index 8fe10966d2..3321d48aa2 100644
>>>>> --- a/target/arm/hvf/hvf.c
>>>>> +++ b/target/arm/hvf/hvf.c
>>>>> @@ -2,6 +2,7 @@
>>>>>      * QEMU Hypervisor.framework support for Apple Silicon
>>>>>
>>>>>      * Copyright 2020 Alexander Graf <agraf@csgraf.de>
>>>>> + * Copyright 2020 Google LLC
>>>>>      *
>>>>>      * This work is licensed under the terms of the GNU GPL, version 2 or later.
>>>>>      * See the COPYING file in the top-level directory.
>>>>> @@ -18,6 +19,7 @@
>>>>>     #include "sysemu/hw_accel.h"
>>>>>
>>>>>     #include <Hypervisor/Hypervisor.h>
>>>>> +#include <mach/mach_time.h>
>>>>>
>>>>>     #include "exec/address-spaces.h"
>>>>>     #include "hw/irq.h"
>>>>> @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)
>>>>>
>>>>>     void hvf_kick_vcpu_thread(CPUState *cpu)
>>>>>     {
>>>>> -    if (cpu->hvf->sleeping) {
>>>>> -        /*
>>>>> -         * When sleeping, make sure we always send signals. Also, clear the
>>>>> -         * timespec, so that an IPI that arrives between setting hvf->sleeping
>>>>> -         * and the nanosleep syscall still aborts the sleep.
>>>>> -         */
>>>>> -        cpu->thread_kicked = false;
>>>>> -        cpu->hvf->ts = (struct timespec){ };
>>>>> -        cpus_kick_thread(cpu);
>>>>> -    } else {
>>>>> -        hv_vcpus_exit(&cpu->hvf->fd, 1);
>>>>> -    }
>>>>> +    cpus_kick_thread(cpu);
>>>>> +    hv_vcpus_exit(&cpu->hvf->fd, 1);
>>>>>     }
>>>>>
>>>>>     static int hvf_inject_interrupts(CPUState *cpu)
>>>>> @@ -349,6 +341,18 @@ static int hvf_inject_interrupts(CPUState *cpu)
>>>>>         return 0;
>>>>>     }
>>>>>
>>>>> +static void hvf_wait_for_ipi(CPUState *cpu, struct timespec *ts)
>>>>> +{
>>>>> +    /*
>>>>> +     * Use pselect to sleep so that other threads can IPI us while we're
>>>>> +     * sleeping.
>>>>> +     */
>>>>> +    qatomic_mb_set(&cpu->thread_kicked, false);
>>>>> +    qemu_mutex_unlock_iothread();
>>>>> +    pselect(0, 0, 0, 0, ts, &cpu->hvf->unblock_ipi_mask);
>>>>> +    qemu_mutex_lock_iothread();
>>>>> +}
>>>>> +
>>>>>     int hvf_vcpu_exec(CPUState *cpu)
>>>>>     {
>>>>>         ARMCPU *arm_cpu = ARM_CPU(cpu);
>>>>> @@ -357,15 +361,11 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>>>         hv_return_t r;
>>>>>         int ret = 0;
>>>>>
>>>>> -    qemu_mutex_unlock_iothread();
>>>>> -
>>>>>         do {
>>>>>             bool advance_pc = false;
>>>>>
>>>>> -        qemu_mutex_lock_iothread();
>>>>>             current_cpu = cpu;
>>>>>             qemu_wait_io_event_common(cpu);
>>>>> -        qemu_mutex_unlock_iothread();
>>>>>
>>>>>             flush_cpu_state(cpu);
>>>>>
>>>>> @@ -374,10 +374,10 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>>>             }
>>>>>
>>>>>             if (cpu->halted) {
>>>>> -            qemu_mutex_lock_iothread();
>>>>>                 return EXCP_HLT;
>>>>>             }
>>>>>
>>>>> +        qemu_mutex_unlock_iothread();
>>>>>             assert_hvf_ok(hv_vcpu_run(cpu->hvf->fd));
>>>>>
>>>>>             /* handle VMEXIT */
>>>>> @@ -385,15 +385,14 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>>>             uint64_t syndrome = hvf_exit->exception.syndrome;
>>>>>             uint32_t ec = syn_get_ec(syndrome);
>>>>>
>>>>> +        qemu_mutex_lock_iothread();
>>>>>             switch (exit_reason) {
>>>>>             case HV_EXIT_REASON_EXCEPTION:
>>>>>                 /* This is the main one, handle below. */
>>>>>                 break;
>>>>>             case HV_EXIT_REASON_VTIMER_ACTIVATED:
>>>>> -            qemu_mutex_lock_iothread();
>>>>>                 current_cpu = cpu;
>>>>>                 qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
>>>>> -            qemu_mutex_unlock_iothread();
>>>>>                 continue;
>>>>>             case HV_EXIT_REASON_CANCELED:
>>>>>                 /* we got kicked, no exit to process */
>>>>> @@ -413,7 +412,6 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>>>                 uint32_t srt = (syndrome >> 16) & 0x1f;
>>>>>                 uint64_t val = 0;
>>>>>
>>>>> -            qemu_mutex_lock_iothread();
>>>>>                 current_cpu = cpu;
>>>>>
>>>>>                 DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
>>>>> @@ -446,8 +444,6 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>>>                     hvf_set_reg(cpu, srt, val);
>>>>>                 }
>>>>>
>>>>> -            qemu_mutex_unlock_iothread();
>>>>> -
>>>>>                 advance_pc = true;
>>>>>                 break;
>>>>>             }
>>>>> @@ -493,68 +489,40 @@ int hvf_vcpu_exec(CPUState *cpu)
>>>>>             case EC_WFX_TRAP:
>>>>>                 if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
>>>>>                     (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
>>>>> -                uint64_t cval, ctl, val, diff, now;
>>>>> +                advance_pc = true;
>>>>>
>>>>> -                /* Set up a local timer for vtimer if necessary ... */
>>>>> -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
>>>>> -                assert_hvf_ok(r);
>>>>> -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
>>>>> +                uint64_t ctl;
>>>>> +                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0,
>>>>> +                                        &ctl);
>>>>>                     assert_hvf_ok(r);
>>>>>
>>>>> -                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
>>>>> -                diff = cval - val;
>>>>> -
>>>>> -                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
>>>>> -                      gt_cntfrq_period_ns(arm_cpu);
>>>>> -
>>>>> -                /* Timer disabled or masked, just wait for long */
>>>>>                     if (!(ctl & 1) || (ctl & 2)) {
>>>>> -                    diff = (120 * NANOSECONDS_PER_SECOND) /
>>>>> -                           gt_cntfrq_period_ns(arm_cpu);
>>>>> +                    /* Timer disabled or masked, just wait for an IPI. */
>>>>> +                    hvf_wait_for_ipi(cpu, NULL);
>>>>> +                    break;
>>>>>                     }
>>>>>
>>>>> -                if (diff < INT64_MAX) {
>>>>> -                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
>>>>> -                    struct timespec *ts = &cpu->hvf->ts;
>>>>> -
>>>>> -                    *ts = (struct timespec){
>>>>> -                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
>>>>> -                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
>>>>> -                    };
>>>>> -
>>>>> -                    /*
>>>>> -                     * Waking up easily takes 1ms, don't go to sleep for smaller
>>>>> -                     * time periods than 2ms.
>>>>> -                     */
>>>>> -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
>>>>> -                        advance_pc = true;
>>>>> -                        break;
>>>>> -                    }
>>>>> -
>>>>> -                    /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */
>>>>> -                    cpu->hvf->sleeping = true;
>>>>> -                    smp_mb();
>>>>> -
>>>>> -                    /* Bail out if we received an IRQ meanwhile */
>>>>> -                    if (cpu->thread_kicked || (cpu->interrupt_request &
>>>>> -                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
>>>>> -                        cpu->hvf->sleeping = false;
>>>>> -                        break;
>>>>> -                    }
>>>>> -
>>>>> -                    /* nanosleep returns on signal, so we wake up on kick. */
>>>>> -                    nanosleep(ts, NULL);
>>>>> -
>>>>> -                    /* Out of sleep - either naturally or because of a kick */
>>>>> -                    cpu->hvf->sleeping = false;
>>>>> +                uint64_t cval;
>>>>> +                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0,
>>>>> +                                        &cval);
>>>>> +                assert_hvf_ok(r);
>>>>> +
>>>>> +                int64_t ticks_to_sleep = cval - mach_absolute_time();
>>>> I think you touched based on it in a previous thread, but would you mind
>>>> to explain again why mach_absolute_time() is the right thing to check
>>>> cval against? If I read the headers correctly, the cnvt_off register
>>>> should be 0, so cntvct should be the reference time, no?
>>> In my experiments I've found that CNTPCT_EL0 and CNTVCT_EL0 are the
>>> same when read on the host (i.e. host CNTVOFF_EL2 = 0). When we look
>>> at the guest we see that CNTPCT_EL0 corresponds to
>>> mach_absolute_time() on the host and not host CNTPCT_EL0 (if you look
>>> at XNU kernel sources you will see that mach_absolute_time() reads
>>> CNTPCT_EL0 and adds a constant corresponding to the amount of time
>>> that the machine spends asleep) so I think that what's going on at the
>>> hypervisor level is that guest CNTPOFF_EL2 is being set to the same
>>> constant to make it correspond to mach_absolute_time().
>>
>> Yes, I can absolutely see how it's different from CNTPCT, but it should
>> be identical to CNTVCT_EL0 inside QEMU, no?
> Not necessarily. It's possible for the guest to observe a different
> CNTVCT_EL0 either because CNTPOFF_EL2 is set to a non-zero value or
> because the hypervisor is secretly adding the "time spent asleep"
> constant to CNTVOFF_EL2 on top of the value that we set it to via the
> API.


Yes, it could be doing the same thing with mach_absolute_time(). What 
I'm asking is whether calling that over just pulling CNTVCT directly 
gives you any benefit.


Alex
Peter Collingbourne Dec. 2, 2020, 1:57 a.m. UTC | #6
On Tue, Dec 1, 2020 at 5:54 PM Alexander Graf <agraf@csgraf.de> wrote:
>
>
> On 02.12.20 02:52, Peter Collingbourne wrote:
> > On Tue, Dec 1, 2020 at 5:39 PM Alexander Graf <agraf@csgraf.de> wrote:
> >>
> >> On 02.12.20 02:32, Peter Collingbourne wrote:
> >>> On Tue, Dec 1, 2020 at 3:24 PM Alexander Graf <agraf@csgraf.de> wrote:
> >>>> On 01.12.20 22:00, Peter Collingbourne wrote:
> >>>>> Sleep on WFx until the VTIMER is due but allow ourselves to be woken
> >>>>> up on IPI.
> >>>>>
> >>>>> Signed-off-by: Peter Collingbourne <pcc@google.com>
> >>>>> ---
> >>>>> v2:
> >>>>> - simplify locking further
> >>>>> - wait indefinitely on disabled or masked timers
> >>>>>
> >>>>>     accel/hvf/hvf-cpus.c     |   5 +-
> >>>>>     include/sysemu/hvf_int.h |   3 +-
> >>>>>     target/arm/hvf/hvf.c     | 116 ++++++++++++++-------------------------
> >>>>>     3 files changed, 43 insertions(+), 81 deletions(-)
> >>>>>
> >>>>> diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
> >>>>> index 4360f64671..b2c8fb57f6 100644
> >>>>> --- a/accel/hvf/hvf-cpus.c
> >>>>> +++ b/accel/hvf/hvf-cpus.c
> >>>>> @@ -344,9 +344,8 @@ static int hvf_init_vcpu(CPUState *cpu)
> >>>>>         sigact.sa_handler = dummy_signal;
> >>>>>         sigaction(SIG_IPI, &sigact, NULL);
> >>>>>
> >>>>> -    pthread_sigmask(SIG_BLOCK, NULL, &set);
> >>>>> -    sigdelset(&set, SIG_IPI);
> >>>>> -    pthread_sigmask(SIG_SETMASK, &set, NULL);
> >>>>> +    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
> >>>>> +    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
> >>>>>
> >>>>>     #ifdef __aarch64__
> >>>>>         r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
> >>>>> diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
> >>>>> index c56baa3ae8..13adf6ea77 100644
> >>>>> --- a/include/sysemu/hvf_int.h
> >>>>> +++ b/include/sysemu/hvf_int.h
> >>>>> @@ -62,8 +62,7 @@ extern HVFState *hvf_state;
> >>>>>     struct hvf_vcpu_state {
> >>>>>         uint64_t fd;
> >>>>>         void *exit;
> >>>>> -    struct timespec ts;
> >>>>> -    bool sleeping;
> >>>>> +    sigset_t unblock_ipi_mask;
> >>>>>     };
> >>>>>
> >>>>>     void assert_hvf_ok(hv_return_t ret);
> >>>>> diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
> >>>>> index 8fe10966d2..3321d48aa2 100644
> >>>>> --- a/target/arm/hvf/hvf.c
> >>>>> +++ b/target/arm/hvf/hvf.c
> >>>>> @@ -2,6 +2,7 @@
> >>>>>      * QEMU Hypervisor.framework support for Apple Silicon
> >>>>>
> >>>>>      * Copyright 2020 Alexander Graf <agraf@csgraf.de>
> >>>>> + * Copyright 2020 Google LLC
> >>>>>      *
> >>>>>      * This work is licensed under the terms of the GNU GPL, version 2 or later.
> >>>>>      * See the COPYING file in the top-level directory.
> >>>>> @@ -18,6 +19,7 @@
> >>>>>     #include "sysemu/hw_accel.h"
> >>>>>
> >>>>>     #include <Hypervisor/Hypervisor.h>
> >>>>> +#include <mach/mach_time.h>
> >>>>>
> >>>>>     #include "exec/address-spaces.h"
> >>>>>     #include "hw/irq.h"
> >>>>> @@ -320,18 +322,8 @@ int hvf_arch_init_vcpu(CPUState *cpu)
> >>>>>
> >>>>>     void hvf_kick_vcpu_thread(CPUState *cpu)
> >>>>>     {
> >>>>> -    if (cpu->hvf->sleeping) {
> >>>>> -        /*
> >>>>> -         * When sleeping, make sure we always send signals. Also, clear the
> >>>>> -         * timespec, so that an IPI that arrives between setting hvf->sleeping
> >>>>> -         * and the nanosleep syscall still aborts the sleep.
> >>>>> -         */
> >>>>> -        cpu->thread_kicked = false;
> >>>>> -        cpu->hvf->ts = (struct timespec){ };
> >>>>> -        cpus_kick_thread(cpu);
> >>>>> -    } else {
> >>>>> -        hv_vcpus_exit(&cpu->hvf->fd, 1);
> >>>>> -    }
> >>>>> +    cpus_kick_thread(cpu);
> >>>>> +    hv_vcpus_exit(&cpu->hvf->fd, 1);
> >>>>>     }
> >>>>>
> >>>>>     static int hvf_inject_interrupts(CPUState *cpu)
> >>>>> @@ -349,6 +341,18 @@ static int hvf_inject_interrupts(CPUState *cpu)
> >>>>>         return 0;
> >>>>>     }
> >>>>>
> >>>>> +static void hvf_wait_for_ipi(CPUState *cpu, struct timespec *ts)
> >>>>> +{
> >>>>> +    /*
> >>>>> +     * Use pselect to sleep so that other threads can IPI us while we're
> >>>>> +     * sleeping.
> >>>>> +     */
> >>>>> +    qatomic_mb_set(&cpu->thread_kicked, false);
> >>>>> +    qemu_mutex_unlock_iothread();
> >>>>> +    pselect(0, 0, 0, 0, ts, &cpu->hvf->unblock_ipi_mask);
> >>>>> +    qemu_mutex_lock_iothread();
> >>>>> +}
> >>>>> +
> >>>>>     int hvf_vcpu_exec(CPUState *cpu)
> >>>>>     {
> >>>>>         ARMCPU *arm_cpu = ARM_CPU(cpu);
> >>>>> @@ -357,15 +361,11 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>>>         hv_return_t r;
> >>>>>         int ret = 0;
> >>>>>
> >>>>> -    qemu_mutex_unlock_iothread();
> >>>>> -
> >>>>>         do {
> >>>>>             bool advance_pc = false;
> >>>>>
> >>>>> -        qemu_mutex_lock_iothread();
> >>>>>             current_cpu = cpu;
> >>>>>             qemu_wait_io_event_common(cpu);
> >>>>> -        qemu_mutex_unlock_iothread();
> >>>>>
> >>>>>             flush_cpu_state(cpu);
> >>>>>
> >>>>> @@ -374,10 +374,10 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>>>             }
> >>>>>
> >>>>>             if (cpu->halted) {
> >>>>> -            qemu_mutex_lock_iothread();
> >>>>>                 return EXCP_HLT;
> >>>>>             }
> >>>>>
> >>>>> +        qemu_mutex_unlock_iothread();
> >>>>>             assert_hvf_ok(hv_vcpu_run(cpu->hvf->fd));
> >>>>>
> >>>>>             /* handle VMEXIT */
> >>>>> @@ -385,15 +385,14 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>>>             uint64_t syndrome = hvf_exit->exception.syndrome;
> >>>>>             uint32_t ec = syn_get_ec(syndrome);
> >>>>>
> >>>>> +        qemu_mutex_lock_iothread();
> >>>>>             switch (exit_reason) {
> >>>>>             case HV_EXIT_REASON_EXCEPTION:
> >>>>>                 /* This is the main one, handle below. */
> >>>>>                 break;
> >>>>>             case HV_EXIT_REASON_VTIMER_ACTIVATED:
> >>>>> -            qemu_mutex_lock_iothread();
> >>>>>                 current_cpu = cpu;
> >>>>>                 qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
> >>>>> -            qemu_mutex_unlock_iothread();
> >>>>>                 continue;
> >>>>>             case HV_EXIT_REASON_CANCELED:
> >>>>>                 /* we got kicked, no exit to process */
> >>>>> @@ -413,7 +412,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>>>                 uint32_t srt = (syndrome >> 16) & 0x1f;
> >>>>>                 uint64_t val = 0;
> >>>>>
> >>>>> -            qemu_mutex_lock_iothread();
> >>>>>                 current_cpu = cpu;
> >>>>>
> >>>>>                 DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
> >>>>> @@ -446,8 +444,6 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>>>                     hvf_set_reg(cpu, srt, val);
> >>>>>                 }
> >>>>>
> >>>>> -            qemu_mutex_unlock_iothread();
> >>>>> -
> >>>>>                 advance_pc = true;
> >>>>>                 break;
> >>>>>             }
> >>>>> @@ -493,68 +489,40 @@ int hvf_vcpu_exec(CPUState *cpu)
> >>>>>             case EC_WFX_TRAP:
> >>>>>                 if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
> >>>>>                     (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> >>>>> -                uint64_t cval, ctl, val, diff, now;
> >>>>> +                advance_pc = true;
> >>>>>
> >>>>> -                /* Set up a local timer for vtimer if necessary ... */
> >>>>> -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
> >>>>> -                assert_hvf_ok(r);
> >>>>> -                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
> >>>>> +                uint64_t ctl;
> >>>>> +                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0,
> >>>>> +                                        &ctl);
> >>>>>                     assert_hvf_ok(r);
> >>>>>
> >>>>> -                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
> >>>>> -                diff = cval - val;
> >>>>> -
> >>>>> -                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
> >>>>> -                      gt_cntfrq_period_ns(arm_cpu);
> >>>>> -
> >>>>> -                /* Timer disabled or masked, just wait for long */
> >>>>>                     if (!(ctl & 1) || (ctl & 2)) {
> >>>>> -                    diff = (120 * NANOSECONDS_PER_SECOND) /
> >>>>> -                           gt_cntfrq_period_ns(arm_cpu);
> >>>>> +                    /* Timer disabled or masked, just wait for an IPI. */
> >>>>> +                    hvf_wait_for_ipi(cpu, NULL);
> >>>>> +                    break;
> >>>>>                     }
> >>>>>
> >>>>> -                if (diff < INT64_MAX) {
> >>>>> -                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
> >>>>> -                    struct timespec *ts = &cpu->hvf->ts;
> >>>>> -
> >>>>> -                    *ts = (struct timespec){
> >>>>> -                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
> >>>>> -                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
> >>>>> -                    };
> >>>>> -
> >>>>> -                    /*
> >>>>> -                     * Waking up easily takes 1ms, don't go to sleep for smaller
> >>>>> -                     * time periods than 2ms.
> >>>>> -                     */
> >>>>> -                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
> >>>>> -                        advance_pc = true;
> >>>>> -                        break;
> >>>>> -                    }
> >>>>> -
> >>>>> -                    /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */
> >>>>> -                    cpu->hvf->sleeping = true;
> >>>>> -                    smp_mb();
> >>>>> -
> >>>>> -                    /* Bail out if we received an IRQ meanwhile */
> >>>>> -                    if (cpu->thread_kicked || (cpu->interrupt_request &
> >>>>> -                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
> >>>>> -                        cpu->hvf->sleeping = false;
> >>>>> -                        break;
> >>>>> -                    }
> >>>>> -
> >>>>> -                    /* nanosleep returns on signal, so we wake up on kick. */
> >>>>> -                    nanosleep(ts, NULL);
> >>>>> -
> >>>>> -                    /* Out of sleep - either naturally or because of a kick */
> >>>>> -                    cpu->hvf->sleeping = false;
> >>>>> +                uint64_t cval;
> >>>>> +                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0,
> >>>>> +                                        &cval);
> >>>>> +                assert_hvf_ok(r);
> >>>>> +
> >>>>> +                int64_t ticks_to_sleep = cval - mach_absolute_time();
> >>>> I think you touched based on it in a previous thread, but would you mind
> >>>> to explain again why mach_absolute_time() is the right thing to check
> >>>> cval against? If I read the headers correctly, the cnvt_off register
> >>>> should be 0, so cntvct should be the reference time, no?
> >>> In my experiments I've found that CNTPCT_EL0 and CNTVCT_EL0 are the
> >>> same when read on the host (i.e. host CNTVOFF_EL2 = 0). When we look
> >>> at the guest we see that CNTPCT_EL0 corresponds to
> >>> mach_absolute_time() on the host and not host CNTPCT_EL0 (if you look
> >>> at XNU kernel sources you will see that mach_absolute_time() reads
> >>> CNTPCT_EL0 and adds a constant corresponding to the amount of time
> >>> that the machine spends asleep) so I think that what's going on at the
> >>> hypervisor level is that guest CNTPOFF_EL2 is being set to the same
> >>> constant to make it correspond to mach_absolute_time().
> >>
> >> Yes, I can absolutely see how it's different from CNTPCT, but it should
> >> be identical to CNTVCT_EL0 inside QEMU, no?
> > Not necessarily. It's possible for the guest to observe a different
> > CNTVCT_EL0 either because CNTPOFF_EL2 is set to a non-zero value or
> > because the hypervisor is secretly adding the "time spent asleep"
> > constant to CNTVOFF_EL2 on top of the value that we set it to via the
> > API.
>
>
> Yes, it could be doing the same thing with mach_absolute_time(). What
> I'm asking is whether calling that over just pulling CNTVCT directly
> gives you any benefit.

If you run this program you will see that mach_absolute_time() may
give you a different result to CNTVCT.

pcc@pac-mini ~> cat time.c
#include <stdio.h>
#include <mach/mach_time.h>

int main() {
  uint64_t cntpct, cntvct;
  __asm__ __volatile__("mrs %0, cntpct_el0" : "=r"(cntpct));
  __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(cntvct));

  printf("cntpct = %lx cntvct = %lx mat = %lx\n", cntpct, cntvct,
mach_absolute_time());
}
pcc@pac-mini ~> ./time
cntpct = d714e2217d0 cntvct = d714e2217d0 mat = 9259a829290

Peter
diff mbox series

Patch

diff --git a/accel/hvf/hvf-cpus.c b/accel/hvf/hvf-cpus.c
index 4360f64671..b2c8fb57f6 100644
--- a/accel/hvf/hvf-cpus.c
+++ b/accel/hvf/hvf-cpus.c
@@ -344,9 +344,8 @@  static int hvf_init_vcpu(CPUState *cpu)
     sigact.sa_handler = dummy_signal;
     sigaction(SIG_IPI, &sigact, NULL);
 
-    pthread_sigmask(SIG_BLOCK, NULL, &set);
-    sigdelset(&set, SIG_IPI);
-    pthread_sigmask(SIG_SETMASK, &set, NULL);
+    pthread_sigmask(SIG_BLOCK, NULL, &cpu->hvf->unblock_ipi_mask);
+    sigdelset(&cpu->hvf->unblock_ipi_mask, SIG_IPI);
 
 #ifdef __aarch64__
     r = hv_vcpu_create(&cpu->hvf->fd, (hv_vcpu_exit_t **)&cpu->hvf->exit, NULL);
diff --git a/include/sysemu/hvf_int.h b/include/sysemu/hvf_int.h
index c56baa3ae8..13adf6ea77 100644
--- a/include/sysemu/hvf_int.h
+++ b/include/sysemu/hvf_int.h
@@ -62,8 +62,7 @@  extern HVFState *hvf_state;
 struct hvf_vcpu_state {
     uint64_t fd;
     void *exit;
-    struct timespec ts;
-    bool sleeping;
+    sigset_t unblock_ipi_mask;
 };
 
 void assert_hvf_ok(hv_return_t ret);
diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
index 8fe10966d2..3321d48aa2 100644
--- a/target/arm/hvf/hvf.c
+++ b/target/arm/hvf/hvf.c
@@ -2,6 +2,7 @@ 
  * QEMU Hypervisor.framework support for Apple Silicon
 
  * Copyright 2020 Alexander Graf <agraf@csgraf.de>
+ * Copyright 2020 Google LLC
  *
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
@@ -18,6 +19,7 @@ 
 #include "sysemu/hw_accel.h"
 
 #include <Hypervisor/Hypervisor.h>
+#include <mach/mach_time.h>
 
 #include "exec/address-spaces.h"
 #include "hw/irq.h"
@@ -320,18 +322,8 @@  int hvf_arch_init_vcpu(CPUState *cpu)
 
 void hvf_kick_vcpu_thread(CPUState *cpu)
 {
-    if (cpu->hvf->sleeping) {
-        /*
-         * When sleeping, make sure we always send signals. Also, clear the
-         * timespec, so that an IPI that arrives between setting hvf->sleeping
-         * and the nanosleep syscall still aborts the sleep.
-         */
-        cpu->thread_kicked = false;
-        cpu->hvf->ts = (struct timespec){ };
-        cpus_kick_thread(cpu);
-    } else {
-        hv_vcpus_exit(&cpu->hvf->fd, 1);
-    }
+    cpus_kick_thread(cpu);
+    hv_vcpus_exit(&cpu->hvf->fd, 1);
 }
 
 static int hvf_inject_interrupts(CPUState *cpu)
@@ -349,6 +341,18 @@  static int hvf_inject_interrupts(CPUState *cpu)
     return 0;
 }
 
+static void hvf_wait_for_ipi(CPUState *cpu, struct timespec *ts)
+{
+    /*
+     * Use pselect to sleep so that other threads can IPI us while we're
+     * sleeping.
+     */
+    qatomic_mb_set(&cpu->thread_kicked, false);
+    qemu_mutex_unlock_iothread();
+    pselect(0, 0, 0, 0, ts, &cpu->hvf->unblock_ipi_mask);
+    qemu_mutex_lock_iothread();
+}
+
 int hvf_vcpu_exec(CPUState *cpu)
 {
     ARMCPU *arm_cpu = ARM_CPU(cpu);
@@ -357,15 +361,11 @@  int hvf_vcpu_exec(CPUState *cpu)
     hv_return_t r;
     int ret = 0;
 
-    qemu_mutex_unlock_iothread();
-
     do {
         bool advance_pc = false;
 
-        qemu_mutex_lock_iothread();
         current_cpu = cpu;
         qemu_wait_io_event_common(cpu);
-        qemu_mutex_unlock_iothread();
 
         flush_cpu_state(cpu);
 
@@ -374,10 +374,10 @@  int hvf_vcpu_exec(CPUState *cpu)
         }
 
         if (cpu->halted) {
-            qemu_mutex_lock_iothread();
             return EXCP_HLT;
         }
 
+        qemu_mutex_unlock_iothread();
         assert_hvf_ok(hv_vcpu_run(cpu->hvf->fd));
 
         /* handle VMEXIT */
@@ -385,15 +385,14 @@  int hvf_vcpu_exec(CPUState *cpu)
         uint64_t syndrome = hvf_exit->exception.syndrome;
         uint32_t ec = syn_get_ec(syndrome);
 
+        qemu_mutex_lock_iothread();
         switch (exit_reason) {
         case HV_EXIT_REASON_EXCEPTION:
             /* This is the main one, handle below. */
             break;
         case HV_EXIT_REASON_VTIMER_ACTIVATED:
-            qemu_mutex_lock_iothread();
             current_cpu = cpu;
             qemu_set_irq(arm_cpu->gt_timer_outputs[GTIMER_VIRT], 1);
-            qemu_mutex_unlock_iothread();
             continue;
         case HV_EXIT_REASON_CANCELED:
             /* we got kicked, no exit to process */
@@ -413,7 +412,6 @@  int hvf_vcpu_exec(CPUState *cpu)
             uint32_t srt = (syndrome >> 16) & 0x1f;
             uint64_t val = 0;
 
-            qemu_mutex_lock_iothread();
             current_cpu = cpu;
 
             DPRINTF("data abort: [pc=0x%llx va=0x%016llx pa=0x%016llx isv=%x "
@@ -446,8 +444,6 @@  int hvf_vcpu_exec(CPUState *cpu)
                 hvf_set_reg(cpu, srt, val);
             }
 
-            qemu_mutex_unlock_iothread();
-
             advance_pc = true;
             break;
         }
@@ -493,68 +489,40 @@  int hvf_vcpu_exec(CPUState *cpu)
         case EC_WFX_TRAP:
             if (!(syndrome & WFX_IS_WFE) && !(cpu->interrupt_request &
                 (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
-                uint64_t cval, ctl, val, diff, now;
+                advance_pc = true;
 
-                /* Set up a local timer for vtimer if necessary ... */
-                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0, &ctl);
-                assert_hvf_ok(r);
-                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0, &cval);
+                uint64_t ctl;
+                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CTL_EL0,
+                                        &ctl);
                 assert_hvf_ok(r);
 
-                asm volatile("mrs %0, cntvct_el0" : "=r"(val));
-                diff = cval - val;
-
-                now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
-                      gt_cntfrq_period_ns(arm_cpu);
-
-                /* Timer disabled or masked, just wait for long */
                 if (!(ctl & 1) || (ctl & 2)) {
-                    diff = (120 * NANOSECONDS_PER_SECOND) /
-                           gt_cntfrq_period_ns(arm_cpu);
+                    /* Timer disabled or masked, just wait for an IPI. */
+                    hvf_wait_for_ipi(cpu, NULL);
+                    break;
                 }
 
-                if (diff < INT64_MAX) {
-                    uint64_t ns = diff * gt_cntfrq_period_ns(arm_cpu);
-                    struct timespec *ts = &cpu->hvf->ts;
-
-                    *ts = (struct timespec){
-                        .tv_sec = ns / NANOSECONDS_PER_SECOND,
-                        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
-                    };
-
-                    /*
-                     * Waking up easily takes 1ms, don't go to sleep for smaller
-                     * time periods than 2ms.
-                     */
-                    if (!ts->tv_sec && (ts->tv_nsec < (SCALE_MS * 2))) {
-                        advance_pc = true;
-                        break;
-                    }
-
-                    /* Set cpu->hvf->sleeping so that we get a SIG_IPI signal. */
-                    cpu->hvf->sleeping = true;
-                    smp_mb();
-
-                    /* Bail out if we received an IRQ meanwhile */
-                    if (cpu->thread_kicked || (cpu->interrupt_request &
-                        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ))) {
-                        cpu->hvf->sleeping = false;
-                        break;
-                    }
-
-                    /* nanosleep returns on signal, so we wake up on kick. */
-                    nanosleep(ts, NULL);
-
-                    /* Out of sleep - either naturally or because of a kick */
-                    cpu->hvf->sleeping = false;
+                uint64_t cval;
+                r = hv_vcpu_get_sys_reg(cpu->hvf->fd, HV_SYS_REG_CNTV_CVAL_EL0,
+                                        &cval);
+                assert_hvf_ok(r);
+
+                int64_t ticks_to_sleep = cval - mach_absolute_time();
+                if (ticks_to_sleep < 0) {
+                    break;
                 }
 
-                advance_pc = true;
+                uint64_t seconds = ticks_to_sleep / arm_cpu->gt_cntfrq_hz;
+                uint64_t nanos =
+                    (ticks_to_sleep - arm_cpu->gt_cntfrq_hz * seconds) *
+                    1000000000 / arm_cpu->gt_cntfrq_hz;
+                struct timespec ts = { seconds, nanos };
+
+                hvf_wait_for_ipi(cpu, &ts);
             }
             break;
         case EC_AA64_HVC:
             cpu_synchronize_state(cpu);
-            qemu_mutex_lock_iothread();
             current_cpu = cpu;
             if (arm_is_psci_call(arm_cpu, EXCP_HVC)) {
                 arm_handle_psci_call(arm_cpu);
@@ -562,11 +530,9 @@  int hvf_vcpu_exec(CPUState *cpu)
                 DPRINTF("unknown HVC! %016llx", env->xregs[0]);
                 env->xregs[0] = -1;
             }
-            qemu_mutex_unlock_iothread();
             break;
         case EC_AA64_SMC:
             cpu_synchronize_state(cpu);
-            qemu_mutex_lock_iothread();
             current_cpu = cpu;
             if (arm_is_psci_call(arm_cpu, EXCP_SMC)) {
                 arm_handle_psci_call(arm_cpu);
@@ -575,7 +541,6 @@  int hvf_vcpu_exec(CPUState *cpu)
                 env->xregs[0] = -1;
                 env->pc += 4;
             }
-            qemu_mutex_unlock_iothread();
             break;
         default:
             cpu_synchronize_state(cpu);
@@ -596,7 +561,6 @@  int hvf_vcpu_exec(CPUState *cpu)
         }
     } while (ret == 0);
 
-    qemu_mutex_lock_iothread();
     current_cpu = cpu;
 
     return ret;