Message ID | 1343188195-25239-1-git-send-email-Bharat.Bhushan@freescale.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 25.07.2012, at 05:49, Bharat Bhushan wrote: > This patch adds the watchdog emulation in KVM. The watchdog > emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_BOOKE_WATCHDOG) ioctl. > The kernel timer are used for watchdog emulation and emulates > h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU > if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how > it is configured. > > Signed-off-by: Liu Yu <yu.liu@freescale.com> > Signed-off-by: Scott Wood <scottwood@freescale.com> > [bharat.bhushan@freescale.com: reworked patch] > Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com> > --- > v6: > - Added kvmppc_subarch_vcpu_unit/uninit() for subarch specific initialization > - Using spin_lock_irqsave() > - Using del_timer_sync(). > > v5: > - Checking that TSR_ENW/TSR_WIS are still set on KVM_EXIT_WATCHDOG. > - Moved watchdog_next_timeout() in lock. > > arch/powerpc/include/asm/kvm_host.h | 3 + > arch/powerpc/include/asm/kvm_ppc.h | 4 + > arch/powerpc/include/asm/reg_booke.h | 7 ++ > arch/powerpc/kvm/book3s.c | 9 ++ > arch/powerpc/kvm/booke.c | 156 ++++++++++++++++++++++++++++++++++ > arch/powerpc/kvm/booke_emulate.c | 8 ++ > arch/powerpc/kvm/powerpc.c | 14 +++- > include/linux/kvm.h | 2 + > include/linux/kvm_host.h | 1 + > 9 files changed, 202 insertions(+), 2 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h > index 50ea12f..43cac48 100644 > --- a/arch/powerpc/include/asm/kvm_host.h > +++ b/arch/powerpc/include/asm/kvm_host.h > @@ -467,6 +467,8 @@ struct kvm_vcpu_arch { > ulong fault_esr; > ulong queued_dear; > ulong queued_esr; > + spinlock_t wdt_lock; > + struct timer_list wdt_timer; > u32 tlbcfg[4]; > u32 mmucfg; > u32 epr; > @@ -482,6 +484,7 @@ struct kvm_vcpu_arch { > u8 osi_needed; > u8 osi_enabled; > u8 papr_enabled; > + u8 watchdog_enabled; > u8 sane; > u8 cpu_type; > u8 hcall_needed; > diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h > index 0124937..823d563 100644 > --- a/arch/powerpc/include/asm/kvm_ppc.h > +++ b/arch/powerpc/include/asm/kvm_ppc.h > @@ -68,6 +68,8 @@ extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); > extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); > extern void kvmppc_decrementer_func(unsigned long data); > extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); > +extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu); > +extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu); > > /* Core-specific hooks */ > > @@ -104,6 +106,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, > struct kvm_interrupt *irq); > extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, > struct kvm_interrupt *irq); > +extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu); > +extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu); > > extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, > unsigned int op, int *advance); > diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h > index 2d916c4..e07e6af 100644 > --- a/arch/powerpc/include/asm/reg_booke.h > +++ b/arch/powerpc/include/asm/reg_booke.h > @@ -539,6 +539,13 @@ > #define TCR_FIE 0x00800000 /* FIT Interrupt Enable */ > #define TCR_ARE 0x00400000 /* Auto Reload Enable */ > > +#ifdef CONFIG_E500 > +#define TCR_GET_WP(tcr) ((((tcr) & 0xC0000000) >> 30) | \ > + (((tcr) & 0x1E0000) >> 15)) > +#else > +#define TCR_GET_WP(tcr) (((tcr) & 0xC0000000) >> 30) > +#endif > + > /* Bit definitions for the TSR. */ > #define TSR_ENW 0x80000000 /* Enable Next Watchdog */ > #define TSR_WIS 0x40000000 /* WDT Interrupt Status */ > diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c > index 3f2a836..e946665 100644 > --- a/arch/powerpc/kvm/book3s.c > +++ b/arch/powerpc/kvm/book3s.c > @@ -411,6 +411,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) > return 0; > } > > +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) > +{ > + return 0; > +} > + > +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) > +{ > +} > + > int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) > { > int i; > diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c > index d25a097..eadd86c 100644 > --- a/arch/powerpc/kvm/booke.c > +++ b/arch/powerpc/kvm/booke.c > @@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, > clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); > } > > +void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu) > +{ > + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG); > +} > + > +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu) > +{ > + clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions); > +} > + These should be static, since we only need them in booke specific code. > static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) > { > #ifdef CONFIG_KVM_BOOKE_HV > @@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, > msr_mask = MSR_CE | MSR_ME | MSR_DE; > int_class = INT_CLASS_NONCRIT; > break; > + case BOOKE_IRQPRIO_WATCHDOG: > case BOOKE_IRQPRIO_CRITICAL: > case BOOKE_IRQPRIO_DBELL_CRIT: > allowed = vcpu->arch.shared->msr & MSR_CE; > @@ -404,12 +415,114 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, > return allowed; > } > > +/* > + * Return the number of jiffies until the next timeout. If the timeout is > + * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA > + * because the larger value can break the timer APIs. > + */ > +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) > +{ > + u64 tb, wdt_tb, wdt_ticks = 0; > + u64 nr_jiffies = 0; > + u32 period = TCR_GET_WP(vcpu->arch.tcr); > + > + wdt_tb = 1ULL << (63 - period); > + tb = get_tb(); > + /* > + * The watchdog timeout will hapeen when TB bit corresponding > + * to watchdog will toggle from 0 to 1. > + */ > + if (tb & wdt_tb) > + wdt_ticks = wdt_tb; > + > + wdt_ticks += wdt_tb - (tb & (wdt_tb - 1)); > + > + /* Convert timebase ticks to jiffies */ > + nr_jiffies = wdt_ticks; > + > + if (do_div(nr_jiffies, tb_ticks_per_jiffy)) > + nr_jiffies++; > + > + return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA); > +} > + > +static void arm_next_watchdog(struct kvm_vcpu *vcpu) > +{ > + unsigned long nr_jiffies; > + unsigned long flags; > + > + spin_lock_irqsave(&vcpu->arch.wdt_lock, flags); > + nr_jiffies = watchdog_next_timeout(vcpu); > + /* > + * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA > + * then do not run the watchdog timer as this can break timer APIs. > + */ > + if (nr_jiffies < NEXT_TIMER_MAX_DELTA) > + mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies); > + else > + del_timer(&vcpu->arch.wdt_timer); > + spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags); > +} > + > +void kvmppc_watchdog_func(unsigned long data) > +{ > + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; > + u32 tsr, new_tsr; > + int final; > + > + do { > + new_tsr = tsr = vcpu->arch.tsr; > + final = 0; > + > + /* Time out event */ > + if (tsr & TSR_ENW) { > + if (tsr & TSR_WIS) > + final = 1; > + else > + new_tsr = tsr | TSR_WIS; > + } else { > + new_tsr = tsr | TSR_ENW; > + } > + } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr); > + > + if (new_tsr & TSR_WIS) { > + smp_wmb(); > + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); > + kvm_vcpu_kick(vcpu); > + } > + > + /* > + * If this is final watchdog expiry and some action is required > + * then exit to userspace. > + */ > + if (final && (vcpu->arch.tcr & TCR_WRC_MASK) && > + vcpu->arch.watchdog_enabled) { > + smp_wmb(); > + kvm_make_request(KVM_REQ_WATCHDOG, vcpu); > + kvm_vcpu_kick(vcpu); > + } > + > + /* > + * Stop running the watchdog timer after final expiration to > + * prevent the host from being flooded with timers if the > + * guest sets a short period. > + * Timers will resume when TSR/TCR is updated next time. > + */ > + if (!final) > + arm_next_watchdog(vcpu); > +} > + > static void update_timer_ints(struct kvm_vcpu *vcpu) > { > if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) > kvmppc_core_queue_dec(vcpu); > else > kvmppc_core_dequeue_dec(vcpu); > + > + if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS)) > + kvmppc_core_queue_watchdog(vcpu); > + else > + kvmppc_core_dequeue_watchdog(vcpu); > } > > static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) > @@ -516,6 +629,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) > goto out; > } > > + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) && > + (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) { > + kvm_run->exit_reason = KVM_EXIT_WATCHDOG; > + ret = 0; > + goto out; > + } I think we want to push this into a generic function that checks for requests. Check out https://github.com/agraf/linux-2.6/commit/e8fe975b59b24777c6314addc9cea2b959454738 for what I did there, namely kvmppc_check_requests. If your case we obviously want a return value. I don't mind which patch goes in first, so if you add a nice and generic version first, I can easily base my mmu notifier patches on top of yours. > + > kvm_guest_enter(); > > #ifdef CONFIG_PPC_FPU > @@ -978,6 +1098,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, > } > } > > + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) && > + (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) { > + run->exit_reason = KVM_EXIT_WATCHDOG; > + r = RESUME_HOST | (r & RESUME_FLAG_NV); > + } > + > return r; > } > > @@ -1011,6 +1137,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) > return r; > } > > +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) > +{ > + /* setup watchdog timer once */ > + spin_lock_init(&vcpu->arch.wdt_lock); > + setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, > + (unsigned long)vcpu); > + > + return 0; > +} > + > +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) > +{ > + del_timer_sync(&vcpu->arch.wdt_timer); > +} > + > int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) > { > int i; > @@ -1106,7 +1247,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, > } > > if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { > + u32 old_tsr = vcpu->arch.tsr; > + > vcpu->arch.tsr = sregs->u.e.tsr; > + > + if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) > + arm_next_watchdog(vcpu); Just curious. Is this potentially racy with the watchdog timer, since we're not accessing tsr atomically. This patch already looks a lot nicer than the previous versions! Scott, for the final patch I'd also like to get your ack. You know your way around the hardware way better than me :). Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 07/31/2012 06:50 AM, Alexander Graf wrote: > > On 25.07.2012, at 05:49, Bharat Bhushan wrote: > >> This patch adds the watchdog emulation in KVM. The watchdog >> emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_BOOKE_WATCHDOG) ioctl. >> The kernel timer are used for watchdog emulation and emulates >> h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU >> if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how >> it is configured. >> >> Signed-off-by: Liu Yu <yu.liu@freescale.com> >> Signed-off-by: Scott Wood <scottwood@freescale.com> >> [bharat.bhushan@freescale.com: reworked patch] >> Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com> >> --- >> v6: >> - Added kvmppc_subarch_vcpu_unit/uninit() for subarch specific initialization >> - Using spin_lock_irqsave() >> - Using del_timer_sync(). >> >> v5: >> - Checking that TSR_ENW/TSR_WIS are still set on KVM_EXIT_WATCHDOG. >> - Moved watchdog_next_timeout() in lock. >> >> arch/powerpc/include/asm/kvm_host.h | 3 + >> arch/powerpc/include/asm/kvm_ppc.h | 4 + >> arch/powerpc/include/asm/reg_booke.h | 7 ++ >> arch/powerpc/kvm/book3s.c | 9 ++ >> arch/powerpc/kvm/booke.c | 156 ++++++++++++++++++++++++++++++++++ >> arch/powerpc/kvm/booke_emulate.c | 8 ++ >> arch/powerpc/kvm/powerpc.c | 14 +++- >> include/linux/kvm.h | 2 + >> include/linux/kvm_host.h | 1 + >> 9 files changed, 202 insertions(+), 2 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h >> index 50ea12f..43cac48 100644 >> --- a/arch/powerpc/include/asm/kvm_host.h >> +++ b/arch/powerpc/include/asm/kvm_host.h >> @@ -467,6 +467,8 @@ struct kvm_vcpu_arch { >> ulong fault_esr; >> ulong queued_dear; >> ulong queued_esr; >> + spinlock_t wdt_lock; >> + struct timer_list wdt_timer; >> u32 tlbcfg[4]; >> u32 mmucfg; >> u32 epr; >> @@ -482,6 +484,7 @@ struct kvm_vcpu_arch { >> u8 osi_needed; >> u8 osi_enabled; >> u8 papr_enabled; >> + u8 watchdog_enabled; >> u8 sane; >> u8 cpu_type; >> u8 hcall_needed; >> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h >> index 0124937..823d563 100644 >> --- a/arch/powerpc/include/asm/kvm_ppc.h >> +++ b/arch/powerpc/include/asm/kvm_ppc.h >> @@ -68,6 +68,8 @@ extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); >> extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); >> extern void kvmppc_decrementer_func(unsigned long data); >> extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); >> +extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu); >> +extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu); >> >> /* Core-specific hooks */ >> >> @@ -104,6 +106,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, >> struct kvm_interrupt *irq); >> extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, >> struct kvm_interrupt *irq); >> +extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu); >> +extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu); >> >> extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, >> unsigned int op, int *advance); >> diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h >> index 2d916c4..e07e6af 100644 >> --- a/arch/powerpc/include/asm/reg_booke.h >> +++ b/arch/powerpc/include/asm/reg_booke.h >> @@ -539,6 +539,13 @@ >> #define TCR_FIE 0x00800000 /* FIT Interrupt Enable */ >> #define TCR_ARE 0x00400000 /* Auto Reload Enable */ >> >> +#ifdef CONFIG_E500 >> +#define TCR_GET_WP(tcr) ((((tcr) & 0xC0000000) >> 30) | \ >> + (((tcr) & 0x1E0000) >> 15)) >> +#else >> +#define TCR_GET_WP(tcr) (((tcr) & 0xC0000000) >> 30) >> +#endif >> + >> /* Bit definitions for the TSR. */ >> #define TSR_ENW 0x80000000 /* Enable Next Watchdog */ >> #define TSR_WIS 0x40000000 /* WDT Interrupt Status */ >> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c >> index 3f2a836..e946665 100644 >> --- a/arch/powerpc/kvm/book3s.c >> +++ b/arch/powerpc/kvm/book3s.c >> @@ -411,6 +411,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) >> return 0; >> } >> >> +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) >> +{ >> + return 0; >> +} >> + >> +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) >> +{ >> +} >> + >> int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) >> { >> int i; >> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c >> index d25a097..eadd86c 100644 >> --- a/arch/powerpc/kvm/booke.c >> +++ b/arch/powerpc/kvm/booke.c >> @@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, >> clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); >> } >> >> +void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu) >> +{ >> + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG); >> +} >> + >> +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu) >> +{ >> + clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions); >> +} >> + > > These should be static, since we only need them in booke specific code. > >> static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) >> { >> #ifdef CONFIG_KVM_BOOKE_HV >> @@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, >> msr_mask = MSR_CE | MSR_ME | MSR_DE; >> int_class = INT_CLASS_NONCRIT; >> break; >> + case BOOKE_IRQPRIO_WATCHDOG: >> case BOOKE_IRQPRIO_CRITICAL: >> case BOOKE_IRQPRIO_DBELL_CRIT: >> allowed = vcpu->arch.shared->msr & MSR_CE; >> @@ -404,12 +415,114 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, >> return allowed; >> } >> >> +/* >> + * Return the number of jiffies until the next timeout. If the timeout is >> + * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA >> + * because the larger value can break the timer APIs. >> + */ >> +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) >> +{ >> + u64 tb, wdt_tb, wdt_ticks = 0; >> + u64 nr_jiffies = 0; >> + u32 period = TCR_GET_WP(vcpu->arch.tcr); >> + >> + wdt_tb = 1ULL << (63 - period); >> + tb = get_tb(); >> + /* >> + * The watchdog timeout will hapeen when TB bit corresponding >> + * to watchdog will toggle from 0 to 1. >> + */ >> + if (tb & wdt_tb) >> + wdt_ticks = wdt_tb; >> + >> + wdt_ticks += wdt_tb - (tb & (wdt_tb - 1)); >> + >> + /* Convert timebase ticks to jiffies */ >> + nr_jiffies = wdt_ticks; >> + >> + if (do_div(nr_jiffies, tb_ticks_per_jiffy)) >> + nr_jiffies++; >> + >> + return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA); >> +} >> + >> +static void arm_next_watchdog(struct kvm_vcpu *vcpu) >> +{ >> + unsigned long nr_jiffies; >> + unsigned long flags; >> + >> + spin_lock_irqsave(&vcpu->arch.wdt_lock, flags); >> + nr_jiffies = watchdog_next_timeout(vcpu); >> + /* >> + * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA >> + * then do not run the watchdog timer as this can break timer APIs. >> + */ >> + if (nr_jiffies < NEXT_TIMER_MAX_DELTA) >> + mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies); >> + else >> + del_timer(&vcpu->arch.wdt_timer); >> + spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags); >> +} >> + >> +void kvmppc_watchdog_func(unsigned long data) >> +{ >> + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; >> + u32 tsr, new_tsr; >> + int final; >> + >> + do { >> + new_tsr = tsr = vcpu->arch.tsr; >> + final = 0; >> + >> + /* Time out event */ >> + if (tsr & TSR_ENW) { >> + if (tsr & TSR_WIS) >> + final = 1; >> + else >> + new_tsr = tsr | TSR_WIS; >> + } else { >> + new_tsr = tsr | TSR_ENW; >> + } >> + } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr); >> + >> + if (new_tsr & TSR_WIS) { >> + smp_wmb(); >> + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); >> + kvm_vcpu_kick(vcpu); >> + } >> + >> + /* >> + * If this is final watchdog expiry and some action is required >> + * then exit to userspace. >> + */ >> + if (final && (vcpu->arch.tcr & TCR_WRC_MASK) && >> + vcpu->arch.watchdog_enabled) { >> + smp_wmb(); >> + kvm_make_request(KVM_REQ_WATCHDOG, vcpu); >> + kvm_vcpu_kick(vcpu); >> + } >> + >> + /* >> + * Stop running the watchdog timer after final expiration to >> + * prevent the host from being flooded with timers if the >> + * guest sets a short period. >> + * Timers will resume when TSR/TCR is updated next time. >> + */ >> + if (!final) >> + arm_next_watchdog(vcpu); >> +} >> + >> static void update_timer_ints(struct kvm_vcpu *vcpu) >> { >> if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) >> kvmppc_core_queue_dec(vcpu); >> else >> kvmppc_core_dequeue_dec(vcpu); >> + >> + if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS)) >> + kvmppc_core_queue_watchdog(vcpu); >> + else >> + kvmppc_core_dequeue_watchdog(vcpu); >> } >> >> static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) >> @@ -516,6 +629,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) >> goto out; >> } >> >> + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) && >> + (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) { >> + kvm_run->exit_reason = KVM_EXIT_WATCHDOG; >> + ret = 0; >> + goto out; >> + } > > I think we want to push this into a generic function that checks for requests. Check out > > https://github.com/agraf/linux-2.6/commit/e8fe975b59b24777c6314addc9cea2b959454738 > > for what I did there, namely kvmppc_check_requests. If your case we obviously want a return value. I don't mind which patch goes in first, so if you add a nice and generic version first, I can easily base my mmu notifier patches on top of yours. > >> + >> kvm_guest_enter(); >> >> #ifdef CONFIG_PPC_FPU >> @@ -978,6 +1098,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, >> } >> } >> >> + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) && >> + (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) { >> + run->exit_reason = KVM_EXIT_WATCHDOG; >> + r = RESUME_HOST | (r & RESUME_FLAG_NV); >> + } >> + >> return r; >> } >> >> @@ -1011,6 +1137,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) >> return r; >> } >> >> +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) >> +{ >> + /* setup watchdog timer once */ >> + spin_lock_init(&vcpu->arch.wdt_lock); >> + setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, >> + (unsigned long)vcpu); >> + >> + return 0; >> +} >> + >> +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) >> +{ >> + del_timer_sync(&vcpu->arch.wdt_timer); >> +} >> + >> int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) >> { >> int i; >> @@ -1106,7 +1247,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, >> } >> >> if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { >> + u32 old_tsr = vcpu->arch.tsr; >> + >> vcpu->arch.tsr = sregs->u.e.tsr; >> + >> + if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) >> + arm_next_watchdog(vcpu); > > Just curious. Is this potentially racy with the watchdog timer, since we're not accessing tsr atomically. Userspace updating TSR is racy (not just with the watchdog). That's why userspace needs to set a special flag for it to happen. It's meant for reset, migration, etc. rather than normal runtime operations. This could be a problem if we want QEMU to clear the watchdog when resuming from debug halt -- we don't want to miss a decrementer that happens around the same time. Maybe we should have separate virtual registers (via one-reg) for each timer source. The only new raciness I can see is if we race with a watchdog timer that determines itself to be the final expiration, and thus doesn't run arm_next_watchdog itself. In practice, I think you'd need three watchdog timers to go off between during the above code, so it's not very realistic, but I suppose you could use a cmpxchg to avoid it. -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 31.07.2012, at 23:24, Scott Wood wrote: > On 07/31/2012 06:50 AM, Alexander Graf wrote: >> >> On 25.07.2012, at 05:49, Bharat Bhushan wrote: >> >>> This patch adds the watchdog emulation in KVM. The watchdog >>> emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_BOOKE_WATCHDOG) ioctl. >>> The kernel timer are used for watchdog emulation and emulates >>> h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU >>> if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how >>> it is configured. >>> >>> Signed-off-by: Liu Yu <yu.liu@freescale.com> >>> Signed-off-by: Scott Wood <scottwood@freescale.com> >>> [bharat.bhushan@freescale.com: reworked patch] >>> Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com> >>> --- >>> v6: >>> - Added kvmppc_subarch_vcpu_unit/uninit() for subarch specific initialization >>> - Using spin_lock_irqsave() >>> - Using del_timer_sync(). >>> >>> v5: >>> - Checking that TSR_ENW/TSR_WIS are still set on KVM_EXIT_WATCHDOG. >>> - Moved watchdog_next_timeout() in lock. >>> >>> arch/powerpc/include/asm/kvm_host.h | 3 + >>> arch/powerpc/include/asm/kvm_ppc.h | 4 + >>> arch/powerpc/include/asm/reg_booke.h | 7 ++ >>> arch/powerpc/kvm/book3s.c | 9 ++ >>> arch/powerpc/kvm/booke.c | 156 ++++++++++++++++++++++++++++++++++ >>> arch/powerpc/kvm/booke_emulate.c | 8 ++ >>> arch/powerpc/kvm/powerpc.c | 14 +++- >>> include/linux/kvm.h | 2 + >>> include/linux/kvm_host.h | 1 + >>> 9 files changed, 202 insertions(+), 2 deletions(-) >>> >>> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h >>> index 50ea12f..43cac48 100644 >>> --- a/arch/powerpc/include/asm/kvm_host.h >>> +++ b/arch/powerpc/include/asm/kvm_host.h >>> @@ -467,6 +467,8 @@ struct kvm_vcpu_arch { >>> ulong fault_esr; >>> ulong queued_dear; >>> ulong queued_esr; >>> + spinlock_t wdt_lock; >>> + struct timer_list wdt_timer; >>> u32 tlbcfg[4]; >>> u32 mmucfg; >>> u32 epr; >>> @@ -482,6 +484,7 @@ struct kvm_vcpu_arch { >>> u8 osi_needed; >>> u8 osi_enabled; >>> u8 papr_enabled; >>> + u8 watchdog_enabled; >>> u8 sane; >>> u8 cpu_type; >>> u8 hcall_needed; >>> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h >>> index 0124937..823d563 100644 >>> --- a/arch/powerpc/include/asm/kvm_ppc.h >>> +++ b/arch/powerpc/include/asm/kvm_ppc.h >>> @@ -68,6 +68,8 @@ extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); >>> extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); >>> extern void kvmppc_decrementer_func(unsigned long data); >>> extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); >>> +extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu); >>> +extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu); >>> >>> /* Core-specific hooks */ >>> >>> @@ -104,6 +106,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, >>> struct kvm_interrupt *irq); >>> extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, >>> struct kvm_interrupt *irq); >>> +extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu); >>> +extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu); >>> >>> extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, >>> unsigned int op, int *advance); >>> diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h >>> index 2d916c4..e07e6af 100644 >>> --- a/arch/powerpc/include/asm/reg_booke.h >>> +++ b/arch/powerpc/include/asm/reg_booke.h >>> @@ -539,6 +539,13 @@ >>> #define TCR_FIE 0x00800000 /* FIT Interrupt Enable */ >>> #define TCR_ARE 0x00400000 /* Auto Reload Enable */ >>> >>> +#ifdef CONFIG_E500 >>> +#define TCR_GET_WP(tcr) ((((tcr) & 0xC0000000) >> 30) | \ >>> + (((tcr) & 0x1E0000) >> 15)) >>> +#else >>> +#define TCR_GET_WP(tcr) (((tcr) & 0xC0000000) >> 30) >>> +#endif >>> + >>> /* Bit definitions for the TSR. */ >>> #define TSR_ENW 0x80000000 /* Enable Next Watchdog */ >>> #define TSR_WIS 0x40000000 /* WDT Interrupt Status */ >>> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c >>> index 3f2a836..e946665 100644 >>> --- a/arch/powerpc/kvm/book3s.c >>> +++ b/arch/powerpc/kvm/book3s.c >>> @@ -411,6 +411,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) >>> return 0; >>> } >>> >>> +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) >>> +{ >>> + return 0; >>> +} >>> + >>> +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) >>> +{ >>> +} >>> + >>> int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) >>> { >>> int i; >>> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c >>> index d25a097..eadd86c 100644 >>> --- a/arch/powerpc/kvm/booke.c >>> +++ b/arch/powerpc/kvm/booke.c >>> @@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, >>> clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); >>> } >>> >>> +void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu) >>> +{ >>> + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG); >>> +} >>> + >>> +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu) >>> +{ >>> + clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions); >>> +} >>> + >> >> These should be static, since we only need them in booke specific code. >> >>> static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) >>> { >>> #ifdef CONFIG_KVM_BOOKE_HV >>> @@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, >>> msr_mask = MSR_CE | MSR_ME | MSR_DE; >>> int_class = INT_CLASS_NONCRIT; >>> break; >>> + case BOOKE_IRQPRIO_WATCHDOG: >>> case BOOKE_IRQPRIO_CRITICAL: >>> case BOOKE_IRQPRIO_DBELL_CRIT: >>> allowed = vcpu->arch.shared->msr & MSR_CE; >>> @@ -404,12 +415,114 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, >>> return allowed; >>> } >>> >>> +/* >>> + * Return the number of jiffies until the next timeout. If the timeout is >>> + * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA >>> + * because the larger value can break the timer APIs. >>> + */ >>> +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) >>> +{ >>> + u64 tb, wdt_tb, wdt_ticks = 0; >>> + u64 nr_jiffies = 0; >>> + u32 period = TCR_GET_WP(vcpu->arch.tcr); >>> + >>> + wdt_tb = 1ULL << (63 - period); >>> + tb = get_tb(); >>> + /* >>> + * The watchdog timeout will hapeen when TB bit corresponding >>> + * to watchdog will toggle from 0 to 1. >>> + */ >>> + if (tb & wdt_tb) >>> + wdt_ticks = wdt_tb; >>> + >>> + wdt_ticks += wdt_tb - (tb & (wdt_tb - 1)); >>> + >>> + /* Convert timebase ticks to jiffies */ >>> + nr_jiffies = wdt_ticks; >>> + >>> + if (do_div(nr_jiffies, tb_ticks_per_jiffy)) >>> + nr_jiffies++; >>> + >>> + return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA); >>> +} >>> + >>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu) >>> +{ >>> + unsigned long nr_jiffies; >>> + unsigned long flags; >>> + >>> + spin_lock_irqsave(&vcpu->arch.wdt_lock, flags); >>> + nr_jiffies = watchdog_next_timeout(vcpu); >>> + /* >>> + * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA >>> + * then do not run the watchdog timer as this can break timer APIs. >>> + */ >>> + if (nr_jiffies < NEXT_TIMER_MAX_DELTA) >>> + mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies); >>> + else >>> + del_timer(&vcpu->arch.wdt_timer); >>> + spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags); >>> +} >>> + >>> +void kvmppc_watchdog_func(unsigned long data) >>> +{ >>> + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; >>> + u32 tsr, new_tsr; >>> + int final; >>> + >>> + do { >>> + new_tsr = tsr = vcpu->arch.tsr; >>> + final = 0; >>> + >>> + /* Time out event */ >>> + if (tsr & TSR_ENW) { >>> + if (tsr & TSR_WIS) >>> + final = 1; >>> + else >>> + new_tsr = tsr | TSR_WIS; >>> + } else { >>> + new_tsr = tsr | TSR_ENW; >>> + } >>> + } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr); >>> + >>> + if (new_tsr & TSR_WIS) { >>> + smp_wmb(); >>> + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); >>> + kvm_vcpu_kick(vcpu); >>> + } >>> + >>> + /* >>> + * If this is final watchdog expiry and some action is required >>> + * then exit to userspace. >>> + */ >>> + if (final && (vcpu->arch.tcr & TCR_WRC_MASK) && >>> + vcpu->arch.watchdog_enabled) { >>> + smp_wmb(); >>> + kvm_make_request(KVM_REQ_WATCHDOG, vcpu); >>> + kvm_vcpu_kick(vcpu); >>> + } >>> + >>> + /* >>> + * Stop running the watchdog timer after final expiration to >>> + * prevent the host from being flooded with timers if the >>> + * guest sets a short period. >>> + * Timers will resume when TSR/TCR is updated next time. >>> + */ >>> + if (!final) >>> + arm_next_watchdog(vcpu); >>> +} >>> + >>> static void update_timer_ints(struct kvm_vcpu *vcpu) >>> { >>> if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) >>> kvmppc_core_queue_dec(vcpu); >>> else >>> kvmppc_core_dequeue_dec(vcpu); >>> + >>> + if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS)) >>> + kvmppc_core_queue_watchdog(vcpu); >>> + else >>> + kvmppc_core_dequeue_watchdog(vcpu); >>> } >>> >>> static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) >>> @@ -516,6 +629,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) >>> goto out; >>> } >>> >>> + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) && >>> + (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) { >>> + kvm_run->exit_reason = KVM_EXIT_WATCHDOG; >>> + ret = 0; >>> + goto out; >>> + } >> >> I think we want to push this into a generic function that checks for requests. Check out >> >> https://github.com/agraf/linux-2.6/commit/e8fe975b59b24777c6314addc9cea2b959454738 >> >> for what I did there, namely kvmppc_check_requests. If your case we obviously want a return value. I don't mind which patch goes in first, so if you add a nice and generic version first, I can easily base my mmu notifier patches on top of yours. >> >>> + >>> kvm_guest_enter(); >>> >>> #ifdef CONFIG_PPC_FPU >>> @@ -978,6 +1098,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, >>> } >>> } >>> >>> + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) && >>> + (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) { >>> + run->exit_reason = KVM_EXIT_WATCHDOG; >>> + r = RESUME_HOST | (r & RESUME_FLAG_NV); >>> + } >>> + >>> return r; >>> } >>> >>> @@ -1011,6 +1137,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) >>> return r; >>> } >>> >>> +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) >>> +{ >>> + /* setup watchdog timer once */ >>> + spin_lock_init(&vcpu->arch.wdt_lock); >>> + setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, >>> + (unsigned long)vcpu); >>> + >>> + return 0; >>> +} >>> + >>> +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) >>> +{ >>> + del_timer_sync(&vcpu->arch.wdt_timer); >>> +} >>> + >>> int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) >>> { >>> int i; >>> @@ -1106,7 +1247,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, >>> } >>> >>> if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { >>> + u32 old_tsr = vcpu->arch.tsr; >>> + >>> vcpu->arch.tsr = sregs->u.e.tsr; >>> + >>> + if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) >>> + arm_next_watchdog(vcpu); >> >> Just curious. Is this potentially racy with the watchdog timer, since we're not accessing tsr atomically. > > Userspace updating TSR is racy (not just with the watchdog). > That's why userspace needs to set a special flag for it to happen. It's > meant for reset, migration, etc. rather than normal runtime operations. > This could be a problem if we want QEMU to clear the watchdog when > resuming from debug halt -- we don't want to miss a decrementer that > happens around the same time. Maybe we should have separate virtual > registers (via one-reg) for each timer source. Well, user space always comes in via the ioctl path, which in turn does vcpu_load(vcpu), thus should always be running as the vcpu context itself. It boils down to the original thing I was saying back when you introduced asynchronous TSR updates. Why don't we just make TSR updates kvm requests? That way we can ensure that whoever deals with the vcpu is the one in charge of it, right? No races left to worry about. Alex > The only new raciness I can see is if we race with a watchdog timer that > determines itself to be the final expiration, and thus doesn't run > arm_next_watchdog itself. In practice, I think you'd need three > watchdog timers to go off between during the above code, so it's not > very realistic, but I suppose you could use a cmpxchg to avoid it. > > -Scott > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 07/31/2012 06:59 PM, Alexander Graf wrote: > Well, user space always comes in via the ioctl path, which in turn > does vcpu_load(vcpu), thus should always be running as the vcpu > context itself. Right, it's the timers you're racing with. > It boils down to the original thing I was saying back when you > introduced asynchronous TSR updates. Right. > Why don't we just make TSR > updates kvm requests? Maybe we should. -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 01.08.2012, at 04:13, Scott Wood wrote: > On 07/31/2012 06:59 PM, Alexander Graf wrote: >> Well, user space always comes in via the ioctl path, which in turn >> does vcpu_load(vcpu), thus should always be running as the vcpu >> context itself. > > Right, it's the timers you're racing with. > >> It boils down to the original thing I was saying back when you >> introduced asynchronous TSR updates. > > Right. > >> Why don't we just make TSR >> updates kvm requests? > > Maybe we should. I prefer to have my code easy to understand. And every time I see cmpxchg and spin locks my readability skills take a serious hit :). The big question is whether we care for the scope of the watchdog patch. Changing it to requests is a purely internal rework. So we could leave it with the (unlikely) race for now, agree on the fact that we won't introduce any other direct TSR accessors and move the code to be request based soon'ish. Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 50ea12f..43cac48 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -467,6 +467,8 @@ struct kvm_vcpu_arch { ulong fault_esr; ulong queued_dear; ulong queued_esr; + spinlock_t wdt_lock; + struct timer_list wdt_timer; u32 tlbcfg[4]; u32 mmucfg; u32 epr; @@ -482,6 +484,7 @@ struct kvm_vcpu_arch { u8 osi_needed; u8 osi_enabled; u8 papr_enabled; + u8 watchdog_enabled; u8 sane; u8 cpu_type; u8 hcall_needed; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 0124937..823d563 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -68,6 +68,8 @@ extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); extern void kvmppc_decrementer_func(unsigned long data); extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); +extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu); +extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu); /* Core-specific hooks */ @@ -104,6 +106,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); +extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu); +extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu); extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int op, int *advance); diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 2d916c4..e07e6af 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -539,6 +539,13 @@ #define TCR_FIE 0x00800000 /* FIT Interrupt Enable */ #define TCR_ARE 0x00400000 /* Auto Reload Enable */ +#ifdef CONFIG_E500 +#define TCR_GET_WP(tcr) ((((tcr) & 0xC0000000) >> 30) | \ + (((tcr) & 0x1E0000) >> 15)) +#else +#define TCR_GET_WP(tcr) (((tcr) & 0xC0000000) >> 30) +#endif + /* Bit definitions for the TSR. */ #define TSR_ENW 0x80000000 /* Enable Next Watchdog */ #define TSR_WIS 0x40000000 /* WDT Interrupt Status */ diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 3f2a836..e946665 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -411,6 +411,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) +{ + return 0; +} + +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ +} + int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index d25a097..eadd86c 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); } +void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu) +{ + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG); +} + +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu) +{ + clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions); +} + static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) { #ifdef CONFIG_KVM_BOOKE_HV @@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, msr_mask = MSR_CE | MSR_ME | MSR_DE; int_class = INT_CLASS_NONCRIT; break; + case BOOKE_IRQPRIO_WATCHDOG: case BOOKE_IRQPRIO_CRITICAL: case BOOKE_IRQPRIO_DBELL_CRIT: allowed = vcpu->arch.shared->msr & MSR_CE; @@ -404,12 +415,114 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, return allowed; } +/* + * Return the number of jiffies until the next timeout. If the timeout is + * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA + * because the larger value can break the timer APIs. + */ +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) +{ + u64 tb, wdt_tb, wdt_ticks = 0; + u64 nr_jiffies = 0; + u32 period = TCR_GET_WP(vcpu->arch.tcr); + + wdt_tb = 1ULL << (63 - period); + tb = get_tb(); + /* + * The watchdog timeout will hapeen when TB bit corresponding + * to watchdog will toggle from 0 to 1. + */ + if (tb & wdt_tb) + wdt_ticks = wdt_tb; + + wdt_ticks += wdt_tb - (tb & (wdt_tb - 1)); + + /* Convert timebase ticks to jiffies */ + nr_jiffies = wdt_ticks; + + if (do_div(nr_jiffies, tb_ticks_per_jiffy)) + nr_jiffies++; + + return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA); +} + +static void arm_next_watchdog(struct kvm_vcpu *vcpu) +{ + unsigned long nr_jiffies; + unsigned long flags; + + spin_lock_irqsave(&vcpu->arch.wdt_lock, flags); + nr_jiffies = watchdog_next_timeout(vcpu); + /* + * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA + * then do not run the watchdog timer as this can break timer APIs. + */ + if (nr_jiffies < NEXT_TIMER_MAX_DELTA) + mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies); + else + del_timer(&vcpu->arch.wdt_timer); + spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags); +} + +void kvmppc_watchdog_func(unsigned long data) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + u32 tsr, new_tsr; + int final; + + do { + new_tsr = tsr = vcpu->arch.tsr; + final = 0; + + /* Time out event */ + if (tsr & TSR_ENW) { + if (tsr & TSR_WIS) + final = 1; + else + new_tsr = tsr | TSR_WIS; + } else { + new_tsr = tsr | TSR_ENW; + } + } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr); + + if (new_tsr & TSR_WIS) { + smp_wmb(); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_vcpu_kick(vcpu); + } + + /* + * If this is final watchdog expiry and some action is required + * then exit to userspace. + */ + if (final && (vcpu->arch.tcr & TCR_WRC_MASK) && + vcpu->arch.watchdog_enabled) { + smp_wmb(); + kvm_make_request(KVM_REQ_WATCHDOG, vcpu); + kvm_vcpu_kick(vcpu); + } + + /* + * Stop running the watchdog timer after final expiration to + * prevent the host from being flooded with timers if the + * guest sets a short period. + * Timers will resume when TSR/TCR is updated next time. + */ + if (!final) + arm_next_watchdog(vcpu); +} + static void update_timer_ints(struct kvm_vcpu *vcpu) { if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) kvmppc_core_queue_dec(vcpu); else kvmppc_core_dequeue_dec(vcpu); + + if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS)) + kvmppc_core_queue_watchdog(vcpu); + else + kvmppc_core_dequeue_watchdog(vcpu); } static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) @@ -516,6 +629,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) goto out; } + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) && + (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) { + kvm_run->exit_reason = KVM_EXIT_WATCHDOG; + ret = 0; + goto out; + } + kvm_guest_enter(); #ifdef CONFIG_PPC_FPU @@ -978,6 +1098,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } } + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) && + (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) { + run->exit_reason = KVM_EXIT_WATCHDOG; + r = RESUME_HOST | (r & RESUME_FLAG_NV); + } + return r; } @@ -1011,6 +1137,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return r; } +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) +{ + /* setup watchdog timer once */ + spin_lock_init(&vcpu->arch.wdt_lock); + setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, + (unsigned long)vcpu); + + return 0; +} + +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + del_timer_sync(&vcpu->arch.wdt_timer); +} + int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; @@ -1106,7 +1247,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, } if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { + u32 old_tsr = vcpu->arch.tsr; + vcpu->arch.tsr = sregs->u.e.tsr; + + if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) + arm_next_watchdog(vcpu); + update_timer_ints(vcpu); } @@ -1267,6 +1414,7 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr) { vcpu->arch.tcr = new_tcr; + arm_next_watchdog(vcpu); update_timer_ints(vcpu); } @@ -1281,6 +1429,14 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) { clear_bits(tsr_bits, &vcpu->arch.tsr); + + /* + * We may have stopped the watchdog due to + * being stuck on final expiration. + */ + if (tsr_bits & (TSR_ENW | TSR_WIS)) + arm_next_watchdog(vcpu); + update_timer_ints(vcpu); } diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 12834bb..5a66ade 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -145,6 +145,14 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) kvmppc_clr_tsr_bits(vcpu, spr_val); break; case SPRN_TCR: + /* + * WRC is a 2-bit field that is supposed to preserve its + * value once written to non-zero. + */ + if (vcpu->arch.tcr & TCR_WRC_MASK) { + spr_val &= ~TCR_WRC_MASK; + spr_val |= vcpu->arch.tcr & TCR_WRC_MASK; + } kvmppc_set_tcr(vcpu, spr_val); break; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 87f4dc8..685829a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -220,6 +220,7 @@ int kvm_dev_ioctl_check_extension(long ext) switch (ext) { #ifdef CONFIG_BOOKE case KVM_CAP_PPC_BOOKE_SREGS: + case KVM_CAP_PPC_BOOKE_WATCHDOG: #else case KVM_CAP_PPC_SEGSTATE: case KVM_CAP_PPC_HIOR: @@ -378,6 +379,8 @@ enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { + int ret; + hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu); vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; @@ -386,13 +389,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) #ifdef CONFIG_KVM_EXIT_TIMING mutex_init(&vcpu->arch.exit_timing_lock); #endif - - return 0; + ret = kvmppc_subarch_vcpu_init(vcpu); + return ret; } void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { kvmppc_mmu_destroy(vcpu); + kvmppc_subarch_vcpu_uninit(vcpu); } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -637,6 +641,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = 0; vcpu->arch.papr_enabled = true; break; +#ifdef CONFIG_BOOKE + case KVM_CAP_PPC_BOOKE_WATCHDOG: + r = 0; + vcpu->arch.watchdog_enabled = true; + break; +#endif #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) case KVM_CAP_SW_TLB: { struct kvm_config_tlb cfg; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 2ce09aa..9ff5aa5 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -163,6 +163,7 @@ struct kvm_pit_config { #define KVM_EXIT_OSI 18 #define KVM_EXIT_PAPR_HCALL 19 #define KVM_EXIT_S390_UCONTROL 20 +#define KVM_EXIT_WATCHDOG 21 /* For KVM_EXIT_INTERNAL_ERROR */ #define KVM_INTERNAL_ERROR_EMULATION 1 @@ -618,6 +619,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_PPC_GET_SMMU_INFO 78 #define KVM_CAP_S390_COW 79 #define KVM_CAP_PPC_ALLOC_HTAB 80 +#define KVM_CAP_PPC_BOOKE_WATCHDOG 81 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 27ac8a4..41d32be 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -69,6 +69,7 @@ #define KVM_REQ_IMMEDIATE_EXIT 15 #define KVM_REQ_PMU 16 #define KVM_REQ_PMI 17 +#define KVM_REQ_WATCHDOG 18 #define KVM_USERSPACE_IRQ_SOURCE_ID 0