Message ID | 201409111306169866254@sangfor.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 2014-09-11 07:06, Zhang Haoyu wrote: > Currently, we call ioapic_service() immediately when we find the irq is still > active during eoi broadcast. But for real hardware, there's some dealy between > the EOI writing and irq delivery (system bus latency?). So we need to emulate > this behavior. Otherwise, for a guest who haven't register a proper irq handler > , it would stay in the interrupt routine as this irq would be re-injected > immediately after guest enables interrupt. This would lead guest can't move > forward and may miss the possibility to get proper irq handler registered (one > example is windows guest resuming from hibernation). > > As there's no way to differ the unhandled irq from new raised ones, this patch > solve this problems by scheduling a delayed work when the count of irq injected > during eoi broadcast exceeds a threshold value. After this patch, the guest can > move a little forward when there's no suitable irq handler in case it may > register one very soon and for guest who has a bad irq detection routine ( such > as note_interrupt() in linux ), this bad irq would be recognized soon as in the > past. > > Cc: Michael S. Tsirkin <mst@redhat.com> > Signed-off-by: Jason Wang <jasowang@redhat.com> > Signed-off-by: Zhang Haoyu <zhanghy@sangfor.com> > --- > include/trace/events/kvm.h | 20 ++++++++++++++++++ > virt/kvm/ioapic.c | 51 ++++++++++++++++++++++++++++++++++++++++++++-- > virt/kvm/ioapic.h | 6 ++++++ > 3 files changed, 75 insertions(+), 2 deletions(-) > > diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h > index 908925a..b05f688 100644 > --- a/include/trace/events/kvm.h > +++ b/include/trace/events/kvm.h > @@ -95,6 +95,26 @@ TRACE_EVENT(kvm_ioapic_set_irq, > __entry->coalesced ? " (coalesced)" : "") > ); > > +TRACE_EVENT(kvm_ioapic_delayed_eoi_inj, > + TP_PROTO(__u64 e), > + TP_ARGS(e), > + > + TP_STRUCT__entry( > + __field( __u64, e ) > + ), > + > + TP_fast_assign( > + __entry->e = e; > + ), > + > + TP_printk("dst %x vec=%u (%s|%s|%s%s)", > + (u8)(__entry->e >> 56), (u8)__entry->e, > + __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), > + (__entry->e & (1<<11)) ? "logical" : "physical", > + (__entry->e & (1<<15)) ? "level" : "edge", > + (__entry->e & (1<<16)) ? "|masked" : "") > +); > + > TRACE_EVENT(kvm_msi_set_irq, > TP_PROTO(__u64 address, __u64 data), > TP_ARGS(address, data), > diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c > index e8ce34c..a36c4c4 100644 > --- a/virt/kvm/ioapic.c > +++ b/virt/kvm/ioapic.c > @@ -405,6 +405,24 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) > spin_unlock(&ioapic->lock); > } > > +static void kvm_ioapic_eoi_inject_work(struct work_struct *work) > +{ > + int i; > + struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic, > + eoi_inject.work); > + spin_lock(&ioapic->lock); > + for (i = 0; i < IOAPIC_NUM_PINS; i++) { > + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; > + > + if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG) > + continue; > + > + if (ioapic->irr & (1 << i) && !ent->fields.remote_irr) > + ioapic_service(ioapic, i, false); > + } > + spin_unlock(&ioapic->lock); > +} > + > static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, > struct kvm_ioapic *ioapic, int vector, int trigger_mode) > { > @@ -435,8 +453,33 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, > > ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); > ent->fields.remote_irr = 0; > - if (ioapic->irr & (1 << i)) > - ioapic_service(ioapic, i, false); > + if (!ent->fields.mask && (ioapic->irr & (1 << i))) { The mask check is new - why now? You don't check it in the work handler as well. > + ++ioapic->irq_eoi[i]; > + if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) { > + /* > + * Real hardware does not deliver the irq so > + * immediately during eoi broadcast, so we need > + * to emulate this behavior. Otherwise, for > + * guests who has not registered handler of a > + * level irq, this irq would be injected > + * immediately after guest enables interrupt > + * (which happens usually at the end of the > + * common interrupt routine). This would lead > + * guest can't move forward and may miss the > + * possibility to get proper irq handler > + * registered. So we need to give some breath to > + * guest. TODO: 1 is too long? TODO should be clarify. Maybe also translate the "1" into something HZ-derived. > + */ > + schedule_delayed_work(&ioapic->eoi_inject, 1); > + ioapic->irq_eoi[i] = 0; > + trace_kvm_ioapic_delayed_eoi_inj(ent->bits); > + } > + else { I think it's the desired kernel style to do "} else {" here, but I'm not totally sure if that rule exists. > + ioapic_service(ioapic, i, false); > + } > + else { I strongly suspect this version of the patch wasn't built... ;) > + ioapic->irq_eoi[i] = 0; > + } > } > } > > @@ -565,12 +608,14 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) > { > int i; > > + cancel_delayed_work_sync(&ioapic->eoi_inject); > for (i = 0; i < IOAPIC_NUM_PINS; i++) > ioapic->redirtbl[i].fields.mask = 1; > ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; > ioapic->ioregsel = 0; > ioapic->irr = 0; > ioapic->id = 0; > + memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); > rtc_irq_eoi_tracking_reset(ioapic); > update_handled_vectors(ioapic); > } > @@ -589,6 +634,7 @@ int kvm_ioapic_init(struct kvm *kvm) > if (!ioapic) > return -ENOMEM; > spin_lock_init(&ioapic->lock); > + INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work); > kvm->arch.vioapic = ioapic; > kvm_ioapic_reset(ioapic); > kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); > @@ -609,6 +655,7 @@ void kvm_ioapic_destroy(struct kvm *kvm) > { > struct kvm_ioapic *ioapic = kvm->arch.vioapic; > > + cancel_delayed_work_sync(&ioapic->eoi_inject); > if (ioapic) { > kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); > kvm->arch.vioapic = NULL; > diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h > index 90d43e9..260ac0a 100644 > --- a/virt/kvm/ioapic.h > +++ b/virt/kvm/ioapic.h > @@ -34,6 +34,10 @@ struct kvm_vcpu; > #define IOAPIC_INIT 0x5 > #define IOAPIC_EXTINT 0x7 > > +/* max successive count of the same irq during ioapic eoi broadcast, if this limitation reached, > + which can be considered as interrupt storm happened */ > +#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000 > + > #ifdef CONFIG_X86 > #define RTC_GSI 8 > #else > @@ -59,6 +63,8 @@ struct kvm_ioapic { > spinlock_t lock; > DECLARE_BITMAP(handled_vectors, 256); > struct rtc_status rtc_status; > + struct delayed_work eoi_inject; > + u32 irq_eoi[IOAPIC_NUM_PINS]; > }; > > #ifdef DEBUG > The general approach looks sane to me. I was first concerned the delay could affect non-broken guests as well, but the loop counter prevents this effectively. Jan
>> Currently, we call ioapic_service() immediately when we find the irq is still >> active during eoi broadcast. But for real hardware, there's some dealy between >> the EOI writing and irq delivery (system bus latency?). So we need to emulate >> this behavior. Otherwise, for a guest who haven't register a proper irq handler >> , it would stay in the interrupt routine as this irq would be re-injected >> immediately after guest enables interrupt. This would lead guest can't move >> forward and may miss the possibility to get proper irq handler registered (one >> example is windows guest resuming from hibernation). >> >> As there's no way to differ the unhandled irq from new raised ones, this patch >> solve this problems by scheduling a delayed work when the count of irq injected >> during eoi broadcast exceeds a threshold value. After this patch, the guest can >> move a little forward when there's no suitable irq handler in case it may >> register one very soon and for guest who has a bad irq detection routine ( such >> as note_interrupt() in linux ), this bad irq would be recognized soon as in the >> past. >> >> Cc: Michael S. Tsirkin <mst@redhat.com> >> Signed-off-by: Jason Wang <jasowang@redhat.com> >> Signed-off-by: Zhang Haoyu <zhanghy@sangfor.com> >> --- >> include/trace/events/kvm.h | 20 ++++++++++++++++++ >> virt/kvm/ioapic.c | 51 ++++++++++++++++++++++++++++++++++++++++++++-- >> virt/kvm/ioapic.h | 6 ++++++ >> 3 files changed, 75 insertions(+), 2 deletions(-) >> >> diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h >> index 908925a..b05f688 100644 >> --- a/include/trace/events/kvm.h >> +++ b/include/trace/events/kvm.h >> @@ -95,6 +95,26 @@ TRACE_EVENT(kvm_ioapic_set_irq, >> __entry->coalesced ? " (coalesced)" : "") >> ); >> >> +TRACE_EVENT(kvm_ioapic_delayed_eoi_inj, >> + TP_PROTO(__u64 e), >> + TP_ARGS(e), >> + >> + TP_STRUCT__entry( >> + __field( __u64, e ) >> + ), >> + >> + TP_fast_assign( >> + __entry->e = e; >> + ), >> + >> + TP_printk("dst %x vec=%u (%s|%s|%s%s)", >> + (u8)(__entry->e >> 56), (u8)__entry->e, >> + __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), >> + (__entry->e & (1<<11)) ? "logical" : "physical", >> + (__entry->e & (1<<15)) ? "level" : "edge", >> + (__entry->e & (1<<16)) ? "|masked" : "") >> +); >> + >> TRACE_EVENT(kvm_msi_set_irq, >> TP_PROTO(__u64 address, __u64 data), >> TP_ARGS(address, data), >> diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c >> index e8ce34c..a36c4c4 100644 >> --- a/virt/kvm/ioapic.c >> +++ b/virt/kvm/ioapic.c >> @@ -405,6 +405,24 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) >> spin_unlock(&ioapic->lock); >> } >> >> +static void kvm_ioapic_eoi_inject_work(struct work_struct *work) >> +{ >> + int i; >> + struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic, >> + eoi_inject.work); >> + spin_lock(&ioapic->lock); >> + for (i = 0; i < IOAPIC_NUM_PINS; i++) { >> + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; >> + >> + if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG) >> + continue; >> + >> + if (ioapic->irr & (1 << i) && !ent->fields.remote_irr) >> + ioapic_service(ioapic, i, false); >> + } >> + spin_unlock(&ioapic->lock); >> +} >> + >> static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, >> struct kvm_ioapic *ioapic, int vector, int trigger_mode) >> { >> @@ -435,8 +453,33 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, >> >> ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); >> ent->fields.remote_irr = 0; >> - if (ioapic->irr & (1 << i)) >> - ioapic_service(ioapic, i, false); >> + if (!ent->fields.mask && (ioapic->irr & (1 << i))) { > >The mask check is new - why now? You don't check it in the work handler >as well. > The mask check is to avoid incrementing ioapic->irq_eoi[i] when this irq is masked, the count should be zeroed, but needless to check it in the work handler, the check will be performed in ioapic_service(). >> + ++ioapic->irq_eoi[i]; >> + if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) { >> + /* >> + * Real hardware does not deliver the irq so >> + * immediately during eoi broadcast, so we need >> + * to emulate this behavior. Otherwise, for >> + * guests who has not registered handler of a >> + * level irq, this irq would be injected >> + * immediately after guest enables interrupt >> + * (which happens usually at the end of the >> + * common interrupt routine). This would lead >> + * guest can't move forward and may miss the >> + * possibility to get proper irq handler >> + * registered. So we need to give some breath to >> + * guest. TODO: 1 is too long? > >TODO should be clarify. Maybe also translate the "1" into something >HZ-derived. > I will delete the TODO comment, now the risk of "1 is too long" can be ignored, because "if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT)" cannot be true except interrupt storm happened from the the angle of probability. >> + */ >> + schedule_delayed_work(&ioapic->eoi_inject, 1); >> + ioapic->irq_eoi[i] = 0; >> + trace_kvm_ioapic_delayed_eoi_inj(ent->bits); >> + } >> + else { > >I think it's the desired kernel style to do "} else {" here, but I'm not >totally sure if that rule exists. > I read CodingStyle, you are right. >> + ioapic_service(ioapic, i, false); >> + } >> + else { > >I strongly suspect this version of the patch wasn't built... ;) > I backported it on my host linux-3.10, and built it okay. And this time I build it on upstream, failed, no matched "}" for if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT). So sorry for my careless. I will post a new patch. >> + ioapic->irq_eoi[i] = 0; >> + } >> } >> } >> >> @@ -565,12 +608,14 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) >> { >> int i; >> >> + cancel_delayed_work_sync(&ioapic->eoi_inject); >> for (i = 0; i < IOAPIC_NUM_PINS; i++) >> ioapic->redirtbl[i].fields.mask = 1; >> ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; >> ioapic->ioregsel = 0; >> ioapic->irr = 0; >> ioapic->id = 0; >> + memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); >> rtc_irq_eoi_tracking_reset(ioapic); >> update_handled_vectors(ioapic); >> } >> @@ -589,6 +634,7 @@ int kvm_ioapic_init(struct kvm *kvm) >> if (!ioapic) >> return -ENOMEM; >> spin_lock_init(&ioapic->lock); >> + INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work); >> kvm->arch.vioapic = ioapic; >> kvm_ioapic_reset(ioapic); >> kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); >> @@ -609,6 +655,7 @@ void kvm_ioapic_destroy(struct kvm *kvm) >> { >> struct kvm_ioapic *ioapic = kvm->arch.vioapic; >> >> + cancel_delayed_work_sync(&ioapic->eoi_inject); >> if (ioapic) { >> kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); >> kvm->arch.vioapic = NULL; >> diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h >> index 90d43e9..260ac0a 100644 >> --- a/virt/kvm/ioapic.h >> +++ b/virt/kvm/ioapic.h >> @@ -34,6 +34,10 @@ struct kvm_vcpu; >> #define IOAPIC_INIT 0x5 >> #define IOAPIC_EXTINT 0x7 >> >> +/* max successive count of the same irq during ioapic eoi broadcast, if this limitation reached, >> + which can be considered as interrupt storm happened */ >> +#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000 >> + >> #ifdef CONFIG_X86 >> #define RTC_GSI 8 >> #else >> @@ -59,6 +63,8 @@ struct kvm_ioapic { >> spinlock_t lock; >> DECLARE_BITMAP(handled_vectors, 256); >> struct rtc_status rtc_status; >> + struct delayed_work eoi_inject; >> + u32 irq_eoi[IOAPIC_NUM_PINS]; >> }; >> >> #ifdef DEBUG >> > >The general approach looks sane to me. I was first concerned the delay >could affect non-broken guests as well, but the loop counter prevents >this effectively. > >Jan -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 908925a..b05f688 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -95,6 +95,26 @@ TRACE_EVENT(kvm_ioapic_set_irq, __entry->coalesced ? " (coalesced)" : "") ); +TRACE_EVENT(kvm_ioapic_delayed_eoi_inj, + TP_PROTO(__u64 e), + TP_ARGS(e), + + TP_STRUCT__entry( + __field( __u64, e ) + ), + + TP_fast_assign( + __entry->e = e; + ), + + TP_printk("dst %x vec=%u (%s|%s|%s%s)", + (u8)(__entry->e >> 56), (u8)__entry->e, + __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), + (__entry->e & (1<<11)) ? "logical" : "physical", + (__entry->e & (1<<15)) ? "level" : "edge", + (__entry->e & (1<<16)) ? "|masked" : "") +); + TRACE_EVENT(kvm_msi_set_irq, TP_PROTO(__u64 address, __u64 data), TP_ARGS(address, data), diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index e8ce34c..a36c4c4 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -405,6 +405,24 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) spin_unlock(&ioapic->lock); } +static void kvm_ioapic_eoi_inject_work(struct work_struct *work) +{ + int i; + struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic, + eoi_inject.work); + spin_lock(&ioapic->lock); + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; + + if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG) + continue; + + if (ioapic->irr & (1 << i) && !ent->fields.remote_irr) + ioapic_service(ioapic, i, false); + } + spin_unlock(&ioapic->lock); +} + static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, struct kvm_ioapic *ioapic, int vector, int trigger_mode) { @@ -435,8 +453,33 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); ent->fields.remote_irr = 0; - if (ioapic->irr & (1 << i)) - ioapic_service(ioapic, i, false); + if (!ent->fields.mask && (ioapic->irr & (1 << i))) { + ++ioapic->irq_eoi[i]; + if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) { + /* + * Real hardware does not deliver the irq so + * immediately during eoi broadcast, so we need + * to emulate this behavior. Otherwise, for + * guests who has not registered handler of a + * level irq, this irq would be injected + * immediately after guest enables interrupt + * (which happens usually at the end of the + * common interrupt routine). This would lead + * guest can't move forward and may miss the + * possibility to get proper irq handler + * registered. So we need to give some breath to + * guest. TODO: 1 is too long? + */ + schedule_delayed_work(&ioapic->eoi_inject, 1); + ioapic->irq_eoi[i] = 0; + trace_kvm_ioapic_delayed_eoi_inj(ent->bits); + } + else { + ioapic_service(ioapic, i, false); + } + else { + ioapic->irq_eoi[i] = 0; + } } } @@ -565,12 +608,14 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) { int i; + cancel_delayed_work_sync(&ioapic->eoi_inject); for (i = 0; i < IOAPIC_NUM_PINS; i++) ioapic->redirtbl[i].fields.mask = 1; ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; ioapic->ioregsel = 0; ioapic->irr = 0; ioapic->id = 0; + memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); rtc_irq_eoi_tracking_reset(ioapic); update_handled_vectors(ioapic); } @@ -589,6 +634,7 @@ int kvm_ioapic_init(struct kvm *kvm) if (!ioapic) return -ENOMEM; spin_lock_init(&ioapic->lock); + INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work); kvm->arch.vioapic = ioapic; kvm_ioapic_reset(ioapic); kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); @@ -609,6 +655,7 @@ void kvm_ioapic_destroy(struct kvm *kvm) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; + cancel_delayed_work_sync(&ioapic->eoi_inject); if (ioapic) { kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); kvm->arch.vioapic = NULL; diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 90d43e9..260ac0a 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -34,6 +34,10 @@ struct kvm_vcpu; #define IOAPIC_INIT 0x5 #define IOAPIC_EXTINT 0x7 +/* max successive count of the same irq during ioapic eoi broadcast, if this limitation reached, + which can be considered as interrupt storm happened */ +#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000 + #ifdef CONFIG_X86 #define RTC_GSI 8 #else @@ -59,6 +63,8 @@ struct kvm_ioapic { spinlock_t lock; DECLARE_BITMAP(handled_vectors, 256); struct rtc_status rtc_status; + struct delayed_work eoi_inject; + u32 irq_eoi[IOAPIC_NUM_PINS]; }; #ifdef DEBUG