@@ -478,6 +478,8 @@ void pit_init(struct domain *d, unsigned long cpu_khz)
if ( !has_vpit(d) )
return;
+ spin_lock_init(&pit->pt0.lock);
+
spin_lock_init(&pit->lock);
if ( is_hvm_domain(d) )
@@ -734,6 +734,7 @@ static void hpet_set(HPETState *h)
h->hpet.timers[i].cmp = ~0ULL;
h->hpet.comparator64[i] = ~0ULL;
h->pt[i].source = PTSRC_isa;
+ spin_lock_init(&h->pt[i].lock);
}
}
@@ -658,8 +658,6 @@ int hvm_domain_initialise(struct domain *d)
/* need link to containing domain */
d->arch.hvm.pl_time->domain = d;
- rwlock_init(&d->arch.hvm.pl_time->pt_migrate);
-
/* Set the default IO Bitmap. */
if ( is_hardware_domain(d) )
{
@@ -842,6 +842,7 @@ void rtc_init(struct domain *d)
}
spin_lock_init(&s->lock);
+ spin_lock_init(&s->pt.lock);
init_timer(&s->update_timer, rtc_update_timer, s, smp_processor_id());
init_timer(&s->update_timer2, rtc_update_timer2, s, smp_processor_id());
@@ -1626,6 +1626,7 @@ int vlapic_init(struct vcpu *v)
vlapic_reset(vlapic);
spin_lock_init(&vlapic->esr_lock);
+ spin_lock_init(&vlapic->pt.lock);
tasklet_init(&vlapic->init_sipi.tasklet, vlapic_init_sipi_action, v);
@@ -153,32 +153,16 @@ static int pt_irq_masked(struct periodic_time *pt)
return 1;
}
-static void pt_vcpu_lock(struct vcpu *v)
-{
- read_lock(&v->domain->arch.hvm.pl_time->pt_migrate);
- spin_lock(&v->arch.hvm.tm_lock);
-}
-
-static void pt_vcpu_unlock(struct vcpu *v)
-{
- spin_unlock(&v->arch.hvm.tm_lock);
- read_unlock(&v->domain->arch.hvm.pl_time->pt_migrate);
-}
-
static void pt_lock(struct periodic_time *pt)
{
- /*
- * We cannot use pt_vcpu_lock here, because we need to acquire the
- * per-domain lock first and then (re-)fetch the value of pt->vcpu, or
- * else we might be using a stale value of pt->vcpu.
- */
- read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
+ spin_lock(&pt->lock);
spin_lock(&pt->vcpu->arch.hvm.tm_lock);
}
static void pt_unlock(struct periodic_time *pt)
{
- pt_vcpu_unlock(pt->vcpu);
+ spin_unlock(&pt->vcpu->arch.hvm.tm_lock);
+ spin_unlock(&pt->lock);
}
static void pt_process_missed_ticks(struct periodic_time *pt)
@@ -228,7 +212,7 @@ void pt_save_timer(struct vcpu *v)
if ( v->pause_flags & VPF_blocked )
return;
- pt_vcpu_lock(v);
+ spin_lock(&v->arch.hvm.tm_lock);
list_for_each_entry ( pt, head, list )
if ( !pt->do_not_freeze )
@@ -236,7 +220,7 @@ void pt_save_timer(struct vcpu *v)
pt_freeze_time(v);
- pt_vcpu_unlock(v);
+ spin_unlock(&v->arch.hvm.tm_lock);
}
void pt_restore_timer(struct vcpu *v)
@@ -244,7 +228,7 @@ void pt_restore_timer(struct vcpu *v)
struct list_head *head = &v->arch.hvm.tm_list;
struct periodic_time *pt;
- pt_vcpu_lock(v);
+ spin_lock(&v->arch.hvm.tm_lock);
list_for_each_entry ( pt, head, list )
{
@@ -257,7 +241,7 @@ void pt_restore_timer(struct vcpu *v)
pt_thaw_time(v);
- pt_vcpu_unlock(v);
+ spin_unlock(&v->arch.hvm.tm_lock);
}
static void pt_timer_fn(void *data)
@@ -318,7 +302,7 @@ int pt_update_irq(struct vcpu *v)
int irq, pt_vector = -1;
bool level;
- pt_vcpu_lock(v);
+ spin_lock(&v->arch.hvm.tm_lock);
earliest_pt = NULL;
max_lag = -1ULL;
@@ -348,7 +332,7 @@ int pt_update_irq(struct vcpu *v)
if ( earliest_pt == NULL )
{
- pt_vcpu_unlock(v);
+ spin_unlock(&v->arch.hvm.tm_lock);
return -1;
}
@@ -356,7 +340,7 @@ int pt_update_irq(struct vcpu *v)
irq = earliest_pt->irq;
level = earliest_pt->level;
- pt_vcpu_unlock(v);
+ spin_unlock(&v->arch.hvm.tm_lock);
switch ( earliest_pt->source )
{
@@ -403,7 +387,7 @@ int pt_update_irq(struct vcpu *v)
time_cb *cb = NULL;
void *cb_priv = NULL;
- pt_vcpu_lock(v);
+ spin_lock(&v->arch.hvm.tm_lock);
/* Make sure the timer is still on the list. */
list_for_each_entry ( pt, &v->arch.hvm.tm_list, list )
if ( pt == earliest_pt )
@@ -413,7 +397,7 @@ int pt_update_irq(struct vcpu *v)
cb_priv = pt->priv;
break;
}
- pt_vcpu_unlock(v);
+ spin_unlock(&v->arch.hvm.tm_lock);
if ( cb != NULL )
cb(v, cb_priv);
@@ -450,12 +434,12 @@ void pt_intr_post(struct vcpu *v, struct hvm_intack intack)
if ( intack.source == hvm_intsrc_vector )
return;
- pt_vcpu_lock(v);
+ spin_lock(&v->arch.hvm.tm_lock);
pt = is_pt_irq(v, intack);
if ( pt == NULL )
{
- pt_vcpu_unlock(v);
+ spin_unlock(&v->arch.hvm.tm_lock);
return;
}
@@ -464,7 +448,7 @@ void pt_intr_post(struct vcpu *v, struct hvm_intack intack)
cb = pt->cb;
cb_priv = pt->priv;
- pt_vcpu_unlock(v);
+ spin_unlock(&v->arch.hvm.tm_lock);
if ( cb != NULL )
cb(v, cb_priv);
@@ -475,12 +459,34 @@ void pt_migrate(struct vcpu *v)
struct list_head *head = &v->arch.hvm.tm_list;
struct periodic_time *pt;
- pt_vcpu_lock(v);
+ spin_lock(&v->arch.hvm.tm_lock);
list_for_each_entry ( pt, head, list )
migrate_timer(&pt->timer, v->processor);
- pt_vcpu_unlock(v);
+ spin_unlock(&v->arch.hvm.tm_lock);
+}
+
+static void __destroy_periodic_time(struct periodic_time *pt, bool locked)
+{
+ /* Was this structure previously initialised by create_periodic_time()? */
+ if ( pt->vcpu == NULL )
+ return;
+
+ if (!locked)
+ pt_lock(pt);
+ if ( pt->on_list )
+ list_del(&pt->list);
+ pt->on_list = 0;
+ pt->pending_intr_nr = 0;
+ if (!locked)
+ pt_unlock(pt);
+
+ /*
+ * pt_timer_fn() can run until this kill_timer() returns. We must do this
+ * outside pt_lock() otherwise we can deadlock with pt_timer_fn().
+ */
+ kill_timer(&pt->timer);
}
void create_periodic_time(
@@ -497,9 +503,16 @@ void create_periodic_time(
return;
}
- destroy_periodic_time(pt);
+ spin_lock(&pt->lock);
- write_lock(&v->domain->arch.hvm.pl_time->pt_migrate);
+ if ( pt->vcpu )
+ {
+ spin_lock(&pt->vcpu->arch.hvm.tm_lock);
+
+ __destroy_periodic_time(pt, true);
+
+ spin_unlock(&pt->vcpu->arch.hvm.tm_lock);
+ }
pt->pending_intr_nr = 0;
pt->do_not_freeze = 0;
@@ -543,33 +556,22 @@ void create_periodic_time(
pt->cb = cb;
pt->priv = data;
+ spin_lock(&pt->vcpu->arch.hvm.tm_lock);
+
pt->on_list = 1;
list_add(&pt->list, &v->arch.hvm.tm_list);
+ spin_unlock(&pt->vcpu->arch.hvm.tm_lock);
+
init_timer(&pt->timer, pt_timer_fn, pt, v->processor);
set_timer(&pt->timer, pt->scheduled);
- write_unlock(&v->domain->arch.hvm.pl_time->pt_migrate);
+ spin_unlock(&pt->lock);
}
void destroy_periodic_time(struct periodic_time *pt)
{
- /* Was this structure previously initialised by create_periodic_time()? */
- if ( pt->vcpu == NULL )
- return;
-
- pt_lock(pt);
- if ( pt->on_list )
- list_del(&pt->list);
- pt->on_list = 0;
- pt->pending_intr_nr = 0;
- pt_unlock(pt);
-
- /*
- * pt_timer_fn() can run until this kill_timer() returns. We must do this
- * outside pt_lock() otherwise we can deadlock with pt_timer_fn().
- */
- kill_timer(&pt->timer);
+ __destroy_periodic_time(pt, false);
}
static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
@@ -579,15 +581,25 @@ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
if ( pt->vcpu == NULL )
return;
- write_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
+ spin_lock(&pt->lock);
+ spin_lock(&pt->vcpu->arch.hvm.tm_lock);
+
+ if ( pt->on_list )
+ list_del(&pt->list);
+
+ spin_unlock(&pt->vcpu->arch.hvm.tm_lock);
+
pt->vcpu = v;
+
+ spin_lock(&pt->vcpu->arch.hvm.tm_lock);
if ( pt->on_list )
{
- list_del(&pt->list);
list_add(&pt->list, &v->arch.hvm.tm_list);
migrate_timer(&pt->timer, v->processor);
}
- write_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
+
+ spin_unlock(&pt->vcpu->arch.hvm.tm_lock);
+ spin_unlock(&pt->lock);
}
void pt_adjust_global_vcpu_target(struct vcpu *v)
@@ -49,6 +49,8 @@ struct periodic_time {
u64 last_plt_gtime; /* platform time when last IRQ is injected */
struct timer timer; /* ac_timer */
time_cb *cb;
+ spinlock_t lock; /* protects vcpu field during PT migration. */
+ /* Needs to be taken before VCPU's tm_lock. */
void *priv; /* point back to platform time source */
};
@@ -128,13 +130,6 @@ struct pl_time { /* platform time */
struct RTCState vrtc;
struct HPETState vhpet;
struct PMTState vpmt;
- /*
- * rwlock to prevent periodic_time vCPU migration. Take the lock in read
- * mode in order to prevent the vcpu field of periodic_time from changing.
- * Lock must be taken in write mode when changes to the vcpu field are
- * performed, as it allows exclusive access to all the timers of a domain.
- */
- rwlock_t pt_migrate;
/* guest_time = Xen sys time + stime_offset */
int64_t stime_offset;
/* Ensures monotonicity in appropriate timer modes. */
Commit 8e76aef72820 ("x86/vpt: fix race when migrating timers between vCPUs") addressed XSA-336 by introducing a per-domain rwlock that was intended to protect periodic timer during VCPU migration. Since such migration is an infrequent event no performance impact was expected. Unfortunately this turned out not to be the case: on a fairly large guest (92 VCPUs) we've observed as much as 40% TPCC performance regression with some guest kernels. Further investigation pointed to pt_migrate read lock taken in pt_update_irq() as the largest contributor to this regression. With large number of VCPUs and large number of VMEXITs (from where pt_update_irq() is always called) the update of an atomic in read_lock() is thought to be the main cause. Stephen Brennan examined the locking pattern and suggested using a per-timer lock instead. This lock will need to be held whenever there is a chance that pt->vcpu field may change (thus avoiding XSA-336 condition). Suggested-by: Stephen Brennan <stephen.s.brennan@oracle.com> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> --- xen/arch/x86/emul-i8254.c | 2 + xen/arch/x86/hvm/hpet.c | 1 + xen/arch/x86/hvm/hvm.c | 2 - xen/arch/x86/hvm/rtc.c | 1 + xen/arch/x86/hvm/vlapic.c | 1 + xen/arch/x86/hvm/vpt.c | 122 +++++++++++++++++++++++------------------- xen/include/asm-x86/hvm/vpt.h | 9 +--- 7 files changed, 74 insertions(+), 64 deletions(-)