@@ -1055,7 +1055,7 @@ int xenmem_add_to_physmap_one(
switch ( space )
{
case XENMAPSPACE_grant_table:
- write_lock(&d->grant_table->lock);
+ grant_percpu_write_lock(&d->grant_table->lock);
if ( d->grant_table->gt_version == 0 )
d->grant_table->gt_version = 1;
@@ -1085,7 +1085,7 @@ int xenmem_add_to_physmap_one(
t = p2m_ram_rw;
- write_unlock(&d->grant_table->lock);
+ grant_percpu_write_unlock(&d->grant_table->lock);
break;
case XENMAPSPACE_shared_info:
if ( idx != 0 )
@@ -4654,7 +4654,7 @@ int xenmem_add_to_physmap_one(
mfn = virt_to_mfn(d->shared_info);
break;
case XENMAPSPACE_grant_table:
- write_lock(&d->grant_table->lock);
+ grant_percpu_write_lock(&d->grant_table->lock);
if ( d->grant_table->gt_version == 0 )
d->grant_table->gt_version = 1;
@@ -4676,7 +4676,7 @@ int xenmem_add_to_physmap_one(
mfn = virt_to_mfn(d->grant_table->shared_raw[idx]);
}
- write_unlock(&d->grant_table->lock);
+ grant_percpu_write_unlock(&d->grant_table->lock);
break;
case XENMAPSPACE_gmfn_range:
case XENMAPSPACE_gmfn:
@@ -178,6 +178,8 @@ struct active_grant_entry {
#define _active_entry(t, e) \
((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
+DEFINE_PERCPU_RWLOCK_GLOBAL(grant_rwlock);
+
static inline void gnttab_flush_tlb(const struct domain *d)
{
if ( !paging_mode_external(d) )
@@ -208,7 +210,13 @@ active_entry_acquire(struct grant_table *t, grant_ref_t e)
{
struct active_grant_entry *act;
- ASSERT(rw_is_locked(&t->lock));
+ /*
+ * The grant table for the active entry should be locked but the
+ * percpu rwlock cannot be checked for read lock without race conditions
+ * or high overhead so we cannot use an ASSERT
+ *
+ * ASSERT(rw_is_locked(&t->lock));
+ */
act = &_active_entry(t, e);
spin_lock(&act->lock);
@@ -270,23 +278,23 @@ double_gt_lock(struct grant_table *lgt, struct grant_table *rgt)
*/
if ( lgt < rgt )
{
- write_lock(&lgt->lock);
- write_lock(&rgt->lock);
+ grant_percpu_write_lock(&lgt->lock);
+ grant_percpu_write_lock(&rgt->lock);
}
else
{
if ( lgt != rgt )
- write_lock(&rgt->lock);
- write_lock(&lgt->lock);
+ grant_percpu_write_lock(&rgt->lock);
+ grant_percpu_write_lock(&lgt->lock);
}
}
static inline void
double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt)
{
- write_unlock(&lgt->lock);
+ grant_percpu_write_unlock(&lgt->lock);
if ( lgt != rgt )
- write_unlock(&rgt->lock);
+ grant_percpu_write_unlock(&rgt->lock);
}
static inline int
@@ -659,8 +667,14 @@ static int grant_map_exists(const struct domain *ld,
unsigned int *ref_count)
{
unsigned int ref, max_iter;
-
- ASSERT(rw_is_locked(&rgt->lock));
+
+ /*
+ * The remote grant table should be locked but the percpu rwlock
+ * cannot be checked for read lock without race conditions or high
+ * overhead so we cannot use an ASSERT
+ *
+ * ASSERT(rw_is_locked(&rgt->lock));
+ */
max_iter = min(*ref_count + (1 << GNTTABOP_CONTINUATION_ARG_SHIFT),
nr_grant_entries(rgt));
@@ -703,12 +717,12 @@ static unsigned int mapkind(
* Must have the local domain's grant table write lock when
* iterating over its maptrack entries.
*/
- ASSERT(rw_is_write_locked(&lgt->lock));
+ ASSERT(percpu_rw_is_write_locked(&lgt->lock));
/*
* Must have the remote domain's grant table write lock while
* counting its active entries.
*/
- ASSERT(rw_is_write_locked(&rd->grant_table->lock));
+ ASSERT(percpu_rw_is_write_locked(&rd->grant_table->lock));
for ( handle = 0; !(kind & MAPKIND_WRITE) &&
handle < lgt->maptrack_limit; handle++ )
@@ -796,7 +810,7 @@ __gnttab_map_grant_ref(
}
rgt = rd->grant_table;
- read_lock(&rgt->lock);
+ grant_percpu_read_lock(&rgt->lock);
/* Bounds check on the grant ref */
if ( unlikely(op->ref >= nr_grant_entries(rgt)))
@@ -859,7 +873,7 @@ __gnttab_map_grant_ref(
cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) );
active_entry_release(act);
- read_unlock(&rgt->lock);
+ grant_percpu_read_unlock(&rgt->lock);
/* pg may be set, with a refcount included, from __get_paged_frame */
if ( !pg )
@@ -1006,7 +1020,7 @@ __gnttab_map_grant_ref(
put_page(pg);
}
- read_lock(&rgt->lock);
+ grant_percpu_read_lock(&rgt->lock);
act = active_entry_acquire(rgt, op->ref);
@@ -1029,7 +1043,7 @@ __gnttab_map_grant_ref(
active_entry_release(act);
unlock_out:
- read_unlock(&rgt->lock);
+ grant_percpu_read_unlock(&rgt->lock);
op->status = rc;
put_maptrack_handle(lgt, handle);
rcu_unlock_domain(rd);
@@ -1080,18 +1094,18 @@ __gnttab_unmap_common(
op->map = &maptrack_entry(lgt, op->handle);
- read_lock(&lgt->lock);
+ grant_percpu_read_lock(&lgt->lock);
if ( unlikely(!read_atomic(&op->map->flags)) )
{
- read_unlock(&lgt->lock);
+ grant_percpu_read_unlock(&lgt->lock);
gdprintk(XENLOG_INFO, "Zero flags for handle (%d).\n", op->handle);
op->status = GNTST_bad_handle;
return;
}
dom = op->map->domid;
- read_unlock(&lgt->lock);
+ grant_percpu_read_unlock(&lgt->lock);
if ( unlikely((rd = rcu_lock_domain_by_id(dom)) == NULL) )
{
@@ -1113,7 +1127,7 @@ __gnttab_unmap_common(
rgt = rd->grant_table;
- read_lock(&rgt->lock);
+ grant_percpu_read_lock(&rgt->lock);
op->flags = read_atomic(&op->map->flags);
if ( unlikely(!op->flags) || unlikely(op->map->domid != dom) )
@@ -1165,7 +1179,7 @@ __gnttab_unmap_common(
act_release_out:
active_entry_release(act);
unmap_out:
- read_unlock(&rgt->lock);
+ grant_percpu_read_unlock(&rgt->lock);
if ( rc == GNTST_okay && gnttab_need_iommu_mapping(ld) )
{
@@ -1220,7 +1234,7 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common *op)
rcu_lock_domain(rd);
rgt = rd->grant_table;
- read_lock(&rgt->lock);
+ grant_percpu_read_lock(&rgt->lock);
if ( rgt->gt_version == 0 )
goto unlock_out;
@@ -1286,7 +1300,7 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common *op)
act_release_out:
active_entry_release(act);
unlock_out:
- read_unlock(&rgt->lock);
+ grant_percpu_read_unlock(&rgt->lock);
if ( put_handle )
{
@@ -1585,7 +1599,7 @@ gnttab_setup_table(
}
gt = d->grant_table;
- write_lock(>->lock);
+ grant_percpu_write_lock(>->lock);
if ( gt->gt_version == 0 )
gt->gt_version = 1;
@@ -1613,7 +1627,7 @@ gnttab_setup_table(
}
out3:
- write_unlock(>->lock);
+ grant_percpu_write_unlock(>->lock);
out2:
rcu_unlock_domain(d);
out1:
@@ -1655,13 +1669,13 @@ gnttab_query_size(
goto query_out_unlock;
}
- read_lock(&d->grant_table->lock);
+ grant_percpu_read_lock(&d->grant_table->lock);
op.nr_frames = nr_grant_frames(d->grant_table);
op.max_nr_frames = max_grant_frames;
op.status = GNTST_okay;
- read_unlock(&d->grant_table->lock);
+ grant_percpu_read_unlock(&d->grant_table->lock);
query_out_unlock:
@@ -1687,7 +1701,7 @@ gnttab_prepare_for_transfer(
union grant_combo scombo, prev_scombo, new_scombo;
int retries = 0;
- read_lock(&rgt->lock);
+ grant_percpu_read_lock(&rgt->lock);
if ( unlikely(ref >= nr_grant_entries(rgt)) )
{
@@ -1730,11 +1744,11 @@ gnttab_prepare_for_transfer(
scombo = prev_scombo;
}
- read_unlock(&rgt->lock);
+ grant_percpu_read_unlock(&rgt->lock);
return 1;
fail:
- read_unlock(&rgt->lock);
+ grant_percpu_read_unlock(&rgt->lock);
return 0;
}
@@ -1925,7 +1939,7 @@ gnttab_transfer(
TRACE_1D(TRC_MEM_PAGE_GRANT_TRANSFER, e->domain_id);
/* Tell the guest about its new page frame. */
- read_lock(&e->grant_table->lock);
+ grant_percpu_read_lock(&e->grant_table->lock);
act = active_entry_acquire(e->grant_table, gop.ref);
if ( e->grant_table->gt_version == 1 )
@@ -1949,7 +1963,7 @@ gnttab_transfer(
GTF_transfer_completed;
active_entry_release(act);
- read_unlock(&e->grant_table->lock);
+ grant_percpu_read_unlock(&e->grant_table->lock);
rcu_unlock_domain(e);
@@ -1987,7 +2001,7 @@ __release_grant_for_copy(
released_read = 0;
released_write = 0;
- read_lock(&rgt->lock);
+ grant_percpu_read_lock(&rgt->lock);
act = active_entry_acquire(rgt, gref);
sha = shared_entry_header(rgt, gref);
@@ -2029,7 +2043,7 @@ __release_grant_for_copy(
}
active_entry_release(act);
- read_unlock(&rgt->lock);
+ grant_percpu_read_unlock(&rgt->lock);
if ( td != rd )
{
@@ -2086,7 +2100,7 @@ __acquire_grant_for_copy(
*page = NULL;
- read_lock(&rgt->lock);
+ grant_percpu_read_lock(&rgt->lock);
if ( unlikely(gref >= nr_grant_entries(rgt)) )
PIN_FAIL(gt_unlock_out, GNTST_bad_gntref,
@@ -2168,20 +2182,20 @@ __acquire_grant_for_copy(
* here and reacquire
*/
active_entry_release(act);
- read_unlock(&rgt->lock);
+ grant_percpu_read_unlock(&rgt->lock);
rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id,
readonly, &grant_frame, page,
&trans_page_off, &trans_length, 0);
- read_lock(&rgt->lock);
+ grant_percpu_read_lock(&rgt->lock);
act = active_entry_acquire(rgt, gref);
if ( rc != GNTST_okay ) {
__fixup_status_for_copy_pin(act, status);
rcu_unlock_domain(td);
active_entry_release(act);
- read_unlock(&rgt->lock);
+ grant_percpu_read_unlock(&rgt->lock);
return rc;
}
@@ -2194,7 +2208,7 @@ __acquire_grant_for_copy(
__fixup_status_for_copy_pin(act, status);
rcu_unlock_domain(td);
active_entry_release(act);
- read_unlock(&rgt->lock);
+ grant_percpu_read_unlock(&rgt->lock);
put_page(*page);
return __acquire_grant_for_copy(rd, gref, ldom, readonly,
frame, page, page_off, length,
@@ -2258,7 +2272,7 @@ __acquire_grant_for_copy(
*frame = act->frame;
active_entry_release(act);
- read_unlock(&rgt->lock);
+ grant_percpu_read_unlock(&rgt->lock);
return rc;
unlock_out_clear:
@@ -2273,7 +2287,7 @@ __acquire_grant_for_copy(
active_entry_release(act);
gt_unlock_out:
- read_unlock(&rgt->lock);
+ grant_percpu_read_unlock(&rgt->lock);
return rc;
}
@@ -2589,7 +2603,7 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARAM(gnttab_set_version_t) uop)
if ( gt->gt_version == op.version )
goto out;
- write_lock(>->lock);
+ grant_percpu_write_lock(>->lock);
/*
* Make sure that the grant table isn't currently in use when we
* change the version number, except for the first 8 entries which
@@ -2702,7 +2716,7 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARAM(gnttab_set_version_t) uop)
gt->gt_version = op.version;
out_unlock:
- write_unlock(>->lock);
+ grant_percpu_write_unlock(>->lock);
out:
op.version = gt->gt_version;
@@ -2758,7 +2772,7 @@ gnttab_get_status_frames(XEN_GUEST_HANDLE_PARAM(gnttab_get_status_frames_t) uop,
op.status = GNTST_okay;
- read_lock(>->lock);
+ grant_percpu_read_lock(>->lock);
for ( i = 0; i < op.nr_frames; i++ )
{
@@ -2767,7 +2781,7 @@ gnttab_get_status_frames(XEN_GUEST_HANDLE_PARAM(gnttab_get_status_frames_t) uop,
op.status = GNTST_bad_virt_addr;
}
- read_unlock(>->lock);
+ grant_percpu_read_unlock(>->lock);
out2:
rcu_unlock_domain(d);
out1:
@@ -2817,7 +2831,7 @@ __gnttab_swap_grant_ref(grant_ref_t ref_a, grant_ref_t ref_b)
struct active_grant_entry *act_b = NULL;
s16 rc = GNTST_okay;
- write_lock(>->lock);
+ grant_percpu_write_lock(>->lock);
/* Bounds check on the grant refs */
if ( unlikely(ref_a >= nr_grant_entries(d->grant_table)))
@@ -2865,7 +2879,7 @@ out:
active_entry_release(act_b);
if ( act_a != NULL )
active_entry_release(act_a);
- write_unlock(>->lock);
+ grant_percpu_write_unlock(>->lock);
rcu_unlock_domain(d);
@@ -2936,12 +2950,12 @@ static int __gnttab_cache_flush(gnttab_cache_flush_t *cflush,
if ( d != owner )
{
- read_lock(&owner->grant_table->lock);
+ grant_percpu_read_lock(&owner->grant_table->lock);
ret = grant_map_exists(d, owner->grant_table, mfn, ref_count);
if ( ret != 0 )
{
- read_unlock(&owner->grant_table->lock);
+ grant_percpu_read_unlock(&owner->grant_table->lock);
rcu_unlock_domain(d);
put_page(page);
return ret;
@@ -2961,7 +2975,7 @@ static int __gnttab_cache_flush(gnttab_cache_flush_t *cflush,
ret = 0;
if ( d != owner )
- read_unlock(&owner->grant_table->lock);
+ grant_percpu_read_unlock(&owner->grant_table->lock);
unmap_domain_page(v);
put_page(page);
@@ -3180,7 +3194,7 @@ grant_table_create(
goto no_mem_0;
/* Simple stuff. */
- rwlock_init(&t->lock);
+ percpu_rwlock_resource_init(&t->lock, grant_rwlock);
spin_lock_init(&t->maptrack_lock);
t->nr_grant_frames = INITIAL_NR_GRANT_FRAMES;
@@ -3282,7 +3296,7 @@ gnttab_release_mappings(
}
rgt = rd->grant_table;
- read_lock(&rgt->lock);
+ grant_percpu_read_lock(&rgt->lock);
act = active_entry_acquire(rgt, ref);
sha = shared_entry_header(rgt, ref);
@@ -3343,7 +3357,7 @@ gnttab_release_mappings(
gnttab_clear_flag(_GTF_reading, status);
active_entry_release(act);
- read_unlock(&rgt->lock);
+ grant_percpu_read_unlock(&rgt->lock);
rcu_unlock_domain(rd);
@@ -3360,7 +3374,7 @@ void grant_table_warn_active_grants(struct domain *d)
#define WARN_GRANT_MAX 10
- read_lock(>->lock);
+ grant_percpu_read_lock(>->lock);
for ( ref = 0; ref != nr_grant_entries(gt); ref++ )
{
@@ -3382,7 +3396,7 @@ void grant_table_warn_active_grants(struct domain *d)
printk(XENLOG_G_DEBUG "Dom%d has too many (%d) active grants to report\n",
d->domain_id, nr_active);
- read_unlock(>->lock);
+ grant_percpu_read_unlock(>->lock);
#undef WARN_GRANT_MAX
}
@@ -3432,7 +3446,7 @@ static void gnttab_usage_print(struct domain *rd)
printk(" -------- active -------- -------- shared --------\n");
printk("[ref] localdom mfn pin localdom gmfn flags\n");
- read_lock(>->lock);
+ grant_percpu_read_lock(>->lock);
for ( ref = 0; ref != nr_grant_entries(gt); ref++ )
{
@@ -3475,7 +3489,7 @@ static void gnttab_usage_print(struct domain *rd)
active_entry_release(act);
}
- read_unlock(>->lock);
+ grant_percpu_read_unlock(>->lock);
if ( first )
printk("grant-table for remote domain:%5d ... "
@@ -51,13 +51,35 @@
/* The maximum size of a grant table. */
extern unsigned int max_grant_frames;
+DECLARE_PERCPU_RWLOCK_GLOBAL(grant_rwlock);
+
+static inline void grant_percpu_read_lock(percpu_rwlock_t *lock)
+{
+ percpu_read_lock(grant_rwlock, lock);
+}
+
+static inline void grant_percpu_read_unlock(percpu_rwlock_t *lock)
+{
+ percpu_read_unlock(grant_rwlock, lock);
+}
+
+static inline void grant_percpu_write_lock(percpu_rwlock_t *lock)
+{
+ percpu_write_lock(grant_rwlock, lock);
+}
+
+static inline void grant_percpu_write_unlock(percpu_rwlock_t *lock)
+{
+ percpu_write_unlock(grant_rwlock, lock);
+}
+
/* Per-domain grant information. */
struct grant_table {
/*
* Lock protecting updates to grant table state (version, active
* entry list, etc.)
*/
- rwlock_t lock;
+ percpu_rwlock_t lock;
/* Table size. Number of frames shared with guest */
unsigned int nr_grant_frames;
/* Shared grant table (see include/public/grant_table.h). */
The per domain grant table read lock suffers from significant contention when performance multi-queue block or network IO due to the parallel grant map/unmaps/copies occurring on the DomU's grant table. On multi-socket systems, the contention results in the locked compare swap operation failing frequently which results in a tight loop of retries of the compare swap operation. As the coherency fabric can only support a specific rate of compare swap operations for a particular data location then taking the read lock itself becomes a bottleneck for grant operations. Standard rwlock performance of a single VIF VM-VM transfer with 16 queues configured was limited to approximately 15 gbit/s on a 2 socket Haswell-EP host. Percpu rwlock performance with the same configuration is approximately 48 gbit/s. Oprofile was used to determine the initial overhead of the read-write locks and to confirm the overhead was dramatically reduced by the percpu rwlocks. Signed-off-by: Malcolm Crossley <malcolm.crossley@citrix.com> -- Changes since v2: - Switched to using wrappers for taking percpu rwlock - Added percpu structure pointer to percpu rwlock initialisation - Added comment on removal of ASSERTS for grant table rw_is_locked() Changes since v1: - Used new macros provided in updated percpu rwlock v2 patch - Converted grant table rwlock_t to percpu_rwlock_t - Patched a missed grant table rwlock_t usage site --- xen/arch/arm/mm.c | 4 +- xen/arch/x86/mm.c | 4 +- xen/common/grant_table.c | 126 +++++++++++++++++++++++------------------- xen/include/xen/grant_table.h | 24 +++++++- 4 files changed, 97 insertions(+), 61 deletions(-)