@@ -31,6 +31,8 @@
DECLARE_PER_CPU(int, mm_lock_level);
#define __get_lock_level() (this_cpu(mm_lock_level))
+DECLARE_PERCPU_RWLOCK_GLOBAL(p2m_percpu_rwlock);
+
static inline void mm_lock_init(mm_lock_t *l)
{
spin_lock_init(&l->lock);
@@ -99,7 +101,7 @@ static inline void _mm_enforce_order_lock_post(int level, int *unlock_level,
static inline void mm_rwlock_init(mm_rwlock_t *l)
{
- rwlock_init(&l->lock);
+ percpu_rwlock_resource_init(&l->lock, p2m_percpu_rwlock);
l->locker = -1;
l->locker_function = "nobody";
l->unlock_level = 0;
@@ -115,7 +117,7 @@ static inline void _mm_write_lock(mm_rwlock_t *l, const char *func, int level)
if ( !mm_write_locked_by_me(l) )
{
__check_lock_level(level);
- write_lock(&l->lock);
+ percpu_write_lock(p2m_percpu_rwlock, &l->lock);
l->locker = get_processor_id();
l->locker_function = func;
l->unlock_level = __get_lock_level();
@@ -131,20 +133,20 @@ static inline void mm_write_unlock(mm_rwlock_t *l)
l->locker = -1;
l->locker_function = "nobody";
__set_lock_level(l->unlock_level);
- write_unlock(&l->lock);
+ percpu_write_unlock(p2m_percpu_rwlock, &l->lock);
}
static inline void _mm_read_lock(mm_rwlock_t *l, int level)
{
__check_lock_level(level);
- read_lock(&l->lock);
+ percpu_read_lock(p2m_percpu_rwlock, &l->lock);
/* There's nowhere to store the per-CPU unlock level so we can't
* set the lock level. */
}
static inline void mm_read_unlock(mm_rwlock_t *l)
{
- read_unlock(&l->lock);
+ percpu_read_unlock(p2m_percpu_rwlock, &l->lock);
}
/* This wrapper uses the line number to express the locking order below */
@@ -54,6 +54,7 @@ boolean_param("hap_2mb", opt_hap_2mb);
#undef page_to_mfn
#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
+DEFINE_PERCPU_RWLOCK_GLOBAL(p2m_percpu_rwlock);
/* Init the datastructures for later use by the p2m code */
static int p2m_initialise(struct domain *d, struct p2m_domain *p2m)
@@ -566,7 +566,7 @@ typedef struct mm_lock {
} mm_lock_t;
typedef struct mm_rwlock {
- rwlock_t lock;
+ percpu_rwlock_t lock;
int unlock_level;
int recurse_count;
int locker; /* CPU that holds the write lock */
The per domain p2m read lock suffers from significant contention when performance multi-queue block or network IO due to the parallel grant map/unmaps/copies occuring on the DomU's p2m. On multi-socket systems, the contention results in the locked compare swap operation failing frequently which results in a tight loop of retries of the compare swap operation. As the coherency fabric can only support a specific rate of compare swap operations for a particular data location then taking the read lock itself becomes a bottleneck for p2m operations. Percpu rwlock p2m performance with the same configuration is approximately 64 gbit/s vs the 48 gbit/s with grant table percpu rwlocks only. Oprofile was used to determine the initial overhead of the read-write locks and to confirm the overhead was dramatically reduced by the percpu rwlocks. Note: altp2m users will not achieve a gain if they take an altp2m read lock simultaneously with the main p2m lock. Signed-off-by: Malcolm Crossley <malcolm.crossley@citrix.com> -- Changes since v4: - None Changes since v3: - None Changes since v2 - Updated local percpu rwlock initialisation --- xen/arch/x86/mm/mm-locks.h | 12 +++++++----- xen/arch/x86/mm/p2m.c | 1 + xen/include/asm-x86/mm.h | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-)