diff mbox

[10/11] qspinlock: Paravirt support

Message ID 20140615130154.213923590@chello.nl (mailing list archive)
State New, archived
Headers show

Commit Message

Peter Zijlstra June 15, 2014, 12:47 p.m. UTC
Add minimal paravirt support.

The code aims for minimal impact on the native case.

On the lock side we add one jump label (asm_goto) and 4 paravirt
callee saved calls that default to NOPs. The only effects are the
extra NOPs and some pointless MOVs to accomodate the calling
convention. No register spills happen because of this (x86_64).

On the unlock side we have one paravirt callee saved call, which
defaults to the actual unlock sequence: "movb $0, (%rdi)" and a NOP.

The actual paravirt code comes in 3 parts;

 - init_node; this initializes the extra data members required for PV
   state. PV state data is kept 1 cacheline ahead of the regular data.

 - link_and_wait_node/kick_node; these are paired with the regular MCS
   queueing and are placed resp. before/after the paired MCS ops.

 - wait_head/queue_unlock; the interesting part here is finding the
   head node to kick.

Tracking the head is done in two parts, firstly the pv_wait_head will
store its cpu number in whichever node is pointed to by the tail part
of the lock word. Secondly, pv_link_and_wait_node() will propagate the
existing head from the old to the new tail node.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
---
 arch/x86/include/asm/paravirt.h       |   39 +++++++
 arch/x86/include/asm/paravirt_types.h |   15 ++
 arch/x86/include/asm/qspinlock.h      |   25 ++++
 arch/x86/kernel/paravirt-spinlocks.c  |   22 ++++
 arch/x86/kernel/paravirt_patch_32.c   |    7 +
 arch/x86/kernel/paravirt_patch_64.c   |    7 +
 include/asm-generic/qspinlock.h       |   11 ++
 kernel/locking/qspinlock.c            |  179 +++++++++++++++++++++++++++++++++-
 8 files changed, 302 insertions(+), 3 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Waiman Long June 17, 2014, 12:53 a.m. UTC | #1
I am resending it as my original reply has some HTML code & hence 
rejected by the mailing lists.


On 06/15/2014 08:47 AM, Peter Zijlstra wrote:
>
>
>
> +#ifdef CONFIG_PARAVIRT_SPINLOCKS
> +
> +/*
> + * Write a comment about how all this works...
> + */
> +
> +#define _Q_LOCKED_SLOW	(2U<<  _Q_LOCKED_OFFSET)
> +
> +struct pv_node {
> +	struct mcs_spinlock	mcs;
> +	struct mcs_spinlock	__offset[3];
> +	int cpu, head;
> +};

I am wondering why you need the separate cpu and head variables. I 
thought one will be enough here. The wait code put the cpu number in 
head, the the kick_cpu code kick the one in cpu which is just the cpu # 
of the tail.

> +
> +#define INVALID_HEAD	-1
> +#define NO_HEAD		nr_cpu_ids
> +

I think it is better to use a constant like -2 for NO_HEAD instead of an 
external variable.

> +void __pv_init_node(struct mcs_spinlock *node)
> +{
> +	struct pv_node *pn = (struct pv_node *)node;
> +
> +	BUILD_BUG_ON(sizeof(struct pv_node)>  5*sizeof(struct mcs_spinlock));
> +
> +	pn->cpu = smp_processor_id();
> +	pn->head = INVALID_HEAD;
> +}
> +
> +static inline struct pv_node *pv_decode_tail(u32 tail)
> +{
> +	return (struct pv_node *)decode_tail(tail);
> +}
> +
> +void __pv_link_and_wait_node(u32 old, struct mcs_spinlock *node)
> +{
> +	struct pv_node *ppn, *pn = (struct pv_node *)node;
> +	unsigned int count;
> +
> +	if (!(old&  _Q_TAIL_MASK)) {
> +		pn->head = NO_HEAD;
> +		return;
> +	}
> +
> +	ppn = pv_decode_tail(old);
> +	ACCESS_ONCE(ppn->mcs.next) = node;
> +
> +	while (ppn->head == INVALID_HEAD)
> +		cpu_relax();
> +
> +	pn->head = ppn->head;

A race can happen here as pn->head can be changed to the head cpu by the 
head waiter while being changed by this function at the same time. It is 
safer to use cmpxchg to make sure that there is no accidental 
overwriting of the head CPU number.

> +
> +	for (;;) {
> +		count = SPIN_THRESHOLD;
> +
> +		do {
> +			if (smp_load_acquire(&node->locked))
> +				return;
> +
> +			cpu_relax();
> +		} while (--count);
> +
> +		pv_wait(&node->locked, 1);
> +	}
> +}
> +
> +void __pv_kick_node(struct mcs_spinlock *node)
> +{
> +	struct pv_node *pn = (struct pv_node *)node;
> +
> +	pv_kick(pn->cpu);
> +}
> +
> +void __pv_wait_head(struct qspinlock *lock)
> +{
> +	unsigned int count;
> +	struct pv_node *pn;
> +	int val, old, new;
> +
> +	for (;;) {
> +		count = SPIN_THRESHOLD;
> +
> +		do {
> +			val = smp_load_acquire(&lock->val.counter);
> +			if (!(val&  _Q_LOCKED_PENDING_MASK))
> +				return;
> +		} while (--count);
> +
> +		do {
> +			pn = pv_decode_tail(atomic_read(&lock->val));
> +
> +			while (pn->head == INVALID_HEAD)
> +				cpu_relax();
> +
> +			pn->head = smp_processor_id();
> +
> +		} while (pn != pv_decode_tail(atomic_read(&lock->val)));
> +
> +		/*
> +		 * Set _Q_LOCKED_SLOW; bail when the lock is free.
> +		 */
> +		val = atomic_read(&lock->val);
> +		for (;;) {
> +			if (!(val&  _Q_LOCKED_PENDING_MASK))
> +				return;
> +			new = val | _Q_LOCKED_SLOW;
> +			old = atomic_cmpxchg(&lock->val, val, new);
> +			if (old == val)
> +				break;
> +			val = old;
> +		}
> +
> +		/* XXX 16bit would be better */
> +		pv_wait(&lock->val.counter, new);
> +	}
> +}
> +
> +static void ___pv_kick_head(struct qspinlock *lock)
> +{
> +	struct pv_node *pn;
> +
> +	pn = pv_decode_tail(atomic_read(&lock->val));
> +
> +	while (pn->head == INVALID_HEAD)
> +		cpu_relax();
> +
> +	if (WARN_ON_ONCE(pn->head == NO_HEAD))
> +		return;
> +
> +	pv_kick(pn->head);
> +}
> +
> +void __pv_queue_unlock(struct qspinlock *lock)
> +{
> +	int val = atomic_read(&lock->val);
> +
> +	native_queue_unlock(lock);
> +
> +	if (val&  _Q_LOCKED_SLOW)
> +		___pv_kick_head(lock);
> +}
> +

Again a race can happen here between the reading and writing of the lock 
value. I can't think of a good way to do that without using cmpxchg.

> +#else
> +
> +static inline void pv_init_node(struct mcs_spinlock *node) { }
> +static inline void pv_link_and_wait_node(u32 old, struct mcs_spinlock *node) { }
> +static inline void pv_kick_node(struct mcs_spinlock *node) { }
> +
> +static inline void pv_wait_head(struct qspinlock *lock) { }
> +
> +#endif
> +
>   /**
>    * queue_spin_lock_slowpath - acquire the queue spinlock
>    * @lock: Pointer to queue spinlock structure
> @@ -247,6 +417,9 @@ void queue_spin_lock_slowpath(struct qsp
>
>   	BUILD_BUG_ON(CONFIG_NR_CPUS>= (1U<<  _Q_TAIL_CPU_BITS));
>
> +	if (pv_enabled())
> +		goto queue;
> +
>   	if (virt_queue_spin_lock(lock))
>   		return;
>
> @@ -323,6 +496,7 @@ void queue_spin_lock_slowpath(struct qsp
>   	node += idx;
>   	node->locked = 0;
>   	node->next = NULL;
> +	pv_init_node(node);
>
>   	/*
>   	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
> @@ -343,6 +517,7 @@ void queue_spin_lock_slowpath(struct qsp
>   	/*
>   	 * if there was a previous node; link it and wait.
>   	 */
> +	pv_link_and_wait_node(old, node);
>   	if (old&  _Q_TAIL_MASK) {
>   		prev = decode_tail(old);
>   		ACCESS_ONCE(prev->next) = node;
> @@ -358,6 +533,7 @@ void queue_spin_lock_slowpath(struct qsp
>   	 *
>   	 * *,x,y ->  *,0,0
>   	 */
> +	pv_wait_head(lock);
>   	while ((val = smp_load_acquire(&lock->val.counter))&
>   			_Q_LOCKED_PENDING_MASK)
>   		cpu_relax();
> @@ -391,6 +567,7 @@ void queue_spin_lock_slowpath(struct qsp
>   		cpu_relax();
>
>   	arch_mcs_spin_unlock_contended(&next->locked);
> +	pv_kick_node(next);
>   

pv_kick_node is an expensive operation and it can significantly slow 
down the locking operation if we have to do it for every subsequent task 
in the queue.

-Longman

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini June 18, 2014, 12:03 p.m. UTC | #2
Il 17/06/2014 00:08, Waiman Long ha scritto:
>> +void __pv_queue_unlock(struct qspinlock *lock)
>> +{
>> +	int val = atomic_read(&lock->val);
>> +
>> +	native_queue_unlock(lock);
>> +
>> +	if (val & _Q_LOCKED_SLOW)
>> +		___pv_kick_head(lock);
>> +}
>> +
>
> Again a race can happen here between the reading and writing of the lock
> value. I can't think of a good way to do that without using cmpxchg.

Could you just use xchg on the locked byte?

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini June 18, 2014, 12:04 p.m. UTC | #3
Il 15/06/2014 14:47, Peter Zijlstra ha scritto:
>
>
>  #if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
>
> -#define	queue_spin_unlock queue_spin_unlock
>  /**
>   * queue_spin_unlock - release a queue spinlock
>   * @lock : Pointer to queue spinlock structure
>   *
>   * An effective smp_store_release() on the least-significant byte.
>   */
> -static inline void queue_spin_unlock(struct qspinlock *lock)
> +static inline void native_queue_unlock(struct qspinlock *lock)
>  {
>  	barrier();
>  	ACCESS_ONCE(*(u8 *)lock) = 0;
>  }
>
> +#else
> +
> +static inline void native_queue_unlock(struct qspinlock *lock)
> +{
> +	atomic_dec(&lock->val);
> +}
> +
>  #endif /* !CONFIG_X86_OOSTORE && !CONFIG_X86_PPRO_FENCE */


Should be (part of) an earlier patch?  Also, does it get wrong if 
(CONFIG_X86_OOSTORE || CONFIG_X86_PPRO_FENCE) && paravirt patches the 
unlock to a single movb?  Of course the paravirt spinlocks could simply 
depend on !CONFIG_X86_OOSTORE && !CONFIG_X86_PPRO_FENCE.

> +
> +#define INVALID_HEAD	-1
> +#define NO_HEAD		nr_cpu_ids
> +

-2, like Waiman said.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Waiman Long June 18, 2014, 3:26 p.m. UTC | #4
On 06/18/2014 08:03 AM, Paolo Bonzini wrote:
> Il 17/06/2014 00:08, Waiman Long ha scritto:
>>> +void __pv_queue_unlock(struct qspinlock *lock)
>>> +{
>>> +    int val = atomic_read(&lock->val);
>>> +
>>> +    native_queue_unlock(lock);
>>> +
>>> +    if (val & _Q_LOCKED_SLOW)
>>> +        ___pv_kick_head(lock);
>>> +}
>>> +
>>
>> Again a race can happen here between the reading and writing of the lock
>> value. I can't think of a good way to do that without using cmpxchg.
>
> Could you just use xchg on the locked byte?
>
> Paolo

The slowpath flag is just an indication that the queue head cpu might 
have been suspended. It may not be due to spurious wakeup. Releasing the 
lock unconditionally may cause the queue to be changed while it is being 
inspected. It really depending on how the cpu kicking is being handled. 
My patch delays the unlocking until all the inspections had been done to 
make sure that we don't waste time doing a cpu kick that is not needed.

-Longman
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Konrad Rzeszutek Wilk June 20, 2014, 1:46 p.m. UTC | #5
On Sun, Jun 15, 2014 at 02:47:07PM +0200, Peter Zijlstra wrote:
> Add minimal paravirt support.
> 
> The code aims for minimal impact on the native case.

Woot!
> 
> On the lock side we add one jump label (asm_goto) and 4 paravirt
> callee saved calls that default to NOPs. The only effects are the
> extra NOPs and some pointless MOVs to accomodate the calling
> convention. No register spills happen because of this (x86_64).
> 
> On the unlock side we have one paravirt callee saved call, which
> defaults to the actual unlock sequence: "movb $0, (%rdi)" and a NOP.
> 
> The actual paravirt code comes in 3 parts;
> 
>  - init_node; this initializes the extra data members required for PV
>    state. PV state data is kept 1 cacheline ahead of the regular data.
> 
>  - link_and_wait_node/kick_node; these are paired with the regular MCS
>    queueing and are placed resp. before/after the paired MCS ops.
> 
>  - wait_head/queue_unlock; the interesting part here is finding the
>    head node to kick.
> 
> Tracking the head is done in two parts, firstly the pv_wait_head will
> store its cpu number in whichever node is pointed to by the tail part
> of the lock word. Secondly, pv_link_and_wait_node() will propagate the
> existing head from the old to the new tail node.

I dug in the code and I have some comments about it, but before
I post them I was wondering if you have any plans to run any performance
tests against the PV ticketlock with normal and over-committed scenarios?

Looking at this with a pen and paper I see that compared to
PV ticketlock for the CPUs that are contending on the queue (so they
go to pv_wait_head_and_link, then progress to pv_wait_head), they
go sleep twice and get woken up twice. In PV ticketlock the
contending CPUs would only go to sleep once and woken up once it
was their turn.

That of course is the worst case scenario - where the CPU
that has the lock is taking forever to do its job and the
host is quite overcommitted.

Thanks!
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra July 7, 2014, 3:20 p.m. UTC | #6
On Mon, Jun 16, 2014 at 06:08:21PM -0400, Waiman Long wrote:
> On 06/15/2014 08:47 AM, Peter Zijlstra wrote:
> >+struct pv_node {
> >+	struct mcs_spinlock	mcs;
> >+	struct mcs_spinlock	__offset[3];
> >+	int cpu, head;
> >+};
> 
> I am wondering why you need the separate cpu and head variables. I thought
> one will be enough here. The wait code put the cpu number in head, the the
> kick_cpu code kick the one in cpu which is just the cpu # of the tail.

The @cpu is the current cpu, the @head is the encoded pointer to the
queue head, they aren't necessarily the same.

The @head thing is not unlike your next pointer, just backwards.

> >+#define INVALID_HEAD	-1
> >+#define NO_HEAD		nr_cpu_ids
> >+
> 
> I think it is better to use a constant like -2 for NO_HEAD instead of an
> external variable.

Sure..

> >+void __pv_init_node(struct mcs_spinlock *node)
> >+{
> >+	struct pv_node *pn = (struct pv_node *)node;
> >+
> >+	BUILD_BUG_ON(sizeof(struct pv_node)>  5*sizeof(struct mcs_spinlock));
> >+
> >+	pn->cpu = smp_processor_id();
> >+	pn->head = INVALID_HEAD;
> >+}
> >+
> >+static inline struct pv_node *pv_decode_tail(u32 tail)
> >+{
> >+	return (struct pv_node *)decode_tail(tail);
> >+}
> >+
> >+void __pv_link_and_wait_node(u32 old, struct mcs_spinlock *node)
> >+{
> >+	struct pv_node *ppn, *pn = (struct pv_node *)node;
> >+	unsigned int count;
> >+
> >+	if (!(old&  _Q_TAIL_MASK)) {
> >+		pn->head = NO_HEAD;
> >+		return;
> >+	}
> >+
> >+	ppn = pv_decode_tail(old);
> >+	ACCESS_ONCE(ppn->mcs.next) = node;
> >+
> >+	while (ppn->head == INVALID_HEAD)
> >+		cpu_relax();
> >+
> >+	pn->head = ppn->head;
> 
> A race can happen here as pn->head can be changed to the head cpu by the
> head waiter while being changed by this function at the same time. It is
> safer to use cmpxchg to make sure that there is no accidental overwriting of
> the head CPU number.

Ok, so I'm not entirely sure I see the race, although its entirely
possible, this is far too fragile. But I couldn't get rid of the race
with cmpxchg/xchg either.

So the idea is 'simple'; have link_and_wait propagate the 'head'
'pointer' from the old to the new tail, and have wait_head set the
'head' pointer on the current tail every time the top waiter goes to
sleep.

There's the obvious race where both happen at the same time and you're
not sure which 'head' 'pointer' won. To solve that what I did was:

init:
  INVALID_HEAD

link_and_wait:
  INVALID_HEAD -> pprev->head , NO_HEAD

wait_head:
  !INVALID_HEAD -> new head

This way wait_head must wait for link_and_wait to finish before writing
the new head value. Furthermore, if we race such that we obtained the
'old' tail and link_and_wait propagated the 'old' head to the 'new'
tail, wait_head will detect this by verifying the tail pointer after
writing the new head.

We don't need atomics here afaict, but we have wait loops, which of
course suck arse for virt :/

I'm not too fond of this scheme; but I thought I'd try and get rid of
that O(n) loop you had for finding the head, we simply cannot assume
'small' number of vcpus.

> >+void __pv_queue_unlock(struct qspinlock *lock)
> >+{
> >+	int val = atomic_read(&lock->val);
> >+
> >+	native_queue_unlock(lock);
> >+
> >+	if (val&  _Q_LOCKED_SLOW)
> >+		___pv_kick_head(lock);
> >+}
> >+
> 
> Again a race can happen here between the reading and writing of the lock
> value. I can't think of a good way to do that without using cmpxchg.

Indeed so, xchg it is I suppose :/

> >@@ -358,6 +533,7 @@ void queue_spin_lock_slowpath(struct qsp
> >  	 *
> >  	 * *,x,y ->  *,0,0
> >  	 */
> >+	pv_wait_head(lock);
> >  	while ((val = smp_load_acquire(&lock->val.counter))&
> >  			_Q_LOCKED_PENDING_MASK)
> >  		cpu_relax();
> >@@ -391,6 +567,7 @@ void queue_spin_lock_slowpath(struct qsp
> >  		cpu_relax();
> >
> >  	arch_mcs_spin_unlock_contended(&next->locked);
> >+	pv_kick_node(next);
> 
> pv_kick_node is an expensive operation and it can significantly slow down
> the locking operation if we have to do it for every subsequent task in the
> queue.

You might by now have noticed that I don't particularly care too much
about (para)virt performance :-) Also, I'm very much trying to get
'simple' things working before making them more complex.
Peter Zijlstra July 7, 2014, 3:20 p.m. UTC | #7
On Wed, Jun 18, 2014 at 02:03:12PM +0200, Paolo Bonzini wrote:
> Il 17/06/2014 00:08, Waiman Long ha scritto:
> >>+void __pv_queue_unlock(struct qspinlock *lock)
> >>+{
> >>+	int val = atomic_read(&lock->val);
> >>+
> >>+	native_queue_unlock(lock);
> >>+
> >>+	if (val & _Q_LOCKED_SLOW)
> >>+		___pv_kick_head(lock);
> >>+}
> >>+
> >
> >Again a race can happen here between the reading and writing of the lock
> >value. I can't think of a good way to do that without using cmpxchg.
> 
> Could you just use xchg on the locked byte?

I'll have to, indeed. This is racy.
Peter Zijlstra July 7, 2014, 3:27 p.m. UTC | #8
On Fri, Jun 20, 2014 at 09:46:08AM -0400, Konrad Rzeszutek Wilk wrote:
> I dug in the code and I have some comments about it, but before
> I post them I was wondering if you have any plans to run any performance
> tests against the PV ticketlock with normal and over-committed scenarios?

I can barely boot a guest.. I'm not sure I can make them do anything
much at all yet. All this virt crap is totally painful.
Konrad Rzeszutek Wilk July 15, 2014, 2:23 p.m. UTC | #9
On Mon, Jul 07, 2014 at 05:27:34PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 20, 2014 at 09:46:08AM -0400, Konrad Rzeszutek Wilk wrote:
> > I dug in the code and I have some comments about it, but before
> > I post them I was wondering if you have any plans to run any performance
> > tests against the PV ticketlock with normal and over-committed scenarios?
> 
> I can barely boot a guest.. I'm not sure I can make them do anything
> much at all yet. All this virt crap is totally painful.

HA!

The reason I asked about that is from a pen-and-paper view it looks
suboptimal in the worst case scenario compared to PV ticketlock.

The 'worst case scenario' is when we over-commit (more CPUs than there
are physical CPUs) or have to delay guests (the sum of all virtual
CPUs > physical CPUs and all of the guests are compiling kernels).

In those cases the PV ticketlock goes to sleep and gets woken up
once the ticket holder has finished. In the PV qspinlock we do
wake up the first in queue, but we also wake the next one in queue
so it can progress further. And so on.

Perhaps a better mechanism is just ditch the queue part and utilize
the byte part and under KVM and Xen just do bytelocking (since we
have 8 bits). For the PV halt/waking we can stash in the 'struct mcs'
the current lock that each CPU is waiting for. And the unlocker
can iterate over all of those and wake them all up. Perhaps make
the iteration random. Anyhow, that is how the old PV bytelock under
Xen worked (before 3.11) and it had worked pretty well (it didn't
do it random thought - always started with 'for_each_online_cpu').

Squashing in the ticketlock concept in qspinlock for PV looks
scary.

And as I said - this is all pen-and-paper - so it might be that this
'wake-up-go-sleep-on-the-queue' kick is actually not that bad?

Lastly - thank you for taking a stab at this.
> 


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

Index: linux-2.6/arch/x86/include/asm/paravirt.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/paravirt.h
+++ linux-2.6/arch/x86/include/asm/paravirt.h
@@ -712,6 +712,44 @@  static inline void __set_fixmap(unsigned
 
 #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
 
+#ifdef CONFIG_QUEUE_SPINLOCK
+
+static __always_inline void pv_init_node(struct mcs_spinlock *node)
+{
+	PVOP_VCALLEE1(pv_lock_ops.init_node, node);
+}
+
+static __always_inline void pv_link_and_wait_node(u32 old, struct mcs_spinlock *node)
+{
+	PVOP_VCALLEE2(pv_lock_ops.link_and_wait_node, old, node);
+}
+
+static __always_inline void pv_kick_node(struct mcs_spinlock *node)
+{
+	PVOP_VCALLEE1(pv_lock_ops.kick_node, node);
+}
+
+static __always_inline void pv_wait_head(struct qspinlock *lock)
+{
+	PVOP_VCALLEE1(pv_lock_ops.wait_head, lock);
+}
+
+static __always_inline void pv_queue_unlock(struct qspinlock *lock)
+{
+	PVOP_VCALLEE1(pv_lock_ops.queue_unlock, lock);
+}
+
+static __always_inline void pv_wait(int *ptr, int val)
+{
+	PVOP_VCALL2(pv_lock_ops.wait, ptr, val);
+}
+
+static __always_inline void pv_kick(int cpu)
+{
+	PVOP_VCALL1(pv_lock_ops.kick, cpu);
+}
+
+#else
 static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
 							__ticket_t ticket)
 {
@@ -723,6 +761,7 @@  static __always_inline void __ticket_unl
 {
 	PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
 }
+#endif
 
 #endif
 
Index: linux-2.6/arch/x86/include/asm/paravirt_types.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/paravirt_types.h
+++ linux-2.6/arch/x86/include/asm/paravirt_types.h
@@ -326,6 +326,9 @@  struct pv_mmu_ops {
 			   phys_addr_t phys, pgprot_t flags);
 };
 
+struct mcs_spinlock;
+struct qspinlock;
+
 struct arch_spinlock;
 #ifdef CONFIG_SMP
 #include <asm/spinlock_types.h>
@@ -334,8 +337,20 @@  typedef u16 __ticket_t;
 #endif
 
 struct pv_lock_ops {
+#ifdef CONFIG_QUEUE_SPINLOCK
+	struct paravirt_callee_save init_node;
+	struct paravirt_callee_save link_and_wait_node;
+	struct paravirt_callee_save kick_node;
+
+	struct paravirt_callee_save wait_head;
+	struct paravirt_callee_save queue_unlock;
+
+	void (*wait)(int *ptr, int val);
+	void (*kick)(int cpu);
+#else
 	struct paravirt_callee_save lock_spinning;
 	void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
+#endif
 };
 
 /* This contains all the paravirt structures: we get a convenient
Index: linux-2.6/arch/x86/include/asm/qspinlock.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/qspinlock.h
+++ linux-2.6/arch/x86/include/asm/qspinlock.h
@@ -3,24 +3,45 @@ 
 
 #include <asm/cpufeature.h>
 #include <asm-generic/qspinlock_types.h>
+#include <asm/paravirt.h>
 
 #if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
 
-#define	queue_spin_unlock queue_spin_unlock
 /**
  * queue_spin_unlock - release a queue spinlock
  * @lock : Pointer to queue spinlock structure
  *
  * An effective smp_store_release() on the least-significant byte.
  */
-static inline void queue_spin_unlock(struct qspinlock *lock)
+static inline void native_queue_unlock(struct qspinlock *lock)
 {
 	barrier();
 	ACCESS_ONCE(*(u8 *)lock) = 0;
 }
 
+#else
+
+static inline void native_queue_unlock(struct qspinlock *lock)
+{
+	atomic_dec(&lock->val);
+}
+
 #endif /* !CONFIG_X86_OOSTORE && !CONFIG_X86_PPRO_FENCE */
 
+#define	queue_spin_unlock queue_spin_unlock
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+static inline void queue_spin_unlock(struct qspinlock *lock)
+{
+	pv_queue_unlock(lock);
+}
+#else
+static inline void queue_spin_unlock(struct qspinlock *lock)
+{
+	native_queue_unlock(lock);
+}
+#endif
+
 #define virt_queue_spin_lock virt_queue_spin_lock
 
 static inline bool virt_queue_spin_lock(struct qspinlock *lock)
Index: linux-2.6/arch/x86/kernel/paravirt-spinlocks.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/paravirt-spinlocks.c
+++ linux-2.6/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,11 +8,33 @@ 
 
 #include <asm/paravirt.h>
 
+#ifdef CONFIG_SMP
+#ifdef CONFIG_QUEUE_SPINLOCK
+void __native_queue_unlock(struct qspinlock *lock)
+{
+	native_queue_unlock(lock);
+}
+PV_CALLEE_SAVE_REGS_THUNK(__native_queue_unlock);
+#endif
+#endif
+
 struct pv_lock_ops pv_lock_ops = {
 #ifdef CONFIG_SMP
+#ifdef CONFIG_QUEUE_SPINLOCK
+	.init_node = __PV_IS_CALLEE_SAVE(paravirt_nop),
+	.link_and_wait_node = __PV_IS_CALLEE_SAVE(paravirt_nop),
+	.kick_node = __PV_IS_CALLEE_SAVE(paravirt_nop),
+
+	.wait_head = __PV_IS_CALLEE_SAVE(paravirt_nop),
+	.queue_unlock = PV_CALLEE_SAVE(__native_queue_unlock),
+
+	.wait = paravirt_nop,
+	.kick = paravirt_nop,
+#else
 	.lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
 	.unlock_kick = paravirt_nop,
 #endif
+#endif
 };
 EXPORT_SYMBOL(pv_lock_ops);
 
Index: linux-2.6/arch/x86/kernel/paravirt_patch_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/paravirt_patch_32.c
+++ linux-2.6/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,10 @@  DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %c
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
 
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUE_SPINLOCK)
+DEF_NATIVE(pv_lock_ops, queue_unlock, "movb $0, (%eax)");
+#endif
+
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
 {
 	/* arg in %eax, return in %eax */
@@ -47,6 +51,9 @@  unsigned native_patch(u8 type, u16 clobb
 		PATCH_SITE(pv_mmu_ops, write_cr3);
 		PATCH_SITE(pv_cpu_ops, clts);
 		PATCH_SITE(pv_cpu_ops, read_tsc);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUE_SPINLOCK)
+		PATCH_SITE(pv_lock_ops, queue_unlock);
+#endif
 
 	patch_site:
 		ret = paravirt_patch_insns(ibuf, len, start, end);
Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c
+++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c
@@ -22,6 +22,10 @@  DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs")
 DEF_NATIVE(, mov32, "mov %edi, %eax");
 DEF_NATIVE(, mov64, "mov %rdi, %rax");
 
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUE_SPINLOCK)
+DEF_NATIVE(pv_lock_ops, queue_unlock, "movb $0, (%rdi)");
+#endif
+
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
 {
 	return paravirt_patch_insns(insnbuf, len,
@@ -61,6 +65,9 @@  unsigned native_patch(u8 type, u16 clobb
 		PATCH_SITE(pv_cpu_ops, clts);
 		PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 		PATCH_SITE(pv_cpu_ops, wbinvd);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUE_SPINLOCK)
+		PATCH_SITE(pv_lock_ops, queue_unlock);
+#endif
 
 	patch_site:
 		ret = paravirt_patch_insns(ibuf, len, start, end);
Index: linux-2.6/include/asm-generic/qspinlock.h
===================================================================
--- linux-2.6.orig/include/asm-generic/qspinlock.h
+++ linux-2.6/include/asm-generic/qspinlock.h
@@ -105,6 +105,17 @@  static __always_inline bool virt_queue_s
 }
 #endif
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+struct mcs_spinlock;
+
+extern void __pv_init_node(struct mcs_spinlock *node);
+extern void __pv_link_and_wait_node(u32 old, struct mcs_spinlock *node);
+extern void __pv_kick_node(struct mcs_spinlock *node);
+
+extern void __pv_wait_head(struct qspinlock *lock);
+extern void __pv_queue_unlock(struct qspinlock *lock);
+#endif
+
 /*
  * Initializier
  */
Index: linux-2.6/kernel/locking/qspinlock.c
===================================================================
--- linux-2.6.orig/kernel/locking/qspinlock.c
+++ linux-2.6/kernel/locking/qspinlock.c
@@ -56,13 +56,33 @@ 
 
 #include "mcs_spinlock.h"
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+
+#define MAX_NODES	8
+
+static inline bool pv_enabled(void)
+{
+	return static_key_false(&paravirt_spinlocks_enabled);
+}
+#else /* !PARAVIRT_SPINLOCKS */
+
+#define MAX_NODES	4
+
+static inline bool pv_enabled(void)
+{
+	return false;
+}
+#endif /* PARAVIRT_SPINLOCKS */
+
 /*
  * Per-CPU queue node structures; we can never have more than 4 nested
  * contexts: task, softirq, hardirq, nmi.
  *
  * Exactly fits one cacheline.
+ *
+ * PV doubles the storage and uses the second cacheline for PV state.
  */
-static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[4]);
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
 
 /*
  * We must be able to distinguish between no-tail and the tail at 0:0,
@@ -218,6 +238,156 @@  static __always_inline void set_locked(s
 	ACCESS_ONCE(l->locked) = _Q_LOCKED_VAL;
 }
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+
+/*
+ * Write a comment about how all this works...
+ */
+
+#define _Q_LOCKED_SLOW	(2U << _Q_LOCKED_OFFSET)
+
+struct pv_node {
+	struct mcs_spinlock	mcs;
+	struct mcs_spinlock	__offset[3];
+	int cpu, head;
+};
+
+#define INVALID_HEAD	-1
+#define NO_HEAD		nr_cpu_ids
+
+void __pv_init_node(struct mcs_spinlock *node)
+{
+	struct pv_node *pn = (struct pv_node *)node;
+
+	BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
+
+	pn->cpu = smp_processor_id();
+	pn->head = INVALID_HEAD;
+}
+
+static inline struct pv_node *pv_decode_tail(u32 tail)
+{
+	return (struct pv_node *)decode_tail(tail);
+}
+
+void __pv_link_and_wait_node(u32 old, struct mcs_spinlock *node)
+{
+	struct pv_node *ppn, *pn = (struct pv_node *)node;
+	unsigned int count;
+
+	if (!(old & _Q_TAIL_MASK)) {
+		pn->head = NO_HEAD;
+		return;
+	}
+
+	ppn = pv_decode_tail(old);
+	ACCESS_ONCE(ppn->mcs.next) = node;
+
+	while (ppn->head == INVALID_HEAD)
+		cpu_relax();
+
+	pn->head = ppn->head;
+
+	for (;;) {
+		count = SPIN_THRESHOLD;
+
+		do {
+			if (smp_load_acquire(&node->locked))
+				return;
+
+			cpu_relax();
+		} while (--count);
+
+		pv_wait(&node->locked, 1);
+	}
+}
+
+void __pv_kick_node(struct mcs_spinlock *node)
+{
+	struct pv_node *pn = (struct pv_node *)node;
+
+	pv_kick(pn->cpu);
+}
+
+void __pv_wait_head(struct qspinlock *lock)
+{
+	unsigned int count;
+	struct pv_node *pn;
+	int val, old, new;
+
+	for (;;) {
+		count = SPIN_THRESHOLD;
+
+		do {
+			val = smp_load_acquire(&lock->val.counter);
+			if (!(val & _Q_LOCKED_PENDING_MASK))
+				return;
+		} while (--count);
+
+		do {
+			pn = pv_decode_tail(atomic_read(&lock->val));
+
+			while (pn->head == INVALID_HEAD)
+				cpu_relax();
+
+			pn->head = smp_processor_id();
+
+		} while (pn != pv_decode_tail(atomic_read(&lock->val)));
+
+		/*
+		 * Set _Q_LOCKED_SLOW; bail when the lock is free.
+		 */
+		val = atomic_read(&lock->val);
+		for (;;) {
+			if (!(val & _Q_LOCKED_PENDING_MASK))
+				return;
+			new = val | _Q_LOCKED_SLOW;
+			old = atomic_cmpxchg(&lock->val, val, new);
+			if (old == val)
+				break;
+			val = old;
+		}
+
+		/* XXX 16bit would be better */
+		pv_wait(&lock->val.counter, new);
+	}
+}
+
+static void ___pv_kick_head(struct qspinlock *lock)
+{
+	struct pv_node *pn;
+
+	pn = pv_decode_tail(atomic_read(&lock->val));
+
+	while (pn->head == INVALID_HEAD)
+		cpu_relax();
+
+	if (WARN_ON_ONCE(pn->head == NO_HEAD))
+		return;
+
+	pv_kick(pn->head);
+}
+
+void __pv_queue_unlock(struct qspinlock *lock)
+{
+	int val = atomic_read(&lock->val);
+
+	native_queue_unlock(lock);
+
+	if (val & _Q_LOCKED_SLOW)
+		___pv_kick_head(lock);
+}
+
+#else
+
+static inline void pv_init_node(struct mcs_spinlock *node) { }
+static inline void pv_link_and_wait_node(u32 old, struct mcs_spinlock *node) { }
+static inline void pv_kick_node(struct mcs_spinlock *node) { }
+
+static inline void pv_wait_head(struct qspinlock *lock) { }
+
+#endif
+
 /**
  * queue_spin_lock_slowpath - acquire the queue spinlock
  * @lock: Pointer to queue spinlock structure
@@ -247,6 +417,9 @@  void queue_spin_lock_slowpath(struct qsp
 
 	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
 
+	if (pv_enabled())
+		goto queue;
+
 	if (virt_queue_spin_lock(lock))
 		return;
 
@@ -323,6 +496,7 @@  void queue_spin_lock_slowpath(struct qsp
 	node += idx;
 	node->locked = 0;
 	node->next = NULL;
+	pv_init_node(node);
 
 	/*
 	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
@@ -343,6 +517,7 @@  void queue_spin_lock_slowpath(struct qsp
 	/*
 	 * if there was a previous node; link it and wait.
 	 */
+	pv_link_and_wait_node(old, node);
 	if (old & _Q_TAIL_MASK) {
 		prev = decode_tail(old);
 		ACCESS_ONCE(prev->next) = node;
@@ -358,6 +533,7 @@  void queue_spin_lock_slowpath(struct qsp
 	 *
 	 * *,x,y -> *,0,0
 	 */
+	pv_wait_head(lock);
 	while ((val = smp_load_acquire(&lock->val.counter)) &
 			_Q_LOCKED_PENDING_MASK)
 		cpu_relax();
@@ -391,6 +567,7 @@  void queue_spin_lock_slowpath(struct qsp
 		cpu_relax();
 
 	arch_mcs_spin_unlock_contended(&next->locked);
+	pv_kick_node(next);
 
 release:
 	/*