diff mbox

[v2,3/4] powernv: cpuidle: Redesign idle states management

Message ID 1416914279-30384-4-git-send-email-shreyas@linux.vnet.ibm.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Shreyas B. Prabhu Nov. 25, 2014, 11:17 a.m. UTC
Deep idle states like sleep and winkle are per core idle states. A core
enters these states only when all the threads enter either the
particular idle state or a deeper one. There are tasks like fastsleep
hardware bug workaround and hypervisor core state save which have to be
done only by the last thread of the core entering deep idle state and
similarly tasks like timebase resync, hypervisor core register restore
that have to be done only by the first thread waking up from these
state.

The current idle state management does not have a way to distinguish the
first/last thread of the core waking/entering idle states. Tasks like
timebase resync are done for all the threads. This is not only is
suboptimal, but can cause functionality issues when subcores and kvm is
involved.

This patch adds the necessary infrastructure to track idle states of
threads in a per-core structure. It uses this info to perform tasks like
fastsleep workaround and timebase resync only once per core.

Signed-off-by: Shreyas B. Prabhu <shreyas@linux.vnet.ibm.com>
Originally-by: Preeti U. Murthy <preeti@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: linux-pm@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
---
 arch/powerpc/include/asm/cpuidle.h             |  14 ++
 arch/powerpc/include/asm/opal.h                |   2 +
 arch/powerpc/include/asm/paca.h                |   4 +
 arch/powerpc/kernel/asm-offsets.c              |   4 +
 arch/powerpc/kernel/exceptions-64s.S           |  20 ++-
 arch/powerpc/kernel/idle_power7.S              | 186 +++++++++++++++++++------
 arch/powerpc/platforms/powernv/opal-wrappers.S |  37 +++++
 arch/powerpc/platforms/powernv/setup.c         |  42 +++++-
 arch/powerpc/platforms/powernv/smp.c           |   3 +-
 drivers/cpuidle/cpuidle-powernv.c              |   3 +-
 10 files changed, 258 insertions(+), 57 deletions(-)
 create mode 100644 arch/powerpc/include/asm/cpuidle.h

Comments

Benjamin Herrenschmidt Nov. 27, 2014, 12:37 a.m. UTC | #1
>  
> @@ -37,8 +38,7 @@
>  
>  /*
>   * Pass requested state in r3:
> - * 	0 - nap
> - * 	1 - sleep
> + *	r3 - PNV_THREAD_NAP/SLEEP/WINKLE
>   *
>   * To check IRQ_HAPPENED in r4
>   * 	0 - don't check
> @@ -123,12 +123,62 @@ power7_enter_nap_mode:
>  	li	r4,KVM_HWTHREAD_IN_NAP
>  	stb	r4,HSTATE_HWTHREAD_STATE(r13)
>  #endif
> -	cmpwi	cr0,r3,1
> -	beq	2f
> +	stb	r3,PACA_THREAD_IDLE_STATE(r13)
> +	cmpwi	cr1,r3,PNV_THREAD_SLEEP
> +	bge	cr1,2f
>  	IDLE_STATE_ENTER_SEQ(PPC_NAP)
>  	/* No return */
> -2:	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
> -	/* No return */
> +2:
> +	/* Sleep or winkle */
> +	li	r7,1
> +	mfspr	r8,SPRN_PIR
> +	/*
> +	 * The last 3 bits of PIR represents the thread id of a cpu
> +	 * in power8. This will need adjusting for power7.
> +	 */
> +	andi.	r8,r8,0x07			/* Get thread id into r8 */
> +	rotld	r7,r7,r8
> +
> +	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)

I assume we have already saved all non-volatile registers ? Because you
are clobbering one here and more below.

> +lwarx_loop1:
> +	lwarx	r15,0,r14
> +	andc	r15,r15,r7			/* Clear thread bit */
> +
> +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
> +	beq	last_thread
> +
> +	/* Not the last thread to goto sleep */
> +	stwcx.	r15,0,r14
> +	bne-	lwarx_loop1
> +	b	common_enter
> +
> +last_thread:
> +	LOAD_REG_ADDR(r3, pnv_need_fastsleep_workaround)
> +	lbz	r3,0(r3)
> +	cmpwi	r3,1
> +	bne	common_enter

This looks wrong. If the workaround is 0, we don't do the stwcx. at
all... Did you try with pnv_need_fastsleep_workaround set to 0 ? It
should work most of the time as long as you don't hit the fairly
rare race window :)

Also it would be nice to make the above a dynamically patches feature
section, though that means pnv_need_fastsleep_workaround needs to turn
into a CPU feature bit and that needs to be done *very* early on.

Another option is to patch out manually from the pnv code the pair:

	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
 	beq	last_thread

To turn them into nops by hand rather than using the feature system.

> +	/*
> +	 * Last thread of the core entering sleep. Last thread needs to execute
> +	 * the hardware bug workaround code. Before that, set the lock bit to
> +	 * avoid the race of other threads waking up and undoing workaround
> +	 * before workaround is applied.
> +	 */
> +	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
> +	stwcx.	r15,0,r14
> +	bne-	lwarx_loop1
> +
> +	/* Fast sleep workaround */
> +	li	r3,1
> +	li	r4,1
> +	li	r0,OPAL_CONFIG_CPU_IDLE_STATE
> +	bl	opal_call_realmode
> +
> +	/* Clear Lock bit */

It's a lock, I would add a lwsync here to be safe, and I would add an
isync before the bne- above. Just to ensure that whatever is done
inside that locked section remains in there.

> +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
> +	stw	r15,0(r14)
> +
> +common_enter: /* common code for all the threads entering sleep */
> +	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
>  
>  _GLOBAL(power7_idle)
>  	/* Now check if user or arch enabled NAP mode */
> @@ -141,49 +191,16 @@ _GLOBAL(power7_idle)
>  
>  _GLOBAL(power7_nap)
>  	mr	r4,r3
> -	li	r3,0
> +	li	r3,PNV_THREAD_NAP
>  	b	power7_powersave_common
>  	/* No return */
>  
>  _GLOBAL(power7_sleep)
> -	li	r3,1
> +	li	r3,PNV_THREAD_SLEEP
>  	li	r4,1
>  	b	power7_powersave_common
>  	/* No return */
>  
> -/*
> - * Make opal call in realmode. This is a generic function to be called
> - * from realmode from reset vector. It handles endianess.
> - *
> - * r13 - paca pointer
> - * r1  - stack pointer
> - * r3  - opal token
> - */
> -opal_call_realmode:
> -	mflr	r12
> -	std	r12,_LINK(r1)
> -	ld	r2,PACATOC(r13)
> -	/* Set opal return address */
> -	LOAD_REG_ADDR(r0,return_from_opal_call)
> -	mtlr	r0
> -	/* Handle endian-ness */
> -	li	r0,MSR_LE
> -	mfmsr	r12
> -	andc	r12,r12,r0
> -	mtspr	SPRN_HSRR1,r12
> -	mr	r0,r3			/* Move opal token to r0 */
> -	LOAD_REG_ADDR(r11,opal)
> -	ld	r12,8(r11)
> -	ld	r2,0(r11)
> -	mtspr	SPRN_HSRR0,r12
> -	hrfid
> -
> -return_from_opal_call:
> -	FIXUP_ENDIAN
> -	ld	r0,_LINK(r1)
> -	mtlr	r0
> -	blr
> -
>  #define CHECK_HMI_INTERRUPT						\
>  	mfspr	r0,SPRN_SRR1;						\
>  BEGIN_FTR_SECTION_NESTED(66);						\
> @@ -196,10 +213,8 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);		\
>  	/* Invoke opal call to handle hmi */				\
>  	ld	r2,PACATOC(r13);					\
>  	ld	r1,PACAR1(r13);						\
> -	std	r3,ORIG_GPR3(r1);	/* Save original r3 */		\
> -	li	r3,OPAL_HANDLE_HMI;	/* Pass opal token argument*/	\
> +	li	r0,OPAL_HANDLE_HMI;	/* Pass opal token argument*/	\
>  	bl	opal_call_realmode;					\
> -	ld	r3,ORIG_GPR3(r1);	/* Restore original r3 */	\
>  20:	nop;
>  
> 
> @@ -210,12 +225,91 @@ _GLOBAL(power7_wakeup_tb_loss)
>  BEGIN_FTR_SECTION
>  	CHECK_HMI_INTERRUPT
>  END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
> +
> +	li	r7,1
> +	mfspr	r8,SPRN_PIR
> +	/*
> +	 * The last 3 bits of PIR represents the thread id of a cpu
> +	 * in power8. This will need adjusting for power7.
> +	 */
> +	andi.	r8,r8,0x07		/* Get thread id into r8 */

I'd be more comfortable if we patched that instruction at boot with the
right mask.

> +	rotld	r7,r7,r8
> +	/* r7 now has 'thread_id'th bit set */
> +
> +	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
> +lwarx_loop2:
> +	lwarx	r15,0,r14
> +	andi.	r9,r15,PNV_CORE_IDLE_LOCK_BIT
> +	/*
> +	 * Lock bit is set in one of the 2 cases-
> +	 * a. In the sleep/winkle enter path, the last thread is executing
> +	 * fastsleep workaround code.
> +	 * b. In the wake up path, another thread is executing fastsleep
> +	 * workaround undo code or resyncing timebase or restoring context
> +	 * In either case loop until the lock bit is cleared.
> +	 */
> +	bne	lwarx_loop2

We should do some smt priority games here otherwise the spinning threads
are going to slow down the one with the lock. Basically, if we see the
lock held, go out of line, smt_low, spin on a normal load, and when
smt_medium and go back to lwarx

> +	cmpwi	cr2,r15,0
> +	or	r15,r15,r7		/* Set thread bit */
> +
> +	beq	cr2,first_thread
> +
> +	/* Not first thread in core to wake up */
> +	stwcx.	r15,0,r14
> +	bne-	lwarx_loop2
> +	b	common_exit
> +
> +first_thread:
> +	/* First thread in core to wakeup */
> +	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
> +	stwcx.	r15,0,r14
> +	bne-	lwarx_loop2
> +
> +	LOAD_REG_ADDR(r3, pnv_need_fastsleep_workaround)
> +	lbz	r3,0(r3)
> +	cmpwi	r3,1

Same comment about dynamic patching.

> +	/*  skip fastsleep workaround if its not needed */
> +	bne	timebase_resync
> +
> +	/* Undo fast sleep workaround */
> +	mfcr	r16	/* Backup CR into a non-volatile register */

Why ? If you have your non-volatiles saved you can use an NV CR like CR2
no ? You need to restore those anyway...

Also same comments as on the way down vs barriers when doing
lock/unlock.

> +	li	r3,1
> +	li	r4,0
> +	li	r0,OPAL_CONFIG_CPU_IDLE_STATE
> +	bl	opal_call_realmode
> +	mtcr	r16	/* Restore CR */
> +
> +	/* Do timebase resync if we are waking up from sleep. Use cr1 value
> +	 * set in exceptions-64s.S */
> +	ble	cr1,clear_lock
> +
> +timebase_resync:
>  	/* Time base re-sync */
> -	li	r3,OPAL_RESYNC_TIMEBASE
> +	li	r0,OPAL_RESYNC_TIMEBASE
>  	bl	opal_call_realmode;
> -
>  	/* TODO: Check r3 for failure */
>  
> +clear_lock:
> +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
> +	stw	r15,0(r14)
> +
> +common_exit:
> +	li	r5,PNV_THREAD_RUNNING
> +	stb     r5,PACA_THREAD_IDLE_STATE(r13)
> +
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +	li      r0,KVM_HWTHREAD_IN_KERNEL
> +	stb     r0,HSTATE_HWTHREAD_STATE(r13)
> +	/* Order setting hwthread_state vs. testing hwthread_req */
> +	sync
> +	lbz     r0,HSTATE_HWTHREAD_REQ(r13)
> +	cmpwi   r0,0
> +	beq     6f
> +	b       kvm_start_guest
> +6:
> +#endif
> +
>  	REST_NVGPRS(r1)
>  	REST_GPR(2, r1)
>  	ld	r3,_CCR(r1)
> diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
> index feb549a..b2aa93b 100644
> --- a/arch/powerpc/platforms/powernv/opal-wrappers.S
> +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
> @@ -158,6 +158,43 @@ opal_tracepoint_return:
>  	blr
>  #endif
>  
> +/*
> + * Make opal call in realmode. This is a generic function to be called
> + * from realmode. It handles endianness.
> + *
> + * r13 - paca pointer
> + * r1  - stack pointer
> + * r0  - opal token
> + */
> +_GLOBAL(opal_call_realmode)
> +	mflr	r12
> +	std	r12,_LINK(r1)
> +	ld	r2,PACATOC(r13)
> +	/* Set opal return address */
> +	LOAD_REG_ADDR(r12,return_from_opal_call)
> +	mtlr	r12
> +
> +	mfmsr	r12
> +#ifdef __LITTLE_ENDIAN__
> +	/* Handle endian-ness */
> +	li	r11,MSR_LE
> +	andc	r12,r12,r11
> +#endif
> +	mtspr	SPRN_HSRR1,r12
> +	LOAD_REG_ADDR(r11,opal)
> +	ld	r12,8(r11)
> +	ld	r2,0(r11)
> +	mtspr	SPRN_HSRR0,r12
> +	hrfid
> +
> +return_from_opal_call:
> +#ifdef __LITTLE_ENDIAN__
> +	FIXUP_ENDIAN
> +#endif
> +	ld	r12,_LINK(r1)
> +	mtlr	r12
> +	blr
> +
>  OPAL_CALL(opal_invalid_call,			OPAL_INVALID_CALL);
>  OPAL_CALL(opal_console_write,			OPAL_CONSOLE_WRITE);
>  OPAL_CALL(opal_console_read,			OPAL_CONSOLE_READ);
> diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
> index 34c6665..17fb98c 100644
> --- a/arch/powerpc/platforms/powernv/setup.c
> +++ b/arch/powerpc/platforms/powernv/setup.c
> @@ -36,6 +36,8 @@
>  #include <asm/opal.h>
>  #include <asm/kexec.h>
>  #include <asm/smp.h>
> +#include <asm/cputhreads.h>
> +#include <asm/cpuidle.h>
>  
>  #include "powernv.h"
>  
> @@ -292,11 +294,45 @@ static void __init pnv_setup_machdep_rtas(void)
>  
>  static u32 supported_cpuidle_states;
>  
> +static void pnv_alloc_idle_core_states(void)
> +{
> +	int i, j;
> +	int nr_cores = cpu_nr_cores();
> +	u32 *core_idle_state;
> +
> +	/*
> +	 * core_idle_state - First 8 bits track the idle state of each thread
> +	 * of the core. The 8th bit is the lock bit. Initially all thread bits
> +	 * are set. They are cleared when the thread enters deep idle state
> +	 * like sleep and winkle. Initially the lock bit is cleared.
> +	 * The lock bit has 2 purposes
> +	 * a. While the first thread is restoring core state, it prevents
> +	 * from other threads in the core from switching to prcoess context.
> +	 * b. While the last thread in the core is saving the core state, it
> +	 * prevent a different thread from waking up.
> +	 */
> +	for (i = 0; i < nr_cores; i++) {
> +		int first_cpu = i * threads_per_core;
> +		int node = cpu_to_node(first_cpu);
> +
> +		core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
> +		for (j = 0; j < threads_per_core; j++) {
> +			int cpu = first_cpu + j;
> +
> +			paca[cpu].core_idle_state_ptr = core_idle_state;
> +			paca[cpu].thread_idle_state = PNV_THREAD_RUNNING;
> +
> +		}
> +	}
> +}
> +
>  u32 pnv_get_supported_cpuidle_states(void)
>  {
>  	return supported_cpuidle_states;
>  }
> +EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
>  
> +u8 pnv_need_fastsleep_workaround;
>  static int __init pnv_init_idle_states(void)
>  {
>  	struct device_node *power_mgt;
> @@ -306,6 +342,7 @@ static int __init pnv_init_idle_states(void)
>  	int i;
>  
>  	supported_cpuidle_states = 0;
> +	pnv_need_fastsleep_workaround = 0;
>  
>  	if (cpuidle_disable != IDLE_NO_OVERRIDE)
>  		return 0;
> @@ -332,13 +369,14 @@ static int __init pnv_init_idle_states(void)
>  		flags = be32_to_cpu(idle_state_flags[i]);
>  		supported_cpuidle_states |= flags;
>  	}
> -
> +	if (supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)
> +		pnv_need_fastsleep_workaround = 1;
> +	pnv_alloc_idle_core_states();
>  	return 0;
>  }
>  
>  subsys_initcall(pnv_init_idle_states);
>  
> -
>  static int __init pnv_probe(void)
>  {
>  	unsigned long root = of_get_flat_dt_root();
> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
> index 3dc4cec..12b761a 100644
> --- a/arch/powerpc/platforms/powernv/smp.c
> +++ b/arch/powerpc/platforms/powernv/smp.c
> @@ -167,7 +167,8 @@ static void pnv_smp_cpu_kill_self(void)
>  	mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1);
>  	while (!generic_check_cpu_restart(cpu)) {
>  		ppc64_runlatch_off();
> -		if (idle_states & OPAL_PM_SLEEP_ENABLED)
> +		if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
> +				(idle_states & OPAL_PM_SLEEP_ENABLED_ER1))
>  			power7_sleep();
>  		else
>  			power7_nap(1);
> diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
> index 0a7d827..a489b56 100644
> --- a/drivers/cpuidle/cpuidle-powernv.c
> +++ b/drivers/cpuidle/cpuidle-powernv.c
> @@ -208,7 +208,8 @@ static int powernv_add_idle_states(void)
>  			nr_idle_states++;
>  		}
>  
> -		if (flags & OPAL_PM_SLEEP_ENABLED) {
> +		if (flags & OPAL_PM_SLEEP_ENABLED ||
> +			flags & OPAL_PM_SLEEP_ENABLED_ER1) {
>  			/* Add FASTSLEEP state */
>  			strcpy(powernv_states[nr_idle_states].name, "FastSleep");
>  			strcpy(powernv_states[nr_idle_states].desc, "FastSleep");


--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shreyas B. Prabhu Nov. 27, 2014, 5:55 a.m. UTC | #2
Hi Ben,

On Thursday 27 November 2014 06:07 AM, Benjamin Herrenschmidt wrote:
> 
>>  
>> @@ -37,8 +38,7 @@
>>  
>>  /*
>>   * Pass requested state in r3:
>> - * 	0 - nap
>> - * 	1 - sleep
>> + *	r3 - PNV_THREAD_NAP/SLEEP/WINKLE
>>   *
>>   * To check IRQ_HAPPENED in r4
>>   * 	0 - don't check
>> @@ -123,12 +123,62 @@ power7_enter_nap_mode:
>>  	li	r4,KVM_HWTHREAD_IN_NAP
>>  	stb	r4,HSTATE_HWTHREAD_STATE(r13)
>>  #endif
>> -	cmpwi	cr0,r3,1
>> -	beq	2f
>> +	stb	r3,PACA_THREAD_IDLE_STATE(r13)
>> +	cmpwi	cr1,r3,PNV_THREAD_SLEEP
>> +	bge	cr1,2f
>>  	IDLE_STATE_ENTER_SEQ(PPC_NAP)
>>  	/* No return */
>> -2:	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
>> -	/* No return */
>> +2:
>> +	/* Sleep or winkle */
>> +	li	r7,1
>> +	mfspr	r8,SPRN_PIR
>> +	/*
>> +	 * The last 3 bits of PIR represents the thread id of a cpu
>> +	 * in power8. This will need adjusting for power7.
>> +	 */
>> +	andi.	r8,r8,0x07			/* Get thread id into r8 */
>> +	rotld	r7,r7,r8
>> +
>> +	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
> 
> I assume we have already saved all non-volatile registers ? Because you
> are clobbering one here and more below.

Yes. At this stage the all non-volatile registers are already saved in
stack.
> 
>> +lwarx_loop1:
>> +	lwarx	r15,0,r14
>> +	andc	r15,r15,r7			/* Clear thread bit */
>> +
>> +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
>> +	beq	last_thread
>> +
>> +	/* Not the last thread to goto sleep */
>> +	stwcx.	r15,0,r14
>> +	bne-	lwarx_loop1
>> +	b	common_enter
>> +
>> +last_thread:
>> +	LOAD_REG_ADDR(r3, pnv_need_fastsleep_workaround)
>> +	lbz	r3,0(r3)
>> +	cmpwi	r3,1
>> +	bne	common_enter
> 
> This looks wrong. If the workaround is 0, we don't do the stwcx. at
> all... Did you try with pnv_need_fastsleep_workaround set to 0 ? It
> should work most of the time as long as you don't hit the fairly
> rare race window :)
> 

My bad. I missed the stwcx. in the pnv_need_fastsleep_workaround = 0 path.
> Also it would be nice to make the above a dynamically patches feature
> section, though that means pnv_need_fastsleep_workaround needs to turn
> into a CPU feature bit and that needs to be done *very* early on.
> 
> Another option is to patch out manually from the pnv code the pair:
> 
> 	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
>  	beq	last_thread
> 
> To turn them into nops by hand rather than using the feature system.
> 

Okay. I'll see which works out best here.
>> +	/*
>> +	 * Last thread of the core entering sleep. Last thread needs to execute
>> +	 * the hardware bug workaround code. Before that, set the lock bit to
>> +	 * avoid the race of other threads waking up and undoing workaround
>> +	 * before workaround is applied.
>> +	 */
>> +	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
>> +	stwcx.	r15,0,r14
>> +	bne-	lwarx_loop1
>> +
>> +	/* Fast sleep workaround */
>> +	li	r3,1
>> +	li	r4,1
>> +	li	r0,OPAL_CONFIG_CPU_IDLE_STATE
>> +	bl	opal_call_realmode
>> +
>> +	/* Clear Lock bit */
> 
> It's a lock, I would add a lwsync here to be safe, and I would add an
> isync before the bne- above. Just to ensure that whatever is done
> inside that locked section remains in there.
>

Okay. Will add it.
>> +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
>> +	stw	r15,0(r14)
>> +
>> +common_enter: /* common code for all the threads entering sleep */
>> +	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
>>  
>>  _GLOBAL(power7_idle)
>>  	/* Now check if user or arch enabled NAP mode */
>> @@ -141,49 +191,16 @@ _GLOBAL(power7_idle)
>>  
>>  _GLOBAL(power7_nap)
>>  	mr	r4,r3
>> -	li	r3,0
>> +	li	r3,PNV_THREAD_NAP
>>  	b	power7_powersave_common
>>  	/* No return */
>>  
>>  _GLOBAL(power7_sleep)
>> -	li	r3,1
>> +	li	r3,PNV_THREAD_SLEEP
>>  	li	r4,1
>>  	b	power7_powersave_common
>>  	/* No return */
>>  
>> -/*
>> - * Make opal call in realmode. This is a generic function to be called
>> - * from realmode from reset vector. It handles endianess.
>> - *
>> - * r13 - paca pointer
>> - * r1  - stack pointer
>> - * r3  - opal token
>> - */
>> -opal_call_realmode:
>> -	mflr	r12
>> -	std	r12,_LINK(r1)
>> -	ld	r2,PACATOC(r13)
>> -	/* Set opal return address */
>> -	LOAD_REG_ADDR(r0,return_from_opal_call)
>> -	mtlr	r0
>> -	/* Handle endian-ness */
>> -	li	r0,MSR_LE
>> -	mfmsr	r12
>> -	andc	r12,r12,r0
>> -	mtspr	SPRN_HSRR1,r12
>> -	mr	r0,r3			/* Move opal token to r0 */
>> -	LOAD_REG_ADDR(r11,opal)
>> -	ld	r12,8(r11)
>> -	ld	r2,0(r11)
>> -	mtspr	SPRN_HSRR0,r12
>> -	hrfid
>> -
>> -return_from_opal_call:
>> -	FIXUP_ENDIAN
>> -	ld	r0,_LINK(r1)
>> -	mtlr	r0
>> -	blr
>> -
>>  #define CHECK_HMI_INTERRUPT						\
>>  	mfspr	r0,SPRN_SRR1;						\
>>  BEGIN_FTR_SECTION_NESTED(66);						\
>> @@ -196,10 +213,8 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);		\
>>  	/* Invoke opal call to handle hmi */				\
>>  	ld	r2,PACATOC(r13);					\
>>  	ld	r1,PACAR1(r13);						\
>> -	std	r3,ORIG_GPR3(r1);	/* Save original r3 */		\
>> -	li	r3,OPAL_HANDLE_HMI;	/* Pass opal token argument*/	\
>> +	li	r0,OPAL_HANDLE_HMI;	/* Pass opal token argument*/	\
>>  	bl	opal_call_realmode;					\
>> -	ld	r3,ORIG_GPR3(r1);	/* Restore original r3 */	\
>>  20:	nop;
>>  
>>
>> @@ -210,12 +225,91 @@ _GLOBAL(power7_wakeup_tb_loss)
>>  BEGIN_FTR_SECTION
>>  	CHECK_HMI_INTERRUPT
>>  END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
>> +
>> +	li	r7,1
>> +	mfspr	r8,SPRN_PIR
>> +	/*
>> +	 * The last 3 bits of PIR represents the thread id of a cpu
>> +	 * in power8. This will need adjusting for power7.
>> +	 */
>> +	andi.	r8,r8,0x07		/* Get thread id into r8 */
> 
> I'd be more comfortable if we patched that instruction at boot with the
> right mask.
> 

Okay. I'll make the change.
>> +	rotld	r7,r7,r8
>> +	/* r7 now has 'thread_id'th bit set */
>> +
>> +	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
>> +lwarx_loop2:
>> +	lwarx	r15,0,r14
>> +	andi.	r9,r15,PNV_CORE_IDLE_LOCK_BIT
>> +	/*
>> +	 * Lock bit is set in one of the 2 cases-
>> +	 * a. In the sleep/winkle enter path, the last thread is executing
>> +	 * fastsleep workaround code.
>> +	 * b. In the wake up path, another thread is executing fastsleep
>> +	 * workaround undo code or resyncing timebase or restoring context
>> +	 * In either case loop until the lock bit is cleared.
>> +	 */
>> +	bne	lwarx_loop2
> 
> We should do some smt priority games here otherwise the spinning threads
> are going to slow down the one with the lock. Basically, if we see the
> lock held, go out of line, smt_low, spin on a normal load, and when
> smt_medium and go back to lwarx
> 

Okay.
>> +	cmpwi	cr2,r15,0
>> +	or	r15,r15,r7		/* Set thread bit */
>> +
>> +	beq	cr2,first_thread
>> +
>> +	/* Not first thread in core to wake up */
>> +	stwcx.	r15,0,r14
>> +	bne-	lwarx_loop2
>> +	b	common_exit
>> +
>> +first_thread:
>> +	/* First thread in core to wakeup */
>> +	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
>> +	stwcx.	r15,0,r14
>> +	bne-	lwarx_loop2
>> +
>> +	LOAD_REG_ADDR(r3, pnv_need_fastsleep_workaround)
>> +	lbz	r3,0(r3)
>> +	cmpwi	r3,1
> 
> Same comment about dynamic patching.

Okay.
> 
>> +	/*  skip fastsleep workaround if its not needed */
>> +	bne	timebase_resync
>> +
>> +	/* Undo fast sleep workaround */
>> +	mfcr	r16	/* Backup CR into a non-volatile register */
> 
> Why ? If you have your non-volatiles saved you can use an NV CR like CR2
> no ? You need to restore those anyway...
> 

I wasn't sure CRs were preserved during OPAL call. I'll make the change
to use CR[234].
> Also same comments as on the way down vs barriers when doing
> lock/unlock.
> 

Okay.
>> +	li	r3,1
>> +	li	r4,0
>> +	li	r0,OPAL_CONFIG_CPU_IDLE_STATE
>> +	bl	opal_call_realmode
>> +	mtcr	r16	/* Restore CR */
>> +
>> +	/* Do timebase resync if we are waking up from sleep. Use cr1 value
>> +	 * set in exceptions-64s.S */
>> +	ble	cr1,clear_lock
>> +
>> +timebase_resync:
>>  	/* Time base re-sync */
>> -	li	r3,OPAL_RESYNC_TIMEBASE
>> +	li	r0,OPAL_RESYNC_TIMEBASE
>>  	bl	opal_call_realmode;
>> -
>>  	/* TODO: Check r3 for failure */
>>  
>> +clear_lock:
>> +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
>> +	stw	r15,0(r14)
>> +
>> +common_exit:
>> +	li	r5,PNV_THREAD_RUNNING
>> +	stb     r5,PACA_THREAD_IDLE_STATE(r13)
>> +
>> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>> +	li      r0,KVM_HWTHREAD_IN_KERNEL
>> +	stb     r0,HSTATE_HWTHREAD_STATE(r13)
>> +	/* Order setting hwthread_state vs. testing hwthread_req */
>> +	sync
>> +	lbz     r0,HSTATE_HWTHREAD_REQ(r13)
>> +	cmpwi   r0,0
>> +	beq     6f
>> +	b       kvm_start_guest
>> +6:
>> +#endif
>> +
>>  	REST_NVGPRS(r1)
>>  	REST_GPR(2, r1)
>>  	ld	r3,_CCR(r1)
>> diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
>> index feb549a..b2aa93b 100644
>> --- a/arch/powerpc/platforms/powernv/opal-wrappers.S
>> +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
>> @@ -158,6 +158,43 @@ opal_tracepoint_return:
>>  	blr
>>  #endif
>>  
>> +/*
>> + * Make opal call in realmode. This is a generic function to be called
>> + * from realmode. It handles endianness.
>> + *
>> + * r13 - paca pointer
>> + * r1  - stack pointer
>> + * r0  - opal token
>> + */
>> +_GLOBAL(opal_call_realmode)
>> +	mflr	r12
>> +	std	r12,_LINK(r1)
>> +	ld	r2,PACATOC(r13)
>> +	/* Set opal return address */
>> +	LOAD_REG_ADDR(r12,return_from_opal_call)
>> +	mtlr	r12
>> +
>> +	mfmsr	r12
>> +#ifdef __LITTLE_ENDIAN__
>> +	/* Handle endian-ness */
>> +	li	r11,MSR_LE
>> +	andc	r12,r12,r11
>> +#endif
>> +	mtspr	SPRN_HSRR1,r12
>> +	LOAD_REG_ADDR(r11,opal)
>> +	ld	r12,8(r11)
>> +	ld	r2,0(r11)
>> +	mtspr	SPRN_HSRR0,r12
>> +	hrfid
>> +
>> +return_from_opal_call:
>> +#ifdef __LITTLE_ENDIAN__
>> +	FIXUP_ENDIAN
>> +#endif
>> +	ld	r12,_LINK(r1)
>> +	mtlr	r12
>> +	blr
>> +
>>  OPAL_CALL(opal_invalid_call,			OPAL_INVALID_CALL);
>>  OPAL_CALL(opal_console_write,			OPAL_CONSOLE_WRITE);
>>  OPAL_CALL(opal_console_read,			OPAL_CONSOLE_READ);
>> diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
>> index 34c6665..17fb98c 100644
>> --- a/arch/powerpc/platforms/powernv/setup.c
>> +++ b/arch/powerpc/platforms/powernv/setup.c
>> @@ -36,6 +36,8 @@
>>  #include <asm/opal.h>
>>  #include <asm/kexec.h>
>>  #include <asm/smp.h>
>> +#include <asm/cputhreads.h>
>> +#include <asm/cpuidle.h>
>>  
>>  #include "powernv.h"
>>  
>> @@ -292,11 +294,45 @@ static void __init pnv_setup_machdep_rtas(void)
>>  
>>  static u32 supported_cpuidle_states;
>>  
>> +static void pnv_alloc_idle_core_states(void)
>> +{
>> +	int i, j;
>> +	int nr_cores = cpu_nr_cores();
>> +	u32 *core_idle_state;
>> +
>> +	/*
>> +	 * core_idle_state - First 8 bits track the idle state of each thread
>> +	 * of the core. The 8th bit is the lock bit. Initially all thread bits
>> +	 * are set. They are cleared when the thread enters deep idle state
>> +	 * like sleep and winkle. Initially the lock bit is cleared.
>> +	 * The lock bit has 2 purposes
>> +	 * a. While the first thread is restoring core state, it prevents
>> +	 * from other threads in the core from switching to prcoess context.
>> +	 * b. While the last thread in the core is saving the core state, it
>> +	 * prevent a different thread from waking up.
>> +	 */
>> +	for (i = 0; i < nr_cores; i++) {
>> +		int first_cpu = i * threads_per_core;
>> +		int node = cpu_to_node(first_cpu);
>> +
>> +		core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
>> +		for (j = 0; j < threads_per_core; j++) {
>> +			int cpu = first_cpu + j;
>> +
>> +			paca[cpu].core_idle_state_ptr = core_idle_state;
>> +			paca[cpu].thread_idle_state = PNV_THREAD_RUNNING;
>> +
>> +		}
>> +	}
>> +}
>> +
>>  u32 pnv_get_supported_cpuidle_states(void)
>>  {
>>  	return supported_cpuidle_states;
>>  }
>> +EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
>>  
>> +u8 pnv_need_fastsleep_workaround;
>>  static int __init pnv_init_idle_states(void)
>>  {
>>  	struct device_node *power_mgt;
>> @@ -306,6 +342,7 @@ static int __init pnv_init_idle_states(void)
>>  	int i;
>>  
>>  	supported_cpuidle_states = 0;
>> +	pnv_need_fastsleep_workaround = 0;
>>  
>>  	if (cpuidle_disable != IDLE_NO_OVERRIDE)
>>  		return 0;
>> @@ -332,13 +369,14 @@ static int __init pnv_init_idle_states(void)
>>  		flags = be32_to_cpu(idle_state_flags[i]);
>>  		supported_cpuidle_states |= flags;
>>  	}
>> -
>> +	if (supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)
>> +		pnv_need_fastsleep_workaround = 1;
>> +	pnv_alloc_idle_core_states();
>>  	return 0;
>>  }
>>  
>>  subsys_initcall(pnv_init_idle_states);
>>  
>> -
>>  static int __init pnv_probe(void)
>>  {
>>  	unsigned long root = of_get_flat_dt_root();
>> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
>> index 3dc4cec..12b761a 100644
>> --- a/arch/powerpc/platforms/powernv/smp.c
>> +++ b/arch/powerpc/platforms/powernv/smp.c
>> @@ -167,7 +167,8 @@ static void pnv_smp_cpu_kill_self(void)
>>  	mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1);
>>  	while (!generic_check_cpu_restart(cpu)) {
>>  		ppc64_runlatch_off();
>> -		if (idle_states & OPAL_PM_SLEEP_ENABLED)
>> +		if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
>> +				(idle_states & OPAL_PM_SLEEP_ENABLED_ER1))
>>  			power7_sleep();
>>  		else
>>  			power7_nap(1);
>> diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
>> index 0a7d827..a489b56 100644
>> --- a/drivers/cpuidle/cpuidle-powernv.c
>> +++ b/drivers/cpuidle/cpuidle-powernv.c
>> @@ -208,7 +208,8 @@ static int powernv_add_idle_states(void)
>>  			nr_idle_states++;
>>  		}
>>  
>> -		if (flags & OPAL_PM_SLEEP_ENABLED) {
>> +		if (flags & OPAL_PM_SLEEP_ENABLED ||
>> +			flags & OPAL_PM_SLEEP_ENABLED_ER1) {
>>  			/* Add FASTSLEEP state */
>>  			strcpy(powernv_states[nr_idle_states].name, "FastSleep");
>>  			strcpy(powernv_states[nr_idle_states].desc, "FastSleep");
> 
> 
Thanks,
Shreyas

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras Nov. 27, 2014, 11:50 p.m. UTC | #3
On Tue, Nov 25, 2014 at 04:47:58PM +0530, Shreyas B. Prabhu wrote:
[snip]
> +2:
> +	/* Sleep or winkle */
> +	li	r7,1
> +	mfspr	r8,SPRN_PIR
> +	/*
> +	 * The last 3 bits of PIR represents the thread id of a cpu
> +	 * in power8. This will need adjusting for power7.
> +	 */
> +	andi.	r8,r8,0x07			/* Get thread id into r8 */
> +	rotld	r7,r7,r8

I would suggest adding another u8 field to the paca to store our
thread bit, and initialize it to 1 << (cpu_id % threads_per_core)
early on.  That will handle the POWER7 case correctly and reduce these
four instructions to one.

> +
> +	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
> +lwarx_loop1:
> +	lwarx	r15,0,r14
> +	andc	r15,r15,r7			/* Clear thread bit */
> +
> +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
> +	beq	last_thread
> +
> +	/* Not the last thread to goto sleep */
> +	stwcx.	r15,0,r14
> +	bne-	lwarx_loop1
> +	b	common_enter
> +
> +last_thread:
> +	LOAD_REG_ADDR(r3, pnv_need_fastsleep_workaround)
> +	lbz	r3,0(r3)
> +	cmpwi	r3,1
> +	bne	common_enter
> +	/*
> +	 * Last thread of the core entering sleep. Last thread needs to execute
> +	 * the hardware bug workaround code. Before that, set the lock bit to
> +	 * avoid the race of other threads waking up and undoing workaround
> +	 * before workaround is applied.
> +	 */
> +	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
> +	stwcx.	r15,0,r14
> +	bne-	lwarx_loop1
> +
> +	/* Fast sleep workaround */
> +	li	r3,1
> +	li	r4,1
> +	li	r0,OPAL_CONFIG_CPU_IDLE_STATE
> +	bl	opal_call_realmode
> +
> +	/* Clear Lock bit */
> +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
> +	stw	r15,0(r14)

In this case we know the result of the andi. will be 0, so this could
be just li r0,0; stw r0,0(r14).

> +
> +common_enter: /* common code for all the threads entering sleep */
> +	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
>  
>  _GLOBAL(power7_idle)
>  	/* Now check if user or arch enabled NAP mode */
> @@ -141,49 +191,16 @@ _GLOBAL(power7_idle)
>  
>  _GLOBAL(power7_nap)
>  	mr	r4,r3
> -	li	r3,0
> +	li	r3,PNV_THREAD_NAP
>  	b	power7_powersave_common
>  	/* No return */
>  
>  _GLOBAL(power7_sleep)
> -	li	r3,1
> +	li	r3,PNV_THREAD_SLEEP
>  	li	r4,1
>  	b	power7_powersave_common
>  	/* No return */
>  
> -/*
> - * Make opal call in realmode. This is a generic function to be called
> - * from realmode from reset vector. It handles endianess.
> - *
> - * r13 - paca pointer
> - * r1  - stack pointer
> - * r3  - opal token
> - */
> -opal_call_realmode:
> -	mflr	r12
> -	std	r12,_LINK(r1)
> -	ld	r2,PACATOC(r13)
> -	/* Set opal return address */
> -	LOAD_REG_ADDR(r0,return_from_opal_call)
> -	mtlr	r0
> -	/* Handle endian-ness */
> -	li	r0,MSR_LE
> -	mfmsr	r12
> -	andc	r12,r12,r0
> -	mtspr	SPRN_HSRR1,r12
> -	mr	r0,r3			/* Move opal token to r0 */
> -	LOAD_REG_ADDR(r11,opal)
> -	ld	r12,8(r11)
> -	ld	r2,0(r11)
> -	mtspr	SPRN_HSRR0,r12
> -	hrfid
> -
> -return_from_opal_call:
> -	FIXUP_ENDIAN
> -	ld	r0,_LINK(r1)
> -	mtlr	r0
> -	blr
> -
>  #define CHECK_HMI_INTERRUPT						\
>  	mfspr	r0,SPRN_SRR1;						\
>  BEGIN_FTR_SECTION_NESTED(66);						\
> @@ -196,10 +213,8 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);		\
>  	/* Invoke opal call to handle hmi */				\
>  	ld	r2,PACATOC(r13);					\
>  	ld	r1,PACAR1(r13);						\
> -	std	r3,ORIG_GPR3(r1);	/* Save original r3 */		\
> -	li	r3,OPAL_HANDLE_HMI;	/* Pass opal token argument*/	\
> +	li	r0,OPAL_HANDLE_HMI;	/* Pass opal token argument*/	\
>  	bl	opal_call_realmode;					\
> -	ld	r3,ORIG_GPR3(r1);	/* Restore original r3 */	\
>  20:	nop;
>  
>  
> @@ -210,12 +225,91 @@ _GLOBAL(power7_wakeup_tb_loss)
>  BEGIN_FTR_SECTION
>  	CHECK_HMI_INTERRUPT
>  END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
> +
> +	li	r7,1
> +	mfspr	r8,SPRN_PIR
> +	/*
> +	 * The last 3 bits of PIR represents the thread id of a cpu
> +	 * in power8. This will need adjusting for power7.
> +	 */
> +	andi.	r8,r8,0x07		/* Get thread id into r8 */
> +	rotld	r7,r7,r8
> +	/* r7 now has 'thread_id'th bit set */
> +
> +	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
> +lwarx_loop2:
> +	lwarx	r15,0,r14
> +	andi.	r9,r15,PNV_CORE_IDLE_LOCK_BIT
> +	/*
> +	 * Lock bit is set in one of the 2 cases-
> +	 * a. In the sleep/winkle enter path, the last thread is executing
> +	 * fastsleep workaround code.
> +	 * b. In the wake up path, another thread is executing fastsleep
> +	 * workaround undo code or resyncing timebase or restoring context
> +	 * In either case loop until the lock bit is cleared.
> +	 */
> +	bne	lwarx_loop2
> +
> +	cmpwi	cr2,r15,0
> +	or	r15,r15,r7		/* Set thread bit */
> +
> +	beq	cr2,first_thread
> +
> +	/* Not first thread in core to wake up */
> +	stwcx.	r15,0,r14
> +	bne-	lwarx_loop2
> +	b	common_exit
> +
> +first_thread:
> +	/* First thread in core to wakeup */
> +	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
> +	stwcx.	r15,0,r14
> +	bne-	lwarx_loop2
> +
> +	LOAD_REG_ADDR(r3, pnv_need_fastsleep_workaround)
> +	lbz	r3,0(r3)
> +	cmpwi	r3,1
> +	/*  skip fastsleep workaround if its not needed */
> +	bne	timebase_resync
> +
> +	/* Undo fast sleep workaround */
> +	mfcr	r16	/* Backup CR into a non-volatile register */
> +	li	r3,1
> +	li	r4,0
> +	li	r0,OPAL_CONFIG_CPU_IDLE_STATE
> +	bl	opal_call_realmode
> +	mtcr	r16	/* Restore CR */
> +
> +	/* Do timebase resync if we are waking up from sleep. Use cr1 value
> +	 * set in exceptions-64s.S */
> +	ble	cr1,clear_lock
> +
> +timebase_resync:
>  	/* Time base re-sync */
> -	li	r3,OPAL_RESYNC_TIMEBASE
> +	li	r0,OPAL_RESYNC_TIMEBASE
>  	bl	opal_call_realmode;

So if pnv_need_fastsleep_workaround is zero, we always do the timebase
resync, but if pnv_need_fastsleep_workaround is one, we only do the
timebase resync if we had a loss of state.  Is that really what you
meant?

> -
>  	/* TODO: Check r3 for failure */
>  
> +clear_lock:
> +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
> +	stw	r15,0(r14)
> +
> +common_exit:
> +	li	r5,PNV_THREAD_RUNNING
> +	stb     r5,PACA_THREAD_IDLE_STATE(r13)
> +
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +	li      r0,KVM_HWTHREAD_IN_KERNEL
> +	stb     r0,HSTATE_HWTHREAD_STATE(r13)
> +	/* Order setting hwthread_state vs. testing hwthread_req */
> +	sync
> +	lbz     r0,HSTATE_HWTHREAD_REQ(r13)
> +	cmpwi   r0,0
> +	beq     6f
> +	b       kvm_start_guest
> +6:
> +#endif

I'd prefer not to duplicate this code.  Could you instead branch back
to the code in exceptions-64s.S?  Or call this code via a bl and get
back to exceptions-64s.S via a blr.

> +
>  	REST_NVGPRS(r1)
>  	REST_GPR(2, r1)
>  	ld	r3,_CCR(r1)
> diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
> index feb549a..b2aa93b 100644
> --- a/arch/powerpc/platforms/powernv/opal-wrappers.S
> +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
> @@ -158,6 +158,43 @@ opal_tracepoint_return:
>  	blr
>  #endif
>  
> +/*
> + * Make opal call in realmode. This is a generic function to be called
> + * from realmode. It handles endianness.
> + *
> + * r13 - paca pointer
> + * r1  - stack pointer
> + * r0  - opal token
> + */
> +_GLOBAL(opal_call_realmode)
> +	mflr	r12
> +	std	r12,_LINK(r1)

This is a bug waiting to happen.  Using _LINK(r1) was OK in this
code's previous location, since there we know there is a
INT_FRAME_SIZE-sized stack frame and the _LINK field is basically
unused.  Now that you're making this available to call from anywhere,
you can't trash the caller's stack frame like this.  You need to use
PPC_LR_STKOFF(r1) instead.

> +	ld	r2,PACATOC(r13)
> +	/* Set opal return address */
> +	LOAD_REG_ADDR(r12,return_from_opal_call)
> +	mtlr	r12
> +
> +	mfmsr	r12
> +#ifdef __LITTLE_ENDIAN__
> +	/* Handle endian-ness */
> +	li	r11,MSR_LE
> +	andc	r12,r12,r11
> +#endif
> +	mtspr	SPRN_HSRR1,r12
> +	LOAD_REG_ADDR(r11,opal)
> +	ld	r12,8(r11)
> +	ld	r2,0(r11)
> +	mtspr	SPRN_HSRR0,r12
> +	hrfid
> +
> +return_from_opal_call:
> +#ifdef __LITTLE_ENDIAN__
> +	FIXUP_ENDIAN
> +#endif
> +	ld	r12,_LINK(r1)
> +	mtlr	r12
> +	blr

Paul.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shreyas B. Prabhu Dec. 1, 2014, 6:27 a.m. UTC | #4
On Friday 28 November 2014 05:20 AM, Paul Mackerras wrote:
> On Tue, Nov 25, 2014 at 04:47:58PM +0530, Shreyas B. Prabhu wrote:
> [snip]
>> +2:
>> +	/* Sleep or winkle */
>> +	li	r7,1
>> +	mfspr	r8,SPRN_PIR
>> +	/*
>> +	 * The last 3 bits of PIR represents the thread id of a cpu
>> +	 * in power8. This will need adjusting for power7.
>> +	 */
>> +	andi.	r8,r8,0x07			/* Get thread id into r8 */
>> +	rotld	r7,r7,r8
> 
> I would suggest adding another u8 field to the paca to store our
> thread bit, and initialize it to 1 << (cpu_id % threads_per_core)
> early on.  That will handle the POWER7 case correctly and reduce these
> four instructions to one.
> 
Okay. I'll make the change. 
>> +
>> +	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
>> +lwarx_loop1:
>> +	lwarx	r15,0,r14
>> +	andc	r15,r15,r7			/* Clear thread bit */
>> +
>> +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
>> +	beq	last_thread
>> +
>> +	/* Not the last thread to goto sleep */
>> +	stwcx.	r15,0,r14
>> +	bne-	lwarx_loop1
>> +	b	common_enter
>> +
>> +last_thread:
>> +	LOAD_REG_ADDR(r3, pnv_need_fastsleep_workaround)
>> +	lbz	r3,0(r3)
>> +	cmpwi	r3,1
>> +	bne	common_enter
>> +	/*
>> +	 * Last thread of the core entering sleep. Last thread needs to execute
>> +	 * the hardware bug workaround code. Before that, set the lock bit to
>> +	 * avoid the race of other threads waking up and undoing workaround
>> +	 * before workaround is applied.
>> +	 */
>> +	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
>> +	stwcx.	r15,0,r14
>> +	bne-	lwarx_loop1
>> +
>> +	/* Fast sleep workaround */
>> +	li	r3,1
>> +	li	r4,1
>> +	li	r0,OPAL_CONFIG_CPU_IDLE_STATE
>> +	bl	opal_call_realmode
>> +
>> +	/* Clear Lock bit */
>> +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
>> +	stw	r15,0(r14)
> 
> In this case we know the result of the andi. will be 0, so this could
> be just li r0,0; stw r0,0(r14).
> 

Yes. I'll make the change.
>> +
>> +common_enter: /* common code for all the threads entering sleep */
>> +	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
>>  
>>  _GLOBAL(power7_idle)
>>  	/* Now check if user or arch enabled NAP mode */
>> @@ -141,49 +191,16 @@ _GLOBAL(power7_idle)
>>  
>>  _GLOBAL(power7_nap)
>>  	mr	r4,r3
>> -	li	r3,0
>> +	li	r3,PNV_THREAD_NAP
>>  	b	power7_powersave_common
>>  	/* No return */
>>  
>>  _GLOBAL(power7_sleep)
>> -	li	r3,1
>> +	li	r3,PNV_THREAD_SLEEP
>>  	li	r4,1
>>  	b	power7_powersave_common
>>  	/* No return */
>>  
>> -/*
>> - * Make opal call in realmode. This is a generic function to be called
>> - * from realmode from reset vector. It handles endianess.
>> - *
>> - * r13 - paca pointer
>> - * r1  - stack pointer
>> - * r3  - opal token
>> - */
>> -opal_call_realmode:
>> -	mflr	r12
>> -	std	r12,_LINK(r1)
>> -	ld	r2,PACATOC(r13)
>> -	/* Set opal return address */
>> -	LOAD_REG_ADDR(r0,return_from_opal_call)
>> -	mtlr	r0
>> -	/* Handle endian-ness */
>> -	li	r0,MSR_LE
>> -	mfmsr	r12
>> -	andc	r12,r12,r0
>> -	mtspr	SPRN_HSRR1,r12
>> -	mr	r0,r3			/* Move opal token to r0 */
>> -	LOAD_REG_ADDR(r11,opal)
>> -	ld	r12,8(r11)
>> -	ld	r2,0(r11)
>> -	mtspr	SPRN_HSRR0,r12
>> -	hrfid
>> -
>> -return_from_opal_call:
>> -	FIXUP_ENDIAN
>> -	ld	r0,_LINK(r1)
>> -	mtlr	r0
>> -	blr
>> -
>>  #define CHECK_HMI_INTERRUPT						\
>>  	mfspr	r0,SPRN_SRR1;						\
>>  BEGIN_FTR_SECTION_NESTED(66);						\
>> @@ -196,10 +213,8 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);		\
>>  	/* Invoke opal call to handle hmi */				\
>>  	ld	r2,PACATOC(r13);					\
>>  	ld	r1,PACAR1(r13);						\
>> -	std	r3,ORIG_GPR3(r1);	/* Save original r3 */		\
>> -	li	r3,OPAL_HANDLE_HMI;	/* Pass opal token argument*/	\
>> +	li	r0,OPAL_HANDLE_HMI;	/* Pass opal token argument*/	\
>>  	bl	opal_call_realmode;					\
>> -	ld	r3,ORIG_GPR3(r1);	/* Restore original r3 */	\
>>  20:	nop;
>>  
>>  
>> @@ -210,12 +225,91 @@ _GLOBAL(power7_wakeup_tb_loss)
>>  BEGIN_FTR_SECTION
>>  	CHECK_HMI_INTERRUPT
>>  END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
>> +
>> +	li	r7,1
>> +	mfspr	r8,SPRN_PIR
>> +	/*
>> +	 * The last 3 bits of PIR represents the thread id of a cpu
>> +	 * in power8. This will need adjusting for power7.
>> +	 */
>> +	andi.	r8,r8,0x07		/* Get thread id into r8 */
>> +	rotld	r7,r7,r8
>> +	/* r7 now has 'thread_id'th bit set */
>> +
>> +	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
>> +lwarx_loop2:
>> +	lwarx	r15,0,r14
>> +	andi.	r9,r15,PNV_CORE_IDLE_LOCK_BIT
>> +	/*
>> +	 * Lock bit is set in one of the 2 cases-
>> +	 * a. In the sleep/winkle enter path, the last thread is executing
>> +	 * fastsleep workaround code.
>> +	 * b. In the wake up path, another thread is executing fastsleep
>> +	 * workaround undo code or resyncing timebase or restoring context
>> +	 * In either case loop until the lock bit is cleared.
>> +	 */
>> +	bne	lwarx_loop2
>> +
>> +	cmpwi	cr2,r15,0
>> +	or	r15,r15,r7		/* Set thread bit */
>> +
>> +	beq	cr2,first_thread
>> +
>> +	/* Not first thread in core to wake up */
>> +	stwcx.	r15,0,r14
>> +	bne-	lwarx_loop2
>> +	b	common_exit
>> +
>> +first_thread:
>> +	/* First thread in core to wakeup */
>> +	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
>> +	stwcx.	r15,0,r14
>> +	bne-	lwarx_loop2
>> +
>> +	LOAD_REG_ADDR(r3, pnv_need_fastsleep_workaround)
>> +	lbz	r3,0(r3)
>> +	cmpwi	r3,1
>> +	/*  skip fastsleep workaround if its not needed */
>> +	bne	timebase_resync
>> +
>> +	/* Undo fast sleep workaround */
>> +	mfcr	r16	/* Backup CR into a non-volatile register */
>> +	li	r3,1
>> +	li	r4,0
>> +	li	r0,OPAL_CONFIG_CPU_IDLE_STATE
>> +	bl	opal_call_realmode
>> +	mtcr	r16	/* Restore CR */
>> +
>> +	/* Do timebase resync if we are waking up from sleep. Use cr1 value
>> +	 * set in exceptions-64s.S */
>> +	ble	cr1,clear_lock
>> +
>> +timebase_resync:
>>  	/* Time base re-sync */
>> -	li	r3,OPAL_RESYNC_TIMEBASE
>> +	li	r0,OPAL_RESYNC_TIMEBASE
>>  	bl	opal_call_realmode;
> 
> So if pnv_need_fastsleep_workaround is zero, we always do the timebase
> resync, but if pnv_need_fastsleep_workaround is one, we only do the
> timebase resync if we had a loss of state.  Is that really what you
> meant?
> 
Timebase resync is needed only if we have a loss of state, irrespective of
pnv_need_fastsleep_workaround. 
But I see here in pnv_need_fastsleep_workaround = 0 path, I am doing timebase
resync unconditionally. I'll fix that.

The correct flow is, if its the first thread in the core waking up from sleep
- call OPAL_CONFIG_CPU_IDLE_STATE if pnv_need_fastsleep_workaround = 1
- If we had a state loss, call OPAL_RESYNC_TIMEBASE to resync timebase.
>> -
>>  	/* TODO: Check r3 for failure */
>>  
>> +clear_lock:
>> +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
>> +	stw	r15,0(r14)
>> +
>> +common_exit:
>> +	li	r5,PNV_THREAD_RUNNING
>> +	stb     r5,PACA_THREAD_IDLE_STATE(r13)
>> +
>> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>> +	li      r0,KVM_HWTHREAD_IN_KERNEL
>> +	stb     r0,HSTATE_HWTHREAD_STATE(r13)
>> +	/* Order setting hwthread_state vs. testing hwthread_req */
>> +	sync
>> +	lbz     r0,HSTATE_HWTHREAD_REQ(r13)
>> +	cmpwi   r0,0
>> +	beq     6f
>> +	b       kvm_start_guest
>> +6:
>> +#endif
> 
> I'd prefer not to duplicate this code.  Could you instead branch back
> to the code in exceptions-64s.S?  Or call this code via a bl and get
> back to exceptions-64s.S via a blr.
> 

Okay.
>> +
>>  	REST_NVGPRS(r1)
>>  	REST_GPR(2, r1)
>>  	ld	r3,_CCR(r1)
>> diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
>> index feb549a..b2aa93b 100644
>> --- a/arch/powerpc/platforms/powernv/opal-wrappers.S
>> +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
>> @@ -158,6 +158,43 @@ opal_tracepoint_return:
>>  	blr
>>  #endif
>>  
>> +/*
>> + * Make opal call in realmode. This is a generic function to be called
>> + * from realmode. It handles endianness.
>> + *
>> + * r13 - paca pointer
>> + * r1  - stack pointer
>> + * r0  - opal token
>> + */
>> +_GLOBAL(opal_call_realmode)
>> +	mflr	r12
>> +	std	r12,_LINK(r1)
> 
> This is a bug waiting to happen.  Using _LINK(r1) was OK in this
> code's previous location, since there we know there is a
> INT_FRAME_SIZE-sized stack frame and the _LINK field is basically
> unused.  Now that you're making this available to call from anywhere,
> you can't trash the caller's stack frame like this.  You need to use
> PPC_LR_STKOFF(r1) instead.
>
 
Okay.
>> +	ld	r2,PACATOC(r13)
>> +	/* Set opal return address */
>> +	LOAD_REG_ADDR(r12,return_from_opal_call)
>> +	mtlr	r12
>> +
>> +	mfmsr	r12
>> +#ifdef __LITTLE_ENDIAN__
>> +	/* Handle endian-ness */
>> +	li	r11,MSR_LE
>> +	andc	r12,r12,r11
>> +#endif
>> +	mtspr	SPRN_HSRR1,r12
>> +	LOAD_REG_ADDR(r11,opal)
>> +	ld	r12,8(r11)
>> +	ld	r2,0(r11)
>> +	mtspr	SPRN_HSRR0,r12
>> +	hrfid
>> +
>> +return_from_opal_call:
>> +#ifdef __LITTLE_ENDIAN__
>> +	FIXUP_ENDIAN
>> +#endif
>> +	ld	r12,_LINK(r1)
>> +	mtlr	r12
>> +	blr
> 
> Paul.
> 

Thanks,
Shreyas

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h
new file mode 100644
index 0000000..8c82850
--- /dev/null
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -0,0 +1,14 @@ 
+#ifndef _ASM_POWERPC_CPUIDLE_H
+#define _ASM_POWERPC_CPUIDLE_H
+
+#ifdef CONFIG_PPC_POWERNV
+/* Used in powernv idle state management */
+#define PNV_THREAD_RUNNING              0
+#define PNV_THREAD_NAP                  1
+#define PNV_THREAD_SLEEP                2
+#define PNV_THREAD_WINKLE               3
+#define PNV_CORE_IDLE_LOCK_BIT          0x100
+#define PNV_CORE_IDLE_THREAD_BITS       0x0FF
+#endif
+
+#endif
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index f8b95c0..bef7fbc 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -152,6 +152,7 @@  struct opal_sg_list {
 #define OPAL_PCI_ERR_INJECT			96
 #define OPAL_PCI_EEH_FREEZE_SET			97
 #define OPAL_HANDLE_HMI				98
+#define OPAL_CONFIG_CPU_IDLE_STATE		99
 #define OPAL_REGISTER_DUMP_REGION		101
 #define OPAL_UNREGISTER_DUMP_REGION		102
 
@@ -162,6 +163,7 @@  struct opal_sg_list {
  */
 #define OPAL_PM_NAP_ENABLED	0x00010000
 #define OPAL_PM_SLEEP_ENABLED	0x00020000
+#define OPAL_PM_SLEEP_ENABLED_ER1	0x00080000
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index a5139ea..85aeedb 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -158,6 +158,10 @@  struct paca_struct {
 	 * early exception handler for use by high level C handler
 	 */
 	struct opal_machine_check_event *opal_mc_evt;
+
+	/* Per-core mask tracking idle threads and a lock bit-[L][TTTTTTTT] */
+	u32 *core_idle_state_ptr;
+	u8 thread_idle_state;		/* ~Idle[0]/Nap[1]/Sleep[2]/Winkle[3] */
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
 	/* Exclusive emergency stack pointer for machine check exception. */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 9d7dede..50f299e 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -731,6 +731,10 @@  int main(void)
 	DEFINE(OPAL_MC_SRR0, offsetof(struct opal_machine_check_event, srr0));
 	DEFINE(OPAL_MC_SRR1, offsetof(struct opal_machine_check_event, srr1));
 	DEFINE(PACA_OPAL_MC_EVT, offsetof(struct paca_struct, opal_mc_evt));
+	DEFINE(PACA_CORE_IDLE_STATE_PTR,
+			offsetof(struct paca_struct, core_idle_state_ptr));
+	DEFINE(PACA_THREAD_IDLE_STATE,
+			offsetof(struct paca_struct, thread_idle_state));
 #endif
 
 	return 0;
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 72e783e..3311c8d 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -15,6 +15,7 @@ 
 #include <asm/hw_irq.h>
 #include <asm/exception-64s.h>
 #include <asm/ptrace.h>
+#include <asm/cpuidle.h>
 
 /*
  * We layout physical memory as follows:
@@ -109,15 +110,19 @@  BEGIN_FTR_SECTION
 	rlwinm.	r13,r13,47-31,30,31
 	beq	9f
 
-	/* waking up from powersave (nap) state */
 	cmpwi	cr1,r13,2
-	/* Total loss of HV state is fatal, we could try to use the
-	 * PIR to locate a PACA, then use an emergency stack etc...
-	 * OPAL v3 based powernv platforms have new idle states
-	 * which fall in this catagory.
-	 */
-	bgt	cr1,8f
+
 	GET_PACA(r13)
+	lbz	r0,PACA_THREAD_IDLE_STATE(r13)
+	cmpwi   cr2,r0,PNV_THREAD_NAP
+	bgt     cr2,8f				/* Either sleep or Winkle */
+
+	/* Waking up from nap should not cause hypervisor state loss */
+	bgt	cr1,.
+
+	/* Waking up from nap */
+	li	r0,PNV_THREAD_RUNNING
+	stb	r0,PACA_THREAD_IDLE_STATE(r13)	/* Clear thread state */
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	li	r0,KVM_HWTHREAD_IN_KERNEL
@@ -1386,6 +1391,7 @@  machine_check_handle_early:
 	MACHINE_CHECK_HANDLER_WINDUP
 	GET_PACA(r13)
 	ld	r1,PACAR1(r13)
+	li	r3,PNV_THREAD_NAP
 	b	power7_enter_nap_mode
 4:
 #endif
diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
index 283c603..c1d590f 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -18,6 +18,7 @@ 
 #include <asm/hw_irq.h>
 #include <asm/kvm_book3s_asm.h>
 #include <asm/opal.h>
+#include <asm/cpuidle.h>
 
 #undef DEBUG
 
@@ -37,8 +38,7 @@ 
 
 /*
  * Pass requested state in r3:
- * 	0 - nap
- * 	1 - sleep
+ *	r3 - PNV_THREAD_NAP/SLEEP/WINKLE
  *
  * To check IRQ_HAPPENED in r4
  * 	0 - don't check
@@ -123,12 +123,62 @@  power7_enter_nap_mode:
 	li	r4,KVM_HWTHREAD_IN_NAP
 	stb	r4,HSTATE_HWTHREAD_STATE(r13)
 #endif
-	cmpwi	cr0,r3,1
-	beq	2f
+	stb	r3,PACA_THREAD_IDLE_STATE(r13)
+	cmpwi	cr1,r3,PNV_THREAD_SLEEP
+	bge	cr1,2f
 	IDLE_STATE_ENTER_SEQ(PPC_NAP)
 	/* No return */
-2:	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
-	/* No return */
+2:
+	/* Sleep or winkle */
+	li	r7,1
+	mfspr	r8,SPRN_PIR
+	/*
+	 * The last 3 bits of PIR represents the thread id of a cpu
+	 * in power8. This will need adjusting for power7.
+	 */
+	andi.	r8,r8,0x07			/* Get thread id into r8 */
+	rotld	r7,r7,r8
+
+	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
+lwarx_loop1:
+	lwarx	r15,0,r14
+	andc	r15,r15,r7			/* Clear thread bit */
+
+	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
+	beq	last_thread
+
+	/* Not the last thread to goto sleep */
+	stwcx.	r15,0,r14
+	bne-	lwarx_loop1
+	b	common_enter
+
+last_thread:
+	LOAD_REG_ADDR(r3, pnv_need_fastsleep_workaround)
+	lbz	r3,0(r3)
+	cmpwi	r3,1
+	bne	common_enter
+	/*
+	 * Last thread of the core entering sleep. Last thread needs to execute
+	 * the hardware bug workaround code. Before that, set the lock bit to
+	 * avoid the race of other threads waking up and undoing workaround
+	 * before workaround is applied.
+	 */
+	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
+	stwcx.	r15,0,r14
+	bne-	lwarx_loop1
+
+	/* Fast sleep workaround */
+	li	r3,1
+	li	r4,1
+	li	r0,OPAL_CONFIG_CPU_IDLE_STATE
+	bl	opal_call_realmode
+
+	/* Clear Lock bit */
+	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
+	stw	r15,0(r14)
+
+common_enter: /* common code for all the threads entering sleep */
+	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
 
 _GLOBAL(power7_idle)
 	/* Now check if user or arch enabled NAP mode */
@@ -141,49 +191,16 @@  _GLOBAL(power7_idle)
 
 _GLOBAL(power7_nap)
 	mr	r4,r3
-	li	r3,0
+	li	r3,PNV_THREAD_NAP
 	b	power7_powersave_common
 	/* No return */
 
 _GLOBAL(power7_sleep)
-	li	r3,1
+	li	r3,PNV_THREAD_SLEEP
 	li	r4,1
 	b	power7_powersave_common
 	/* No return */
 
-/*
- * Make opal call in realmode. This is a generic function to be called
- * from realmode from reset vector. It handles endianess.
- *
- * r13 - paca pointer
- * r1  - stack pointer
- * r3  - opal token
- */
-opal_call_realmode:
-	mflr	r12
-	std	r12,_LINK(r1)
-	ld	r2,PACATOC(r13)
-	/* Set opal return address */
-	LOAD_REG_ADDR(r0,return_from_opal_call)
-	mtlr	r0
-	/* Handle endian-ness */
-	li	r0,MSR_LE
-	mfmsr	r12
-	andc	r12,r12,r0
-	mtspr	SPRN_HSRR1,r12
-	mr	r0,r3			/* Move opal token to r0 */
-	LOAD_REG_ADDR(r11,opal)
-	ld	r12,8(r11)
-	ld	r2,0(r11)
-	mtspr	SPRN_HSRR0,r12
-	hrfid
-
-return_from_opal_call:
-	FIXUP_ENDIAN
-	ld	r0,_LINK(r1)
-	mtlr	r0
-	blr
-
 #define CHECK_HMI_INTERRUPT						\
 	mfspr	r0,SPRN_SRR1;						\
 BEGIN_FTR_SECTION_NESTED(66);						\
@@ -196,10 +213,8 @@  ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);		\
 	/* Invoke opal call to handle hmi */				\
 	ld	r2,PACATOC(r13);					\
 	ld	r1,PACAR1(r13);						\
-	std	r3,ORIG_GPR3(r1);	/* Save original r3 */		\
-	li	r3,OPAL_HANDLE_HMI;	/* Pass opal token argument*/	\
+	li	r0,OPAL_HANDLE_HMI;	/* Pass opal token argument*/	\
 	bl	opal_call_realmode;					\
-	ld	r3,ORIG_GPR3(r1);	/* Restore original r3 */	\
 20:	nop;
 
 
@@ -210,12 +225,91 @@  _GLOBAL(power7_wakeup_tb_loss)
 BEGIN_FTR_SECTION
 	CHECK_HMI_INTERRUPT
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+
+	li	r7,1
+	mfspr	r8,SPRN_PIR
+	/*
+	 * The last 3 bits of PIR represents the thread id of a cpu
+	 * in power8. This will need adjusting for power7.
+	 */
+	andi.	r8,r8,0x07		/* Get thread id into r8 */
+	rotld	r7,r7,r8
+	/* r7 now has 'thread_id'th bit set */
+
+	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
+lwarx_loop2:
+	lwarx	r15,0,r14
+	andi.	r9,r15,PNV_CORE_IDLE_LOCK_BIT
+	/*
+	 * Lock bit is set in one of the 2 cases-
+	 * a. In the sleep/winkle enter path, the last thread is executing
+	 * fastsleep workaround code.
+	 * b. In the wake up path, another thread is executing fastsleep
+	 * workaround undo code or resyncing timebase or restoring context
+	 * In either case loop until the lock bit is cleared.
+	 */
+	bne	lwarx_loop2
+
+	cmpwi	cr2,r15,0
+	or	r15,r15,r7		/* Set thread bit */
+
+	beq	cr2,first_thread
+
+	/* Not first thread in core to wake up */
+	stwcx.	r15,0,r14
+	bne-	lwarx_loop2
+	b	common_exit
+
+first_thread:
+	/* First thread in core to wakeup */
+	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
+	stwcx.	r15,0,r14
+	bne-	lwarx_loop2
+
+	LOAD_REG_ADDR(r3, pnv_need_fastsleep_workaround)
+	lbz	r3,0(r3)
+	cmpwi	r3,1
+	/*  skip fastsleep workaround if its not needed */
+	bne	timebase_resync
+
+	/* Undo fast sleep workaround */
+	mfcr	r16	/* Backup CR into a non-volatile register */
+	li	r3,1
+	li	r4,0
+	li	r0,OPAL_CONFIG_CPU_IDLE_STATE
+	bl	opal_call_realmode
+	mtcr	r16	/* Restore CR */
+
+	/* Do timebase resync if we are waking up from sleep. Use cr1 value
+	 * set in exceptions-64s.S */
+	ble	cr1,clear_lock
+
+timebase_resync:
 	/* Time base re-sync */
-	li	r3,OPAL_RESYNC_TIMEBASE
+	li	r0,OPAL_RESYNC_TIMEBASE
 	bl	opal_call_realmode;
-
 	/* TODO: Check r3 for failure */
 
+clear_lock:
+	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
+	stw	r15,0(r14)
+
+common_exit:
+	li	r5,PNV_THREAD_RUNNING
+	stb     r5,PACA_THREAD_IDLE_STATE(r13)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	li      r0,KVM_HWTHREAD_IN_KERNEL
+	stb     r0,HSTATE_HWTHREAD_STATE(r13)
+	/* Order setting hwthread_state vs. testing hwthread_req */
+	sync
+	lbz     r0,HSTATE_HWTHREAD_REQ(r13)
+	cmpwi   r0,0
+	beq     6f
+	b       kvm_start_guest
+6:
+#endif
+
 	REST_NVGPRS(r1)
 	REST_GPR(2, r1)
 	ld	r3,_CCR(r1)
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index feb549a..b2aa93b 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -158,6 +158,43 @@  opal_tracepoint_return:
 	blr
 #endif
 
+/*
+ * Make opal call in realmode. This is a generic function to be called
+ * from realmode. It handles endianness.
+ *
+ * r13 - paca pointer
+ * r1  - stack pointer
+ * r0  - opal token
+ */
+_GLOBAL(opal_call_realmode)
+	mflr	r12
+	std	r12,_LINK(r1)
+	ld	r2,PACATOC(r13)
+	/* Set opal return address */
+	LOAD_REG_ADDR(r12,return_from_opal_call)
+	mtlr	r12
+
+	mfmsr	r12
+#ifdef __LITTLE_ENDIAN__
+	/* Handle endian-ness */
+	li	r11,MSR_LE
+	andc	r12,r12,r11
+#endif
+	mtspr	SPRN_HSRR1,r12
+	LOAD_REG_ADDR(r11,opal)
+	ld	r12,8(r11)
+	ld	r2,0(r11)
+	mtspr	SPRN_HSRR0,r12
+	hrfid
+
+return_from_opal_call:
+#ifdef __LITTLE_ENDIAN__
+	FIXUP_ENDIAN
+#endif
+	ld	r12,_LINK(r1)
+	mtlr	r12
+	blr
+
 OPAL_CALL(opal_invalid_call,			OPAL_INVALID_CALL);
 OPAL_CALL(opal_console_write,			OPAL_CONSOLE_WRITE);
 OPAL_CALL(opal_console_read,			OPAL_CONSOLE_READ);
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 34c6665..17fb98c 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -36,6 +36,8 @@ 
 #include <asm/opal.h>
 #include <asm/kexec.h>
 #include <asm/smp.h>
+#include <asm/cputhreads.h>
+#include <asm/cpuidle.h>
 
 #include "powernv.h"
 
@@ -292,11 +294,45 @@  static void __init pnv_setup_machdep_rtas(void)
 
 static u32 supported_cpuidle_states;
 
+static void pnv_alloc_idle_core_states(void)
+{
+	int i, j;
+	int nr_cores = cpu_nr_cores();
+	u32 *core_idle_state;
+
+	/*
+	 * core_idle_state - First 8 bits track the idle state of each thread
+	 * of the core. The 8th bit is the lock bit. Initially all thread bits
+	 * are set. They are cleared when the thread enters deep idle state
+	 * like sleep and winkle. Initially the lock bit is cleared.
+	 * The lock bit has 2 purposes
+	 * a. While the first thread is restoring core state, it prevents
+	 * from other threads in the core from switching to prcoess context.
+	 * b. While the last thread in the core is saving the core state, it
+	 * prevent a different thread from waking up.
+	 */
+	for (i = 0; i < nr_cores; i++) {
+		int first_cpu = i * threads_per_core;
+		int node = cpu_to_node(first_cpu);
+
+		core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
+		for (j = 0; j < threads_per_core; j++) {
+			int cpu = first_cpu + j;
+
+			paca[cpu].core_idle_state_ptr = core_idle_state;
+			paca[cpu].thread_idle_state = PNV_THREAD_RUNNING;
+
+		}
+	}
+}
+
 u32 pnv_get_supported_cpuidle_states(void)
 {
 	return supported_cpuidle_states;
 }
+EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
 
+u8 pnv_need_fastsleep_workaround;
 static int __init pnv_init_idle_states(void)
 {
 	struct device_node *power_mgt;
@@ -306,6 +342,7 @@  static int __init pnv_init_idle_states(void)
 	int i;
 
 	supported_cpuidle_states = 0;
+	pnv_need_fastsleep_workaround = 0;
 
 	if (cpuidle_disable != IDLE_NO_OVERRIDE)
 		return 0;
@@ -332,13 +369,14 @@  static int __init pnv_init_idle_states(void)
 		flags = be32_to_cpu(idle_state_flags[i]);
 		supported_cpuidle_states |= flags;
 	}
-
+	if (supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)
+		pnv_need_fastsleep_workaround = 1;
+	pnv_alloc_idle_core_states();
 	return 0;
 }
 
 subsys_initcall(pnv_init_idle_states);
 
-
 static int __init pnv_probe(void)
 {
 	unsigned long root = of_get_flat_dt_root();
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 3dc4cec..12b761a 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -167,7 +167,8 @@  static void pnv_smp_cpu_kill_self(void)
 	mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1);
 	while (!generic_check_cpu_restart(cpu)) {
 		ppc64_runlatch_off();
-		if (idle_states & OPAL_PM_SLEEP_ENABLED)
+		if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
+				(idle_states & OPAL_PM_SLEEP_ENABLED_ER1))
 			power7_sleep();
 		else
 			power7_nap(1);
diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 0a7d827..a489b56 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -208,7 +208,8 @@  static int powernv_add_idle_states(void)
 			nr_idle_states++;
 		}
 
-		if (flags & OPAL_PM_SLEEP_ENABLED) {
+		if (flags & OPAL_PM_SLEEP_ENABLED ||
+			flags & OPAL_PM_SLEEP_ENABLED_ER1) {
 			/* Add FASTSLEEP state */
 			strcpy(powernv_states[nr_idle_states].name, "FastSleep");
 			strcpy(powernv_states[nr_idle_states].desc, "FastSleep");