diff mbox series

[kvm-unit-tests,v9,16/31] powerpc: add SMP and IPI support

Message ID 20240504122841.1177683-17-npiggin@gmail.com (mailing list archive)
State New, archived
Headers show
Series powerpc improvements | expand

Commit Message

Nicholas Piggin May 4, 2024, 12:28 p.m. UTC
powerpc SMP support is very primitive and does not set up a first-class
runtime environment for secondary CPUs.

This reworks SMP support, and provides a complete C and harness
environment for the secondaries, including interrupt handling, as well
as IPI support.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 lib/powerpc/asm/processor.h |  23 +++
 lib/powerpc/asm/reg.h       |   1 +
 lib/powerpc/asm/setup.h     |   2 -
 lib/powerpc/asm/smp.h       |  44 +++--
 lib/powerpc/io.c            |  15 +-
 lib/powerpc/processor.c     |   7 +-
 lib/powerpc/setup.c         |  90 +++++++---
 lib/powerpc/smp.c           | 337 ++++++++++++++++++++++++++++++----
 lib/ppc64/asm-offsets.c     |   9 +
 lib/ppc64/asm/atomic.h      |   6 +
 lib/ppc64/asm/barrier.h     |   2 +
 lib/ppc64/asm/opal.h        |   7 +
 powerpc/Makefile.common     |   1 +
 powerpc/cstart64.S          |  50 +++++-
 powerpc/selftest.c          |   4 +-
 powerpc/smp.c               | 348 ++++++++++++++++++++++++++++++++++++
 powerpc/tm.c                |   5 +-
 powerpc/unittests.cfg       |   8 +
 18 files changed, 875 insertions(+), 84 deletions(-)
 create mode 100644 lib/ppc64/asm/atomic.h
 create mode 100644 powerpc/smp.c

Comments

Thomas Huth June 4, 2024, 5:14 a.m. UTC | #1
On 04/05/2024 14.28, Nicholas Piggin wrote:
> powerpc SMP support is very primitive and does not set up a first-class
> runtime environment for secondary CPUs.
> 
> This reworks SMP support, and provides a complete C and harness
> environment for the secondaries, including interrupt handling, as well
> as IPI support.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

I now skimmed through the patch and it looks fine so far:

Acked-by: Thomas Huth <thuth@redhat.com>
diff mbox series

Patch

diff --git a/lib/powerpc/asm/processor.h b/lib/powerpc/asm/processor.h
index eed37d1f4..a3859b5d4 100644
--- a/lib/powerpc/asm/processor.h
+++ b/lib/powerpc/asm/processor.h
@@ -16,6 +16,7 @@  extern bool cpu_has_siar;
 extern bool cpu_has_heai;
 extern bool cpu_has_prefix;
 extern bool cpu_has_sc_lev;
+extern bool cpu_has_pause_short;
 
 static inline uint64_t mfspr(int nr)
 {
@@ -45,6 +46,28 @@  static inline void mtmsr(uint64_t msr)
 	asm volatile ("mtmsrd %[msr]" :: [msr] "r" (msr) : "memory");
 }
 
+static inline void local_irq_enable(void)
+{
+	unsigned long msr;
+
+	asm volatile(
+"		mfmsr	%0		\n \
+		ori	%0,%0,%1	\n \
+		mtmsrd	%0,1		"
+		: "=r"(msr) : "i"(MSR_EE): "memory");
+}
+
+static inline void local_irq_disable(void)
+{
+	unsigned long msr;
+
+	asm volatile(
+"		mfmsr	%0		\n \
+		andc	%0,%0,%1	\n \
+		mtmsrd	%0,1		"
+		: "=r"(msr) : "r"(MSR_EE): "memory");
+}
+
 /*
  * This returns true on PowerNV / OPAL machines which run in hypervisor
  * mode. False on pseries / PAPR machines that run in guest mode.
diff --git a/lib/powerpc/asm/reg.h b/lib/powerpc/asm/reg.h
index d6097f48f..d2ca964c4 100644
--- a/lib/powerpc/asm/reg.h
+++ b/lib/powerpc/asm/reg.h
@@ -19,6 +19,7 @@ 
 #define SPR_SPRG1	0x111
 #define SPR_SPRG2	0x112
 #define SPR_SPRG3	0x113
+#define SPR_TBU40	0x11e
 #define SPR_PVR		0x11f
 #define   PVR_VERSION_MASK	UL(0xffff0000)
 #define   PVR_VER_970		UL(0x00390000)
diff --git a/lib/powerpc/asm/setup.h b/lib/powerpc/asm/setup.h
index cc7cf5e25..9ca318ce6 100644
--- a/lib/powerpc/asm/setup.h
+++ b/lib/powerpc/asm/setup.h
@@ -8,8 +8,6 @@ 
 #include <libcflat.h>
 
 #define NR_CPUS			8	/* arbitrarily set for now */
-extern u32 cpus[NR_CPUS];
-extern int nr_cpus;
 
 extern uint64_t tb_hz;
 
diff --git a/lib/powerpc/asm/smp.h b/lib/powerpc/asm/smp.h
index 21940b4bc..c45988bfa 100644
--- a/lib/powerpc/asm/smp.h
+++ b/lib/powerpc/asm/smp.h
@@ -2,21 +2,45 @@ 
 #define _ASMPOWERPC_SMP_H_
 
 #include <libcflat.h>
+#include <asm/processor.h>
 
-extern int nr_threads;
+typedef void (*secondary_entry_fn)(int cpu_id);
 
-struct start_threads {
-	int nr_threads;
-	int nr_started;
+struct cpu {
+	unsigned long server_no;
+	unsigned long stack;
+	unsigned long exception_stack;
+	secondary_entry_fn entry;
 };
 
-typedef void (*secondary_entry_fn)(void);
+extern int nr_cpus_present;
+extern int nr_cpus_online;
+extern struct cpu cpus[];
 
-extern void halt(void);
+register struct cpu *__current_cpu asm("r13");
+static inline struct cpu *current_cpu(void)
+{
+	return __current_cpu;
+}
 
-extern int start_thread(int cpu_id, secondary_entry_fn entry, uint32_t r3);
-extern struct start_threads start_cpu(int cpu_node, secondary_entry_fn entry,
-				      uint32_t r3);
-extern bool start_all_cpus(secondary_entry_fn entry, uint32_t r3);
+static inline int smp_processor_id(void)
+{
+	return current_cpu()->server_no;
+}
+
+void cpu_init(struct cpu *cpu, int cpu_id);
+
+extern void halt(int cpu_id);
+
+extern bool start_all_cpus(secondary_entry_fn entry);
+extern void stop_all_cpus(void);
+
+struct pt_regs;
+void register_ipi(void (*fn)(struct pt_regs *, void *), void *data);
+void unregister_ipi(void);
+void cpu_init_ipis(void);
+void local_ipi_enable(void);
+void local_ipi_disable(void);
+void send_ipi(int cpu_id);
 
 #endif /* _ASMPOWERPC_SMP_H_ */
diff --git a/lib/powerpc/io.c b/lib/powerpc/io.c
index ab7bb843c..cb7f2f050 100644
--- a/lib/powerpc/io.c
+++ b/lib/powerpc/io.c
@@ -10,6 +10,7 @@ 
 #include <asm/rtas.h>
 #include <asm/setup.h>
 #include <asm/processor.h>
+#include <asm/atomic.h>
 #include "io.h"
 
 static struct spinlock print_lock;
@@ -55,13 +56,17 @@  extern void halt(int code);
 
 void exit(int code)
 {
+	static int exited = 0;
+
 // FIXME: change this print-exit/rtas-poweroff to chr_testdev_exit(),
 //        maybe by plugging chr-testdev into a spapr-vty.
-	printf("\nEXIT: STATUS=%d\n", ((code) << 1) | 1);
-	if (machine_is_powernv())
-		opal_power_off();
-	else
-		rtas_power_off();
+	if (atomic_fetch_inc(&exited) == 0) {
+		printf("\nEXIT: STATUS=%d\n", ((code) << 1) | 1);
+		if (machine_is_powernv())
+			opal_power_off();
+		else
+			rtas_power_off();
+	}
 	halt(code);
 	__builtin_unreachable();
 }
diff --git a/lib/powerpc/processor.c b/lib/powerpc/processor.c
index 1b4bb0d61..a6ce3c905 100644
--- a/lib/powerpc/processor.c
+++ b/lib/powerpc/processor.c
@@ -13,6 +13,7 @@ 
 #include <asm/barrier.h>
 #include <asm/hcall.h>
 #include <asm/handlers.h>
+#include <asm/smp.h>
 
 static struct {
 	void (*func)(struct pt_regs *, void *data);
@@ -44,6 +45,8 @@  void do_handle_exception(struct pt_regs *regs)
 {
 	unsigned char v;
 
+	__current_cpu = (struct cpu *)mfspr(SPR_SPRG0);
+
 	v = regs->trap >> 5;
 
 	if (v < 128 && handlers[v].func) {
@@ -51,8 +54,8 @@  void do_handle_exception(struct pt_regs *regs)
 		return;
 	}
 
-	printf("Unhandled cpu exception %#lx at NIA:0x%016lx MSR:0x%016lx\n",
-			regs->trap, regs->nip, regs->msr);
+	printf("Unhandled CPU%d exception %#lx at NIA:0x%016lx MSR:0x%016lx\n",
+		smp_processor_id(), regs->trap, regs->nip, regs->msr);
 	dump_frame_stack((void *)regs->nip, (void *)regs->gpr[1]);
 	abort();
 }
diff --git a/lib/powerpc/setup.c b/lib/powerpc/setup.c
index 73ca2f931..622b99e5d 100644
--- a/lib/powerpc/setup.c
+++ b/lib/powerpc/setup.c
@@ -2,7 +2,7 @@ 
  * Initialize machine setup information and I/O.
  *
  * After running setup() unit tests may query how many cpus they have
- * (nr_cpus), how much memory they have (PHYSICAL_END - PHYSICAL_START),
+ * (nr_cpus_present), how much memory they have (PHYSICAL_END - PHYSICAL_START),
  * may use dynamic memory allocation (malloc, etc.), printf, and exit.
  * Finally, argc and argv are also ready to be passed to main().
  *
@@ -18,6 +18,7 @@ 
 #include <alloc_page.h>
 #include <argv.h>
 #include <asm/setup.h>
+#include <asm/smp.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
 #include <asm/processor.h>
@@ -29,8 +30,8 @@  extern unsigned long stacktop;
 char *initrd;
 u32 initrd_size;
 
-u32 cpus[NR_CPUS] = { [0 ... NR_CPUS-1] = (~0U) };
-int nr_cpus;
+u32 cpu_to_hwid[NR_CPUS] = { [0 ... NR_CPUS-1] = (~0U) };
+int nr_cpus_present;
 uint64_t tb_hz;
 
 struct mem_region mem_regions[NR_MEM_REGIONS];
@@ -45,13 +46,32 @@  struct cpu_set_params {
 
 static void cpu_set(int fdtnode, u64 regval, void *info)
 {
+	const struct fdt_property *prop;
+	u32 *threads;
 	static bool read_common_info = false;
 	struct cpu_set_params *params = info;
-	int cpu = nr_cpus++;
-
-	assert_msg(cpu < NR_CPUS, "Number cpus exceeds maximum supported (%d).", NR_CPUS);
-
-	cpus[cpu] = regval;
+	int nr_threads;
+	int len, i;
+
+	/* Get the id array of threads on this node */
+	prop = fdt_get_property(dt_fdt(), fdtnode,
+				"ibm,ppc-interrupt-server#s", &len);
+	assert(prop);
+
+	nr_threads = len >> 2; /* Divide by 4 since 4 bytes per thread */
+	threads = (u32 *)prop->data; /* Array of valid ids */
+
+	for (i = 0; i < nr_threads; i++) {
+		if (nr_cpus_present >= NR_CPUS) {
+			static bool warned = false;
+			if (!warned) {
+				printf("Warning: Number of present CPUs exceeds maximum supported (%d).\n", NR_CPUS);
+				warned = true;
+			}
+			break;
+		}
+		cpu_to_hwid[nr_cpus_present++] = fdt32_to_cpu(threads[i]);
+	}
 
 	if (!read_common_info) {
 		const struct fdt_property *prop;
@@ -85,32 +105,25 @@  bool cpu_has_siar;
 bool cpu_has_heai;
 bool cpu_has_prefix;
 bool cpu_has_sc_lev; /* sc interrupt has LEV field in SRR1 */
+bool cpu_has_pause_short;
 
-static void cpu_init(void)
+static void cpu_init_params(void)
 {
 	struct cpu_set_params params;
 	int ret;
 
-	nr_cpus = 0;
+	nr_cpus_present = 0;
 	ret = dt_for_each_cpu_node(cpu_set, &params);
 	assert(ret == 0);
 	__icache_bytes = params.icache_bytes;
 	__dcache_bytes = params.dcache_bytes;
 	tb_hz = params.tb_hz;
 
-	/* Interrupt Endianness */
-	if (machine_is_pseries()) {
-#if  __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-		hcall(H_SET_MODE, 1, 4, 0, 0);
-#else
-		hcall(H_SET_MODE, 0, 4, 0, 0);
-#endif
-	}
-
 	switch (mfspr(SPR_PVR) & PVR_VERSION_MASK) {
 	case PVR_VER_POWER10:
 		cpu_has_prefix = true;
 		cpu_has_sc_lev = true;
+		cpu_has_pause_short = true;
 	case PVR_VER_POWER9:
 	case PVR_VER_POWER8E:
 	case PVR_VER_POWER8NVL:
@@ -183,19 +196,37 @@  static void mem_init(phys_addr_t freemem_start)
 #define EXCEPTION_STACK_SIZE	SZ_64K
 
 static char boot_exception_stack[EXCEPTION_STACK_SIZE];
+struct cpu cpus[NR_CPUS];
+
+void cpu_init(struct cpu *cpu, int cpu_id)
+{
+	cpu->server_no = cpu_id;
+
+	cpu->stack = (unsigned long)memalign(SZ_4K, SZ_64K);
+	cpu->stack += SZ_64K - 64;
+	cpu->exception_stack = (unsigned long)memalign(SZ_4K, SZ_64K);
+	cpu->exception_stack += SZ_64K - 64;
+}
 
 void setup(const void *fdt)
 {
 	void *freemem = &stacktop;
 	const char *bootargs, *tmp;
+	struct cpu *cpu;
 	u32 fdt_size;
 	int ret;
 
 	cpu_has_hv = !!(mfmsr() & (1ULL << MSR_HV_BIT));
 
-	/* set exception stack address for this CPU (in SPGR0) */
-	asm volatile ("mtsprg0 %[addr]" ::
-		      [addr] "r" (boot_exception_stack + EXCEPTION_STACK_SIZE - 64));
+	memset(cpus, 0xff, sizeof(cpus));
+
+	cpu = &cpus[0];
+	cpu->server_no = fdt_boot_cpuid_phys(fdt);
+	cpu->exception_stack = (unsigned long)boot_exception_stack;
+	cpu->exception_stack += EXCEPTION_STACK_SIZE - 64;
+
+	mtspr(SPR_SPRG0, (unsigned long)cpu);
+	__current_cpu = cpu;
 
 	enable_mcheck();
 
@@ -238,8 +269,19 @@  void setup(const void *fdt)
 
 	assert(STACK_INT_FRAME_SIZE % 16 == 0);
 
-	/* call init functions */
-	cpu_init();
+	/* set parameters from dt */
+	cpu_init_params();
+
+	/* Interrupt Endianness */
+	if (machine_is_pseries()) {
+#if  __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+		hcall(H_SET_MODE, 1, 4, 0, 0);
+#else
+		hcall(H_SET_MODE, 0, 4, 0, 0);
+#endif
+	}
+
+	cpu_init_ipis();
 
 	/* cpu_init must be called before mem_init */
 	mem_init(PAGE_ALIGN((unsigned long)freemem));
diff --git a/lib/powerpc/smp.c b/lib/powerpc/smp.c
index 3e211eba8..27b169841 100644
--- a/lib/powerpc/smp.c
+++ b/lib/powerpc/smp.c
@@ -6,58 +6,253 @@ 
  * This work is licensed under the terms of the GNU LGPL, version 2.
  */
 
+#include <alloc.h>
 #include <devicetree.h>
+#include <asm/atomic.h>
+#include <asm/barrier.h>
+#include <asm/processor.h>
 #include <asm/time.h>
 #include <asm/setup.h>
+#include <asm/opal.h>
+#include <asm/hcall.h>
 #include <asm/rtas.h>
 #include <asm/smp.h>
 
-int nr_threads;
-
 struct secondary_entry_data {
 	secondary_entry_fn entry;
-	uint64_t r3;
-	int nr_started;
 };
 
+int nr_cpus_online = 1;
+
+static void stop_self(int cpu_id)
+{
+	if (machine_is_powernv()) {
+		if (opal_call(OPAL_RETURN_CPU, 0, 0, 0) != OPAL_SUCCESS) {
+			printf("OPAL_RETURN_CPU failed\n");
+		}
+	} else {
+		rtas_stop_self();
+	}
+
+	printf("failed to stop cpu %d\n", cpu_id);
+	assert(0);
+}
+
+void main_secondary(struct cpu *cpu);
+void main_secondary(struct cpu *cpu)
+{
+	mtspr(SPR_SPRG0, (unsigned long)cpu);
+	__current_cpu = cpu;
+
+	enable_mcheck();
+
+	cpu_init_ipis();
+
+	atomic_fetch_inc(&nr_cpus_online);
+
+	cpu->entry(cpu->server_no);
+
+	mb();
+	atomic_fetch_dec(&nr_cpus_online);
+
+	stop_self(cpu->server_no);
+}
+
+enum OpalThreadStatus {
+        OPAL_THREAD_INACTIVE = 0x0,
+        OPAL_THREAD_STARTED = 0x1,
+        OPAL_THREAD_UNAVAILABLE = 0x2 /* opal-v3 */
+};
+
+#define H_EOI		0x64
+#define H_CPPR		0x68
+#define H_IPI		0x6c
+#define H_XIRR		0x74
+
+static void (*ipi_fn)(struct pt_regs *regs, void *data);
+
+static void dbell_handler(struct pt_regs *regs, void *data)
+{
+	/* sync */
+	ipi_fn(regs, data);
+}
+
+static void extint_handler(struct pt_regs *regs, void *data)
+{
+	int32_t xirr;
+	int32_t xisr;
+	int64_t rc;
+
+	asm volatile("mr r3,%1 ; sc 1 ; mr %0,r4" : "=r"(xirr) : "r"(H_XIRR));
+	xisr = xirr & 0xffffff;
+
+	if (xisr == 2) { /* IPI */
+		rc = hcall(H_IPI, smp_processor_id(), 0xff);
+		assert(rc == H_SUCCESS);
+	}
+
+	xirr |= (5 << 24);
+	rc = hcall(H_EOI, xirr);
+	assert(rc == H_SUCCESS);
+
+	/* lower IPI */
+	ipi_fn(regs, data);
+}
+
+void cpu_init_ipis(void)
+{
+	if (machine_is_powernv()) {
+		/* skiboot can leave some messages set */
+		unsigned long rb = (5 << (63-36));
+		asm volatile("msgclr	%0" :: "r"(rb) : "memory");
+	}
+}
+
+void local_ipi_enable(void)
+{
+	if (machine_is_pseries()) {
+		hcall(H_CPPR, 5);
+	}
+}
+
+void local_ipi_disable(void)
+{
+	if (machine_is_pseries()) {
+		hcall(H_CPPR, 0);
+	}
+}
+
+void register_ipi(void (*fn)(struct pt_regs *, void *), void *data)
+{
+	ipi_fn = fn;
+	if (machine_is_powernv()) {
+		handle_exception(0xe80, &dbell_handler, data);
+	} else {
+		handle_exception(0x500, &extint_handler, data);
+	}
+}
+
+void unregister_ipi(void)
+{
+	if (machine_is_powernv()) {
+		handle_exception(0xe80, NULL, NULL);
+	} else {
+		handle_exception(0x500, NULL, NULL);
+	}
+}
+
+void send_ipi(int cpu_id)
+{
+	if (machine_is_powernv()) {
+		unsigned long rb = (5 << (63-36)) | cpu_id;
+		asm volatile("lwsync" ::: "memory");
+		asm volatile("msgsnd	%0" :: "r"(rb) : "memory");
+	} else {
+		hcall(H_IPI, cpu_id, 4);
+	}
+}
+
+static int nr_started = 1;
+
+extern void start_secondary(uint64_t server_no); /* asm entry point */
+
+static bool cpu_is_running(int cpu_id)
+{
+	if (machine_is_powernv()) {
+		int64_t ret;
+		uint8_t status;
+
+		ret = opal_call(OPAL_QUERY_CPU_STATUS, cpu_id, (unsigned long)&status, 0);
+		if (ret != OPAL_SUCCESS) {
+			printf("OPAL_QUERY_CPU_STATUS failed for cpu %d\n", cpu_id);
+			return false;
+		}
+		return (status != OPAL_THREAD_INACTIVE);
+	} else {
+		uint32_t query_token;
+		int outputs[1], ret;
+
+		ret = rtas_token("query-cpu-stopped-state", &query_token);
+		if (ret != 0) {
+			printf("rtas token query-cpu-stopped-state failed\n");
+			return false;
+		}
+
+		ret = rtas_call(query_token, 1, 2, outputs, cpu_id);
+		if (ret) {
+			printf("query-cpu-stopped-state failed for cpu %d\n", cpu_id);
+			return ret;
+		}
+		if (outputs[0]) /* cpu not in stopped state */
+			return true;
+		return false;
+	}
+}
+
 /*
  * Start stopped thread cpu_id at entry
  * Returns:	<0 on failure to start stopped cpu
  *		0  on success
  *		>0 on cpu not in stopped state
  */
-int start_thread(int cpu_id, secondary_entry_fn entry, uint32_t r3)
+static int start_thread(int cpu_id, secondary_entry_fn entry)
 {
-	uint32_t query_token, start_token;
-	int outputs[1], ret;
+	struct cpu *cpu;
+	uint64_t tb;
 
-	ret = rtas_token("query-cpu-stopped-state", &query_token);
-	assert(ret == 0);
-	ret = rtas_token("start-cpu", &start_token);
-	assert(ret == 0);
+	if (nr_started >= NR_CPUS) {
+		/* Reached limit */
+		return -1;
+	}
+
+	if (cpu_id == smp_processor_id()) {
+		/* Boot CPU already started */
+		return -1;
+	}
+
+	tb = get_tb();
+	while (cpu_is_running(cpu_id)) {
+		if (get_tb() - tb > 3*tb_hz) {
+			printf("Unable to start running CPU:%d\n", cpu_id);
+			return 1;
+		}
+	}
+
+	cpu = &cpus[nr_started];
+	nr_started++;
+
+	cpu_init(cpu, cpu_id);
+	cpu->entry = entry;
+
+	if (machine_is_powernv()) {
+		if (opal_call(OPAL_START_CPU, cpu_id, (unsigned long)start_secondary, 0) != OPAL_SUCCESS) {
+			printf("failed to start cpu %d\n", cpu_id);
+			return -1;
+		}
+	} else {
+		uint32_t start_token;
+		int ret;
 
-	ret = rtas_call(query_token, 1, 2, outputs, cpu_id);
-	if (ret) {
-		printf("query-cpu-stopped-state failed for cpu %d\n", cpu_id);
-	} else if (!outputs[0]) { /* cpu in stopped state */
-		ret = rtas_call(start_token, 3, 1, NULL, cpu_id, entry, r3);
-		if (ret)
+		ret = rtas_token("start-cpu", &start_token);
+		assert(ret == 0);
+
+		ret = rtas_call(start_token, 3, 1, NULL, cpu_id, start_secondary, cpu_id);
+		if (ret) {
 			printf("failed to start cpu %d\n", cpu_id);
-	} else { /* cpu not in stopped state */
-		ret = outputs[0];
+			return ret;
+		}
 	}
 
-	return ret;
+	return 0;
 }
 
 /*
  * Start all stopped threads (vcpus) on cpu_node
  * Returns: Number of stopped cpus which were successfully started
  */
-struct start_threads start_cpu(int cpu_node, secondary_entry_fn entry,
-			       uint32_t r3)
+static void start_core(int cpu_node, secondary_entry_fn entry)
 {
-	int len, i, nr_threads, nr_started = 0;
+	int len, i, nr_threads;
 	const struct fdt_property *prop;
 	u32 *threads;
 
@@ -67,23 +262,18 @@  struct start_threads start_cpu(int cpu_node, secondary_entry_fn entry,
 	assert(prop);
 
 	nr_threads = len >> 2; /* Divide by 4 since 4 bytes per thread */
-	threads = (u32 *)prop->data; /* Array of valid ids */
 
-	for (i = 0; i < nr_threads; i++) {
-		if (!start_thread(fdt32_to_cpu(threads[i]), entry, r3))
-			nr_started++;
-	}
+	threads = (u32 *)prop->data; /* Array of valid ids */
 
-	return (struct start_threads) { nr_threads, nr_started };
+	for (i = 0; i < nr_threads; i++)
+		start_thread(fdt32_to_cpu(threads[i]), entry);
 }
 
 static void start_each_secondary(int fdtnode, u64 regval __unused, void *info)
 {
 	struct secondary_entry_data *datap = info;
-	struct start_threads ret = start_cpu(fdtnode, datap->entry, datap->r3);
 
-	nr_threads += ret.nr_threads;
-	datap->nr_started += ret.nr_started;
+	start_core(fdtnode, datap->entry);
 }
 
 /*
@@ -92,14 +282,89 @@  static void start_each_secondary(int fdtnode, u64 regval __unused, void *info)
  * Returns:	TRUE on success
  *		FALSE on failure
  */
-bool start_all_cpus(secondary_entry_fn entry, uint32_t r3)
+bool start_all_cpus(secondary_entry_fn entry)
 {
-	struct secondary_entry_data data = { entry, r3,	0 };
+	struct secondary_entry_data data = { entry };
+	uint64_t tb;
 	int ret;
 
+	assert(nr_cpus_online == 1);
+	assert(nr_started == 1);
 	ret = dt_for_each_cpu_node(start_each_secondary, &data);
 	assert(ret == 0);
+	assert(nr_started == nr_cpus_present);
+
+	tb = get_tb();
+	while (nr_cpus_online < nr_cpus_present) {
+		if (get_tb() - tb > 3*tb_hz) {
+			printf("failed to start all secondaries\n");
+			assert(0);
+		}
+		cpu_relax();
+	}
+
+	return 1;
+}
+
+/*
+ * Start stopped thread cpu_id at entry
+ * Returns:	<0 on failure to start stopped cpu
+ *		0  on success
+ *		>0 on cpu not in stopped state
+ */
+static int wait_thread(int cpu_id)
+{
+	uint64_t tb;
+
+	/* Skip the caller */
+	if (cpu_id == smp_processor_id()) {
+		return 0;
+	}
+
+	tb = get_tb();
+	while (cpu_is_running(cpu_id)) {
+		if (get_tb() - tb > 3*tb_hz) {
+			printf("Timeout waiting to stop CPU:%d\n", cpu_id);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Wait for running threads (vcpus) on cpu_node to stop
+ */
+static void wait_core(int cpu_node)
+{
+	int len, i, nr_threads;
+	const struct fdt_property *prop;
+	u32 *threads;
+
+	/* Get the id array of threads on this cpu_node */
+	prop = fdt_get_property(dt_fdt(), cpu_node,
+				"ibm,ppc-interrupt-server#s", &len);
+	assert(prop);
+
+	nr_threads = len >> 2; /* Divide by 4 since 4 bytes per thread */
+
+	threads = (u32 *)prop->data; /* Array of valid ids */
+
+	for (i = 0; i < nr_threads; i++)
+		wait_thread(fdt32_to_cpu(threads[i]));
+}
+
+static void wait_each_secondary(int fdtnode, u64 regval __unused, void *info)
+{
+	wait_core(fdtnode);
+}
+
+void stop_all_cpus(void)
+{
+	while (nr_cpus_online > 1)
+		cpu_relax();
 
-	/* We expect that we come in with one thread already started */
-	return data.nr_started == nr_threads - 1;
+	dt_for_each_cpu_node(wait_each_secondary, NULL);
+	mb();
+	nr_started = 1;
 }
diff --git a/lib/ppc64/asm-offsets.c b/lib/ppc64/asm-offsets.c
index 7843a20b4..ba1a045df 100644
--- a/lib/ppc64/asm-offsets.c
+++ b/lib/ppc64/asm-offsets.c
@@ -6,6 +6,8 @@ 
 #include <libcflat.h>
 #include <kbuild.h>
 #include <asm/ptrace.h>
+#include <asm/setup.h>
+#include <asm/smp.h>
 
 int main(void)
 {
@@ -50,5 +52,12 @@  int main(void)
 	DEFINE(_XER, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, xer));
 	DEFINE(_CCR, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, ccr));
 	DEFINE(_TRAP, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, trap));
+
+	DEFINE(SIZEOF_STRUCT_CPU, sizeof(struct cpu));
+	DEFINE(CPU_SERVER_NO, offsetof(struct cpu, server_no));
+	DEFINE(CPU_STACK, offsetof(struct cpu, stack));
+	DEFINE(CPU_EXCEPTION_STACK, offsetof(struct cpu, exception_stack));
+	DEFINE(MAX_CPUS, NR_CPUS);
+
 	return 0;
 }
diff --git a/lib/ppc64/asm/atomic.h b/lib/ppc64/asm/atomic.h
new file mode 100644
index 000000000..0f461b9ce
--- /dev/null
+++ b/lib/ppc64/asm/atomic.h
@@ -0,0 +1,6 @@ 
+#ifndef _POWERPC_ATOMIC_H_
+#define _POWERPC_ATOMIC_H_
+
+#include "asm-generic/atomic.h"
+
+#endif /* _POWERPC_ATOMIC_H_ */
diff --git a/lib/ppc64/asm/barrier.h b/lib/ppc64/asm/barrier.h
index 76f6efa78..d2df29520 100644
--- a/lib/ppc64/asm/barrier.h
+++ b/lib/ppc64/asm/barrier.h
@@ -1,6 +1,8 @@ 
 #ifndef _ASMPPC64_BARRIER_H_
 #define _ASMPPC64_BARRIER_H_
 
+#define pause_short() asm volatile(".long 0x7c40003c" ::: "memory")
+
 #define mb() asm volatile("sync":::"memory")
 #define rmb() asm volatile("sync":::"memory")
 #define wmb() asm volatile("sync":::"memory")
diff --git a/lib/ppc64/asm/opal.h b/lib/ppc64/asm/opal.h
index de64e2c8d..6c3e9ffe2 100644
--- a/lib/ppc64/asm/opal.h
+++ b/lib/ppc64/asm/opal.h
@@ -2,14 +2,21 @@ 
 #ifndef _ASMPPC64_OPAL_H_
 #define _ASMPPC64_OPAL_H_
 
+#include <stdint.h>
+
 #define OPAL_SUCCESS				0
 
 #define OPAL_CONSOLE_WRITE			1
 #define OPAL_CONSOLE_READ			2
 #define OPAL_CEC_POWER_DOWN			5
 #define OPAL_POLL_EVENTS			10
+#define OPAL_START_CPU				41
+#define OPAL_QUERY_CPU_STATUS			42
+#define OPAL_RETURN_CPU				69
 #define OPAL_REINIT_CPUS			70
 # define OPAL_REINIT_CPUS_HILE_BE		(1 << 0)
 # define OPAL_REINIT_CPUS_HILE_LE		(1 << 1)
 
+int64_t opal_call(int64_t token, int64_t arg1, int64_t arg2, int64_t arg3);
+
 #endif
diff --git a/powerpc/Makefile.common b/powerpc/Makefile.common
index 6374418d2..b98f71c2f 100644
--- a/powerpc/Makefile.common
+++ b/powerpc/Makefile.common
@@ -12,6 +12,7 @@  tests-common = \
 	$(TEST_DIR)/rtas.elf \
 	$(TEST_DIR)/emulator.elf \
 	$(TEST_DIR)/tm.elf \
+	$(TEST_DIR)/smp.elf \
 	$(TEST_DIR)/sprs.elf \
 	$(TEST_DIR)/interrupts.elf
 
diff --git a/powerpc/cstart64.S b/powerpc/cstart64.S
index 405cb0561..b4eb70696 100644
--- a/powerpc/cstart64.S
+++ b/powerpc/cstart64.S
@@ -125,6 +125,50 @@  start:
 	bl	exit
 	b	halt
 
+/*
+ * start_secondary is the secondary entry point. r3 contains the cpu id
+ */
+.globl start_secondary
+start_secondary:
+	FIXUP_ENDIAN
+	/* Switch to 64-bit mode */
+	mfmsr	r1
+	li	r2,1
+	sldi	r2,r2,MSR_SF_BIT
+	or	r1,r1,r2
+	mtmsrd	r1
+
+	bl	0f
+0:	mflr	r31
+	subi	r31, r31, 0b - start	/* QEMU's kernel load address */
+
+	ld	r2, (p_toc - start)(r31)
+
+	LOAD_REG_ADDR(r9, cpus)
+	li	r8,0
+	li	r7,0
+1:	add	r6,r9,r7
+	ld	r6,CPU_SERVER_NO(r6)
+	cmpd	r6,r3
+	beq	2f
+	addi	r7,r7,SIZEOF_STRUCT_CPU
+	addi	r8,r8,1
+	cmpdi	r8,MAX_CPUS
+	bne	1b
+	b	.
+
+2:	add	r3,r9,r7
+	ld	r1,CPU_STACK(r3)
+
+	/* Zero backpointers in initial stack frame so backtrace() stops */
+	li	r0,0
+	std	r0,0(r1)
+	std	r0,16(r1)
+
+	bl	main_secondary
+	bl	exit
+	b	halt
+
 .align 3
 p_stack:	.llong  stackptr
 p_toc:		.llong  tocptr
@@ -257,7 +301,8 @@  do_handle_exception_return:
 	. = \vec
 
 	mtsprg1	r1	/* save r1 */
-	mfsprg0	r1	/* get exception stack address */
+	mfsprg0	r1	/* get struct cpu address */
+	ld	r1,CPU_EXCEPTION_STACK(r1) /* get exception stack address */
 	subi	r1,r1, INT_FRAME_SIZE
 
 	/* save r0 and ctr to call generic handler */
@@ -273,7 +318,8 @@  do_handle_exception_return:
 	. = \vec
 
 	mtsprg1	r1	/* save r1 */
-	mfsprg0	r1	/* get exception stack address */
+	mfsprg0	r1	/* get struct cpu address */
+	ld	r1,CPU_EXCEPTION_STACK(r1) /* get exception stack address */
 	subi	r1,r1, INT_FRAME_SIZE
 
 	/* save r0 and ctr to call generic handler */
diff --git a/powerpc/selftest.c b/powerpc/selftest.c
index 7acff7104..8d1a2c767 100644
--- a/powerpc/selftest.c
+++ b/powerpc/selftest.c
@@ -8,6 +8,7 @@ 
 #include <libcflat.h>
 #include <util.h>
 #include <asm/setup.h>
+#include <asm/smp.h>
 
 static void check_setup(int argc, char **argv)
 {
@@ -34,7 +35,8 @@  static void check_setup(int argc, char **argv)
 
 		} else if (strcmp(argv[i], "smp") == 0) {
 
-			report(nr_cpus == (int)val, "nr_cpus = %d", nr_cpus);
+			report(nr_cpus_present == (int)val,
+				"nr_cpus_present = %d", nr_cpus_present);
 			++nr_tests;
 		}
 
diff --git a/powerpc/smp.c b/powerpc/smp.c
new file mode 100644
index 000000000..644cf1c75
--- /dev/null
+++ b/powerpc/smp.c
@@ -0,0 +1,348 @@ 
+/* SPDX-License-Identifier: LGPL-2.0-only */
+/*
+ * SMP and IPI Tests
+ *
+ * Copyright 2024 Nicholas Piggin, IBM Corp.
+ */
+#include <libcflat.h>
+#include <asm/atomic.h>
+#include <asm/barrier.h>
+#include <asm/processor.h>
+#include <asm/time.h>
+#include <asm/smp.h>
+#include <asm/setup.h>
+#include <asm/ppc_asm.h>
+#include <devicetree.h>
+
+static volatile bool start_test_running = true;
+static volatile int nr_cpus_started;
+
+static void start_fn(int cpu_id)
+{
+	atomic_fetch_inc(&nr_cpus_started);
+	while (start_test_running)
+		cpu_relax();
+	atomic_fetch_dec(&nr_cpus_started);
+}
+
+static void test_start_cpus(int argc, char **argv)
+{
+	uint64_t tb;
+
+	if (argc > 2)
+		report_abort("Unsupported argument: '%s'", argv[2]);
+
+	nr_cpus_started = 1;
+	if (!start_all_cpus(start_fn))
+		report_abort("Failed to start secondary cpus");
+
+	tb = get_tb();
+	while (nr_cpus_started < nr_cpus_present) {
+		cpu_relax();
+		if (get_tb() - tb > tb_hz * 5)
+			report_abort("Failed to start all secondaries");
+	}
+
+	if (nr_cpus_started != nr_cpus_online)
+		report_abort("Started CPUs does not match online");
+
+	barrier();
+	start_test_running = false;
+	barrier();
+
+	tb = get_tb();
+	while (nr_cpus_started > 1) {
+		cpu_relax();
+		if (get_tb() - tb > tb_hz * 5)
+			report_abort("Failed to stop all secondaries");
+	}
+
+	stop_all_cpus();
+
+	report(true, "start cpus");
+}
+
+static volatile int nr_cpus_ipi = 0;
+
+static void ipi_handler(struct pt_regs *regs, void *data)
+{
+	atomic_fetch_inc(&nr_cpus_ipi);
+}
+
+static volatile bool ipi_test_running = true;
+
+static void ipi_fn(int cpu_id)
+{
+	local_ipi_enable();
+
+	mtspr(SPR_DEC, 0x7fffffff);
+	local_irq_enable();
+	while (ipi_test_running)
+		cpu_relax();
+	local_irq_disable();
+
+	local_ipi_disable();
+}
+
+static void test_ipi_cpus(int argc, char **argv)
+{
+	uint64_t tb;
+	int i;
+
+	if (argc > 2)
+		report_abort("Unsupported argument: '%s'", argv[2]);
+
+	if (nr_cpus_present < 2) {
+		report_skip("Requires SMP (2 or more CPUs)");
+		return;
+	}
+
+	register_ipi(ipi_handler, NULL);
+
+	if (!start_all_cpus(ipi_fn))
+		report_abort("Failed to start secondary cpus");
+
+	for (i = 1; i < nr_cpus_online; i++)
+		send_ipi(cpus[i].server_no);
+
+	tb = get_tb();
+	while (nr_cpus_ipi < nr_cpus_online - 1) {
+		cpu_relax();
+		if (get_tb() - tb > tb_hz * 5)
+			report_abort("Secondaries failed to respond to IPIs");
+	}
+
+	send_ipi(cpus[1].server_no);
+
+	tb = get_tb();
+	while (nr_cpus_ipi < nr_cpus_online) {
+		cpu_relax();
+		if (get_tb() - tb > tb_hz * 5)
+			report_abort("Secondaries failed to respond to IPIs");
+	}
+
+	ipi_test_running = false;
+
+	stop_all_cpus();
+
+	assert(nr_cpus_ipi == nr_cpus_present);
+
+	unregister_ipi();
+
+	report(true, "IPI cpus");
+}
+
+static uint64_t time;
+static bool time_went_backward;
+
+static void check_and_record_time(void)
+{
+	uint64_t tb;
+	uint64_t t;
+	uint64_t old;
+
+	t = time;
+again:
+	barrier();
+	tb = get_tb();
+	asm volatile("1: ldarx %0,0,%1 ; cmpd %0,%2 ; bne 2f ; stdcx. %3,0,%1 ; bne- 1b; 2:" : "=&r"(old) : "r"(&time), "r"(t), "r"(tb) : "memory", "cr0");
+	assert(tb >= t);
+	if (old != t) {
+		t = old;
+		goto again;
+	}
+	if (old > tb)
+		time_went_backward = true;
+}
+
+static void update_time(int64_t tb_offset)
+{
+	uint64_t new_tb;
+
+	new_tb = get_tb() + tb_offset;
+	mtspr(SPR_TBU40, new_tb);
+	if ((get_tb() & 0xFFFFFF) < (new_tb & 0xFFFFFF)) {
+		new_tb += 0x1000000;
+		mtspr(SPR_TBU40, new_tb);
+	}
+}
+
+static void time_sync_fn(int cpu_id)
+{
+	uint64_t start = get_tb();
+
+	while (!time_went_backward && get_tb() - start < tb_hz*2) {
+		check_and_record_time();
+		cpu_relax();
+	}
+
+	while (!time_went_backward && get_tb() - start < tb_hz*2) {
+		check_and_record_time();
+		udelay(1);
+	}
+
+	if (machine_is_powernv()) {
+		while (!time_went_backward && get_tb() - start < tb_hz*2) {
+			check_and_record_time();
+			update_time(0x1234000000);
+			cpu_relax();
+			update_time(-0x1234000000);
+		}
+	}
+}
+
+static void test_time_sync(int argc, char **argv)
+{
+	if (argc > 2)
+		report_abort("Unsupported argument: '%s'", argv[2]);
+
+	if (nr_cpus_present < 2) {
+		report_skip("Requires SMP (2 or more CPUs)");
+		return;
+	}
+
+	time_went_backward = false;
+
+	if (!start_all_cpus(time_sync_fn))
+		report_abort("Failed to start secondary cpus");
+
+	time_sync_fn(-1);
+
+	stop_all_cpus();
+
+	report(!time_went_backward, "time sync");
+}
+
+static volatile bool relax_test_running = true;
+
+static int relax_loop_count[NR_CPUS];
+
+static void relax_fn(int cpu_id)
+{
+	volatile int i = 0;
+
+	while (relax_test_running) {
+		cpu_relax();
+		i++;
+	}
+
+	relax_loop_count[cpu_id] = i;
+}
+
+#define ITERS 1000000
+
+static void test_relax(int argc, char **argv)
+{
+	volatile int i;
+	int count;
+
+	if (argc > 2)
+		report_abort("Unsupported argument: '%s'", argv[2]);
+
+	if (nr_cpus_present < 2) {
+		report_skip("Requires SMP (2 or more CPUs)");
+		return;
+	}
+
+	if (!start_all_cpus(relax_fn))
+		report_abort("Failed to start secondary cpus");
+
+	for (i = 0; i < ITERS; i++)
+		;
+
+	relax_test_running = false;
+
+	stop_all_cpus();
+
+	count = 0;
+	for (i = 0; i < NR_CPUS; i++)
+		count += relax_loop_count[i];
+	if (count == 0)
+		count = 1;
+
+	report(true, "busy-loops on CPU:%d vs cpu_relax-loops on others %ld%%", smp_processor_id(), (long)ITERS * 100 / count);
+}
+
+static volatile bool pause_test_running = true;
+
+static int pause_loop_count[NR_CPUS];
+
+static void pause_fn(int cpu_id)
+{
+	volatile int i = 0;
+
+	while (pause_test_running) {
+		pause_short();
+		i++;
+	}
+
+	pause_loop_count[cpu_id] = i;
+}
+
+#define ITERS 1000000
+
+static void test_pause(int argc, char **argv)
+{
+	volatile int i;
+	int count;
+
+	if (argc > 2)
+		report_abort("Unsupported argument: '%s'", argv[2]);
+
+	if (!cpu_has_pause_short)
+		return;
+
+	if (nr_cpus_present < 2) {
+		report_skip("Requires SMP (2 or more CPUs)");
+		return;
+	}
+
+	if (!start_all_cpus(pause_fn))
+		report_abort("Failed to start secondary cpus");
+
+	for (i = 0; i < ITERS; i++)
+		;
+
+	pause_test_running = false;
+
+	stop_all_cpus();
+
+	count = 0;
+	for (i = 0; i < NR_CPUS; i++)
+		count += pause_loop_count[i];
+
+	report(true, "busy-loops on CPU:%d vs pause_short-loops on others %ld%%", smp_processor_id(), (long)ITERS * 100 / count);
+}
+
+struct {
+	const char *name;
+	void (*func)(int argc, char **argv);
+} hctests[] = {
+	{ "start_cpus", test_start_cpus },
+	{ "ipi_cpus", test_ipi_cpus },
+	{ "time_sync", test_time_sync },
+	{ "cpu_relax", test_relax },
+	{ "pause", test_pause },
+	{ NULL, NULL }
+};
+
+int main(int argc, char **argv)
+{
+	bool all;
+	int i;
+
+	all = argc == 1 || !strcmp(argv[1], "all");
+
+	report_prefix_push("smp");
+
+	for (i = 0; hctests[i].name != NULL; i++) {
+		if (all || strcmp(argv[1], hctests[i].name) == 0) {
+			report_prefix_push(hctests[i].name);
+			hctests[i].func(argc, argv);
+			report_prefix_pop();
+		}
+	}
+
+	report_prefix_pop();
+	return report_summary();
+}
diff --git a/powerpc/tm.c b/powerpc/tm.c
index d9e7f455d..507eaf492 100644
--- a/powerpc/tm.c
+++ b/powerpc/tm.c
@@ -89,7 +89,7 @@  static void test_h_cede_tm(int argc, char **argv)
 	if (argc > 2)
 		report_abort("Unsupported argument: '%s'", argv[2]);
 
-	if (!start_all_cpus(halt, 0))
+	if (!start_all_cpus(halt))
 		report_abort("Failed to start secondary cpus");
 
 	if (!enable_tm())
@@ -133,8 +133,9 @@  int main(int argc, char **argv)
 		report_skip("TM is not available");
 		goto done;
 	}
+	/* kvm-unit-tests can limit number of CPUs present */
 	/* KVM does not report TM in secondary threads in POWER9 */
-	report_kfail(true, cpus_with_tm == nr_cpus,
+	report_kfail(true, cpus_with_tm >= nr_cpus_present,
 	       "TM available in all 'ibm,pa-features' properties");
 
 	all = argc == 1 || !strcmp(argv[1], "all");
diff --git a/powerpc/unittests.cfg b/powerpc/unittests.cfg
index 03d7b60e5..aa707e0f3 100644
--- a/powerpc/unittests.cfg
+++ b/powerpc/unittests.cfg
@@ -69,6 +69,14 @@  file = emulator.elf
 [interrupts]
 file = interrupts.elf
 
+[smp]
+file = smp.elf
+smp = 2
+
+[smp-smt]
+file = smp.elf
+smp = 8,threads=4
+
 [h_cede_tm]
 file = tm.elf
 machine = pseries