@@ -90,8 +90,6 @@
#define PHANDLE_XICP 0x00001111
-#define HTAB_SIZE(spapr) (1ULL << ((spapr)->htab_shift))
-
static XICSState *try_create_xics(const char *type, int nr_servers,
int nr_irqs, Error **errp)
{
@@ -1,4 +1,5 @@
#include "sysemu/sysemu.h"
+#include "qemu/error-report.h"
#include "cpu.h"
#include "helper_regs.h"
#include "hw/ppc/spapr.h"
@@ -316,16 +317,278 @@ static target_ulong h_read(PowerPCCPU *cpu, sPAPRMachineState *spapr,
return H_SUCCESS;
}
+struct sPAPRPendingHPT {
+ /* These fields are read-only after initialization */
+ int shift;
+ QemuThread thread;
+
+ /* These fields are protected by the BQL */
+ bool complete;
+
+ /* These fields are private to the preparation thread if
+ * !complete, otherwise protected by the BQL */
+ int ret;
+ void *hpt;
+};
+
+static void free_pending_hpt(sPAPRPendingHPT *pending)
+{
+ if (pending->hpt) {
+ qemu_vfree(pending->hpt);
+ }
+
+ g_free(pending);
+}
+
+static void *hpt_prepare_thread(void *opaque)
+{
+ sPAPRPendingHPT *pending = opaque;
+ size_t size = 1ULL << pending->shift;
+
+ pending->hpt = qemu_memalign(size, size);
+ if (pending->hpt) {
+ memset(pending->hpt, 0, size);
+ pending->ret = H_SUCCESS;
+ } else {
+ pending->ret = H_NO_MEM;
+ }
+
+ qemu_mutex_lock_iothread();
+
+ if (SPAPR_MACHINE(qdev_get_machine())->pending_hpt != pending) {
+ /* We've been cancelled, clean ourselves up */
+ free_pending_hpt(pending);
+ goto out;
+ }
+
+ pending->complete = true;
+
+out:
+ qemu_mutex_unlock_iothread();
+ return NULL;
+}
+
+/* Must be called with BQL held */
+static void cancel_hpt_prepare(sPAPRMachineState *spapr)
+{
+ sPAPRPendingHPT *pending = spapr->pending_hpt;
+
+ /* Let the thread know it's cancelled */
+ spapr->pending_hpt = NULL;
+
+ if (!pending) {
+ /* Nothing to do */
+ return;
+ }
+
+ if (!pending->complete) {
+ /* thread will clean itself up */
+ return;
+ }
+
+ free_pending_hpt(pending);
+}
+
static target_ulong h_resize_hpt_prepare(PowerPCCPU *cpu,
sPAPRMachineState *spapr,
target_ulong opcode,
target_ulong *args)
{
target_ulong flags = args[0];
- target_ulong shift = args[1];
+ int shift = args[1];
+ sPAPRPendingHPT *pending = spapr->pending_hpt;
trace_spapr_h_resize_hpt_prepare(flags, shift);
- return H_HARDWARE;
+
+ if (flags != 0) {
+ return H_PARAMETER;
+ }
+
+ if (shift && ((shift < 18) || (shift > 46))) {
+ return H_PARAMETER;
+ }
+
+ if (pending) {
+ /* something already in progress */
+ if (pending->shift == shift) {
+ /* and it's suitable */
+ if (pending->complete) {
+ return pending->ret;
+ } else {
+ return H_LONG_BUSY_ORDER_100_MSEC;
+ }
+ }
+
+ /* not suitable, cancel and replace */
+ cancel_hpt_prepare(spapr);
+ }
+
+ if (!shift) {
+ /* nothing to do */
+ return H_SUCCESS;
+ }
+
+ /* start new prepare */
+ pending = g_malloc0(sizeof(*pending));
+ pending->shift = shift;
+ pending->ret = H_HARDWARE;
+
+ qemu_thread_create(&pending->thread, "sPAPR HPT prepare",
+ hpt_prepare_thread, pending, QEMU_THREAD_DETACHED);
+
+ spapr->pending_hpt = pending;
+
+ /* In theory we could estimate the time more accurately based on
+ * the new size, but there's not much point */
+ return H_LONG_BUSY_ORDER_100_MSEC;
+}
+
+static uint64 new_hpte_load0(void *htab, uint64_t pteg, int slot)
+{
+ uint8_t *addr = htab;
+
+ addr += pteg * HASH_PTEG_SIZE_64;
+ addr += slot * HASH_PTE_SIZE_64;
+ return ldq_p(addr);
+}
+
+static void new_hpte_store(void *htab, uint64_t pteg, int slot,
+ uint64_t pte0, uint64_t pte1)
+{
+ uint8_t *addr = htab;
+
+ addr += pteg * HASH_PTEG_SIZE_64;
+ addr += slot * HASH_PTE_SIZE_64;
+
+ stq_p(addr, pte0);
+ stq_p(addr + HASH_PTE_SIZE_64/2, pte1);
+}
+
+static int rehash_hpte(PowerPCCPU *cpu, uint64_t token,
+ void *old, uint64_t oldsize,
+ void *new, uint64_t newsize,
+ uint64_t pteg, int slot)
+{
+ uint64_t old_hash_mask = (oldsize >> 7) - 1;
+ uint64_t new_hash_mask = (newsize >> 7) - 1;
+ target_ulong pte0 = ppc_hash64_load_hpte0(cpu, token, slot);
+ target_ulong pte1;
+ uint64_t avpn;
+ unsigned shift, spshift;
+ uint64_t hash, new_pteg, replace_pte0;
+
+ if (!(pte0 & HPTE64_V_VALID) || !(pte0 & HPTE64_V_BOLTED)) {
+ return H_SUCCESS;
+ }
+
+ pte1 = ppc_hash64_load_hpte1(cpu, token, slot);
+
+ shift = ppc_hash64_hpte_page_shift_noslb(cpu, pte0, pte1, &spshift);
+ assert(shift); /* H_ENTER should never have allowed a bad encoding */
+ avpn = HPTE64_V_AVPN_VAL(pte0) & ~(((1ULL << shift) - 1) >> 23);
+
+ if (pte0 & HPTE64_V_SECONDARY) {
+ pteg = ~pteg;
+ }
+
+ if ((pte0 & HPTE64_V_SSIZE) == HPTE64_V_SSIZE_256M) {
+ uint64_t offset, vsid;
+
+ /* We only have 28 - 23 bits of offset in avpn */
+ offset = (avpn & 0x1f) << 23;
+ vsid = avpn >> 5;
+ /* We can find more bits from the pteg value */
+ if (shift < 23) {
+ offset |= ((vsid ^ pteg) & old_hash_mask) << shift;
+ }
+
+ hash = vsid ^ (offset >> shift);
+ } else if ((pte0 & HPTE64_V_SSIZE) == HPTE64_V_SSIZE_1T) {
+ uint64_t offset, vsid;
+
+ /* We only have 40 - 23 bits of seg_off in avpn */
+ offset = (avpn & 0x1ffff) << 23;
+ vsid = avpn >> 17;
+ if (shift < 23) {
+ offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) << shift;
+ }
+
+ hash = vsid ^ (vsid << 25) ^ (offset >> shift);
+ } else {
+ error_report("rehash_pte: Bad segment size in HPTE");
+ return H_HARDWARE;
+ }
+
+ new_pteg = hash & new_hash_mask;
+ if (pte0 & HPTE64_V_SECONDARY) {
+ assert(~pteg == (hash & old_hash_mask));
+ new_pteg = ~new_pteg;
+ } else {
+ assert(pteg == (hash & old_hash_mask));
+ }
+ assert((oldsize != newsize) || (pteg == new_pteg));
+ replace_pte0 = new_hpte_load0(new, new_pteg, slot);
+ if (replace_pte0 & HPTE64_V_VALID) {
+ assert(newsize < oldsize);
+ if (replace_pte0 & HPTE64_V_BOLTED) {
+ if (pte0 & HPTE64_V_BOLTED) {
+ /* Bolted collision, nothing we can do */
+ return H_PTEG_FULL;
+ } else {
+ /* Discard this hpte */
+ return H_SUCCESS;
+ }
+ }
+ }
+
+ new_hpte_store(new, new_pteg, slot, pte0, pte1);
+ return H_SUCCESS;
+}
+
+static int rehash_hpt(PowerPCCPU *cpu,
+ void *old, uint64_t oldsize,
+ void *new, uint64_t newsize)
+{
+ CPUPPCState *env = &cpu->env;
+ uint64_t n_ptegs = oldsize >> 7;
+ uint64_t pteg;
+ int slot;
+ int rc;
+
+ assert(env->external_htab == old);
+
+ for (pteg = 0; pteg < n_ptegs; pteg++) {
+ uint64_t token = ppc_hash64_start_access(cpu, pteg * HPTES_PER_GROUP);
+
+ if (!token) {
+ return H_HARDWARE;
+ }
+
+ for (slot = 0; slot < HPTES_PER_GROUP; slot++) {
+ rc = rehash_hpte(cpu, token, old, oldsize, new, newsize,
+ pteg, slot);
+ if (rc != H_SUCCESS) {
+ ppc_hash64_stop_access(token);
+ return rc;
+ }
+ }
+ ppc_hash64_stop_access(token);
+ }
+
+ return H_SUCCESS;
+}
+
+static void pivot_hpt(void *arg)
+{
+ sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+ CPUState *cs = arg;
+ CPUPPCState *env = &POWERPC_CPU(cs)->env;
+
+ cpu_synchronize_state(cs);
+ env->external_htab = spapr->htab;
+ env->htab_mask = (1ULL << (spapr->htab_shift - 7)) - 1;
+ env->spr[SPR_SDR1] = (target_ulong)(uintptr_t)spapr->htab |
+ (spapr->htab_shift - 18);
}
static target_ulong h_resize_hpt_commit(PowerPCCPU *cpu,
@@ -335,9 +598,48 @@ static target_ulong h_resize_hpt_commit(PowerPCCPU *cpu,
{
target_ulong flags = args[0];
target_ulong shift = args[1];
+ sPAPRPendingHPT *pending = spapr->pending_hpt;
+ int rc;
+ size_t newsize;
trace_spapr_h_resize_hpt_commit(flags, shift);
- return H_HARDWARE;
+
+ if (flags != 0) {
+ return H_PARAMETER;
+ }
+
+ if (!pending || (pending->shift != shift)) {
+ /* no matching prepare */
+ return H_CLOSED;
+ }
+
+ if (!pending->complete) {
+ /* prepare has not completed */
+ return H_BUSY;
+ }
+
+ newsize = 1ULL << pending->shift;
+ rc = rehash_hpt(cpu, spapr->htab, HTAB_SIZE(spapr),
+ pending->hpt, newsize);
+ if (rc == H_SUCCESS) {
+ CPUState *cs;
+
+ qemu_vfree(spapr->htab);
+ spapr->htab = pending->hpt;
+ spapr->htab_shift = pending->shift;
+
+ CPU_FOREACH(cs) {
+ run_on_cpu(cs, pivot_hpt, cs);
+ }
+
+ pending->hpt = NULL; /* so it's not free()d */
+ }
+
+ /* Clean up */
+ spapr->pending_hpt = NULL;
+ free_pending_hpt(pending);
+
+ return rc;
}
static target_ulong h_set_dabr(PowerPCCPU *cpu, sPAPRMachineState *spapr,
@@ -12,6 +12,7 @@ struct sPAPRPHBState;
struct sPAPRNVRAM;
typedef struct sPAPRConfigureConnectorState sPAPRConfigureConnectorState;
typedef struct sPAPREventLogEntry sPAPREventLogEntry;
+typedef struct sPAPRPendingHPT sPAPRPendingHPT;
#define HPTE64_V_HPTE_DIRTY 0x0000000000000040ULL
#define SPAPR_ENTRY_POINT 0x100
@@ -54,6 +55,8 @@ struct sPAPRMachineState {
void *htab;
uint32_t htab_shift;
+ sPAPRPendingHPT *pending_hpt; /* in-progress resize */
+
hwaddr rma_size;
int vrma_adjust;
hwaddr fdt_addr, rtas_addr;
@@ -649,4 +652,6 @@ int spapr_rng_populate_dt(void *fdt);
*/
#define SPAPR_LMB_FLAGS_ASSIGNED 0x00000008
+#define HTAB_SIZE(spapr) (1ULL << ((spapr)->htab_shift))
+
#endif /* !defined (__HW_SPAPR_H__) */
@@ -58,11 +58,15 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
#define HASH_PTE_SIZE_64 16
#define HASH_PTEG_SIZE_64 (HASH_PTE_SIZE_64 * HPTES_PER_GROUP)
+#define HPTE64_V_SSIZE SLB_VSID_B
+#define HPTE64_V_SSIZE_256M SLB_VSID_B_256M
+#define HPTE64_V_SSIZE_1T SLB_VSID_B_1T
#define HPTE64_V_SSIZE_SHIFT 62
#define HPTE64_V_AVPN_SHIFT 7
#define HPTE64_V_AVPN 0x3fffffffffffff80ULL
#define HPTE64_V_AVPN_VAL(x) (((x) & HPTE64_V_AVPN) >> HPTE64_V_AVPN_SHIFT)
#define HPTE64_V_COMPARE(x, y) (!(((x) ^ (y)) & 0xffffffffffffff80ULL))
+#define HPTE64_V_BOLTED 0x0000000000000010ULL
#define HPTE64_V_LARGE 0x0000000000000004ULL
#define HPTE64_V_SECONDARY 0x0000000000000002ULL
#define HPTE64_V_VALID 0x0000000000000001ULL
This patch implements hypercalls allowing a PAPR guest to resize its own hash page table. This will eventually allow for more flexible memory hotplug. The implementation is partially asynchronous, handled in a special thread running the hpt_prepare_thread() function. The state of a pending resize is stored in SPAPR_MACHINE->pending_hpt. The H_RESIZE_HPT_PREPARE hypercall will kick off creation of a new HPT, or, if one is already in progress, monitor it for completion. If there is an existing HPT resize in progress that doesn't match the size specified in the call, it will cancel it, replacing it with a new one matching the given size. The H_RESIZE_HPT_COMMIT completes transition to a resized HPT, and can only be called successfully once H_RESIZE_HPT_PREPARE has successfully completed initialization of a new HPT. The guest must ensure that there are no concurrent accesses to the existing HPT while this is called (this effectively means stop_machine() for Linux guests). For now H_RESIZE_HPT_COMMIT goes through the whole old HPT, rehashing each HPTE into the new HPT. This can have quite high latency, but it seems to be of the order of typical migration downtime latencies for HPTs of size up to ~2GiB (which would be used in a 256GiB guest). In future we probably want to move more of the rehashing to the "prepare" phase, by having H_ENTER and other hcalls update both current and pending HPTs. That's a project for another day, but should be possible without any changes to the guest interface. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> --- hw/ppc/spapr.c | 2 - hw/ppc/spapr_hcall.c | 308 +++++++++++++++++++++++++++++++++++++++++++++++- include/hw/ppc/spapr.h | 5 + target-ppc/mmu-hash64.h | 4 + 4 files changed, 314 insertions(+), 5 deletions(-)