diff mbox series

[RFC,3/3] spapr: implement nested-hv support for the TCG virtual hypervisor

Message ID 20220210065340.1276367-4-npiggin@gmail.com (mailing list archive)
State New, archived
Headers show
Series spapr: nested-hv support for TCG | expand

Commit Message

Nicholas Piggin Feb. 10, 2022, 6:53 a.m. UTC
This implements the nested-hv hcall API for spapr under TCG.
It's still a bit rough around the edges, concept seems to work.

Some HV exceptions can be raised now in the TCG spapr machine when
running a nested guest. The main ones are the lev==1 syscall, the
hdecr, hdsi and hisi, and h_virt external interrupts. These are
dealt with in the interrupt delivery code by noticing MSR[HV] raised
and instead of switching the machine to HV mode, it exits the
H_ENTER_NESTED hcall with the interrupt vector as return value as
required by the hcall API.

Address translation is provided by the 2-level page table walker
that is implemented for the pnv machine. The partition scope page
table is pointed to the L1's partition scope, and a few tests have
to take into account that nested-hv translations are 2-level. This
could perhaps be tidied up a bit e.g., with a 'bool two_level = ...'
but it's surprisingly little code.

There is no TLB tagging between L1 and L2 translations at the moment
so the TLB is flushed on any L1<->L2 transition (hcall entry and exit).

XXX: stop doing atomic RC on page table walks (not for nested but in general)

not-yet-Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 hw/ppc/ppc.c               |  20 +++
 hw/ppc/spapr.c             |  16 ++
 hw/ppc/spapr_caps.c        |   5 +-
 hw/ppc/spapr_hcall.c       | 316 +++++++++++++++++++++++++++++++++++++
 include/hw/ppc/ppc.h       |   3 +
 include/hw/ppc/spapr.h     |  75 ++++++++-
 target/ppc/cpu.h           |   6 +
 target/ppc/excp_helper.c   |  60 ++++---
 target/ppc/helper_regs.c   |   1 +
 target/ppc/mmu-book3s-v3.c |  20 ++-
 target/ppc/mmu-radix64.c   |  15 +-
 11 files changed, 499 insertions(+), 38 deletions(-)

Comments

Fabiano Rosas Feb. 14, 2022, 2:32 p.m. UTC | #1
Nicholas Piggin <npiggin@gmail.com> writes:

> This implements the nested-hv hcall API for spapr under TCG.
> It's still a bit rough around the edges, concept seems to work.
>
> Some HV exceptions can be raised now in the TCG spapr machine when
> running a nested guest. The main ones are the lev==1 syscall, the
> hdecr, hdsi and hisi, and h_virt external interrupts. These are
> dealt with in the interrupt delivery code by noticing MSR[HV] raised
> and instead of switching the machine to HV mode, it exits the
> H_ENTER_NESTED hcall with the interrupt vector as return value as
> required by the hcall API.
>
> Address translation is provided by the 2-level page table walker
> that is implemented for the pnv machine. The partition scope page
> table is pointed to the L1's partition scope, and a few tests have
> to take into account that nested-hv translations are 2-level. This
> could perhaps be tidied up a bit e.g., with a 'bool two_level = ...'
> but it's surprisingly little code.
>
> There is no TLB tagging between L1 and L2 translations at the moment
> so the TLB is flushed on any L1<->L2 transition (hcall entry and exit).
>
> XXX: stop doing atomic RC on page table walks (not for nested but in general)
>
> not-yet-Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Hi Nick,

Sorry it took me so long to get to this. It think overall it looks
good. I have some comments about code structure that you probably
already know about.

I don't have answers to your XXX questions but I think it's ok to handle
them later individually as they come up, after this is merged.

> ---
>  hw/ppc/ppc.c               |  20 +++
>  hw/ppc/spapr.c             |  16 ++
>  hw/ppc/spapr_caps.c        |   5 +-
>  hw/ppc/spapr_hcall.c       | 316 +++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/ppc.h       |   3 +
>  include/hw/ppc/spapr.h     |  75 ++++++++-
>  target/ppc/cpu.h           |   6 +
>  target/ppc/excp_helper.c   |  60 ++++---
>  target/ppc/helper_regs.c   |   1 +
>  target/ppc/mmu-book3s-v3.c |  20 ++-
>  target/ppc/mmu-radix64.c   |  15 +-
>  11 files changed, 499 insertions(+), 38 deletions(-)
>
> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
> index a7c262db93..135900a6f4 100644
> --- a/hw/ppc/ppc.c
> +++ b/hw/ppc/ppc.c
> @@ -1083,6 +1083,26 @@ clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq)
>      return &cpu_ppc_set_tb_clk;
>  }
>  
> +void cpu_ppc_hdecr_init (CPUPPCState *env)
> +{
> +    PowerPCCPU *cpu = env_archcpu(env);
> +
> +    assert(env->tb_env->hdecr_timer == NULL);
> +
> +    env->tb_env->hdecr_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &cpu_ppc_hdecr_cb,
> +                                                 cpu);
> +}
> +
> +void cpu_ppc_hdecr_exit (CPUPPCState *env)
> +{
> +    PowerPCCPU *cpu = env_archcpu(env);
> +
> +    timer_free(env->tb_env->hdecr_timer);
> +    env->tb_env->hdecr_timer = NULL;
> +
> +    cpu_ppc_hdecr_lower(cpu);
> +}
> +
>  /* Specific helpers for POWER & PowerPC 601 RTC */
>  void cpu_ppc601_store_rtcu (CPUPPCState *env, uint32_t value)
>  {
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 3d6ec309dd..f0c3f726f2 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1273,6 +1273,8 @@ static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp,
>      if (msr_pr) {
>          hcall_dprintf("Hypercall made with MSR[PR]=1\n");
>          env->gpr[3] = H_PRIVILEGE;
> +    } else if (env->gpr[3] == KVMPPC_H_ENTER_NESTED) {
> +        spapr_enter_nested(cpu);

This looks like it could be plumbed along with spapr_hypercall below.

>      } else {
>          env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
>      }
> @@ -4465,6 +4467,17 @@ PowerPCCPU *spapr_find_cpu(int vcpu_id)
>      return NULL;
>  }
>  
> +static bool spapr_cpu_in_nested(PowerPCCPU *cpu)
> +{
> +    return cpu->in_spapr_nested;
> +}
> +
> +static target_ulong spapr_get_nested_ptcr(PowerPCCPU *cpu, target_ulong lpid)
> +{
> +        SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +        return spapr->nested_ptcr;
> +}
> +
>  static void spapr_cpu_exec_enter(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
>  {
>      SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
> @@ -4573,6 +4586,9 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>      fwc->get_dev_path = spapr_get_fw_dev_path;
>      nc->nmi_monitor_handler = spapr_nmi;
>      smc->phb_placement = spapr_phb_placement;
> +    vhc->cpu_in_nested = spapr_cpu_in_nested;
> +    vhc->get_nested_ptcr = spapr_get_nested_ptcr;
> +    vhc->exit_nested = spapr_exit_nested;
>      vhc->hypercall = emulate_spapr_hypercall;
>      vhc->hpt_mask = spapr_hpt_mask;
>      vhc->map_hptes = spapr_map_hptes;
> diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
> index ed7c077a0d..a665245f6f 100644
> --- a/hw/ppc/spapr_caps.c
> +++ b/hw/ppc/spapr_caps.c
> @@ -450,10 +450,7 @@ static void cap_nested_kvm_hv_apply(SpaprMachineState *spapr,
>          return;
>      }
>  
> -    if (tcg_enabled()) {
> -        error_setg(errp, "No Nested KVM-HV support in TCG");
> -        error_append_hint(errp, "Try appending -machine cap-nested-hv=off\n");
> -    } else if (kvm_enabled()) {
> +    if (!tcg_enabled()) {
>          if (!ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0,
>                                spapr->max_compat_pvr)) {
>              error_setg(errp, "Nested KVM-HV only supported on POWER9");
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 222c1b6bbd..8ffb13ada0 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -9,6 +9,7 @@
>  #include "qemu/error-report.h"
>  #include "exec/exec-all.h"
>  #include "helper_regs.h"
> +#include "hw/ppc/ppc.h"
>  #include "hw/ppc/spapr.h"
>  #include "hw/ppc/spapr_cpu_core.h"
>  #include "mmu-hash64.h"
> @@ -1497,6 +1498,317 @@ static void hypercall_register_softmmu(void)
>  }
>  #endif
>  
> +/* TCG only */
> +#define PRTS_MASK      0x1f
> +
> +static target_ulong h_set_ptbl(PowerPCCPU *cpu,
> +                               SpaprMachineState *spapr,
> +                               target_ulong opcode,
> +                               target_ulong *args)
> +{
> +    target_ulong ptcr = args[0];
> +
> +    if (!spapr_get_cap(spapr, SPAPR_CAP_NESTED_KVM_HV)) {
> +        return H_FUNCTION;
> +    }
> +
> +    if ((ptcr & PRTS_MASK) + 12 - 4 > 12) {
> +        return H_PARAMETER;
> +    }
> +
> +    spapr->nested_ptcr = ptcr; /* Save new partition table */
> +
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_tlb_invalidate(PowerPCCPU *cpu,
> +                               SpaprMachineState *spapr,
> +                               target_ulong opcode,
> +                               target_ulong *args)
> +{
> +    CPUState *cs = CPU(cpu);
> +
> +    /*
> +     * The spapr virtual hypervisor nested HV implementation retains no
> +     * translation state except for TLB. This might be optimised to
> +     * invalidate fewer entries, but at the moment it's not important
> +     * because L1<->L2 transitions always flush the entire TLB for now.
> +     */
> +    tlb_flush(cs);
> +
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_copy_tofrom_guest(PowerPCCPU *cpu,
> +                               SpaprMachineState *spapr,
> +                               target_ulong opcode,
> +                               target_ulong *args)
> +{
> +    /*
> +     * This HCALL is not required, L1 KVM will take a slow path and walk the
> +     * page tables manually to do the data copy.
> +     */

Kind of unrelated to the patch itself but I thought the point of this
hcall was less about performance and more about Ln not having access to
the KVM memslots for the Ln+2.

KVM accesses a guest address space by using the GFN to find the memory
slot and from there the userspace address that is part of the
memslot. Accesses from Ln -> Ln+1 can use that mechanism because any Ln
will have slots registered for its guests.

For L0 -> L2, we can go via radix quadrants because the L0 is the real
HV (that's how L0 handles this hcall after all).

For L1 -> L3, we need the hcall since L1 cannot access quadrants 1 and 2
and it does not have the memslot so it cannot use the QEMU address.

Does that make sense?

> +    return H_FUNCTION;
> +}
> +
> +void spapr_enter_nested(PowerPCCPU *cpu)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
> +    CPUState *cs = CPU(cpu);
> +    CPUPPCState *env = &cpu->env;
> +    target_ulong hv_ptr = env->gpr[4];
> +    target_ulong regs_ptr = env->gpr[5];
> +    target_ulong hdec, now = cpu_ppc_load_tbl(env);
> +    struct kvmppc_hv_guest_state *hvstate;
> +    struct kvmppc_hv_guest_state hv_state;
> +    struct kvmppc_pt_regs *regs;
> +    hwaddr len;
> +    uint32_t cr;
> +    int i;
> +
> +    if (cpu->in_spapr_nested) {
> +        env->gpr[3] = H_FUNCTION;
> +        return;
> +    }
> +    if (spapr->nested_ptcr == 0) {
> +        env->gpr[3] = H_NOT_AVAILABLE;
> +        return;
> +    }
> +
> +    len = sizeof(*hvstate);
> +    hvstate = cpu_physical_memory_map(hv_ptr, &len, true);
> +    if (!hvstate || len != sizeof(*hvstate)) {
> +        env->gpr[3] = H_PARAMETER;
> +        return;
> +    }
> +
> +    memcpy(&hv_state, hvstate, len);
> +
> +    cpu_physical_memory_unmap(hvstate, len, 0 /* read */, len /* access len */);
> +
> +    if (hv_state.version != HV_GUEST_STATE_VERSION) {

KVM uses > here. Should we do the same? Daniel and I have been
discussing if we need some sort of version negotiation for nested KVM
since recently someone tried to use an L0 version 1 with an L1 version
2:

https://gitlab.com/qemu-project/qemu/-/issues/860

> +        env->gpr[3] = H_PARAMETER;
> +        return;
> +    }
> +

<snip>

> +/* 64-bit powerpc pt_regs struct used by nested HV */
> +struct kvmppc_pt_regs {
> +    uint64_t gpr[32];
> +    uint64_t nip;
> +    uint64_t msr;
> +    uint64_t orig_gpr3;    /* Used for restarting system calls */
> +    uint64_t ctr;
> +    uint64_t link;
> +    uint64_t xer;
> +    uint64_t ccr;
> +    uint64_t softe;        /* Soft enabled/disabled */
> +    uint64_t trap;         /* Reason for being here */
> +    uint64_t dar;          /* Fault registers */
> +    uint64_t dsisr;        /* on 4xx/Book-E used for ESR */

There's no 4xx/booke support for spapr nested.

> +    uint64_t result;       /* Result of a system call */
> +};
>  
>  typedef struct SpaprDeviceTreeUpdateHeader {
>      uint32_t version_id;
> @@ -604,6 +673,10 @@ typedef target_ulong (*spapr_hcall_fn)(PowerPCCPU *cpu, SpaprMachineState *sm,
>  void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn);
>  target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>                               target_ulong *args);
> +
> +void spapr_enter_nested(PowerPCCPU *cpu);
> +void spapr_exit_nested(PowerPCCPU *cpu, int excp);
> +
>  target_ulong softmmu_resize_hpt_prepare(PowerPCCPU *cpu, SpaprMachineState *spapr,
>                                           target_ulong shift);
>  target_ulong softmmu_resize_hpt_commit(PowerPCCPU *cpu, SpaprMachineState *spapr,
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index dcd83b503c..1806a8e776 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -1306,6 +1306,9 @@ struct PowerPCCPU {
>      bool pre_2_10_migration;
>      bool pre_3_0_migration;
>      int32_t mig_slb_nr;
> +
> +    bool in_spapr_nested;
> +    CPUPPCState *nested_host_state;
>  };
>  
>  
> @@ -1316,6 +1319,9 @@ PowerPCCPUClass *ppc_cpu_get_family_class(PowerPCCPUClass *pcc);
>  #ifndef CONFIG_USER_ONLY
>  struct PPCVirtualHypervisorClass {
>      InterfaceClass parent;
> +    bool (*cpu_in_nested)(PowerPCCPU *cpu);
> +    target_ulong (*get_nested_ptcr)(PowerPCCPU *cpu, target_ulong lpid);
> +    void (*exit_nested)(PowerPCCPU *cpu, int excp);
>      void (*hypercall)(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu);
>      hwaddr (*hpt_mask)(PPCVirtualHypervisor *vhyp);
>      const ppc_hash_pte64_t *(*map_hptes)(PPCVirtualHypervisor *vhyp,
> diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
> index c107953dec..239c253dbc 100644
> --- a/target/ppc/excp_helper.c
> +++ b/target/ppc/excp_helper.c
> @@ -361,8 +361,8 @@ static void ppc_excp_apply_ail(PowerPCCPU *cpu, int excp_model, int excp,
>  #endif
>  }
>  
> -static void powerpc_set_excp_state(PowerPCCPU *cpu,
> -                                          target_ulong vector, target_ulong msr)
> +static void powerpc_set_excp_state(PowerPCCPU *cpu, int excp,
> +                                   target_ulong vector, target_ulong msr)
>  {
>      CPUState *cs = CPU(cpu);
>      CPUPPCState *env = &cpu->env;
> @@ -375,9 +375,17 @@ static void powerpc_set_excp_state(PowerPCCPU *cpu,
>       * will prevent setting of the HV bit which some exceptions might need
>       * to do.
>       */
> -    env->msr = msr & env->msr_mask;
> -    hreg_compute_hflags(env);
> -    env->nip = vector;
> +    if (cpu->vhyp && cpu->in_spapr_nested && (msr & MSR_HVB)) {
> +        PPCVirtualHypervisorClass *vhc =
> +            PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
> +        // Deliver interrupt to L1 by returning from the H_ENTER_NESTED call
> +        vhc->exit_nested(cpu, excp);
> +    } else {
> +        env->nip = vector;
> +        env->msr = msr & env->msr_mask;
> +        hreg_compute_hflags(env);
> +    }
> +

All of this should fit better at the end of powerpc_excp_books. Even if
we need to duplicate it for fwnmi_machine_check.

>      /* Reset exception state */
>      cs->exception_index = POWERPC_EXCP_NONE;
>      env->error_code = 0;
> @@ -548,7 +556,7 @@ static void powerpc_excp_40x(PowerPCCPU *cpu, int excp)
>      /* Save MSR */
>      env->spr[srr1] = msr;
>  
> -    powerpc_set_excp_state(cpu, vector, new_msr);
> +    powerpc_set_excp_state(cpu, excp, vector, new_msr);
>  }
>  
>  static void powerpc_excp_74xx(PowerPCCPU *cpu, int excp)
> @@ -742,7 +750,7 @@ static void powerpc_excp_74xx(PowerPCCPU *cpu, int excp)
>      /* Save MSR */
>      env->spr[SPR_SRR1] = msr;
>  
> -    powerpc_set_excp_state(cpu, vector, new_msr);
> +    powerpc_set_excp_state(cpu, excp, vector, new_msr);
>  }
>  
>  #ifdef TARGET_PPC64
> @@ -916,7 +924,7 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
>          env->nip += 4;
>  
>          /* "PAPR mode" built-in hypercall emulation */
> -        if ((lev == 1) && cpu->vhyp) {
> +        if ((lev == 1) && cpu->vhyp && !cpu->in_spapr_nested) {
>              PPCVirtualHypervisorClass *vhc =
>                  PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
>              vhc->hypercall(cpu->vhyp, cpu);
> @@ -1004,18 +1012,6 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
>          break;
>      }
>  
> -    /* Sanity check */
> -    if (!(env->msr_mask & MSR_HVB)) {
> -        if (new_msr & MSR_HVB) {
> -            cpu_abort(cs, "Trying to deliver HV exception (MSR) %d with "
> -                      "no HV support\n", excp);
> -        }
> -        if (srr0 == SPR_HSRR0) {
> -            cpu_abort(cs, "Trying to deliver HV exception (HSRR) %d with "
> -                      "no HV support\n", excp);
> -        }
> -    }
> -
>      /*
>       * Sort out endianness of interrupt, this differs depending on the
>       * CPU, the HV mode, etc...
> @@ -1037,7 +1033,19 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
>      /* This can update new_msr and vector if AIL applies */
>      ppc_excp_apply_ail(cpu, excp_model, excp, msr, &new_msr, &vector);
>  
> -    powerpc_set_excp_state(cpu, vector, new_msr);
> +    powerpc_set_excp_state(cpu, excp, vector, new_msr);
> +
> +    /* Sanity check */
> +    if (!(env->msr_mask & MSR_HVB)) {
> +        if (env->msr & MSR_HVB) {
> +            cpu_abort(cs, "Trying to deliver HV exception (MSR) %d with "
> +                      "no HV support\n", excp);
> +        }

FYI, this doesn't exist anymore.

> +        if (0 && srr0 == SPR_HSRR0) {
> +            cpu_abort(cs, "Trying to deliver HV exception (HSRR) %d with "
> +                      "no HV support\n", excp);
> +        }

msr_mask should have MSR_HVB independently of MSR_HV being actually set,
so this shouldn't be causing any problems. Is it?

> +    }
>  }
>  #else
>  static inline void powerpc_excp_books(PowerPCCPU *cpu, int excp)
> @@ -1517,7 +1525,7 @@ static inline void powerpc_excp_legacy(PowerPCCPU *cpu, int excp)
>      /* This can update new_msr and vector if AIL applies */
>      ppc_excp_apply_ail(cpu, excp_model, excp, msr, &new_msr, &vector);
>  
> -    powerpc_set_excp_state(cpu, vector, new_msr);
> +    powerpc_set_excp_state(cpu, excp, vector, new_msr);
>  }
>  
>  static void powerpc_excp(PowerPCCPU *cpu, int excp)
> @@ -1613,7 +1621,11 @@ static void ppc_hw_interrupt(CPUPPCState *env)
>          /* HEIC blocks delivery to the hypervisor */
>          if ((async_deliver && !(heic && msr_hv && !msr_pr)) ||
>              (env->has_hv_mode && msr_hv == 0 && !lpes0)) {
> -            powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
> +            if (cpu->in_spapr_nested) {
> +                powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
> +            } else {
> +                powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
> +            }

This function affects other CPUs, so it's better to leave this as it
were and change the exception inside powerpc_excp_books, like we do for
HV_EMUL.

>              return;
>          }
>      }
> @@ -1723,7 +1735,7 @@ void ppc_cpu_do_fwnmi_machine_check(CPUState *cs, target_ulong vector)
>          msr |= (1ULL << MSR_LE);
>      }
>  
> -    powerpc_set_excp_state(cpu, vector, msr);
> +    powerpc_set_excp_state(cpu, POWERPC_EXCP_MCHECK, vector, msr);
>  }
>  
>  bool ppc_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
> diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c
> index 5b12cb03c9..7f92606522 100644
> --- a/target/ppc/helper_regs.c
> +++ b/target/ppc/helper_regs.c
> @@ -163,6 +163,7 @@ static uint32_t hreg_compute_hflags_value(CPUPPCState *env)
>          immu_idx |= msr & (1 << MSR_IS) ? 2 : 0;
>          dmmu_idx |= msr & (1 << MSR_DS) ? 2 : 0;
>      } else {
> +        /* Could have nested IDX instead of HV to avoid tlb flush on nested enter/exit? */

Sounds like a good idea. However I don't know enough about softmmu stuff
to point a direction here.

>          dmmu_idx |= msr & (1ull << MSR_HV) ? 4 : 0;
>          immu_idx = dmmu_idx;
>          immu_idx |= msr & (1 << MSR_IR) ? 0 : 2;
Cédric Le Goater Feb. 14, 2022, 6:31 p.m. UTC | #2
On 2/10/22 07:53, Nicholas Piggin wrote:
> This implements the nested-hv hcall API for spapr under TCG.
> It's still a bit rough around the edges, concept seems to work.
> 
> Some HV exceptions can be raised now in the TCG spapr machine when
> running a nested guest. The main ones are the lev==1 syscall, the
> hdecr, hdsi and hisi, and h_virt external interrupts. These are
> dealt with in the interrupt delivery code by noticing MSR[HV] raised
> and instead of switching the machine to HV mode, it exits the
> H_ENTER_NESTED hcall with the interrupt vector as return value as
> required by the hcall API.
> 
> Address translation is provided by the 2-level page table walker
> that is implemented for the pnv machine. The partition scope page
> table is pointed to the L1's partition scope, and a few tests have
> to take into account that nested-hv translations are 2-level. This
> could perhaps be tidied up a bit e.g., with a 'bool two_level = ...'
> but it's surprisingly little code.
> 
> There is no TLB tagging between L1 and L2 translations at the moment
> so the TLB is flushed on any L1<->L2 transition (hcall entry and exit).
> 
> XXX: stop doing atomic RC on page table walks (not for nested but in general)
> 
> not-yet-Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>   hw/ppc/ppc.c               |  20 +++
>   hw/ppc/spapr.c             |  16 ++
>   hw/ppc/spapr_caps.c        |   5 +-
>   hw/ppc/spapr_hcall.c       | 316 +++++++++++++++++++++++++++++++++++++
>   include/hw/ppc/ppc.h       |   3 +
>   include/hw/ppc/spapr.h     |  75 ++++++++-
>   target/ppc/cpu.h           |   6 +
>   target/ppc/excp_helper.c   |  60 ++++---
>   target/ppc/helper_regs.c   |   1 +
>   target/ppc/mmu-book3s-v3.c |  20 ++-
>   target/ppc/mmu-radix64.c   |  15 +-
>   11 files changed, 499 insertions(+), 38 deletions(-)
> 
> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
> index a7c262db93..135900a6f4 100644
> --- a/hw/ppc/ppc.c
> +++ b/hw/ppc/ppc.c
> @@ -1083,6 +1083,26 @@ clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq)
>       return &cpu_ppc_set_tb_clk;
>   }
>   
> +void cpu_ppc_hdecr_init (CPUPPCState *env)
> +{
> +    PowerPCCPU *cpu = env_archcpu(env);
> +
> +    assert(env->tb_env->hdecr_timer == NULL);
> +
> +    env->tb_env->hdecr_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &cpu_ppc_hdecr_cb,
> +                                                 cpu);
> +}
> +
> +void cpu_ppc_hdecr_exit (CPUPPCState *env)
> +{
> +    PowerPCCPU *cpu = env_archcpu(env);
> +
> +    timer_free(env->tb_env->hdecr_timer);
> +    env->tb_env->hdecr_timer = NULL;
> +
> +    cpu_ppc_hdecr_lower(cpu);
> +}


So these are called every time a L2 enters or exits ?

>   /* Specific helpers for POWER & PowerPC 601 RTC */
>   void cpu_ppc601_store_rtcu (CPUPPCState *env, uint32_t value)
>   {
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 3d6ec309dd..f0c3f726f2 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1273,6 +1273,8 @@ static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp,
>       if (msr_pr) {
>           hcall_dprintf("Hypercall made with MSR[PR]=1\n");
>           env->gpr[3] = H_PRIVILEGE;
> +    } else if (env->gpr[3] == KVMPPC_H_ENTER_NESTED) {
> +        spapr_enter_nested(cpu);

Can not this be in the hcall table ?

>       } else {
>           env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
>       }
> @@ -4465,6 +4467,17 @@ PowerPCCPU *spapr_find_cpu(int vcpu_id)
>       return NULL;
>   }
>   
> +static bool spapr_cpu_in_nested(PowerPCCPU *cpu)
> +{
> +    return cpu->in_spapr_nested;
> +}

This handler is not very much used.

> +static target_ulong spapr_get_nested_ptcr(PowerPCCPU *cpu, target_ulong lpid)
> +{
> +        SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +        return spapr->nested_ptcr;
> +}
> +
>   static void spapr_cpu_exec_enter(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
>   {
>       SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
> @@ -4573,6 +4586,9 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>       fwc->get_dev_path = spapr_get_fw_dev_path;
>       nc->nmi_monitor_handler = spapr_nmi;
>       smc->phb_placement = spapr_phb_placement;
> +    vhc->cpu_in_nested = spapr_cpu_in_nested;
> +    vhc->get_nested_ptcr = spapr_get_nested_ptcr;
> +    vhc->exit_nested = spapr_exit_nested;
>       vhc->hypercall = emulate_spapr_hypercall;
>       vhc->hpt_mask = spapr_hpt_mask;
>       vhc->map_hptes = spapr_map_hptes;
> diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
> index ed7c077a0d..a665245f6f 100644
> --- a/hw/ppc/spapr_caps.c
> +++ b/hw/ppc/spapr_caps.c
> @@ -450,10 +450,7 @@ static void cap_nested_kvm_hv_apply(SpaprMachineState *spapr,
>           return;
>       }
>   
> -    if (tcg_enabled()) {
> -        error_setg(errp, "No Nested KVM-HV support in TCG");
> -        error_append_hint(errp, "Try appending -machine cap-nested-hv=off\n");
> -    } else if (kvm_enabled()) {
> +    if (!tcg_enabled()) {
>           if (!ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0,
>                                 spapr->max_compat_pvr)) {
>               error_setg(errp, "Nested KVM-HV only supported on POWER9");
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 222c1b6bbd..8ffb13ada0 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -9,6 +9,7 @@
>   #include "qemu/error-report.h"
>   #include "exec/exec-all.h"
>   #include "helper_regs.h"
> +#include "hw/ppc/ppc.h"
>   #include "hw/ppc/spapr.h"
>   #include "hw/ppc/spapr_cpu_core.h"
>   #include "mmu-hash64.h"
> @@ -1497,6 +1498,317 @@ static void hypercall_register_softmmu(void)
>   }
>   #endif
>   
> +/* TCG only */
> +#define PRTS_MASK      0x1f
> +
> +static target_ulong h_set_ptbl(PowerPCCPU *cpu,
> +                               SpaprMachineState *spapr,
> +                               target_ulong opcode,
> +                               target_ulong *args)
> +{
> +    target_ulong ptcr = args[0];
> +
> +    if (!spapr_get_cap(spapr, SPAPR_CAP_NESTED_KVM_HV)) {
> +        return H_FUNCTION;
> +    }
> +
> +    if ((ptcr & PRTS_MASK) + 12 - 4 > 12) {
> +        return H_PARAMETER;
> +    }
> +
> +    spapr->nested_ptcr = ptcr; /* Save new partition table */
> +
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_tlb_invalidate(PowerPCCPU *cpu,
> +                               SpaprMachineState *spapr,
> +                               target_ulong opcode,
> +                               target_ulong *args)
> +{
> +    CPUState *cs = CPU(cpu);
> +
> +    /*
> +     * The spapr virtual hypervisor nested HV implementation retains no
> +     * translation state except for TLB. This might be optimised to
> +     * invalidate fewer entries, but at the moment it's not important
> +     * because L1<->L2 transitions always flush the entire TLB for now.
> +     */
> +    tlb_flush(cs);
> +
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_copy_tofrom_guest(PowerPCCPU *cpu,
> +                               SpaprMachineState *spapr,
> +                               target_ulong opcode,
> +                               target_ulong *args)
> +{
> +    /*
> +     * This HCALL is not required, L1 KVM will take a slow path and walk the
> +     * page tables manually to do the data copy.
> +     */
> +    return H_FUNCTION;
> +}
> +
> +void spapr_enter_nested(PowerPCCPU *cpu)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
> +    CPUState *cs = CPU(cpu);
> +    CPUPPCState *env = &cpu->env;
> +    target_ulong hv_ptr = env->gpr[4];
> +    target_ulong regs_ptr = env->gpr[5];
> +    target_ulong hdec, now = cpu_ppc_load_tbl(env);
> +    struct kvmppc_hv_guest_state *hvstate;
> +    struct kvmppc_hv_guest_state hv_state;
> +    struct kvmppc_pt_regs *regs;
> +    hwaddr len;
> +    uint32_t cr;
> +    int i;
> +
> +    if (cpu->in_spapr_nested) {
> +        env->gpr[3] = H_FUNCTION;
> +        return;
> +    }
> +    if (spapr->nested_ptcr == 0) {
> +        env->gpr[3] = H_NOT_AVAILABLE;
> +        return;
> +    }
> +
> +    len = sizeof(*hvstate);
> +    hvstate = cpu_physical_memory_map(hv_ptr, &len, 

Are you writing to the state ? address_space_map() is a better pratice.


> +    if (!hvstate || len != sizeof(*hvstate)) {
> +        env->gpr[3] = H_PARAMETER;
> +        return;
> +    }
> +
> +    memcpy(&hv_state, hvstate, len);
> +
> +    cpu_physical_memory_unmap(hvstate, len, 0 /* read */, len /* access len */);
> +
> +    if (hv_state.version != HV_GUEST_STATE_VERSION) {
> +        env->gpr[3] = H_PARAMETER;
> +        return;
> +    }
> +
> +    cpu->nested_host_state = g_try_malloc(sizeof(CPUPPCState));
> +    if (!cpu->nested_host_state) {
> +        env->gpr[3] = H_NO_MEM;
> +        return;
> +    }
> +
> +    memcpy(cpu->nested_host_state, env, sizeof(CPUPPCState));
> +
> +    len = sizeof(*regs);
> +    regs = cpu_physical_memory_map(regs_ptr, &len, true);
> +    if (!regs || len != sizeof(*regs)) {
> +        g_free(cpu->nested_host_state);
> +        env->gpr[3] = H_P2;
> +        return;
> +    }
> +
> +    len = sizeof(env->gpr);
> +    assert(len == sizeof(regs->gpr));
> +    memcpy(env->gpr, regs->gpr, len);
> +
> +    env->lr = regs->link;
> +    env->ctr = regs->ctr;
> +    cpu_write_xer(env, regs->xer);
> +
> +    cr = regs->ccr;
> +    for (i = 7; i >= 0; i--) {
> +        env->crf[i] = cr & 15;
> +        cr >>= 4;
> +    }
> +
> +    env->msr = regs->msr;
> +    env->nip = regs->nip;
> +
> +    cpu_physical_memory_unmap(regs, len, 0 /* read */, len /* access len */);
> +
> +    env->cfar = hv_state.cfar;
> +
> +    assert(env->spr[SPR_LPIDR] == 0);
> +    env->spr[SPR_LPCR] = hv_state.lpcr & pcc->lpcr_mask; // XXX any other mask?
> +    env->spr[SPR_LPIDR] = hv_state.lpid;
> +    env->spr[SPR_PCR] = hv_state.pcr;
> +//    env->spr[SPR_AMOR] = hv_state.amor;
> +    env->spr[SPR_DPDES] = hv_state.dpdes;
> +    env->spr[SPR_HFSCR] = hv_state.hfscr;
> +    hdec = hv_state.hdec_expiry - now;
> +    env->tb_env->tb_offset += hv_state.tb_offset; // XXX how to deal?
> +    // dec already set
> +//  DAWRetc, CIABR, [S]PURR, IC
> +    env->spr[SPR_VTB] = hv_state.vtb;
> +//    env->spr[SPR_HEIR] = hv_state.heir;
> +    env->spr[SPR_SRR0] = hv_state.srr0;
> +    env->spr[SPR_SRR1] = hv_state.srr1;
> +    env->spr[SPR_SPRG0] = hv_state.sprg[0];
> +    env->spr[SPR_SPRG1] = hv_state.sprg[1];
> +    env->spr[SPR_SPRG2] = hv_state.sprg[2];
> +    env->spr[SPR_SPRG3] = hv_state.sprg[3];
> +    env->spr[SPR_BOOKS_PID] = hv_state.pidr;
> +    env->spr[SPR_PPR] = hv_state.ppr;
> +
> +    cpu_ppc_hdecr_init(env);
> +    cpu_ppc_store_hdecr(env, hdec);
> +
> +    /*
> +     * The hv_state.vcpu_token is not needed. It is used by the KVM
> +     * implementation to remember which L2 vCPU last ran on which physical
> +     * CPU so as to invalidate process scope translations if it is moved
> +     * between physical CPUs. For now TLBs are always flushed on L1<->L2
> +     * transitions so this is not a problem.
> +     *
> +     * Could validate that the same vcpu_token does not attempt to run on
> +     * different L1 vCPUs at the same time, but that would be a L1 KVM bug
> +     * and it's not obviously worth a new data structure to do it.
> +     */
> +
> +    cpu->in_spapr_nested = true;
> +
> +    hreg_compute_hflags(env);
> +    tlb_flush(cs);
> +}
> +
> +void spapr_exit_nested(PowerPCCPU *cpu, int excp)
> +{
> +    CPUState *cs = CPU(cpu);
> +    CPUPPCState *env = &cpu->env;
> +    target_ulong r3_return = env->excp_vectors[excp]; // hcall return value
> +    target_ulong hv_ptr = cpu->nested_host_state->gpr[4];
> +    target_ulong regs_ptr = cpu->nested_host_state->gpr[5];
> +    struct kvmppc_hv_guest_state *hvstate;
> +    struct kvmppc_pt_regs *regs;
> +    hwaddr len;
> +    int i;
> +
> +    assert(cpu->in_spapr_nested);
> +    cpu->in_spapr_nested = false;
> +
> +    cpu_ppc_hdecr_exit(env);
> +
> +    len = sizeof(*hvstate);
> +    hvstate = cpu_physical_memory_map(hv_ptr, &len, true);
> +    if (!hvstate || len != sizeof(*hvstate)) {
> +        r3_return = H_PARAMETER;
> +        goto out_restore_l1;
> +    }
> +
> +    //XXX check linux kvm nested CTRL reg bug?
> +    //
> +    env->tb_env->tb_offset -= hvstate->tb_offset;
> +
> +    hvstate->cfar = env->cfar;
> +    hvstate->lpcr = env->spr[SPR_LPCR];
> +    hvstate->pcr = env->spr[SPR_PCR];
> +//    hvstate->amor = env->spr[SPR_AMOR];
> +    hvstate->dpdes = env->spr[SPR_DPDES];
> +    hvstate->hfscr = env->spr[SPR_HFSCR];
> +//    hvstate-> = // dec already set
> +//  DAWRetc, CIABR, [S]PURR, IC
> +    hvstate->vtb = env->spr[SPR_VTB];
> +
> +    if (excp == POWERPC_EXCP_HDSI) {
> +        hvstate->hdar = env->spr[SPR_HDAR];
> +        hvstate->hdsisr = env->spr[SPR_HDSISR];
> +        hvstate->asdr = env->spr[SPR_ASDR];
> +    } else if (excp == POWERPC_EXCP_HISI) {
> +        hvstate->asdr = env->spr[SPR_ASDR];
> +    }
> +
> +//    hvstate->heir = env->spr[SPR_HEIR]; XXX HEIR?
> +    hvstate->srr0 = env->spr[SPR_SRR0];
> +    hvstate->srr1 = env->spr[SPR_SRR1];
> +    hvstate->sprg[0] = env->spr[SPR_SPRG0];
> +    hvstate->sprg[1] = env->spr[SPR_SPRG1];
> +    hvstate->sprg[2] = env->spr[SPR_SPRG2];
> +    hvstate->sprg[3] = env->spr[SPR_SPRG3];
> +    hvstate->pidr = env->spr[SPR_BOOKS_PID];
> +    hvstate->ppr = env->spr[SPR_PPR];
> +
> +    cpu_physical_memory_unmap(hvstate, len, 0 /* read */, len /* access len */);
> +
> +    len = sizeof(*regs);
> +    regs = cpu_physical_memory_map(regs_ptr, &len, true);
> +    if (!regs || len != sizeof(*regs)) {
> +        r3_return = H_P2;
> +        goto out_restore_l1;
> +    }
> +
> +    len = sizeof(env->gpr);
> +    assert(len == sizeof(regs->gpr));
> +    memcpy(regs->gpr, env->gpr, len);
> +
> +    regs->link = env->lr;
> +    regs->ctr = env->ctr;
> +    regs->xer = cpu_read_xer(env);
> +
> +    regs->ccr = 0;
> +    for (i = 0; i < 8; i++) {
> +        regs->ccr |= (env->crf[i] & 15) << (4 * (7 - i));
> +    }
> +
> +    if (excp == POWERPC_EXCP_MCHECK ||
> +        excp == POWERPC_EXCP_RESET ||
> +        excp == POWERPC_EXCP_SYSCALL) {
> +        regs->nip = env->spr[SPR_SRR0];
> +        regs->msr = env->spr[SPR_SRR1];
> +    } else {
> +        regs->nip = env->spr[SPR_HSRR0];
> +        regs->msr = env->spr[SPR_HSRR1];
> +    }
> +    // XXX must msr be masked?
> +
> +    cpu_physical_memory_unmap(regs, len, 0 /* read */, len /* access len */);
> +
> +out_restore_l1:
> +    memcpy(env->gpr, cpu->nested_host_state->gpr, sizeof(env->gpr));
> +    env->lr = cpu->nested_host_state->lr;
> +    env->ctr = cpu->nested_host_state->ctr;
> +    memcpy(env->crf, cpu->nested_host_state->crf, sizeof(env->crf));
> +    env->cfar = cpu->nested_host_state->cfar;
> +    env->xer = cpu->nested_host_state->xer;
> +    env->so = cpu->nested_host_state->so;
> +    env->ov = cpu->nested_host_state->ov;
> +    env->ov32 = cpu->nested_host_state->ov32;
> +    env->ca32 = cpu->nested_host_state->ca32;
> +    env->msr = cpu->nested_host_state->msr;
> +    env->nip = cpu->nested_host_state->nip;
> +
> +    assert(env->spr[SPR_LPIDR] != 0);
> +    env->spr[SPR_LPCR] = cpu->nested_host_state->spr[SPR_LPCR];
> +    env->spr[SPR_LPIDR] = cpu->nested_host_state->spr[SPR_LPIDR];
> +    env->spr[SPR_PCR] = cpu->nested_host_state->spr[SPR_PCR];
> +//    env->spr[SPR_AMOR] = cpu->nested_host_state->spr[SPR_AMOR];
> +    env->spr[SPR_DPDES] = 0;
> +    env->spr[SPR_HFSCR] = cpu->nested_host_state->spr[SPR_HFSCR];
> +//  DAWRetc, CIABR, [S]PURR, IC
> +    env->spr[SPR_VTB] = cpu->nested_host_state->spr[SPR_VTB];
> +//    env->spr[SPR_HEIR] = hv_state.heir;
> +    env->spr[SPR_SRR0] = cpu->nested_host_state->spr[SPR_SRR0];
> +    env->spr[SPR_SRR1] = cpu->nested_host_state->spr[SPR_SRR1];
> +    env->spr[SPR_SPRG0] = cpu->nested_host_state->spr[SPR_SPRG0];
> +    env->spr[SPR_SPRG1] = cpu->nested_host_state->spr[SPR_SPRG1];
> +    env->spr[SPR_SPRG2] = cpu->nested_host_state->spr[SPR_SPRG2];
> +    env->spr[SPR_SPRG3] = cpu->nested_host_state->spr[SPR_SPRG3];
> +    env->spr[SPR_BOOKS_PID] = cpu->nested_host_state->spr[SPR_BOOKS_PID];
> +    env->spr[SPR_PPR] = cpu->nested_host_state->spr[SPR_PPR];
> +
> +    g_free(cpu->nested_host_state);
> +    cpu->nested_host_state = NULL;
> +
> +    /*
> +     * Return the interrupt vector address from H_ENTER_NESTED to the L1
> +     * (or error code).
> +     */
> +    env->gpr[3] = r3_return;
> +
> +    hreg_compute_hflags(env);
> +    tlb_flush(cs);
> +}
> +
>   static void hypercall_register_types(void)
>   {
>       hypercall_register_softmmu();
> @@ -1552,6 +1864,10 @@ static void hypercall_register_types(void)
>       spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support);
>   
>       spapr_register_hypercall(KVMPPC_H_UPDATE_DT, h_update_dt);
> +
> +    spapr_register_hypercall(KVMPPC_H_SET_PARTITION_TABLE, h_set_ptbl);
> +    spapr_register_hypercall(KVMPPC_H_TLB_INVALIDATE, h_tlb_invalidate);
> +    spapr_register_hypercall(KVMPPC_H_COPY_TOFROM_GUEST, h_copy_tofrom_guest);
>   }
>   
>   type_init(hypercall_register_types)
> diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h
> index 93e614cffd..fcf9e495a0 100644
> --- a/include/hw/ppc/ppc.h
> +++ b/include/hw/ppc/ppc.h
> @@ -54,6 +54,9 @@ struct ppc_tb_t {
>   
>   uint64_t cpu_ppc_get_tb(ppc_tb_t *tb_env, uint64_t vmclk, int64_t tb_offset);
>   clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq);
> +void cpu_ppc_hdecr_init (CPUPPCState *env);
> +void cpu_ppc_hdecr_exit (CPUPPCState *env);
> +
>   /* Embedded PowerPC DCR management */
>   typedef uint32_t (*dcr_read_cb)(void *opaque, int dcrn);
>   typedef void (*dcr_write_cb)(void *opaque, int dcrn, uint32_t val);
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index ee7504b976..3a9fa4c024 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -197,6 +197,9 @@ struct SpaprMachineState {
>       bool has_graphics;
>       uint32_t vsmt;       /* Virtual SMT mode (KVM's "core stride") */
>   
> +    /* Nested HV support (TCG only) */
> +    uint64_t nested_ptcr;
> +
>       Notifier epow_notifier;
>       QTAILQ_HEAD(, SpaprEventLogEntry) pending_events;
>       bool use_hotplug_event_source;
> @@ -577,7 +580,14 @@ struct SpaprMachineState {
>   #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
>   /* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
>   #define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
> -#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
> +
> +/* Platform-specific hcalls used for nested HV KVM */
> +#define KVMPPC_H_SET_PARTITION_TABLE   (KVMPPC_HCALL_BASE + 0x800)
> +#define KVMPPC_H_ENTER_NESTED          (KVMPPC_HCALL_BASE + 0x804)
> +#define KVMPPC_H_TLB_INVALIDATE        (KVMPPC_HCALL_BASE + 0x808)
> +#define KVMPPC_H_COPY_TOFROM_GUEST     (KVMPPC_HCALL_BASE + 0x80C)
> +
> +#define KVMPPC_HCALL_MAX        KVMPPC_H_COPY_TOFROM_GUEST
>   
>   /*
>    * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
> @@ -587,6 +597,65 @@ struct SpaprMachineState {
>   #define SVM_H_TPM_COMM              0xEF10
>   #define SVM_HCALL_MAX               SVM_H_TPM_COMM
>   
> +/*
> + * Register state for entering a nested guest with H_ENTER_NESTED.
> + * New member must be added at the end.
> + */
> +struct kvmppc_hv_guest_state {
> +    uint64_t version;        /* version of this structure layout, must be first */
> +    uint32_t lpid;
> +    uint32_t vcpu_token;
> +    /* These registers are hypervisor privileged (at least for writing) */
> +    uint64_t lpcr;
> +    uint64_t pcr;
> +    uint64_t amor;
> +    uint64_t dpdes;
> +    uint64_t hfscr;
> +    int64_t tb_offset;
> +    uint64_t dawr0;
> +    uint64_t dawrx0;
> +    uint64_t ciabr;
> +    uint64_t hdec_expiry;
> +    uint64_t purr;
> +    uint64_t spurr;
> +    uint64_t ic;
> +    uint64_t vtb;
> +    uint64_t hdar;
> +    uint64_t hdsisr;
> +    uint64_t heir;
> +    uint64_t asdr;
> +    /* These are OS privileged but need to be set late in guest entry */
> +    uint64_t srr0;
> +    uint64_t srr1;
> +    uint64_t sprg[4];
> +    uint64_t pidr;
> +    uint64_t cfar;
> +    uint64_t ppr;
> +    /* Version 1 ends here */
> +    uint64_t dawr1;
> +    uint64_t dawrx1;
> +    /* Version 2 ends here */
> +};
> +
> +/* Latest version of hv_guest_state structure */
> +#define HV_GUEST_STATE_VERSION  2
> +
> +/* 64-bit powerpc pt_regs struct used by nested HV */
> +struct kvmppc_pt_regs {
> +    uint64_t gpr[32];
> +    uint64_t nip;
> +    uint64_t msr;
> +    uint64_t orig_gpr3;    /* Used for restarting system calls */
> +    uint64_t ctr;
> +    uint64_t link;
> +    uint64_t xer;
> +    uint64_t ccr;
> +    uint64_t softe;        /* Soft enabled/disabled */
> +    uint64_t trap;         /* Reason for being here */
> +    uint64_t dar;          /* Fault registers */
> +    uint64_t dsisr;        /* on 4xx/Book-E used for ESR */
> +    uint64_t result;       /* Result of a system call */
> +};
>   
>   typedef struct SpaprDeviceTreeUpdateHeader {
>       uint32_t version_id;
> @@ -604,6 +673,10 @@ typedef target_ulong (*spapr_hcall_fn)(PowerPCCPU *cpu, SpaprMachineState *sm,
>   void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn);
>   target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>                                target_ulong *args);
> +
> +void spapr_enter_nested(PowerPCCPU *cpu);
> +void spapr_exit_nested(PowerPCCPU *cpu, int excp);
> +
>   target_ulong softmmu_resize_hpt_prepare(PowerPCCPU *cpu, SpaprMachineState *spapr,
>                                            target_ulong shift);
>   target_ulong softmmu_resize_hpt_commit(PowerPCCPU *cpu, SpaprMachineState *spapr,
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index dcd83b503c..1806a8e776 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -1306,6 +1306,9 @@ struct PowerPCCPU {
>       bool pre_2_10_migration;
>       bool pre_3_0_migration;
>       int32_t mig_slb_nr;
> +
> +    bool in_spapr_nested;
> +    CPUPPCState *nested_host_state;
>   };
>   
>   
> @@ -1316,6 +1319,9 @@ PowerPCCPUClass *ppc_cpu_get_family_class(PowerPCCPUClass *pcc);
>   #ifndef CONFIG_USER_ONLY
>   struct PPCVirtualHypervisorClass {
>       InterfaceClass parent;
> +    bool (*cpu_in_nested)(PowerPCCPU *cpu);
> +    target_ulong (*get_nested_ptcr)(PowerPCCPU *cpu, target_ulong lpid);
> +    void (*exit_nested)(PowerPCCPU *cpu, int excp);
>       void (*hypercall)(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu);
>       hwaddr (*hpt_mask)(PPCVirtualHypervisor *vhyp);
>       const ppc_hash_pte64_t *(*map_hptes)(PPCVirtualHypervisor *vhyp,
> diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
> index c107953dec..239c253dbc 100644
> --- a/target/ppc/excp_helper.c
> +++ b/target/ppc/excp_helper.c
> @@ -361,8 +361,8 @@ static void ppc_excp_apply_ail(PowerPCCPU *cpu, int excp_model, int excp,
>   #endif
>   }
>   
> -static void powerpc_set_excp_state(PowerPCCPU *cpu,
> -                                          target_ulong vector, target_ulong msr)
> +static void powerpc_set_excp_state(PowerPCCPU *cpu, int excp,
> +                                   target_ulong vector, target_ulong msr)
>   {
>       CPUState *cs = CPU(cpu);
>       CPUPPCState *env = &cpu->env;
> @@ -375,9 +375,17 @@ static void powerpc_set_excp_state(PowerPCCPU *cpu,
>        * will prevent setting of the HV bit which some exceptions might need
>        * to do.
>        */
> -    env->msr = msr & env->msr_mask;
> -    hreg_compute_hflags(env);
> -    env->nip = vector;
> +    if (cpu->vhyp && cpu->in_spapr_nested && (msr & MSR_HVB)) {
> +        PPCVirtualHypervisorClass *vhc =
> +            PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
> +        // Deliver interrupt to L1 by returning from the H_ENTER_NESTED call
> +        vhc->exit_nested(cpu, excp);
> +    } else {
> +        env->nip = vector;
> +        env->msr = msr & env->msr_mask;
> +        hreg_compute_hflags(env);
> +    }
> +
>       /* Reset exception state */
>       cs->exception_index = POWERPC_EXCP_NONE;
>       env->error_code = 0;
> @@ -548,7 +556,7 @@ static void powerpc_excp_40x(PowerPCCPU *cpu, int excp)
>       /* Save MSR */
>       env->spr[srr1] = msr;
>   
> -    powerpc_set_excp_state(cpu, vector, new_msr);
> +    powerpc_set_excp_state(cpu, excp, vector, new_msr);
>   }
>   
>   static void powerpc_excp_74xx(PowerPCCPU *cpu, int excp)
> @@ -742,7 +750,7 @@ static void powerpc_excp_74xx(PowerPCCPU *cpu, int excp)
>       /* Save MSR */
>       env->spr[SPR_SRR1] = msr;
>   
> -    powerpc_set_excp_state(cpu, vector, new_msr);
> +    powerpc_set_excp_state(cpu, excp, vector, new_msr);
>   }
>   
>   #ifdef TARGET_PPC64
> @@ -916,7 +924,7 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
>           env->nip += 4;
>   
>           /* "PAPR mode" built-in hypercall emulation */
> -        if ((lev == 1) && cpu->vhyp) {
> +        if ((lev == 1) && cpu->vhyp && !cpu->in_spapr_nested) {

an helper for (cpu->vhyp && !cpu->in_spapr_nested) would help.

>               PPCVirtualHypervisorClass *vhc =
>                   PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
>               vhc->hypercall(cpu->vhyp, cpu);
> @@ -1004,18 +1012,6 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
>           break;
>       }
>   
> -    /* Sanity check */
> -    if (!(env->msr_mask & MSR_HVB)) {
> -        if (new_msr & MSR_HVB) {
> -            cpu_abort(cs, "Trying to deliver HV exception (MSR) %d with "
> -                      "no HV support\n", excp);
> -        }
> -        if (srr0 == SPR_HSRR0) {
> -            cpu_abort(cs, "Trying to deliver HV exception (HSRR) %d with "
> -                      "no HV support\n", excp);
> -        }
> -    }
> -
>       /*
>        * Sort out endianness of interrupt, this differs depending on the
>        * CPU, the HV mode, etc...
> @@ -1037,7 +1033,19 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
>       /* This can update new_msr and vector if AIL applies */
>       ppc_excp_apply_ail(cpu, excp_model, excp, msr, &new_msr, &vector);
>   
> -    powerpc_set_excp_state(cpu, vector, new_msr);
> +    powerpc_set_excp_state(cpu, excp, vector, new_msr);
> +
> +    /* Sanity check */
> +    if (!(env->msr_mask & MSR_HVB)) {
> +        if (env->msr & MSR_HVB) {
> +            cpu_abort(cs, "Trying to deliver HV exception (MSR) %d with "
> +                      "no HV support\n", excp);
> +        }
> +        if (0 && srr0 == SPR_HSRR0) {
> +            cpu_abort(cs, "Trying to deliver HV exception (HSRR) %d with "
> +                      "no HV support\n", excp);
> +        }
> +    }
>   }
>   #else
>   static inline void powerpc_excp_books(PowerPCCPU *cpu, int excp)
> @@ -1517,7 +1525,7 @@ static inline void powerpc_excp_legacy(PowerPCCPU *cpu, int excp)
>       /* This can update new_msr and vector if AIL applies */
>       ppc_excp_apply_ail(cpu, excp_model, excp, msr, &new_msr, &vector);
>   
> -    powerpc_set_excp_state(cpu, vector, new_msr);
> +    powerpc_set_excp_state(cpu, excp, vector, new_msr);
>   }
>   
>   static void powerpc_excp(PowerPCCPU *cpu, int excp)
> @@ -1613,7 +1621,11 @@ static void ppc_hw_interrupt(CPUPPCState *env)
>           /* HEIC blocks delivery to the hypervisor */
>           if ((async_deliver && !(heic && msr_hv && !msr_pr)) ||
>               (env->has_hv_mode && msr_hv == 0 && !lpes0)) {
> -            powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
> +            if (cpu->in_spapr_nested) {
> +                powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
> +            } else {
> +                powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
> +            }
>               return;
>           }
>       }
> @@ -1723,7 +1735,7 @@ void ppc_cpu_do_fwnmi_machine_check(CPUState *cs, target_ulong vector)
>           msr |= (1ULL << MSR_LE);
>       }
>   
> -    powerpc_set_excp_state(cpu, vector, msr);
> +    powerpc_set_excp_state(cpu, POWERPC_EXCP_MCHECK, vector, msr);
>   }
>   
>   bool ppc_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
> diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c
> index 5b12cb03c9..7f92606522 100644
> --- a/target/ppc/helper_regs.c
> +++ b/target/ppc/helper_regs.c
> @@ -163,6 +163,7 @@ static uint32_t hreg_compute_hflags_value(CPUPPCState *env)
>           immu_idx |= msr & (1 << MSR_IS) ? 2 : 0;
>           dmmu_idx |= msr & (1 << MSR_DS) ? 2 : 0;
>       } else {
> +        /* Could have nested IDX instead of HV to avoid tlb flush on nested enter/exit? */

yes.

>           dmmu_idx |= msr & (1ull << MSR_HV) ? 4 : 0;
>           immu_idx = dmmu_idx;
>           immu_idx |= msr & (1 << MSR_IR) ? 0 : 2;
> diff --git a/target/ppc/mmu-book3s-v3.c b/target/ppc/mmu-book3s-v3.c
> index f4985bae78..0810be3668 100644
> --- a/target/ppc/mmu-book3s-v3.c
> +++ b/target/ppc/mmu-book3s-v3.c
> @@ -25,8 +25,23 @@
>   
>   bool ppc64_v3_get_pate(PowerPCCPU *cpu, target_ulong lpid, ppc_v3_pate_t *entry)
>   {
> -    uint64_t patb = cpu->env.spr[SPR_PTCR] & PTCR_PATB;
> -    uint64_t pats = cpu->env.spr[SPR_PTCR] & PTCR_PATS;
> +    uint64_t patb, pats;
> +
> +    if (cpu->vhyp) {
> +        PPCVirtualHypervisorClass *vhc =
> +            PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
> +        target_ulong nested_ptcr;
> +
> +        assert(cpu->in_spapr_nested);
> +
> +        nested_ptcr = vhc->get_nested_ptcr(cpu, lpid);
> +
> +        patb = nested_ptcr & PTCR_PATB;
> +        pats = nested_ptcr & PTCR_PATS;
> +    } else {
> +        patb = cpu->env.spr[SPR_PTCR] & PTCR_PATB;
> +        pats = cpu->env.spr[SPR_PTCR] & PTCR_PATS;
> +    }
>   
>       /* Calculate number of entries */
>       pats = 1ull << (pats + 12 - 4);
> @@ -38,5 +53,6 @@ bool ppc64_v3_get_pate(PowerPCCPU *cpu, target_ulong lpid, ppc_v3_pate_t *entry)
>       patb += 16 * lpid;
>       entry->dw0 = ldq_phys(CPU(cpu)->as, patb);
>       entry->dw1 = ldq_phys(CPU(cpu)->as, patb + 8);
> +
>       return true;
>   }
> diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
> index 54fb3ce98d..6304a23d05 100644
> --- a/target/ppc/mmu-radix64.c
> +++ b/target/ppc/mmu-radix64.c
> @@ -304,7 +304,7 @@ static bool validate_pate(PowerPCCPU *cpu, uint64_t lpid, ppc_v3_pate_t *pate)
>       if (!(pate->dw0 & PATE0_HR)) {
>           return false;
>       }
> -    if (lpid == 0 && !msr_hv) {
> +    if (lpid == 0 && (!msr_hv && !(cpu->vhyp && cpu->in_spapr_nested))) {
>           return false;
>       }
>       if ((pate->dw0 & PATE1_R_PRTS) < 5) {
> @@ -336,6 +336,7 @@ static int ppc_radix64_partition_scoped_xlate(PowerPCCPU *cpu,
>                     g_raddr);
>   
>       *h_page_size = PRTBE_R_GET_RTS(pate.dw0);
> +
>       /* No valid pte or access denied due to protection */
>       if (ppc_radix64_walk_tree(CPU(cpu)->as, g_raddr, pate.dw0 & PRTBE_R_RPDB,
>                                 pate.dw0 & PRTBE_R_RPDS, h_raddr, h_page_size,
> @@ -389,7 +390,7 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu,
>       }
>       prtbe_addr = (pate.dw1 & PATE1_R_PRTB) + offset;
>   
> -    if (cpu->vhyp) {
> +    if (cpu->vhyp && !cpu->in_spapr_nested) {
>           prtbe0 = ldq_phys(cs->as, prtbe_addr);
>       } else {
>           /*
> @@ -415,7 +416,7 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu,
>       *g_page_size = PRTBE_R_GET_RTS(prtbe0);
>       base_addr = prtbe0 & PRTBE_R_RPDB;
>       nls = prtbe0 & PRTBE_R_RPDS;
> -    if (msr_hv || cpu->vhyp) {
> +    if (msr_hv || (cpu->vhyp && !cpu->in_spapr_nested)) {
>           /*
>            * Can treat process table addresses as real addresses
>            */
> @@ -519,7 +520,7 @@ static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr eaddr,
>       relocation = !mmuidx_real(mmu_idx);
>   
>       /* HV or virtual hypervisor Real Mode Access */
> -    if (!relocation && (mmuidx_hv(mmu_idx) || cpu->vhyp)) {
> +    if (!relocation && (mmuidx_hv(mmu_idx) || (cpu->vhyp && !cpu->in_spapr_nested))) {
>           /* In real mode top 4 effective addr bits (mostly) ignored */
>           *raddr = eaddr & 0x0FFFFFFFFFFFFFFFULL;
>   
> @@ -553,7 +554,7 @@ static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr eaddr,
>       }
>   
>       /* Get Process Table */
> -    if (cpu->vhyp) {
> +    if (cpu->vhyp && !cpu->in_spapr_nested) {
>           PPCVirtualHypervisorClass *vhc;
>           vhc = PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
>           vhc->get_pate(cpu->vhyp, &pate);
> @@ -596,7 +597,7 @@ static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr eaddr,
>           g_raddr = eaddr & R_EADDR_MASK;
>       }
>   
> -    if (cpu->vhyp) {
> +    if (cpu->vhyp && !cpu->in_spapr_nested) {
>           *raddr = g_raddr;
>       } else {
>           /*
> @@ -604,7 +605,7 @@ static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr eaddr,
>            * quadrants 1 or 2. Translates a guest real address to a host
>            * real address.
>            */
> -        if (lpid || !mmuidx_hv(mmu_idx)) {
> +        if (lpid || !mmuidx_hv(mmu_idx) || cpu->in_spapr_nested) {
>               int ret;
>   
>               ret = ppc_radix64_partition_scoped_xlate(cpu, access_type, eaddr,
>
Nicholas Piggin Feb. 14, 2022, 11:24 p.m. UTC | #3
Excerpts from Fabiano Rosas's message of February 15, 2022 12:32 am:
> Nicholas Piggin <npiggin@gmail.com> writes:
> 
>> This implements the nested-hv hcall API for spapr under TCG.
>> It's still a bit rough around the edges, concept seems to work.
>>
>> Some HV exceptions can be raised now in the TCG spapr machine when
>> running a nested guest. The main ones are the lev==1 syscall, the
>> hdecr, hdsi and hisi, and h_virt external interrupts. These are
>> dealt with in the interrupt delivery code by noticing MSR[HV] raised
>> and instead of switching the machine to HV mode, it exits the
>> H_ENTER_NESTED hcall with the interrupt vector as return value as
>> required by the hcall API.
>>
>> Address translation is provided by the 2-level page table walker
>> that is implemented for the pnv machine. The partition scope page
>> table is pointed to the L1's partition scope, and a few tests have
>> to take into account that nested-hv translations are 2-level. This
>> could perhaps be tidied up a bit e.g., with a 'bool two_level = ...'
>> but it's surprisingly little code.
>>
>> There is no TLB tagging between L1 and L2 translations at the moment
>> so the TLB is flushed on any L1<->L2 transition (hcall entry and exit).
>>
>> XXX: stop doing atomic RC on page table walks (not for nested but in general)
>>
>> not-yet-Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> 
> Hi Nick,
> 
> Sorry it took me so long to get to this. It think overall it looks
> good. I have some comments about code structure that you probably
> already know about.

Great thanks for the comments.

> I don't have answers to your XXX questions but I think it's ok to handle
> them later individually as they come up, after this is merged.

I've been able to remove most of them I think. I'll repost soon.

>> ---
>>  hw/ppc/ppc.c               |  20 +++
>>  hw/ppc/spapr.c             |  16 ++
>>  hw/ppc/spapr_caps.c        |   5 +-
>>  hw/ppc/spapr_hcall.c       | 316 +++++++++++++++++++++++++++++++++++++
>>  include/hw/ppc/ppc.h       |   3 +
>>  include/hw/ppc/spapr.h     |  75 ++++++++-
>>  target/ppc/cpu.h           |   6 +
>>  target/ppc/excp_helper.c   |  60 ++++---
>>  target/ppc/helper_regs.c   |   1 +
>>  target/ppc/mmu-book3s-v3.c |  20 ++-
>>  target/ppc/mmu-radix64.c   |  15 +-
>>  11 files changed, 499 insertions(+), 38 deletions(-)
>>
>> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
>> index a7c262db93..135900a6f4 100644
>> --- a/hw/ppc/ppc.c
>> +++ b/hw/ppc/ppc.c
>> @@ -1083,6 +1083,26 @@ clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq)
>>      return &cpu_ppc_set_tb_clk;
>>  }
>>  
>> +void cpu_ppc_hdecr_init (CPUPPCState *env)
>> +{
>> +    PowerPCCPU *cpu = env_archcpu(env);
>> +
>> +    assert(env->tb_env->hdecr_timer == NULL);
>> +
>> +    env->tb_env->hdecr_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &cpu_ppc_hdecr_cb,
>> +                                                 cpu);
>> +}
>> +
>> +void cpu_ppc_hdecr_exit (CPUPPCState *env)
>> +{
>> +    PowerPCCPU *cpu = env_archcpu(env);
>> +
>> +    timer_free(env->tb_env->hdecr_timer);
>> +    env->tb_env->hdecr_timer = NULL;
>> +
>> +    cpu_ppc_hdecr_lower(cpu);
>> +}
>> +
>>  /* Specific helpers for POWER & PowerPC 601 RTC */
>>  void cpu_ppc601_store_rtcu (CPUPPCState *env, uint32_t value)
>>  {
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 3d6ec309dd..f0c3f726f2 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -1273,6 +1273,8 @@ static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp,
>>      if (msr_pr) {
>>          hcall_dprintf("Hypercall made with MSR[PR]=1\n");
>>          env->gpr[3] = H_PRIVILEGE;
>> +    } else if (env->gpr[3] == KVMPPC_H_ENTER_NESTED) {
>> +        spapr_enter_nested(cpu);
> 
> This looks like it could be plumbed along with spapr_hypercall below.

Hmm. when "QEMU" returns from this hypercall, TCG should be running the
L2 guest with L2 registers but the L1 itself has not returned from the
hcall yet.

So when I wrote it first the idea was that we don't want to overwrite
the L2's r3 register with env->gpr[3] return value... But now you make
me think about it the hcall implementation could return the L2's r3
value here.

Interesting point. I'll try it.

>> +static target_ulong h_copy_tofrom_guest(PowerPCCPU *cpu,
>> +                               SpaprMachineState *spapr,
>> +                               target_ulong opcode,
>> +                               target_ulong *args)
>> +{
>> +    /*
>> +     * This HCALL is not required, L1 KVM will take a slow path and walk the
>> +     * page tables manually to do the data copy.
>> +     */
> 
> Kind of unrelated to the patch itself but I thought the point of this
> hcall was less about performance and more about Ln not having access to
> the KVM memslots for the Ln+2.
> 
> KVM accesses a guest address space by using the GFN to find the memory
> slot and from there the userspace address that is part of the
> memslot. Accesses from Ln -> Ln+1 can use that mechanism because any Ln
> will have slots registered for its guests.
> 
> For L0 -> L2, we can go via radix quadrants because the L0 is the real
> HV (that's how L0 handles this hcall after all).
> 
> For L1 -> L3, we need the hcall since L1 cannot access quadrants 1 and 2
> and it does not have the memslot so it cannot use the QEMU address.
> 
> Does that make sense?

AFAIKS it's not used for Ln->Ln+2, only for Ln->Ln+1. In kvmppc_ld for
example the load_from_eaddr fast path uses this (or quadrants in case of
L0), and the fallback does kvmppc_xlate.

This patch did boot a L3 up to busybox, so it seems not to be required.

>> +void spapr_enter_nested(PowerPCCPU *cpu)
>> +{
>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
>> +    CPUState *cs = CPU(cpu);
>> +    CPUPPCState *env = &cpu->env;
>> +    target_ulong hv_ptr = env->gpr[4];
>> +    target_ulong regs_ptr = env->gpr[5];
>> +    target_ulong hdec, now = cpu_ppc_load_tbl(env);
>> +    struct kvmppc_hv_guest_state *hvstate;
>> +    struct kvmppc_hv_guest_state hv_state;
>> +    struct kvmppc_pt_regs *regs;
>> +    hwaddr len;
>> +    uint32_t cr;
>> +    int i;
>> +
>> +    if (cpu->in_spapr_nested) {
>> +        env->gpr[3] = H_FUNCTION;
>> +        return;
>> +    }
>> +    if (spapr->nested_ptcr == 0) {
>> +        env->gpr[3] = H_NOT_AVAILABLE;
>> +        return;
>> +    }
>> +
>> +    len = sizeof(*hvstate);
>> +    hvstate = cpu_physical_memory_map(hv_ptr, &len, true);
>> +    if (!hvstate || len != sizeof(*hvstate)) {
>> +        env->gpr[3] = H_PARAMETER;
>> +        return;
>> +    }
>> +
>> +    memcpy(&hv_state, hvstate, len);
>> +
>> +    cpu_physical_memory_unmap(hvstate, len, 0 /* read */, len /* access len */);
>> +
>> +    if (hv_state.version != HV_GUEST_STATE_VERSION) {
> 
> KVM uses > here. Should we do the same? Daniel and I have been
> discussing if we need some sort of version negotiation for nested KVM
> since recently someone tried to use an L0 version 1 with an L1 version
> 2:
> 
> https://gitlab.com/qemu-project/qemu/-/issues/860

Good question, I'm not quite sure how that would all be handled.
Actually we don't use the version 2 registers AFAIKS so we could
support both.

> 
>> +        env->gpr[3] = H_PARAMETER;
>> +        return;
>> +    }
>> +
> 
> <snip>
> 
>> +/* 64-bit powerpc pt_regs struct used by nested HV */
>> +struct kvmppc_pt_regs {
>> +    uint64_t gpr[32];
>> +    uint64_t nip;
>> +    uint64_t msr;
>> +    uint64_t orig_gpr3;    /* Used for restarting system calls */
>> +    uint64_t ctr;
>> +    uint64_t link;
>> +    uint64_t xer;
>> +    uint64_t ccr;
>> +    uint64_t softe;        /* Soft enabled/disabled */
>> +    uint64_t trap;         /* Reason for being here */
>> +    uint64_t dar;          /* Fault registers */
>> +    uint64_t dsisr;        /* on 4xx/Book-E used for ESR */
> 
> There's no 4xx/booke support for spapr nested.

Yeah that's just copy/paste from Linux.

>> @@ -375,9 +375,17 @@ static void powerpc_set_excp_state(PowerPCCPU *cpu,
>>       * will prevent setting of the HV bit which some exceptions might need
>>       * to do.
>>       */
>> -    env->msr = msr & env->msr_mask;
>> -    hreg_compute_hflags(env);
>> -    env->nip = vector;
>> +    if (cpu->vhyp && cpu->in_spapr_nested && (msr & MSR_HVB)) {
>> +        PPCVirtualHypervisorClass *vhc =
>> +            PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
>> +        // Deliver interrupt to L1 by returning from the H_ENTER_NESTED call
>> +        vhc->exit_nested(cpu, excp);
>> +    } else {
>> +        env->nip = vector;
>> +        env->msr = msr & env->msr_mask;
>> +        hreg_compute_hflags(env);
>> +    }
>> +
> 
> All of this should fit better at the end of powerpc_excp_books. Even if
> we need to duplicate it for fwnmi_machine_check.

Okay I'll try that.

>>      /* Reset exception state */
>>      cs->exception_index = POWERPC_EXCP_NONE;
>>      env->error_code = 0;
>> @@ -548,7 +556,7 @@ static void powerpc_excp_40x(PowerPCCPU *cpu, int excp)
>>      /* Save MSR */
>>      env->spr[srr1] = msr;
>>  
>> -    powerpc_set_excp_state(cpu, vector, new_msr);
>> +    powerpc_set_excp_state(cpu, excp, vector, new_msr);
>>  }
>>  
>>  static void powerpc_excp_74xx(PowerPCCPU *cpu, int excp)
>> @@ -742,7 +750,7 @@ static void powerpc_excp_74xx(PowerPCCPU *cpu, int excp)
>>      /* Save MSR */
>>      env->spr[SPR_SRR1] = msr;
>>  
>> -    powerpc_set_excp_state(cpu, vector, new_msr);
>> +    powerpc_set_excp_state(cpu, excp, vector, new_msr);
>>  }
>>  
>>  #ifdef TARGET_PPC64
>> @@ -916,7 +924,7 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
>>          env->nip += 4;
>>  
>>          /* "PAPR mode" built-in hypercall emulation */
>> -        if ((lev == 1) && cpu->vhyp) {
>> +        if ((lev == 1) && cpu->vhyp && !cpu->in_spapr_nested) {
>>              PPCVirtualHypervisorClass *vhc =
>>                  PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
>>              vhc->hypercall(cpu->vhyp, cpu);
>> @@ -1004,18 +1012,6 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
>>          break;
>>      }
>>  
>> -    /* Sanity check */
>> -    if (!(env->msr_mask & MSR_HVB)) {
>> -        if (new_msr & MSR_HVB) {
>> -            cpu_abort(cs, "Trying to deliver HV exception (MSR) %d with "
>> -                      "no HV support\n", excp);
>> -        }
>> -        if (srr0 == SPR_HSRR0) {
>> -            cpu_abort(cs, "Trying to deliver HV exception (HSRR) %d with "
>> -                      "no HV support\n", excp);
>> -        }
>> -    }
>> -
>>      /*
>>       * Sort out endianness of interrupt, this differs depending on the
>>       * CPU, the HV mode, etc...
>> @@ -1037,7 +1033,19 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
>>      /* This can update new_msr and vector if AIL applies */
>>      ppc_excp_apply_ail(cpu, excp_model, excp, msr, &new_msr, &vector);
>>  
>> -    powerpc_set_excp_state(cpu, vector, new_msr);
>> +    powerpc_set_excp_state(cpu, excp, vector, new_msr);
>> +
>> +    /* Sanity check */
>> +    if (!(env->msr_mask & MSR_HVB)) {
>> +        if (env->msr & MSR_HVB) {
>> +            cpu_abort(cs, "Trying to deliver HV exception (MSR) %d with "
>> +                      "no HV support\n", excp);
>> +        }
> 
> FYI, this doesn't exist anymore.
> 
>> +        if (0 && srr0 == SPR_HSRR0) {
>> +            cpu_abort(cs, "Trying to deliver HV exception (HSRR) %d with "
>> +                      "no HV support\n", excp);
>> +        }
> 
> msr_mask should have MSR_HVB independently of MSR_HV being actually set,
> so this shouldn't be causing any problems. Is it?

Hmm, I'll have to re-check that. It did crash but that might have been a
slightly different version of the test.

>> @@ -1613,7 +1621,11 @@ static void ppc_hw_interrupt(CPUPPCState *env)
>>          /* HEIC blocks delivery to the hypervisor */
>>          if ((async_deliver && !(heic && msr_hv && !msr_pr)) ||
>>              (env->has_hv_mode && msr_hv == 0 && !lpes0)) {
>> -            powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
>> +            if (cpu->in_spapr_nested) {
>> +                powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
>> +            } else {
>> +                powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
>> +            }
> 
> This function affects other CPUs, so it's better to leave this as it
> were and change the exception inside powerpc_excp_books, like we do for
> HV_EMUL.

Good point.

>>              return;
>>          }
>>      }
>> @@ -1723,7 +1735,7 @@ void ppc_cpu_do_fwnmi_machine_check(CPUState *cs, target_ulong vector)
>>          msr |= (1ULL << MSR_LE);
>>      }
>>  
>> -    powerpc_set_excp_state(cpu, vector, msr);
>> +    powerpc_set_excp_state(cpu, POWERPC_EXCP_MCHECK, vector, msr);
>>  }
>>  
>>  bool ppc_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
>> diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c
>> index 5b12cb03c9..7f92606522 100644
>> --- a/target/ppc/helper_regs.c
>> +++ b/target/ppc/helper_regs.c
>> @@ -163,6 +163,7 @@ static uint32_t hreg_compute_hflags_value(CPUPPCState *env)
>>          immu_idx |= msr & (1 << MSR_IS) ? 2 : 0;
>>          dmmu_idx |= msr & (1 << MSR_DS) ? 2 : 0;
>>      } else {
>> +        /* Could have nested IDX instead of HV to avoid tlb flush on nested enter/exit? */
> 
> Sounds like a good idea. However I don't know enough about softmmu stuff
> to point a direction here.

Yep, the trivial attempt (just add in_spapr_nested as a new bit in the 
mmu idx) failed :P Performance is not horrible as is though so it's not
necessary for the first version.

Thanks,
Nick
Nicholas Piggin Feb. 14, 2022, 11:28 p.m. UTC | #4
Excerpts from Cédric Le Goater's message of February 15, 2022 4:31 am:
> On 2/10/22 07:53, Nicholas Piggin wrote:
>> +void cpu_ppc_hdecr_init (CPUPPCState *env)
>> +{
>> +    PowerPCCPU *cpu = env_archcpu(env);
>> +
>> +    assert(env->tb_env->hdecr_timer == NULL);
>> +
>> +    env->tb_env->hdecr_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &cpu_ppc_hdecr_cb,
>> +                                                 cpu);
>> +}
>> +
>> +void cpu_ppc_hdecr_exit (CPUPPCState *env)
>> +{
>> +    PowerPCCPU *cpu = env_archcpu(env);
>> +
>> +    timer_free(env->tb_env->hdecr_timer);
>> +    env->tb_env->hdecr_timer = NULL;
>> +
>> +    cpu_ppc_hdecr_lower(cpu);
>> +}
> 
> 
> So these are called every time a L2 enters or exits ?

Yes.

>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 3d6ec309dd..f0c3f726f2 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -1273,6 +1273,8 @@ static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp,
>>       if (msr_pr) {
>>           hcall_dprintf("Hypercall made with MSR[PR]=1\n");
>>           env->gpr[3] = H_PRIVILEGE;
>> +    } else if (env->gpr[3] == KVMPPC_H_ENTER_NESTED) {
>> +        spapr_enter_nested(cpu);
> 
> Can not this be in the hcall table ?

See reply to Fabiano, I think it can be.

>>       } else {
>>           env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
>>       }
>> @@ -4465,6 +4467,17 @@ PowerPCCPU *spapr_find_cpu(int vcpu_id)
>>       return NULL;
>>   }
>>   
>> +static bool spapr_cpu_in_nested(PowerPCCPU *cpu)
>> +{
>> +    return cpu->in_spapr_nested;
>> +}
> 
> This handler is not very much used.

Yeah, I have improved that in the current code, hopefully doesn't 
open-code cpu->in_spapr_nested anywhere outside spapr specifics.

>> +void spapr_enter_nested(PowerPCCPU *cpu)
>> +{
>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
>> +    CPUState *cs = CPU(cpu);
>> +    CPUPPCState *env = &cpu->env;
>> +    target_ulong hv_ptr = env->gpr[4];
>> +    target_ulong regs_ptr = env->gpr[5];
>> +    target_ulong hdec, now = cpu_ppc_load_tbl(env);
>> +    struct kvmppc_hv_guest_state *hvstate;
>> +    struct kvmppc_hv_guest_state hv_state;
>> +    struct kvmppc_pt_regs *regs;
>> +    hwaddr len;
>> +    uint32_t cr;
>> +    int i;
>> +
>> +    if (cpu->in_spapr_nested) {
>> +        env->gpr[3] = H_FUNCTION;
>> +        return;
>> +    }
>> +    if (spapr->nested_ptcr == 0) {
>> +        env->gpr[3] = H_NOT_AVAILABLE;
>> +        return;
>> +    }
>> +
>> +    len = sizeof(*hvstate);
>> +    hvstate = cpu_physical_memory_map(hv_ptr, &len, 
> 
> Are you writing to the state ? address_space_map() is a better pratice.

Yes, in exit_nested it gets written. I'll take a look at 
address_space_map().

>> @@ -916,7 +924,7 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
>>           env->nip += 4;
>>   
>>           /* "PAPR mode" built-in hypercall emulation */
>> -        if ((lev == 1) && cpu->vhyp) {
>> +        if ((lev == 1) && cpu->vhyp && !cpu->in_spapr_nested) {
> 
> an helper for (cpu->vhyp && !cpu->in_spapr_nested) would help.

Yeah, I put some helpers in the mmu code since this patch which is 
nicer. Will try to do the same for this.

Thanks,
Nick
Nicholas Piggin Feb. 15, 2022, 2:57 a.m. UTC | #5
Excerpts from Nicholas Piggin's message of February 15, 2022 9:28 am:
> Excerpts from Cédric Le Goater's message of February 15, 2022 4:31 am:
>> On 2/10/22 07:53, Nicholas Piggin wrote:
>>> +void spapr_enter_nested(PowerPCCPU *cpu)
>>> +{
>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>>> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
>>> +    CPUState *cs = CPU(cpu);
>>> +    CPUPPCState *env = &cpu->env;
>>> +    target_ulong hv_ptr = env->gpr[4];
>>> +    target_ulong regs_ptr = env->gpr[5];
>>> +    target_ulong hdec, now = cpu_ppc_load_tbl(env);
>>> +    struct kvmppc_hv_guest_state *hvstate;
>>> +    struct kvmppc_hv_guest_state hv_state;
>>> +    struct kvmppc_pt_regs *regs;
>>> +    hwaddr len;
>>> +    uint32_t cr;
>>> +    int i;
>>> +
>>> +    if (cpu->in_spapr_nested) {
>>> +        env->gpr[3] = H_FUNCTION;
>>> +        return;
>>> +    }
>>> +    if (spapr->nested_ptcr == 0) {
>>> +        env->gpr[3] = H_NOT_AVAILABLE;
>>> +        return;
>>> +    }
>>> +
>>> +    len = sizeof(*hvstate);
>>> +    hvstate = cpu_physical_memory_map(hv_ptr, &len, 
>> 
>> Are you writing to the state ? address_space_map() is a better pratice.
> 
> Yes, in exit_nested it gets written. I'll take a look at 
> address_space_map().

Hmm, address_space_map() says only use it for reads OR writes. Some o 
these are doing both.

Why is it better practice to use address_space_map()? I could split the
operations out into read, then write if necessary. For now I will re
submit the series with cpu_physical_memory_map because there has been a
lot of changes and cleanups includng all your and Fabiano's suggestions
except this one so it should make it easier to review.

Thanks,
Nick
diff mbox series

Patch

diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index a7c262db93..135900a6f4 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -1083,6 +1083,26 @@  clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq)
     return &cpu_ppc_set_tb_clk;
 }
 
+void cpu_ppc_hdecr_init (CPUPPCState *env)
+{
+    PowerPCCPU *cpu = env_archcpu(env);
+
+    assert(env->tb_env->hdecr_timer == NULL);
+
+    env->tb_env->hdecr_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &cpu_ppc_hdecr_cb,
+                                                 cpu);
+}
+
+void cpu_ppc_hdecr_exit (CPUPPCState *env)
+{
+    PowerPCCPU *cpu = env_archcpu(env);
+
+    timer_free(env->tb_env->hdecr_timer);
+    env->tb_env->hdecr_timer = NULL;
+
+    cpu_ppc_hdecr_lower(cpu);
+}
+
 /* Specific helpers for POWER & PowerPC 601 RTC */
 void cpu_ppc601_store_rtcu (CPUPPCState *env, uint32_t value)
 {
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3d6ec309dd..f0c3f726f2 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1273,6 +1273,8 @@  static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp,
     if (msr_pr) {
         hcall_dprintf("Hypercall made with MSR[PR]=1\n");
         env->gpr[3] = H_PRIVILEGE;
+    } else if (env->gpr[3] == KVMPPC_H_ENTER_NESTED) {
+        spapr_enter_nested(cpu);
     } else {
         env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
     }
@@ -4465,6 +4467,17 @@  PowerPCCPU *spapr_find_cpu(int vcpu_id)
     return NULL;
 }
 
+static bool spapr_cpu_in_nested(PowerPCCPU *cpu)
+{
+    return cpu->in_spapr_nested;
+}
+
+static target_ulong spapr_get_nested_ptcr(PowerPCCPU *cpu, target_ulong lpid)
+{
+        SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+        return spapr->nested_ptcr;
+}
+
 static void spapr_cpu_exec_enter(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
 {
     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
@@ -4573,6 +4586,9 @@  static void spapr_machine_class_init(ObjectClass *oc, void *data)
     fwc->get_dev_path = spapr_get_fw_dev_path;
     nc->nmi_monitor_handler = spapr_nmi;
     smc->phb_placement = spapr_phb_placement;
+    vhc->cpu_in_nested = spapr_cpu_in_nested;
+    vhc->get_nested_ptcr = spapr_get_nested_ptcr;
+    vhc->exit_nested = spapr_exit_nested;
     vhc->hypercall = emulate_spapr_hypercall;
     vhc->hpt_mask = spapr_hpt_mask;
     vhc->map_hptes = spapr_map_hptes;
diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
index ed7c077a0d..a665245f6f 100644
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@@ -450,10 +450,7 @@  static void cap_nested_kvm_hv_apply(SpaprMachineState *spapr,
         return;
     }
 
-    if (tcg_enabled()) {
-        error_setg(errp, "No Nested KVM-HV support in TCG");
-        error_append_hint(errp, "Try appending -machine cap-nested-hv=off\n");
-    } else if (kvm_enabled()) {
+    if (!tcg_enabled()) {
         if (!ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0,
                               spapr->max_compat_pvr)) {
             error_setg(errp, "Nested KVM-HV only supported on POWER9");
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 222c1b6bbd..8ffb13ada0 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -9,6 +9,7 @@ 
 #include "qemu/error-report.h"
 #include "exec/exec-all.h"
 #include "helper_regs.h"
+#include "hw/ppc/ppc.h"
 #include "hw/ppc/spapr.h"
 #include "hw/ppc/spapr_cpu_core.h"
 #include "mmu-hash64.h"
@@ -1497,6 +1498,317 @@  static void hypercall_register_softmmu(void)
 }
 #endif
 
+/* TCG only */
+#define PRTS_MASK      0x1f
+
+static target_ulong h_set_ptbl(PowerPCCPU *cpu,
+                               SpaprMachineState *spapr,
+                               target_ulong opcode,
+                               target_ulong *args)
+{
+    target_ulong ptcr = args[0];
+
+    if (!spapr_get_cap(spapr, SPAPR_CAP_NESTED_KVM_HV)) {
+        return H_FUNCTION;
+    }
+
+    if ((ptcr & PRTS_MASK) + 12 - 4 > 12) {
+        return H_PARAMETER;
+    }
+
+    spapr->nested_ptcr = ptcr; /* Save new partition table */
+
+    return H_SUCCESS;
+}
+
+static target_ulong h_tlb_invalidate(PowerPCCPU *cpu,
+                               SpaprMachineState *spapr,
+                               target_ulong opcode,
+                               target_ulong *args)
+{
+    CPUState *cs = CPU(cpu);
+
+    /*
+     * The spapr virtual hypervisor nested HV implementation retains no
+     * translation state except for TLB. This might be optimised to
+     * invalidate fewer entries, but at the moment it's not important
+     * because L1<->L2 transitions always flush the entire TLB for now.
+     */
+    tlb_flush(cs);
+
+    return H_SUCCESS;
+}
+
+static target_ulong h_copy_tofrom_guest(PowerPCCPU *cpu,
+                               SpaprMachineState *spapr,
+                               target_ulong opcode,
+                               target_ulong *args)
+{
+    /*
+     * This HCALL is not required, L1 KVM will take a slow path and walk the
+     * page tables manually to do the data copy.
+     */
+    return H_FUNCTION;
+}
+
+void spapr_enter_nested(PowerPCCPU *cpu)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
+    CPUState *cs = CPU(cpu);
+    CPUPPCState *env = &cpu->env;
+    target_ulong hv_ptr = env->gpr[4];
+    target_ulong regs_ptr = env->gpr[5];
+    target_ulong hdec, now = cpu_ppc_load_tbl(env);
+    struct kvmppc_hv_guest_state *hvstate;
+    struct kvmppc_hv_guest_state hv_state;
+    struct kvmppc_pt_regs *regs;
+    hwaddr len;
+    uint32_t cr;
+    int i;
+
+    if (cpu->in_spapr_nested) {
+        env->gpr[3] = H_FUNCTION;
+        return;
+    }
+    if (spapr->nested_ptcr == 0) {
+        env->gpr[3] = H_NOT_AVAILABLE;
+        return;
+    }
+
+    len = sizeof(*hvstate);
+    hvstate = cpu_physical_memory_map(hv_ptr, &len, true);
+    if (!hvstate || len != sizeof(*hvstate)) {
+        env->gpr[3] = H_PARAMETER;
+        return;
+    }
+
+    memcpy(&hv_state, hvstate, len);
+
+    cpu_physical_memory_unmap(hvstate, len, 0 /* read */, len /* access len */);
+
+    if (hv_state.version != HV_GUEST_STATE_VERSION) {
+        env->gpr[3] = H_PARAMETER;
+        return;
+    }
+
+    cpu->nested_host_state = g_try_malloc(sizeof(CPUPPCState));
+    if (!cpu->nested_host_state) {
+        env->gpr[3] = H_NO_MEM;
+        return;
+    }
+
+    memcpy(cpu->nested_host_state, env, sizeof(CPUPPCState));
+
+    len = sizeof(*regs);
+    regs = cpu_physical_memory_map(regs_ptr, &len, true);
+    if (!regs || len != sizeof(*regs)) {
+        g_free(cpu->nested_host_state);
+        env->gpr[3] = H_P2;
+        return;
+    }
+
+    len = sizeof(env->gpr);
+    assert(len == sizeof(regs->gpr));
+    memcpy(env->gpr, regs->gpr, len);
+
+    env->lr = regs->link;
+    env->ctr = regs->ctr;
+    cpu_write_xer(env, regs->xer);
+
+    cr = regs->ccr;
+    for (i = 7; i >= 0; i--) {
+        env->crf[i] = cr & 15;
+        cr >>= 4;
+    }
+
+    env->msr = regs->msr;
+    env->nip = regs->nip;
+
+    cpu_physical_memory_unmap(regs, len, 0 /* read */, len /* access len */);
+
+    env->cfar = hv_state.cfar;
+
+    assert(env->spr[SPR_LPIDR] == 0);
+    env->spr[SPR_LPCR] = hv_state.lpcr & pcc->lpcr_mask; // XXX any other mask?
+    env->spr[SPR_LPIDR] = hv_state.lpid;
+    env->spr[SPR_PCR] = hv_state.pcr;
+//    env->spr[SPR_AMOR] = hv_state.amor;
+    env->spr[SPR_DPDES] = hv_state.dpdes;
+    env->spr[SPR_HFSCR] = hv_state.hfscr;
+    hdec = hv_state.hdec_expiry - now;
+    env->tb_env->tb_offset += hv_state.tb_offset; // XXX how to deal?
+    // dec already set
+//  DAWRetc, CIABR, [S]PURR, IC
+    env->spr[SPR_VTB] = hv_state.vtb;
+//    env->spr[SPR_HEIR] = hv_state.heir;
+    env->spr[SPR_SRR0] = hv_state.srr0;
+    env->spr[SPR_SRR1] = hv_state.srr1;
+    env->spr[SPR_SPRG0] = hv_state.sprg[0];
+    env->spr[SPR_SPRG1] = hv_state.sprg[1];
+    env->spr[SPR_SPRG2] = hv_state.sprg[2];
+    env->spr[SPR_SPRG3] = hv_state.sprg[3];
+    env->spr[SPR_BOOKS_PID] = hv_state.pidr;
+    env->spr[SPR_PPR] = hv_state.ppr;
+
+    cpu_ppc_hdecr_init(env);
+    cpu_ppc_store_hdecr(env, hdec);
+
+    /*
+     * The hv_state.vcpu_token is not needed. It is used by the KVM
+     * implementation to remember which L2 vCPU last ran on which physical
+     * CPU so as to invalidate process scope translations if it is moved
+     * between physical CPUs. For now TLBs are always flushed on L1<->L2
+     * transitions so this is not a problem.
+     *
+     * Could validate that the same vcpu_token does not attempt to run on
+     * different L1 vCPUs at the same time, but that would be a L1 KVM bug
+     * and it's not obviously worth a new data structure to do it.
+     */
+
+    cpu->in_spapr_nested = true;
+
+    hreg_compute_hflags(env);
+    tlb_flush(cs);
+}
+
+void spapr_exit_nested(PowerPCCPU *cpu, int excp)
+{
+    CPUState *cs = CPU(cpu);
+    CPUPPCState *env = &cpu->env;
+    target_ulong r3_return = env->excp_vectors[excp]; // hcall return value
+    target_ulong hv_ptr = cpu->nested_host_state->gpr[4];
+    target_ulong regs_ptr = cpu->nested_host_state->gpr[5];
+    struct kvmppc_hv_guest_state *hvstate;
+    struct kvmppc_pt_regs *regs;
+    hwaddr len;
+    int i;
+
+    assert(cpu->in_spapr_nested);
+    cpu->in_spapr_nested = false;
+
+    cpu_ppc_hdecr_exit(env);
+
+    len = sizeof(*hvstate);
+    hvstate = cpu_physical_memory_map(hv_ptr, &len, true);
+    if (!hvstate || len != sizeof(*hvstate)) {
+        r3_return = H_PARAMETER;
+        goto out_restore_l1;
+    }
+
+    //XXX check linux kvm nested CTRL reg bug?
+    //
+    env->tb_env->tb_offset -= hvstate->tb_offset;
+
+    hvstate->cfar = env->cfar;
+    hvstate->lpcr = env->spr[SPR_LPCR];
+    hvstate->pcr = env->spr[SPR_PCR];
+//    hvstate->amor = env->spr[SPR_AMOR];
+    hvstate->dpdes = env->spr[SPR_DPDES];
+    hvstate->hfscr = env->spr[SPR_HFSCR];
+//    hvstate-> = // dec already set
+//  DAWRetc, CIABR, [S]PURR, IC
+    hvstate->vtb = env->spr[SPR_VTB];
+
+    if (excp == POWERPC_EXCP_HDSI) {
+        hvstate->hdar = env->spr[SPR_HDAR];
+        hvstate->hdsisr = env->spr[SPR_HDSISR];
+        hvstate->asdr = env->spr[SPR_ASDR];
+    } else if (excp == POWERPC_EXCP_HISI) {
+        hvstate->asdr = env->spr[SPR_ASDR];
+    }
+
+//    hvstate->heir = env->spr[SPR_HEIR]; XXX HEIR?
+    hvstate->srr0 = env->spr[SPR_SRR0];
+    hvstate->srr1 = env->spr[SPR_SRR1];
+    hvstate->sprg[0] = env->spr[SPR_SPRG0];
+    hvstate->sprg[1] = env->spr[SPR_SPRG1];
+    hvstate->sprg[2] = env->spr[SPR_SPRG2];
+    hvstate->sprg[3] = env->spr[SPR_SPRG3];
+    hvstate->pidr = env->spr[SPR_BOOKS_PID];
+    hvstate->ppr = env->spr[SPR_PPR];
+
+    cpu_physical_memory_unmap(hvstate, len, 0 /* read */, len /* access len */);
+
+    len = sizeof(*regs);
+    regs = cpu_physical_memory_map(regs_ptr, &len, true);
+    if (!regs || len != sizeof(*regs)) {
+        r3_return = H_P2;
+        goto out_restore_l1;
+    }
+
+    len = sizeof(env->gpr);
+    assert(len == sizeof(regs->gpr));
+    memcpy(regs->gpr, env->gpr, len);
+
+    regs->link = env->lr;
+    regs->ctr = env->ctr;
+    regs->xer = cpu_read_xer(env);
+
+    regs->ccr = 0;
+    for (i = 0; i < 8; i++) {
+        regs->ccr |= (env->crf[i] & 15) << (4 * (7 - i));
+    }
+
+    if (excp == POWERPC_EXCP_MCHECK ||
+        excp == POWERPC_EXCP_RESET ||
+        excp == POWERPC_EXCP_SYSCALL) {
+        regs->nip = env->spr[SPR_SRR0];
+        regs->msr = env->spr[SPR_SRR1];
+    } else {
+        regs->nip = env->spr[SPR_HSRR0];
+        regs->msr = env->spr[SPR_HSRR1];
+    }
+    // XXX must msr be masked?
+
+    cpu_physical_memory_unmap(regs, len, 0 /* read */, len /* access len */);
+
+out_restore_l1:
+    memcpy(env->gpr, cpu->nested_host_state->gpr, sizeof(env->gpr));
+    env->lr = cpu->nested_host_state->lr;
+    env->ctr = cpu->nested_host_state->ctr;
+    memcpy(env->crf, cpu->nested_host_state->crf, sizeof(env->crf));
+    env->cfar = cpu->nested_host_state->cfar;
+    env->xer = cpu->nested_host_state->xer;
+    env->so = cpu->nested_host_state->so;
+    env->ov = cpu->nested_host_state->ov;
+    env->ov32 = cpu->nested_host_state->ov32;
+    env->ca32 = cpu->nested_host_state->ca32;
+    env->msr = cpu->nested_host_state->msr;
+    env->nip = cpu->nested_host_state->nip;
+
+    assert(env->spr[SPR_LPIDR] != 0);
+    env->spr[SPR_LPCR] = cpu->nested_host_state->spr[SPR_LPCR];
+    env->spr[SPR_LPIDR] = cpu->nested_host_state->spr[SPR_LPIDR];
+    env->spr[SPR_PCR] = cpu->nested_host_state->spr[SPR_PCR];
+//    env->spr[SPR_AMOR] = cpu->nested_host_state->spr[SPR_AMOR];
+    env->spr[SPR_DPDES] = 0;
+    env->spr[SPR_HFSCR] = cpu->nested_host_state->spr[SPR_HFSCR];
+//  DAWRetc, CIABR, [S]PURR, IC
+    env->spr[SPR_VTB] = cpu->nested_host_state->spr[SPR_VTB];
+//    env->spr[SPR_HEIR] = hv_state.heir;
+    env->spr[SPR_SRR0] = cpu->nested_host_state->spr[SPR_SRR0];
+    env->spr[SPR_SRR1] = cpu->nested_host_state->spr[SPR_SRR1];
+    env->spr[SPR_SPRG0] = cpu->nested_host_state->spr[SPR_SPRG0];
+    env->spr[SPR_SPRG1] = cpu->nested_host_state->spr[SPR_SPRG1];
+    env->spr[SPR_SPRG2] = cpu->nested_host_state->spr[SPR_SPRG2];
+    env->spr[SPR_SPRG3] = cpu->nested_host_state->spr[SPR_SPRG3];
+    env->spr[SPR_BOOKS_PID] = cpu->nested_host_state->spr[SPR_BOOKS_PID];
+    env->spr[SPR_PPR] = cpu->nested_host_state->spr[SPR_PPR];
+
+    g_free(cpu->nested_host_state);
+    cpu->nested_host_state = NULL;
+
+    /*
+     * Return the interrupt vector address from H_ENTER_NESTED to the L1
+     * (or error code).
+     */
+    env->gpr[3] = r3_return;
+
+    hreg_compute_hflags(env);
+    tlb_flush(cs);
+}
+
 static void hypercall_register_types(void)
 {
     hypercall_register_softmmu();
@@ -1552,6 +1864,10 @@  static void hypercall_register_types(void)
     spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support);
 
     spapr_register_hypercall(KVMPPC_H_UPDATE_DT, h_update_dt);
+
+    spapr_register_hypercall(KVMPPC_H_SET_PARTITION_TABLE, h_set_ptbl);
+    spapr_register_hypercall(KVMPPC_H_TLB_INVALIDATE, h_tlb_invalidate);
+    spapr_register_hypercall(KVMPPC_H_COPY_TOFROM_GUEST, h_copy_tofrom_guest);
 }
 
 type_init(hypercall_register_types)
diff --git a/include/hw/ppc/ppc.h b/include/hw/ppc/ppc.h
index 93e614cffd..fcf9e495a0 100644
--- a/include/hw/ppc/ppc.h
+++ b/include/hw/ppc/ppc.h
@@ -54,6 +54,9 @@  struct ppc_tb_t {
 
 uint64_t cpu_ppc_get_tb(ppc_tb_t *tb_env, uint64_t vmclk, int64_t tb_offset);
 clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq);
+void cpu_ppc_hdecr_init (CPUPPCState *env);
+void cpu_ppc_hdecr_exit (CPUPPCState *env);
+
 /* Embedded PowerPC DCR management */
 typedef uint32_t (*dcr_read_cb)(void *opaque, int dcrn);
 typedef void (*dcr_write_cb)(void *opaque, int dcrn, uint32_t val);
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index ee7504b976..3a9fa4c024 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -197,6 +197,9 @@  struct SpaprMachineState {
     bool has_graphics;
     uint32_t vsmt;       /* Virtual SMT mode (KVM's "core stride") */
 
+    /* Nested HV support (TCG only) */
+    uint64_t nested_ptcr;
+
     Notifier epow_notifier;
     QTAILQ_HEAD(, SpaprEventLogEntry) pending_events;
     bool use_hotplug_event_source;
@@ -577,7 +580,14 @@  struct SpaprMachineState {
 #define KVMPPC_H_UPDATE_DT      (KVMPPC_HCALL_BASE + 0x3)
 /* 0x4 was used for KVMPPC_H_UPDATE_PHANDLE in SLOF */
 #define KVMPPC_H_VOF_CLIENT     (KVMPPC_HCALL_BASE + 0x5)
-#define KVMPPC_HCALL_MAX        KVMPPC_H_VOF_CLIENT
+
+/* Platform-specific hcalls used for nested HV KVM */
+#define KVMPPC_H_SET_PARTITION_TABLE   (KVMPPC_HCALL_BASE + 0x800)
+#define KVMPPC_H_ENTER_NESTED          (KVMPPC_HCALL_BASE + 0x804)
+#define KVMPPC_H_TLB_INVALIDATE        (KVMPPC_HCALL_BASE + 0x808)
+#define KVMPPC_H_COPY_TOFROM_GUEST     (KVMPPC_HCALL_BASE + 0x80C)
+
+#define KVMPPC_HCALL_MAX        KVMPPC_H_COPY_TOFROM_GUEST
 
 /*
  * The hcall range 0xEF00 to 0xEF80 is reserved for use in facilitating
@@ -587,6 +597,65 @@  struct SpaprMachineState {
 #define SVM_H_TPM_COMM              0xEF10
 #define SVM_HCALL_MAX               SVM_H_TPM_COMM
 
+/*
+ * Register state for entering a nested guest with H_ENTER_NESTED.
+ * New member must be added at the end.
+ */
+struct kvmppc_hv_guest_state {
+    uint64_t version;        /* version of this structure layout, must be first */
+    uint32_t lpid;
+    uint32_t vcpu_token;
+    /* These registers are hypervisor privileged (at least for writing) */
+    uint64_t lpcr;
+    uint64_t pcr;
+    uint64_t amor;
+    uint64_t dpdes;
+    uint64_t hfscr;
+    int64_t tb_offset;
+    uint64_t dawr0;
+    uint64_t dawrx0;
+    uint64_t ciabr;
+    uint64_t hdec_expiry;
+    uint64_t purr;
+    uint64_t spurr;
+    uint64_t ic;
+    uint64_t vtb;
+    uint64_t hdar;
+    uint64_t hdsisr;
+    uint64_t heir;
+    uint64_t asdr;
+    /* These are OS privileged but need to be set late in guest entry */
+    uint64_t srr0;
+    uint64_t srr1;
+    uint64_t sprg[4];
+    uint64_t pidr;
+    uint64_t cfar;
+    uint64_t ppr;
+    /* Version 1 ends here */
+    uint64_t dawr1;
+    uint64_t dawrx1;
+    /* Version 2 ends here */
+};
+
+/* Latest version of hv_guest_state structure */
+#define HV_GUEST_STATE_VERSION  2
+
+/* 64-bit powerpc pt_regs struct used by nested HV */
+struct kvmppc_pt_regs {
+    uint64_t gpr[32];
+    uint64_t nip;
+    uint64_t msr;
+    uint64_t orig_gpr3;    /* Used for restarting system calls */
+    uint64_t ctr;
+    uint64_t link;
+    uint64_t xer;
+    uint64_t ccr;
+    uint64_t softe;        /* Soft enabled/disabled */
+    uint64_t trap;         /* Reason for being here */
+    uint64_t dar;          /* Fault registers */
+    uint64_t dsisr;        /* on 4xx/Book-E used for ESR */
+    uint64_t result;       /* Result of a system call */
+};
 
 typedef struct SpaprDeviceTreeUpdateHeader {
     uint32_t version_id;
@@ -604,6 +673,10 @@  typedef target_ulong (*spapr_hcall_fn)(PowerPCCPU *cpu, SpaprMachineState *sm,
 void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn);
 target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
                              target_ulong *args);
+
+void spapr_enter_nested(PowerPCCPU *cpu);
+void spapr_exit_nested(PowerPCCPU *cpu, int excp);
+
 target_ulong softmmu_resize_hpt_prepare(PowerPCCPU *cpu, SpaprMachineState *spapr,
                                          target_ulong shift);
 target_ulong softmmu_resize_hpt_commit(PowerPCCPU *cpu, SpaprMachineState *spapr,
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index dcd83b503c..1806a8e776 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1306,6 +1306,9 @@  struct PowerPCCPU {
     bool pre_2_10_migration;
     bool pre_3_0_migration;
     int32_t mig_slb_nr;
+
+    bool in_spapr_nested;
+    CPUPPCState *nested_host_state;
 };
 
 
@@ -1316,6 +1319,9 @@  PowerPCCPUClass *ppc_cpu_get_family_class(PowerPCCPUClass *pcc);
 #ifndef CONFIG_USER_ONLY
 struct PPCVirtualHypervisorClass {
     InterfaceClass parent;
+    bool (*cpu_in_nested)(PowerPCCPU *cpu);
+    target_ulong (*get_nested_ptcr)(PowerPCCPU *cpu, target_ulong lpid);
+    void (*exit_nested)(PowerPCCPU *cpu, int excp);
     void (*hypercall)(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu);
     hwaddr (*hpt_mask)(PPCVirtualHypervisor *vhyp);
     const ppc_hash_pte64_t *(*map_hptes)(PPCVirtualHypervisor *vhyp,
diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index c107953dec..239c253dbc 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -361,8 +361,8 @@  static void ppc_excp_apply_ail(PowerPCCPU *cpu, int excp_model, int excp,
 #endif
 }
 
-static void powerpc_set_excp_state(PowerPCCPU *cpu,
-                                          target_ulong vector, target_ulong msr)
+static void powerpc_set_excp_state(PowerPCCPU *cpu, int excp,
+                                   target_ulong vector, target_ulong msr)
 {
     CPUState *cs = CPU(cpu);
     CPUPPCState *env = &cpu->env;
@@ -375,9 +375,17 @@  static void powerpc_set_excp_state(PowerPCCPU *cpu,
      * will prevent setting of the HV bit which some exceptions might need
      * to do.
      */
-    env->msr = msr & env->msr_mask;
-    hreg_compute_hflags(env);
-    env->nip = vector;
+    if (cpu->vhyp && cpu->in_spapr_nested && (msr & MSR_HVB)) {
+        PPCVirtualHypervisorClass *vhc =
+            PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
+        // Deliver interrupt to L1 by returning from the H_ENTER_NESTED call
+        vhc->exit_nested(cpu, excp);
+    } else {
+        env->nip = vector;
+        env->msr = msr & env->msr_mask;
+        hreg_compute_hflags(env);
+    }
+
     /* Reset exception state */
     cs->exception_index = POWERPC_EXCP_NONE;
     env->error_code = 0;
@@ -548,7 +556,7 @@  static void powerpc_excp_40x(PowerPCCPU *cpu, int excp)
     /* Save MSR */
     env->spr[srr1] = msr;
 
-    powerpc_set_excp_state(cpu, vector, new_msr);
+    powerpc_set_excp_state(cpu, excp, vector, new_msr);
 }
 
 static void powerpc_excp_74xx(PowerPCCPU *cpu, int excp)
@@ -742,7 +750,7 @@  static void powerpc_excp_74xx(PowerPCCPU *cpu, int excp)
     /* Save MSR */
     env->spr[SPR_SRR1] = msr;
 
-    powerpc_set_excp_state(cpu, vector, new_msr);
+    powerpc_set_excp_state(cpu, excp, vector, new_msr);
 }
 
 #ifdef TARGET_PPC64
@@ -916,7 +924,7 @@  static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
         env->nip += 4;
 
         /* "PAPR mode" built-in hypercall emulation */
-        if ((lev == 1) && cpu->vhyp) {
+        if ((lev == 1) && cpu->vhyp && !cpu->in_spapr_nested) {
             PPCVirtualHypervisorClass *vhc =
                 PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
             vhc->hypercall(cpu->vhyp, cpu);
@@ -1004,18 +1012,6 @@  static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
         break;
     }
 
-    /* Sanity check */
-    if (!(env->msr_mask & MSR_HVB)) {
-        if (new_msr & MSR_HVB) {
-            cpu_abort(cs, "Trying to deliver HV exception (MSR) %d with "
-                      "no HV support\n", excp);
-        }
-        if (srr0 == SPR_HSRR0) {
-            cpu_abort(cs, "Trying to deliver HV exception (HSRR) %d with "
-                      "no HV support\n", excp);
-        }
-    }
-
     /*
      * Sort out endianness of interrupt, this differs depending on the
      * CPU, the HV mode, etc...
@@ -1037,7 +1033,19 @@  static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
     /* This can update new_msr and vector if AIL applies */
     ppc_excp_apply_ail(cpu, excp_model, excp, msr, &new_msr, &vector);
 
-    powerpc_set_excp_state(cpu, vector, new_msr);
+    powerpc_set_excp_state(cpu, excp, vector, new_msr);
+
+    /* Sanity check */
+    if (!(env->msr_mask & MSR_HVB)) {
+        if (env->msr & MSR_HVB) {
+            cpu_abort(cs, "Trying to deliver HV exception (MSR) %d with "
+                      "no HV support\n", excp);
+        }
+        if (0 && srr0 == SPR_HSRR0) {
+            cpu_abort(cs, "Trying to deliver HV exception (HSRR) %d with "
+                      "no HV support\n", excp);
+        }
+    }
 }
 #else
 static inline void powerpc_excp_books(PowerPCCPU *cpu, int excp)
@@ -1517,7 +1525,7 @@  static inline void powerpc_excp_legacy(PowerPCCPU *cpu, int excp)
     /* This can update new_msr and vector if AIL applies */
     ppc_excp_apply_ail(cpu, excp_model, excp, msr, &new_msr, &vector);
 
-    powerpc_set_excp_state(cpu, vector, new_msr);
+    powerpc_set_excp_state(cpu, excp, vector, new_msr);
 }
 
 static void powerpc_excp(PowerPCCPU *cpu, int excp)
@@ -1613,7 +1621,11 @@  static void ppc_hw_interrupt(CPUPPCState *env)
         /* HEIC blocks delivery to the hypervisor */
         if ((async_deliver && !(heic && msr_hv && !msr_pr)) ||
             (env->has_hv_mode && msr_hv == 0 && !lpes0)) {
-            powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
+            if (cpu->in_spapr_nested) {
+                powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
+            } else {
+                powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
+            }
             return;
         }
     }
@@ -1723,7 +1735,7 @@  void ppc_cpu_do_fwnmi_machine_check(CPUState *cs, target_ulong vector)
         msr |= (1ULL << MSR_LE);
     }
 
-    powerpc_set_excp_state(cpu, vector, msr);
+    powerpc_set_excp_state(cpu, POWERPC_EXCP_MCHECK, vector, msr);
 }
 
 bool ppc_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c
index 5b12cb03c9..7f92606522 100644
--- a/target/ppc/helper_regs.c
+++ b/target/ppc/helper_regs.c
@@ -163,6 +163,7 @@  static uint32_t hreg_compute_hflags_value(CPUPPCState *env)
         immu_idx |= msr & (1 << MSR_IS) ? 2 : 0;
         dmmu_idx |= msr & (1 << MSR_DS) ? 2 : 0;
     } else {
+        /* Could have nested IDX instead of HV to avoid tlb flush on nested enter/exit? */
         dmmu_idx |= msr & (1ull << MSR_HV) ? 4 : 0;
         immu_idx = dmmu_idx;
         immu_idx |= msr & (1 << MSR_IR) ? 0 : 2;
diff --git a/target/ppc/mmu-book3s-v3.c b/target/ppc/mmu-book3s-v3.c
index f4985bae78..0810be3668 100644
--- a/target/ppc/mmu-book3s-v3.c
+++ b/target/ppc/mmu-book3s-v3.c
@@ -25,8 +25,23 @@ 
 
 bool ppc64_v3_get_pate(PowerPCCPU *cpu, target_ulong lpid, ppc_v3_pate_t *entry)
 {
-    uint64_t patb = cpu->env.spr[SPR_PTCR] & PTCR_PATB;
-    uint64_t pats = cpu->env.spr[SPR_PTCR] & PTCR_PATS;
+    uint64_t patb, pats;
+
+    if (cpu->vhyp) {
+        PPCVirtualHypervisorClass *vhc =
+            PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
+        target_ulong nested_ptcr;
+
+        assert(cpu->in_spapr_nested);
+
+        nested_ptcr = vhc->get_nested_ptcr(cpu, lpid);
+
+        patb = nested_ptcr & PTCR_PATB;
+        pats = nested_ptcr & PTCR_PATS;
+    } else {
+        patb = cpu->env.spr[SPR_PTCR] & PTCR_PATB;
+        pats = cpu->env.spr[SPR_PTCR] & PTCR_PATS;
+    }
 
     /* Calculate number of entries */
     pats = 1ull << (pats + 12 - 4);
@@ -38,5 +53,6 @@  bool ppc64_v3_get_pate(PowerPCCPU *cpu, target_ulong lpid, ppc_v3_pate_t *entry)
     patb += 16 * lpid;
     entry->dw0 = ldq_phys(CPU(cpu)->as, patb);
     entry->dw1 = ldq_phys(CPU(cpu)->as, patb + 8);
+
     return true;
 }
diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
index 54fb3ce98d..6304a23d05 100644
--- a/target/ppc/mmu-radix64.c
+++ b/target/ppc/mmu-radix64.c
@@ -304,7 +304,7 @@  static bool validate_pate(PowerPCCPU *cpu, uint64_t lpid, ppc_v3_pate_t *pate)
     if (!(pate->dw0 & PATE0_HR)) {
         return false;
     }
-    if (lpid == 0 && !msr_hv) {
+    if (lpid == 0 && (!msr_hv && !(cpu->vhyp && cpu->in_spapr_nested))) {
         return false;
     }
     if ((pate->dw0 & PATE1_R_PRTS) < 5) {
@@ -336,6 +336,7 @@  static int ppc_radix64_partition_scoped_xlate(PowerPCCPU *cpu,
                   g_raddr);
 
     *h_page_size = PRTBE_R_GET_RTS(pate.dw0);
+
     /* No valid pte or access denied due to protection */
     if (ppc_radix64_walk_tree(CPU(cpu)->as, g_raddr, pate.dw0 & PRTBE_R_RPDB,
                               pate.dw0 & PRTBE_R_RPDS, h_raddr, h_page_size,
@@ -389,7 +390,7 @@  static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu,
     }
     prtbe_addr = (pate.dw1 & PATE1_R_PRTB) + offset;
 
-    if (cpu->vhyp) {
+    if (cpu->vhyp && !cpu->in_spapr_nested) {
         prtbe0 = ldq_phys(cs->as, prtbe_addr);
     } else {
         /*
@@ -415,7 +416,7 @@  static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu,
     *g_page_size = PRTBE_R_GET_RTS(prtbe0);
     base_addr = prtbe0 & PRTBE_R_RPDB;
     nls = prtbe0 & PRTBE_R_RPDS;
-    if (msr_hv || cpu->vhyp) {
+    if (msr_hv || (cpu->vhyp && !cpu->in_spapr_nested)) {
         /*
          * Can treat process table addresses as real addresses
          */
@@ -519,7 +520,7 @@  static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr eaddr,
     relocation = !mmuidx_real(mmu_idx);
 
     /* HV or virtual hypervisor Real Mode Access */
-    if (!relocation && (mmuidx_hv(mmu_idx) || cpu->vhyp)) {
+    if (!relocation && (mmuidx_hv(mmu_idx) || (cpu->vhyp && !cpu->in_spapr_nested))) {
         /* In real mode top 4 effective addr bits (mostly) ignored */
         *raddr = eaddr & 0x0FFFFFFFFFFFFFFFULL;
 
@@ -553,7 +554,7 @@  static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr eaddr,
     }
 
     /* Get Process Table */
-    if (cpu->vhyp) {
+    if (cpu->vhyp && !cpu->in_spapr_nested) {
         PPCVirtualHypervisorClass *vhc;
         vhc = PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
         vhc->get_pate(cpu->vhyp, &pate);
@@ -596,7 +597,7 @@  static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr eaddr,
         g_raddr = eaddr & R_EADDR_MASK;
     }
 
-    if (cpu->vhyp) {
+    if (cpu->vhyp && !cpu->in_spapr_nested) {
         *raddr = g_raddr;
     } else {
         /*
@@ -604,7 +605,7 @@  static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr eaddr,
          * quadrants 1 or 2. Translates a guest real address to a host
          * real address.
          */
-        if (lpid || !mmuidx_hv(mmu_idx)) {
+        if (lpid || !mmuidx_hv(mmu_idx) || cpu->in_spapr_nested) {
             int ret;
 
             ret = ppc_radix64_partition_scoped_xlate(cpu, access_type, eaddr,