diff mbox series

[v4,12/25] KVM: arm64: Add infrastructure to create and track pKVM instances at EL2

Message ID 20221017115209.2099-13-will@kernel.org (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Introduce pKVM hyp VM and vCPU state at EL2 | expand

Commit Message

Will Deacon Oct. 17, 2022, 11:51 a.m. UTC
From: Fuad Tabba <tabba@google.com>

Introduce a global table (and lock) to track pKVM instances at EL2, and
provide hypercalls that can be used by the untrusted host to create and
destroy pKVM VMs and their vCPUs. pKVM VM/vCPU state is directly
accessible only by the trusted hypervisor (EL2).

Each pKVM VM is directly associated with an untrusted host KVM instance,
and is referenced by the host using an opaque handle. Future patches
will provide hypercalls to allow the host to initialize/set/get pKVM
VM/vCPU state using the opaque handle.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/kvm_asm.h              |   3 +
 arch/arm64/include/asm/kvm_host.h             |   8 +
 arch/arm64/include/asm/kvm_pgtable.h          |   8 +
 arch/arm64/include/asm/kvm_pkvm.h             |   8 +
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |   3 +
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h        |  64 +++
 arch/arm64/kvm/hyp/nvhe/hyp-main.c            |  31 ++
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         |  14 +
 arch/arm64/kvm/hyp/nvhe/pkvm.c                | 389 ++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/setup.c               |   8 +
 arch/arm64/kvm/hyp/pgtable.c                  |   9 +
 arch/arm64/kvm/pkvm.c                         |   1 +
 12 files changed, 546 insertions(+)
 create mode 100644 arch/arm64/kvm/hyp/include/nvhe/pkvm.h

Comments

Quentin Perret Oct. 18, 2022, 3:13 p.m. UTC | #1
On Monday 17 Oct 2022 at 12:51:56 (+0100), Will Deacon wrote:
> +/* Maximum number of protected VMs that can be created. */
> +#define KVM_MAX_PVMS 255

Nit: I think that limit will apply to non-protected VMs too, at least
initially, so it'd be worth rewording the comment.

Cheers,
Quentin
Quentin Perret Oct. 18, 2022, 4:21 p.m. UTC | #2
On Monday 17 Oct 2022 at 12:51:56 (+0100), Will Deacon wrote:
> +struct pkvm_hyp_vm {
> +	struct kvm kvm;
> +
> +	/* Backpointer to the host's (untrusted) KVM instance. */
> +	struct kvm *host_kvm;
> +
> +	/*
> +	 * Total amount of memory donated by the host for maintaining
> +	 * this 'struct pkvm_hyp_vm' in the hypervisor.
> +	 */
> +	size_t donated_memory_size;

I think you could get rid of that member. IIUC, all you need to
re-compute it in the teardown path is the number of created vCPUs on
the host, which we should have safely stored in
pkvm_hyp_vm::kvm::created_vcpus.

> +	/* The guest's stage-2 page-table managed by the hypervisor. */
> +	struct kvm_pgtable pgt;
> +
> +	/*
> +	 * The number of vcpus initialized and ready to run.
> +	 * Modifying this is protected by 'vm_table_lock'.
> +	 */
> +	unsigned int nr_vcpus;
> +
> +	/* Array of the hyp vCPU structures for this VM. */
> +	struct pkvm_hyp_vcpu *vcpus[];
> +};

Cheers,
Quentin
Quentin Perret Oct. 18, 2022, 4:33 p.m. UTC | #3
On Monday 17 Oct 2022 at 12:51:56 (+0100), Will Deacon wrote:
> +void pkvm_hyp_vm_table_init(void *tbl)
> +{
> +	WARN_ON(vm_table);
> +	vm_table = tbl;
> +}

Uh, why does this one need to be exposed outside pkvm.c ?
Quentin Perret Oct. 18, 2022, 4:40 p.m. UTC | #4
On Monday 17 Oct 2022 at 12:51:56 (+0100), Will Deacon wrote:
> +static void *map_donated_memory_noclear(unsigned long host_va, size_t size)
> +{
> +	void *va = (void *)kern_hyp_va(host_va);
> +
> +	if (!PAGE_ALIGNED(va))
> +		return NULL;
> +
> +	if (__pkvm_host_donate_hyp(hyp_virt_to_pfn(va),
> +				   PAGE_ALIGN(size) >> PAGE_SHIFT))
> +		return NULL;
> +
> +	return va;
> +}
> +
> +static void *map_donated_memory(unsigned long host_va, size_t size)
> +{
> +	void *va = map_donated_memory_noclear(host_va, size);
> +
> +	if (va)
> +		memset(va, 0, size);
> +
> +	return va;
> +}
> +
> +static void __unmap_donated_memory(void *va, size_t size)
> +{
> +	WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va),
> +				       PAGE_ALIGN(size) >> PAGE_SHIFT));
> +}
> +
> +static void unmap_donated_memory(void *va, size_t size)
> +{
> +	if (!va)
> +		return;
> +
> +	memset(va, 0, size);
> +	__unmap_donated_memory(va, size);
> +}
> +
> +static void unmap_donated_memory_noclear(void *va, size_t size)
> +{
> +	if (!va)
> +		return;
> +
> +	__unmap_donated_memory(va, size);
> +}

Nit: I'm not a huge fan of the naming here, these do more than just
map/unmap random pages. This only works for host pages, the donation
path has permission checks, etc. Maybe {admit,return}_host_memory()?
Quentin Perret Oct. 18, 2022, 4:45 p.m. UTC | #5
On Monday 17 Oct 2022 at 12:51:56 (+0100), Will Deacon wrote:
> +static int find_free_vm_table_entry(struct kvm *host_kvm)
> +{
> +	int i, ret = -ENOMEM;
> +
> +	for (i = 0; i < KVM_MAX_PVMS; ++i) {
> +		struct pkvm_hyp_vm *vm = vm_table[i];
> +
> +		if (!vm) {
> +			if (ret < 0)
> +				ret = i;
> +			continue;
> +		}
> +
> +		if (unlikely(vm->host_kvm == host_kvm)) {
> +			ret = -EEXIST;
> +			break;
> +		}

That would be funny if the host passed the same struct twice, but do we
care? If the host wants to shoot itself in the foot, it's not our
problem I guess :) ? Also, we don't do the same check for vCPUs so ...
Will Deacon Oct. 19, 2022, 11:57 a.m. UTC | #6
On Tue, Oct 18, 2022 at 04:33:45PM +0000, Quentin Perret wrote:
> On Monday 17 Oct 2022 at 12:51:56 (+0100), Will Deacon wrote:
> > +void pkvm_hyp_vm_table_init(void *tbl)
> > +{
> > +	WARN_ON(vm_table);
> > +	vm_table = tbl;
> > +}
> 
> Uh, why does this one need to be exposed outside pkvm.c ?

We need to initialise the table using the memory donated by the host
on the __pkvm_init path. That's all private to nvhe/setup.c, so rather
than expose the raw pointers (of either the table or the donated memory),
we've got this initialisation function instead which is invoked by
__pkvm_init_finalise() on the deprivilege path.

Happy to repaint it if you have a patch?

Will
Fuad Tabba Oct. 19, 2022, 12:18 p.m. UTC | #7
Hi,

On Tue, Oct 18, 2022 at 5:45 PM Quentin Perret <qperret@google.com> wrote:
>
> On Monday 17 Oct 2022 at 12:51:56 (+0100), Will Deacon wrote:
> > +static int find_free_vm_table_entry(struct kvm *host_kvm)
> > +{
> > +     int i, ret = -ENOMEM;
> > +
> > +     for (i = 0; i < KVM_MAX_PVMS; ++i) {
> > +             struct pkvm_hyp_vm *vm = vm_table[i];
> > +
> > +             if (!vm) {
> > +                     if (ret < 0)
> > +                             ret = i;
> > +                     continue;
> > +             }
> > +
> > +             if (unlikely(vm->host_kvm == host_kvm)) {
> > +                     ret = -EEXIST;
> > +                     break;
> > +             }
>
> That would be funny if the host passed the same struct twice, but do we
> care? If the host wants to shoot itself in the foot, it's not our
> problem I guess :) ? Also, we don't do the same check for vCPUs so ...

You're right, the host can shoot itself in the foot if it wants to.
The reason why we don't have it for vcpus is that that code was
factored out later, and as you said, a similar check isn't necessary.

TLDR: we'll remove it :)

Thanks,
/fuad
Will Deacon Oct. 19, 2022, 12:35 p.m. UTC | #8
On Tue, Oct 18, 2022 at 03:13:36PM +0000, Quentin Perret wrote:
> On Monday 17 Oct 2022 at 12:51:56 (+0100), Will Deacon wrote:
> > +/* Maximum number of protected VMs that can be created. */
> > +#define KVM_MAX_PVMS 255
> 
> Nit: I think that limit will apply to non-protected VMs too, at least
> initially, so it'd be worth rewording the comment.

Good point, I've changed this to:

  /* Maximum number of VMs that can co-exist under pKVM. */

Will
Will Deacon Oct. 19, 2022, 12:44 p.m. UTC | #9
On Tue, Oct 18, 2022 at 04:40:11PM +0000, Quentin Perret wrote:
> On Monday 17 Oct 2022 at 12:51:56 (+0100), Will Deacon wrote:
> > +static void *map_donated_memory_noclear(unsigned long host_va, size_t size)
> > +{
> > +	void *va = (void *)kern_hyp_va(host_va);
> > +
> > +	if (!PAGE_ALIGNED(va))
> > +		return NULL;
> > +
> > +	if (__pkvm_host_donate_hyp(hyp_virt_to_pfn(va),
> > +				   PAGE_ALIGN(size) >> PAGE_SHIFT))
> > +		return NULL;
> > +
> > +	return va;
> > +}
> > +
> > +static void *map_donated_memory(unsigned long host_va, size_t size)
> > +{
> > +	void *va = map_donated_memory_noclear(host_va, size);
> > +
> > +	if (va)
> > +		memset(va, 0, size);
> > +
> > +	return va;
> > +}
> > +
> > +static void __unmap_donated_memory(void *va, size_t size)
> > +{
> > +	WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va),
> > +				       PAGE_ALIGN(size) >> PAGE_SHIFT));
> > +}
> > +
> > +static void unmap_donated_memory(void *va, size_t size)
> > +{
> > +	if (!va)
> > +		return;
> > +
> > +	memset(va, 0, size);
> > +	__unmap_donated_memory(va, size);
> > +}
> > +
> > +static void unmap_donated_memory_noclear(void *va, size_t size)
> > +{
> > +	if (!va)
> > +		return;
> > +
> > +	__unmap_donated_memory(va, size);
> > +}
> 
> Nit: I'm not a huge fan of the naming here, these do more than just
> map/unmap random pages. This only works for host pages, the donation
> path has permission checks, etc. Maybe {admit,return}_host_memory()?

Hmm, so I made this change locally and I found return_host_memory() to be
really hard to read, particularly on error paths where we 'goto' an error
label and have calls to 'return_host_memory' before an actual 'return'
statement.

So I've left as-is, not because I'm tied to the existing names, but because
I did struggle with your suggestion. At the end of the day, these are static
functions so it's really easy to change the name later on if we can think
of something better.

Will
Will Deacon Oct. 19, 2022, 12:45 p.m. UTC | #10
On Tue, Oct 18, 2022 at 04:21:11PM +0000, Quentin Perret wrote:
> On Monday 17 Oct 2022 at 12:51:56 (+0100), Will Deacon wrote:
> > +struct pkvm_hyp_vm {
> > +	struct kvm kvm;
> > +
> > +	/* Backpointer to the host's (untrusted) KVM instance. */
> > +	struct kvm *host_kvm;
> > +
> > +	/*
> > +	 * Total amount of memory donated by the host for maintaining
> > +	 * this 'struct pkvm_hyp_vm' in the hypervisor.
> > +	 */
> > +	size_t donated_memory_size;
> 
> I think you could get rid of that member. IIUC, all you need to
> re-compute it in the teardown path is the number of created vCPUs on
> the host, which we should have safely stored in
> pkvm_hyp_vm::kvm::created_vcpus.

Oh, well spotted! I've dropped this as you have suggested.

Will
Quentin Perret Oct. 19, 2022, 1:35 p.m. UTC | #11
On Wednesday 19 Oct 2022 at 12:57:24 (+0100), Will Deacon wrote:
> On Tue, Oct 18, 2022 at 04:33:45PM +0000, Quentin Perret wrote:
> > On Monday 17 Oct 2022 at 12:51:56 (+0100), Will Deacon wrote:
> > > +void pkvm_hyp_vm_table_init(void *tbl)
> > > +{
> > > +	WARN_ON(vm_table);
> > > +	vm_table = tbl;
> > > +}
> > 
> > Uh, why does this one need to be exposed outside pkvm.c ?
> 
> We need to initialise the table using the memory donated by the host
> on the __pkvm_init path. That's all private to nvhe/setup.c, so rather
> than expose the raw pointers (of either the table or the donated memory),
> we've got this initialisation function instead which is invoked by
> __pkvm_init_finalise() on the deprivilege path.
> 
> Happy to repaint it if you have a patch?

I don't, I just got confused, maybe because in an older version of this
(possibly quite old) the table was statically allocated? Anyways, it's
all fine as-is, thanks for the reply.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 53035763e48e..de52ba775d48 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -76,6 +76,9 @@  enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs,
 	__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs,
 	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps,
+	__KVM_HOST_SMCCC_FUNC___pkvm_init_vm,
+	__KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu,
+	__KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm,
 };
 
 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 45e2136322ba..d3dd7ab9c79e 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -115,6 +115,8 @@  struct kvm_smccc_features {
 	unsigned long vendor_hyp_bmap;
 };
 
+typedef unsigned int pkvm_handle_t;
+
 struct kvm_arch {
 	struct kvm_s2_mmu mmu;
 
@@ -166,6 +168,12 @@  struct kvm_arch {
 
 	/* Hypercall features firmware registers' descriptor */
 	struct kvm_smccc_features smccc_feat;
+
+	/*
+	 * For an untrusted host VM, 'pkvm_handle' is used to lookup
+	 * the associated pKVM instance in the hypervisor.
+	 */
+	pkvm_handle_t pkvm_handle;
 };
 
 struct kvm_vcpu_fault_info {
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 1b098bd4cd37..4f6d79fe4352 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -288,6 +288,14 @@  u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
  */
 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);
 
+/**
+ * kvm_pgtable_stage2_pgd_size() - Helper to compute size of a stage-2 PGD
+ * @vtcr:	Content of the VTCR register.
+ *
+ * Return: the size (in bytes) of the stage-2 PGD
+ */
+size_t kvm_pgtable_stage2_pgd_size(u64 vtcr);
+
 /**
  * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
  * @pgt:	Uninitialised page-table structure to initialise.
diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 8f7b8a2314bb..e22856cf4207 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -9,6 +9,9 @@ 
 #include <linux/memblock.h>
 #include <asm/kvm_pgtable.h>
 
+/* Maximum number of protected VMs that can be created. */
+#define KVM_MAX_PVMS 255
+
 #define HYP_MEMBLOCK_REGIONS 128
 
 extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
@@ -40,6 +43,11 @@  static inline unsigned long hyp_vmemmap_pages(size_t vmemmap_entry_size)
 	return res >> PAGE_SHIFT;
 }
 
+static inline unsigned long hyp_vm_table_pages(void)
+{
+	return PAGE_ALIGN(KVM_MAX_PVMS * sizeof(void *)) >> PAGE_SHIFT;
+}
+
 static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
 {
 	unsigned long total = 0, i;
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 0a6d3e7f2a43..ce9a796a85ee 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -11,6 +11,7 @@ 
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_pgtable.h>
 #include <asm/virt.h>
+#include <nvhe/pkvm.h>
 #include <nvhe/spinlock.h>
 
 /*
@@ -68,10 +69,12 @@  bool addr_is_memory(phys_addr_t phys);
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
 int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id);
 int kvm_host_prepare_stage2(void *pgt_pool_base);
+int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd);
 void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
 
 int hyp_pin_shared_mem(void *from, void *to);
 void hyp_unpin_shared_mem(void *from, void *to);
+void reclaim_guest_pages(struct pkvm_hyp_vm *vm);
 
 static __always_inline void __load_host_stage2(void)
 {
diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
new file mode 100644
index 000000000000..d176399dbfb5
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -0,0 +1,64 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Google LLC
+ * Author: Fuad Tabba <tabba@google.com>
+ */
+
+#ifndef __ARM64_KVM_NVHE_PKVM_H__
+#define __ARM64_KVM_NVHE_PKVM_H__
+
+#include <asm/kvm_pkvm.h>
+
+/*
+ * Holds the relevant data for maintaining the vcpu state completely at hyp.
+ */
+struct pkvm_hyp_vcpu {
+	struct kvm_vcpu vcpu;
+
+	/* Backpointer to the host's (untrusted) vCPU instance. */
+	struct kvm_vcpu *host_vcpu;
+};
+
+/*
+ * Holds the relevant data for running a protected vm.
+ */
+struct pkvm_hyp_vm {
+	struct kvm kvm;
+
+	/* Backpointer to the host's (untrusted) KVM instance. */
+	struct kvm *host_kvm;
+
+	/*
+	 * Total amount of memory donated by the host for maintaining
+	 * this 'struct pkvm_hyp_vm' in the hypervisor.
+	 */
+	size_t donated_memory_size;
+
+	/* The guest's stage-2 page-table managed by the hypervisor. */
+	struct kvm_pgtable pgt;
+
+	/*
+	 * The number of vcpus initialized and ready to run.
+	 * Modifying this is protected by 'vm_table_lock'.
+	 */
+	unsigned int nr_vcpus;
+
+	/* Array of the hyp vCPU structures for this VM. */
+	struct pkvm_hyp_vcpu *vcpus[];
+};
+
+static inline struct pkvm_hyp_vm *
+pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	return container_of(hyp_vcpu->vcpu.kvm, struct pkvm_hyp_vm, kvm);
+}
+
+void pkvm_hyp_vm_table_init(void *tbl);
+
+int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
+		   unsigned long pgd_hva);
+int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
+		     unsigned long vcpu_hva);
+int __pkvm_teardown_vm(pkvm_handle_t handle);
+
+#endif /* __ARM64_KVM_NVHE_PKVM_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 3cea4b6ac23e..b5f3fcfe9135 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -15,6 +15,7 @@ 
 
 #include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
+#include <nvhe/pkvm.h>
 #include <nvhe/trap_handler.h>
 
 DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
@@ -191,6 +192,33 @@  static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt)
 	__pkvm_vcpu_init_traps(kern_hyp_va(vcpu));
 }
 
+static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1);
+	DECLARE_REG(unsigned long, vm_hva, host_ctxt, 2);
+	DECLARE_REG(unsigned long, pgd_hva, host_ctxt, 3);
+
+	host_kvm = kern_hyp_va(host_kvm);
+	cpu_reg(host_ctxt, 1) = __pkvm_init_vm(host_kvm, vm_hva, pgd_hva);
+}
+
+static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+	DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 2);
+	DECLARE_REG(unsigned long, vcpu_hva, host_ctxt, 3);
+
+	host_vcpu = kern_hyp_va(host_vcpu);
+	cpu_reg(host_ctxt, 1) = __pkvm_init_vcpu(handle, host_vcpu, vcpu_hva);
+}
+
+static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+
+	cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle);
+}
+
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -220,6 +248,9 @@  static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__vgic_v3_save_aprs),
 	HANDLE_FUNC(__vgic_v3_restore_aprs),
 	HANDLE_FUNC(__pkvm_vcpu_init_traps),
+	HANDLE_FUNC(__pkvm_init_vm),
+	HANDLE_FUNC(__pkvm_init_vcpu),
+	HANDLE_FUNC(__pkvm_teardown_vm),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index bec8306c2392..2ef6aaa21ba5 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -141,6 +141,20 @@  int kvm_host_prepare_stage2(void *pgt_pool_base)
 	return 0;
 }
 
+int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
+{
+	vm->pgt.pgd = pgd;
+	return 0;
+}
+
+void reclaim_guest_pages(struct pkvm_hyp_vm *vm)
+{
+	unsigned long nr_pages;
+
+	nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT;
+	WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(vm->pgt.pgd), nr_pages));
+}
+
 int __pkvm_prot_finalize(void)
 {
 	struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu;
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 85d3b7ae720f..7e6f7d2f7713 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -7,6 +7,9 @@ 
 #include <linux/kvm_host.h>
 #include <linux/mm.h>
 #include <nvhe/fixed_config.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/memory.h>
+#include <nvhe/pkvm.h>
 #include <nvhe/trap_handler.h>
 
 /*
@@ -183,3 +186,389 @@  void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu)
 	pvm_init_traps_aa64mmfr0(vcpu);
 	pvm_init_traps_aa64mmfr1(vcpu);
 }
+
+/*
+ * Start the VM table handle at the offset defined instead of at 0.
+ * Mainly for sanity checking and debugging.
+ */
+#define HANDLE_OFFSET 0x1000
+
+static unsigned int vm_handle_to_idx(pkvm_handle_t handle)
+{
+	return handle - HANDLE_OFFSET;
+}
+
+static pkvm_handle_t idx_to_vm_handle(unsigned int idx)
+{
+	return idx + HANDLE_OFFSET;
+}
+
+/*
+ * Spinlock for protecting state related to the VM table. Protects writes
+ * to 'vm_table' and 'nr_table_entries' as well as reads and writes to
+ * 'last_hyp_vcpu_lookup'.
+ */
+static DEFINE_HYP_SPINLOCK(vm_table_lock);
+
+/*
+ * The table of VM entries for protected VMs in hyp.
+ * Allocated at hyp initialization and setup.
+ */
+static struct pkvm_hyp_vm **vm_table;
+
+void pkvm_hyp_vm_table_init(void *tbl)
+{
+	WARN_ON(vm_table);
+	vm_table = tbl;
+}
+
+/*
+ * Return the hyp vm structure corresponding to the handle.
+ */
+static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
+{
+	unsigned int idx = vm_handle_to_idx(handle);
+
+	if (unlikely(idx >= KVM_MAX_PVMS))
+		return NULL;
+
+	return vm_table[idx];
+}
+
+static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
+{
+	if (host_vcpu)
+		hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1);
+}
+
+static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[],
+			     unsigned int nr_vcpus)
+{
+	int i;
+
+	for (i = 0; i < nr_vcpus; i++)
+		unpin_host_vcpu(hyp_vcpus[i]->host_vcpu);
+}
+
+static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
+			     unsigned int nr_vcpus)
+{
+	hyp_vm->host_kvm = host_kvm;
+	hyp_vm->kvm.created_vcpus = nr_vcpus;
+	hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr;
+}
+
+static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
+			      struct pkvm_hyp_vm *hyp_vm,
+			      struct kvm_vcpu *host_vcpu,
+			      unsigned int vcpu_idx)
+{
+	int ret = 0;
+
+	if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1))
+		return -EBUSY;
+
+	if (host_vcpu->vcpu_idx != vcpu_idx) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	hyp_vcpu->host_vcpu = host_vcpu;
+
+	hyp_vcpu->vcpu.kvm = &hyp_vm->kvm;
+	hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id);
+	hyp_vcpu->vcpu.vcpu_idx = vcpu_idx;
+
+	hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu;
+done:
+	if (ret)
+		unpin_host_vcpu(host_vcpu);
+	return ret;
+}
+
+static int find_free_vm_table_entry(struct kvm *host_kvm)
+{
+	int i, ret = -ENOMEM;
+
+	for (i = 0; i < KVM_MAX_PVMS; ++i) {
+		struct pkvm_hyp_vm *vm = vm_table[i];
+
+		if (!vm) {
+			if (ret < 0)
+				ret = i;
+			continue;
+		}
+
+		if (unlikely(vm->host_kvm == host_kvm)) {
+			ret = -EEXIST;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Allocate a VM table entry and insert a pointer to the new vm.
+ *
+ * Return a unique handle to the protected VM on success,
+ * negative error code on failure.
+ */
+static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm,
+					   struct pkvm_hyp_vm *hyp_vm,
+					   size_t donated_memory_size)
+{
+	struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu;
+	int idx;
+
+	hyp_assert_lock_held(&vm_table_lock);
+
+	/*
+	 * Initializing protected state might have failed, yet a malicious
+	 * host could trigger this function. Thus, ensure that 'vm_table'
+	 * exists.
+	 */
+	if (unlikely(!vm_table))
+		return -EINVAL;
+
+	idx = find_free_vm_table_entry(host_kvm);
+	if (idx < 0)
+		return idx;
+
+	hyp_vm->kvm.arch.pkvm_handle = idx_to_vm_handle(idx);
+	hyp_vm->donated_memory_size = donated_memory_size;
+
+	/* VMID 0 is reserved for the host */
+	atomic64_set(&mmu->vmid.id, idx + 1);
+
+	mmu->arch = &hyp_vm->kvm.arch;
+	mmu->pgt = &hyp_vm->pgt;
+
+	vm_table[idx] = hyp_vm;
+	return hyp_vm->kvm.arch.pkvm_handle;
+}
+
+/*
+ * Deallocate and remove the VM table entry corresponding to the handle.
+ */
+static void remove_vm_table_entry(pkvm_handle_t handle)
+{
+	hyp_assert_lock_held(&vm_table_lock);
+	vm_table[vm_handle_to_idx(handle)] = NULL;
+}
+
+static size_t pkvm_get_hyp_vm_size(unsigned int nr_vcpus)
+{
+	return size_add(sizeof(struct pkvm_hyp_vm),
+		size_mul(sizeof(struct pkvm_hyp_vcpu *), nr_vcpus));
+}
+
+static void *map_donated_memory_noclear(unsigned long host_va, size_t size)
+{
+	void *va = (void *)kern_hyp_va(host_va);
+
+	if (!PAGE_ALIGNED(va))
+		return NULL;
+
+	if (__pkvm_host_donate_hyp(hyp_virt_to_pfn(va),
+				   PAGE_ALIGN(size) >> PAGE_SHIFT))
+		return NULL;
+
+	return va;
+}
+
+static void *map_donated_memory(unsigned long host_va, size_t size)
+{
+	void *va = map_donated_memory_noclear(host_va, size);
+
+	if (va)
+		memset(va, 0, size);
+
+	return va;
+}
+
+static void __unmap_donated_memory(void *va, size_t size)
+{
+	WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va),
+				       PAGE_ALIGN(size) >> PAGE_SHIFT));
+}
+
+static void unmap_donated_memory(void *va, size_t size)
+{
+	if (!va)
+		return;
+
+	memset(va, 0, size);
+	__unmap_donated_memory(va, size);
+}
+
+static void unmap_donated_memory_noclear(void *va, size_t size)
+{
+	if (!va)
+		return;
+
+	__unmap_donated_memory(va, size);
+}
+
+/*
+ * Initialize the hypervisor copy of the protected VM state using the
+ * memory donated by the host.
+ *
+ * Unmaps the donated memory from the host at stage 2.
+ *
+ * kvm: A pointer to the host's struct kvm.
+ * vm_hva: The host va of the area being donated for the VM state.
+ *	   Must be page aligned.
+ * pgd_hva: The host va of the area being donated for the stage-2 PGD for
+ *	    the VM. Must be page aligned. Its size is implied by the VM's
+ *	    VTCR.
+ *
+ * Return a unique handle to the protected VM on success,
+ * negative error code on failure.
+ */
+int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
+		   unsigned long pgd_hva)
+{
+	struct pkvm_hyp_vm *hyp_vm = NULL;
+	size_t vm_size, pgd_size;
+	unsigned int nr_vcpus;
+	void *pgd = NULL;
+	int ret;
+
+	ret = hyp_pin_shared_mem(host_kvm, host_kvm + 1);
+	if (ret)
+		return ret;
+
+	nr_vcpus = READ_ONCE(host_kvm->created_vcpus);
+	if (nr_vcpus < 1) {
+		ret = -EINVAL;
+		goto err_unpin_kvm;
+	}
+
+	vm_size = pkvm_get_hyp_vm_size(nr_vcpus);
+	pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.vtcr);
+
+	ret = -ENOMEM;
+
+	hyp_vm = map_donated_memory(vm_hva, vm_size);
+	if (!hyp_vm)
+		goto err_remove_mappings;
+
+	pgd = map_donated_memory_noclear(pgd_hva, pgd_size);
+	if (!pgd)
+		goto err_remove_mappings;
+
+	init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus);
+
+	hyp_spin_lock(&vm_table_lock);
+	ret = insert_vm_table_entry(host_kvm, hyp_vm, vm_size);
+	if (ret < 0)
+		goto err_unlock;
+
+	ret = kvm_guest_prepare_stage2(hyp_vm, pgd);
+	if (ret)
+		goto err_remove_vm_table_entry;
+	hyp_spin_unlock(&vm_table_lock);
+
+	return hyp_vm->kvm.arch.pkvm_handle;
+
+err_remove_vm_table_entry:
+	remove_vm_table_entry(hyp_vm->kvm.arch.pkvm_handle);
+err_unlock:
+	hyp_spin_unlock(&vm_table_lock);
+err_remove_mappings:
+	unmap_donated_memory(hyp_vm, vm_size);
+	unmap_donated_memory(pgd, pgd_size);
+err_unpin_kvm:
+	hyp_unpin_shared_mem(host_kvm, host_kvm + 1);
+	return ret;
+}
+
+/*
+ * Initialize the hypervisor copy of the protected vCPU state using the
+ * memory donated by the host.
+ *
+ * handle: The handle for the protected vm.
+ * host_vcpu: A pointer to the corresponding host vcpu.
+ * vcpu_hva: The host va of the area being donated for the vcpu state.
+ *	     Must be page aligned. The size of the area must be equal to
+ *	     the page-aligned size of 'struct pkvm_hyp_vcpu'.
+ * Return 0 on success, negative error code on failure.
+ */
+int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
+		     unsigned long vcpu_hva)
+{
+	struct pkvm_hyp_vcpu *hyp_vcpu;
+	struct pkvm_hyp_vm *hyp_vm;
+	unsigned int idx;
+	int ret;
+
+	hyp_vcpu = map_donated_memory(vcpu_hva, sizeof(*hyp_vcpu));
+	if (!hyp_vcpu)
+		return -ENOMEM;
+
+	hyp_spin_lock(&vm_table_lock);
+
+	hyp_vm = get_vm_by_handle(handle);
+	if (!hyp_vm) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	idx = hyp_vm->nr_vcpus;
+	if (idx >= hyp_vm->kvm.created_vcpus) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx);
+	if (ret)
+		goto unlock;
+
+	hyp_vm->vcpus[idx] = hyp_vcpu;
+	hyp_vm->nr_vcpus++;
+unlock:
+	hyp_spin_unlock(&vm_table_lock);
+
+	if (ret)
+		unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu));
+
+	return ret;
+}
+
+int __pkvm_teardown_vm(pkvm_handle_t handle)
+{
+	struct pkvm_hyp_vm *hyp_vm;
+	int err;
+
+	hyp_spin_lock(&vm_table_lock);
+	hyp_vm = get_vm_by_handle(handle);
+	if (!hyp_vm) {
+		err = -ENOENT;
+		goto err_unlock;
+	}
+
+	if (WARN_ON(hyp_page_count(hyp_vm))) {
+		err = -EBUSY;
+		goto err_unlock;
+	}
+
+	/* Ensure the VMID is clean before it can be reallocated */
+	__kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu);
+	remove_vm_table_entry(handle);
+	hyp_spin_unlock(&vm_table_lock);
+
+	/* Reclaim guest pages (including page-table pages) */
+	reclaim_guest_pages(hyp_vm);
+	unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus);
+
+	/* Push the metadata pages to the teardown memcache */
+	hyp_unpin_shared_mem(hyp_vm->host_kvm, hyp_vm->host_kvm + 1);
+
+	unmap_donated_memory(hyp_vm, hyp_vm->donated_memory_size);
+	return 0;
+
+err_unlock:
+	hyp_spin_unlock(&vm_table_lock);
+	return err;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 0312c9c74a5a..2be72fbe7279 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -16,6 +16,7 @@ 
 #include <nvhe/memory.h>
 #include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
+#include <nvhe/pkvm.h>
 #include <nvhe/trap_handler.h>
 
 unsigned long hyp_nr_cpus;
@@ -24,6 +25,7 @@  unsigned long hyp_nr_cpus;
 			 (unsigned long)__per_cpu_start)
 
 static void *vmemmap_base;
+static void *vm_table_base;
 static void *hyp_pgt_base;
 static void *host_s2_pgt_base;
 static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
@@ -40,6 +42,11 @@  static int divide_memory_pool(void *virt, unsigned long size)
 	if (!vmemmap_base)
 		return -ENOMEM;
 
+	nr_pages = hyp_vm_table_pages();
+	vm_table_base = hyp_early_alloc_contig(nr_pages);
+	if (!vm_table_base)
+		return -ENOMEM;
+
 	nr_pages = hyp_s1_pgtable_pages();
 	hyp_pgt_base = hyp_early_alloc_contig(nr_pages);
 	if (!hyp_pgt_base)
@@ -314,6 +321,7 @@  void __noreturn __pkvm_init_finalise(void)
 	if (ret)
 		goto out;
 
+	pkvm_hyp_vm_table_init(vm_table_base);
 out:
 	/*
 	 * We tail-called to here from handle___pkvm_init() and will not return,
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index cdf8e76b0be1..a1a27f88a312 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1200,6 +1200,15 @@  int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
 	return 0;
 }
 
+size_t kvm_pgtable_stage2_pgd_size(u64 vtcr)
+{
+	u32 ia_bits = VTCR_EL2_IPA(vtcr);
+	u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
+	u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
+
+	return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
+}
+
 static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 			      enum kvm_pgtable_walk_flags flag,
 			      void * const arg)
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 34229425b25d..71493136e59c 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -71,6 +71,7 @@  void __init kvm_hyp_reserve(void)
 
 	hyp_mem_pages += hyp_s1_pgtable_pages();
 	hyp_mem_pages += host_s2_pgtable_pages();
+	hyp_mem_pages += hyp_vm_table_pages();
 	hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
 
 	/*