diff mbox series

[13/21] KVM: TDX: Handle TLB tracking for TDX

Message ID 20240904030751.117579-14-rick.p.edgecombe@intel.com (mailing list archive)
State New, archived
Headers show
Series TDX MMU Part 2 | expand

Commit Message

Rick Edgecombe Sept. 4, 2024, 3:07 a.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

Handle TLB tracking for TDX by introducing function tdx_track() for private
memory TLB tracking and implementing flush_tlb* hooks to flush TLBs for
shared memory.

Introduce function tdx_track() to do TLB tracking on private memory, which
basically does two things: calling TDH.MEM.TRACK to increase TD epoch and
kicking off all vCPUs. The private EPT will then be flushed when each vCPU
re-enters the TD. This function is unused temporarily in this patch and
will be called on a page-by-page basis on removal of private guest page in
a later patch.

In earlier revisions, tdx_track() relied on an atomic counter to coordinate
the synchronization between the actions of kicking off vCPUs, incrementing
the TD epoch, and the vCPUs waiting for the incremented TD epoch after
being kicked off.

However, the core MMU only actually needs to call tdx_track() while
aleady under a write mmu_lock. So this sychnonization can be made to be
unneeded. vCPUs are kicked off only after the successful execution of
TDH.MEM.TRACK, eliminating the need for vCPUs to wait for TDH.MEM.TRACK
completion after being kicked off. tdx_track() is therefore able to send
requests KVM_REQ_OUTSIDE_GUEST_MODE rather than KVM_REQ_TLB_FLUSH.

Hooks for flush_remote_tlb and flush_remote_tlbs_range are not necessary
for TDX, as tdx_track() will handle TLB tracking of private memory on
page-by-page basis when private guest pages are removed. There is no need
to invoke tdx_track() again in kvm_flush_remote_tlbs() even after changes
to the mirrored page table.

For hooks flush_tlb_current and flush_tlb_all, which are invoked during
kvm_mmu_load() and vcpu load for normal VMs, let VMM to flush all EPTs in
the two hooks for simplicity, since TDX does not depend on the two
hooks to notify TDX module to flush private EPT in those cases.

Co-developed-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
TDX MMU part 2 v1:
 - Split from the big patch "KVM: TDX: TDP MMU TDX support".
 - Modification of synchronization mechanism in tdx_track().
 - Dropped hooks flush_remote_tlb and flush_remote_tlbs_range.
 - Let VMM to flush all EPTs in hooks flush_tlb_all and flush_tlb_current.
 - Dropped KVM_BUG_ON() in vt_flush_tlb_gva(). (Rick)
---
 arch/x86/kvm/vmx/main.c    | 52 ++++++++++++++++++++++++++++++++---
 arch/x86/kvm/vmx/tdx.c     | 55 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/x86_ops.h |  2 ++
 3 files changed, 105 insertions(+), 4 deletions(-)

Comments

Paolo Bonzini Sept. 10, 2024, 8:16 a.m. UTC | #1
On 9/4/24 05:07, Rick Edgecombe wrote:
> +static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
> +{
> +	/*
> +	 * TDX calls tdx_track() in tdx_sept_remove_private_spte() to ensure
> +	 * private EPT will be flushed on the next TD enter.
> +	 * No need to call tdx_track() here again even when this callback is as
> +	 * a result of zapping private EPT.
> +	 * Just invoke invept() directly here to work for both shared EPT and
> +	 * private EPT.
> +	 */
> +	if (is_td_vcpu(vcpu)) {
> +		ept_sync_global();
> +		return;
> +	}
> +
> +	vmx_flush_tlb_all(vcpu);
> +}
> +
> +static void vt_flush_tlb_current(struct kvm_vcpu *vcpu)
> +{
> +	if (is_td_vcpu(vcpu)) {
> +		tdx_flush_tlb_current(vcpu);
> +		return;
> +	}
> +
> +	vmx_flush_tlb_current(vcpu);
> +}
> +

I'd do it slightly different:

static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
{
	if (is_td_vcpu(vcpu)) {
		tdx_flush_tlb_all(vcpu);
		return;
	}

	vmx_flush_tlb_all(vcpu);
}

static void vt_flush_tlb_current(struct kvm_vcpu *vcpu)
{
	if (is_td_vcpu(vcpu)) {
		/*
		 * flush_tlb_current() is used only the first time for
		 * the vcpu runs, since TDX supports neither shadow
		 * nested paging nor SMM.  Keep this function simple.
		 */
		tdx_flush_tlb_all(vcpu);
		return;
	}

	vmx_flush_tlb_current(vcpu);
}

and put the implementation details close to tdx_track:

void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
{
	/*
	 * TDX calls tdx_track() in tdx_sept_remove_private_spte() to
	 * ensure private EPT will be flushed on the next TD enter.
	 * No need to call tdx_track() here again, even when this
	 * callback is a result of zapping private EPT.  Just
	 * invoke invept() directly here, which works for both shared
	 * EPT and private EPT.
	 */
	ept_sync_global();
}
Rick Edgecombe Sept. 10, 2024, 11:49 p.m. UTC | #2
On Tue, 2024-09-10 at 10:16 +0200, Paolo Bonzini wrote:
> 
> I'd do it slightly different:

Fair enough, thanks.
Xu Yilun Sept. 11, 2024, 6:25 a.m. UTC | #3
> +static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
> +{
> +	/*
> +	 * TDX calls tdx_track() in tdx_sept_remove_private_spte() to ensure
> +	 * private EPT will be flushed on the next TD enter.
> +	 * No need to call tdx_track() here again even when this callback is as
> +	 * a result of zapping private EPT.
> +	 * Just invoke invept() directly here to work for both shared EPT and
> +	 * private EPT.

IIUC, private EPT is already flushed in .remove_private_spte(), so in
theory we don't have to invept() for private EPT?

Thanks,
Yilun

> +	 */
> +	if (is_td_vcpu(vcpu)) {
> +		ept_sync_global();
> +		return;
> +	}
> +
> +	vmx_flush_tlb_all(vcpu);
> +}
Rick Edgecombe Sept. 11, 2024, 5:28 p.m. UTC | #4
On Wed, 2024-09-11 at 14:25 +0800, Xu Yilun wrote:
> > +static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
> > +{
> > +       /*
> > +        * TDX calls tdx_track() in tdx_sept_remove_private_spte() to ensure
> > +        * private EPT will be flushed on the next TD enter.
> > +        * No need to call tdx_track() here again even when this callback is
> > as
> > +        * a result of zapping private EPT.
> > +        * Just invoke invept() directly here to work for both shared EPT
> > and
> > +        * private EPT.
> 
> IIUC, private EPT is already flushed in .remove_private_spte(), so in
> theory we don't have to invept() for private EPT?

I think you are talking about the comment, and not an optimization. So changing:
"Just invoke invept() directly here to work for both shared EPT and private EPT"
to just "Just invoke invept() directly here to work for shared EPT".

Seems good to me.
Yan Zhao Sept. 12, 2024, 4:54 a.m. UTC | #5
On Thu, Sep 12, 2024 at 01:28:18AM +0800, Edgecombe, Rick P wrote:
> On Wed, 2024-09-11 at 14:25 +0800, Xu Yilun wrote:
> > > +static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
> > > +{
> > > +       /*
> > > +        * TDX calls tdx_track() in tdx_sept_remove_private_spte() to ensure
> > > +        * private EPT will be flushed on the next TD enter.
> > > +        * No need to call tdx_track() here again even when this callback is
> > > as
> > > +        * a result of zapping private EPT.
> > > +        * Just invoke invept() directly here to work for both shared EPT
> > > and
> > > +        * private EPT.
> > 
> > IIUC, private EPT is already flushed in .remove_private_spte(), so in
> > theory we don't have to invept() for private EPT?
> 
> I think you are talking about the comment, and not an optimization. So changing:
> "Just invoke invept() directly here to work for both shared EPT and private EPT"
> to just "Just invoke invept() directly here to work for shared EPT".
> 
> Seems good to me.
Hmm, what about just adding
"Due to the lack of context within this callback function, it cannot
  determine which EPT has been affected by zapping."?

as blow:

"TDX calls tdx_track() in tdx_sept_remove_private_spte() to ensure
private EPT will be flushed on the next TD enter.
No need to call tdx_track() here again even when this callback is
as a result of zapping private EPT.

Due to the lack of context within this callback function, it cannot
determine which EPT has been affected by zapping.
Just invoke invept() directly here to work for both shared EPT and
private EPT for simplicity."
Xu Yilun Sept. 12, 2024, 7:47 a.m. UTC | #6
On Wed, Sep 11, 2024 at 05:28:18PM +0000, Edgecombe, Rick P wrote:
> On Wed, 2024-09-11 at 14:25 +0800, Xu Yilun wrote:
> > > +static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
> > > +{
> > > +       /*
> > > +        * TDX calls tdx_track() in tdx_sept_remove_private_spte() to ensure
> > > +        * private EPT will be flushed on the next TD enter.
> > > +        * No need to call tdx_track() here again even when this callback is
> > > as
> > > +        * a result of zapping private EPT.
> > > +        * Just invoke invept() directly here to work for both shared EPT
> > > and
> > > +        * private EPT.
> > 
> > IIUC, private EPT is already flushed in .remove_private_spte(), so in
> > theory we don't have to invept() for private EPT?
> 
> I think you are talking about the comment, and not an optimization. So changing:

Yes, just the comment.

> "Just invoke invept() directly here to work for both shared EPT and private EPT"
> to just "Just invoke invept() directly here to work for shared EPT".

Maybe also remind invept() is redundant for private EPT in some cases,
but we implement like this for simplicity.

Thanks,
Yilun

> 
> Seems good to me.
Rick Edgecombe Sept. 12, 2024, 2:44 p.m. UTC | #7
On Thu, 2024-09-12 at 12:54 +0800, Yan Zhao wrote:
> "TDX calls tdx_track() in tdx_sept_remove_private_spte() to ensure
> private EPT will be flushed on the next TD enter.
> No need to call tdx_track() here again even when this callback is
> as a result of zapping private EPT.
> 
> Due to the lack of context within this callback function, it cannot
> determine which EPT has been affected by zapping.
> Just invoke invept() directly here to work for both shared EPT and
> private EPT for simplicity."

Yes, I think agree this is better.
Yan Zhao Oct. 14, 2024, 6:34 a.m. UTC | #8
On Tue, Sep 10, 2024 at 10:16:27AM +0200, Paolo Bonzini wrote:
> On 9/4/24 05:07, Rick Edgecombe wrote:
> > +static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
> > +{
> > +	/*
> > +	 * TDX calls tdx_track() in tdx_sept_remove_private_spte() to ensure
> > +	 * private EPT will be flushed on the next TD enter.
> > +	 * No need to call tdx_track() here again even when this callback is as
> > +	 * a result of zapping private EPT.
> > +	 * Just invoke invept() directly here to work for both shared EPT and
> > +	 * private EPT.
> > +	 */
> > +	if (is_td_vcpu(vcpu)) {
> > +		ept_sync_global();
> > +		return;
> > +	}
> > +
> > +	vmx_flush_tlb_all(vcpu);
> > +}
> > +
> > +static void vt_flush_tlb_current(struct kvm_vcpu *vcpu)
> > +{
> > +	if (is_td_vcpu(vcpu)) {
> > +		tdx_flush_tlb_current(vcpu);
> > +		return;
> > +	}
> > +
> > +	vmx_flush_tlb_current(vcpu);
> > +}
> > +
> 
> I'd do it slightly different:
> 
> static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
> {
> 	if (is_td_vcpu(vcpu)) {
> 		tdx_flush_tlb_all(vcpu);
> 		return;
> 	}
> 
> 	vmx_flush_tlb_all(vcpu);
> }
Thanks!
This is better.

> 
> static void vt_flush_tlb_current(struct kvm_vcpu *vcpu)
> {
> 	if (is_td_vcpu(vcpu)) {
> 		/*
> 		 * flush_tlb_current() is used only the first time for
> 		 * the vcpu runs, since TDX supports neither shadow
> 		 * nested paging nor SMM.  Keep this function simple.
> 		 */
> 		tdx_flush_tlb_all(vcpu);
Could we still keep tdx_flush_tlb_current()?
Though both tdx_flush_tlb_all() and tdx_flush_tlb_current() simply invoke
ept_sync_global(), their purposes are different:

- The ept_sync_global() in tdx_flush_tlb_current() is intended to avoid
  retrieving private EPTP required for the single-context invalidation for
  shared EPT;
- while the ept_sync_global() in tdx_flush_tlb_all() is right for shared EPT.

Adding a tdx_flush_tlb_current() can help document the differences in tdx.c.

like this:

void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
        /*
         * flush_tlb_current() is invoked when the first time for the vcpu to
         * run or when root of shared EPT is invalidated.
         * KVM only needs to flush the TLB for shared EPT because the TDX module
         * handles TLB invalidation for private EPT in tdh_vp_enter();
         *
         * A single context invalidation for shared EPT can be performed here.
         * However, this single context invalidation requires the private EPTP
         * rather than the shared EPTP to flush TLB for shared EPT, as shared
         * EPT uses private EPTP as its ASID for TLB invalidation.
         *
         * To avoid reading back private EPTP, perform a global invalidation for
         * shared EPT instead to keep this function simple.
         */
        ept_sync_global();
}

void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
{
        /*
         * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
         * ensure that private EPT will be flushed on the next TD enter. No need
         * to call tdx_track() here again even when this callback is a result of
         * zapping private EPT.
         *
         * Due to the lack of the context to determine which EPT has been
         * affected by zapping, invoke invept() directly here for both shared
         * EPT and private EPT for simplicity, though it's not necessary for
         * private EPT.          *
         */
        ept_sync_global();
}



> 		return;
> 	}
> 
> 	vmx_flush_tlb_current(vcpu);
> }
> 

> and put the implementation details close to tdx_track:
> void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
> {
> 	/*
> 	 * TDX calls tdx_track() in tdx_sept_remove_private_spte() to
> 	 * ensure private EPT will be flushed on the next TD enter.
> 	 * No need to call tdx_track() here again, even when this
> 	 * callback is a result of zapping private EPT.  Just
> 	 * invoke invept() directly here, which works for both shared
> 	 * EPT and private EPT.
> 	 */
> 	ept_sync_global();
> }
Got it!
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 2cc29d0fc279..1c86849680a3 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -101,6 +101,50 @@  static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	vmx_vcpu_reset(vcpu, init_event);
 }
 
+static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * TDX calls tdx_track() in tdx_sept_remove_private_spte() to ensure
+	 * private EPT will be flushed on the next TD enter.
+	 * No need to call tdx_track() here again even when this callback is as
+	 * a result of zapping private EPT.
+	 * Just invoke invept() directly here to work for both shared EPT and
+	 * private EPT.
+	 */
+	if (is_td_vcpu(vcpu)) {
+		ept_sync_global();
+		return;
+	}
+
+	vmx_flush_tlb_all(vcpu);
+}
+
+static void vt_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+	if (is_td_vcpu(vcpu)) {
+		tdx_flush_tlb_current(vcpu);
+		return;
+	}
+
+	vmx_flush_tlb_current(vcpu);
+}
+
+static void vt_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+{
+	if (is_td_vcpu(vcpu))
+		return;
+
+	vmx_flush_tlb_gva(vcpu, addr);
+}
+
+static void vt_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+	if (is_td_vcpu(vcpu))
+		return;
+
+	vmx_flush_tlb_guest(vcpu);
+}
+
 static void vt_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
 			int pgd_level)
 {
@@ -190,10 +234,10 @@  struct kvm_x86_ops vt_x86_ops __initdata = {
 	.set_rflags = vmx_set_rflags,
 	.get_if_flag = vmx_get_if_flag,
 
-	.flush_tlb_all = vmx_flush_tlb_all,
-	.flush_tlb_current = vmx_flush_tlb_current,
-	.flush_tlb_gva = vmx_flush_tlb_gva,
-	.flush_tlb_guest = vmx_flush_tlb_guest,
+	.flush_tlb_all = vt_flush_tlb_all,
+	.flush_tlb_current = vt_flush_tlb_current,
+	.flush_tlb_gva = vt_flush_tlb_gva,
+	.flush_tlb_guest = vt_flush_tlb_guest,
 
 	.vcpu_pre_run = vmx_vcpu_pre_run,
 	.vcpu_run = vmx_vcpu_run,
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 9da71782660f..6feb3ab96926 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -6,6 +6,7 @@ 
 #include "mmu.h"
 #include "tdx.h"
 #include "tdx_ops.h"
+#include "vmx.h"
 #include "mmu/spte.h"
 
 #undef pr_fmt
@@ -446,6 +447,51 @@  void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
 	td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
 }
 
+/*
+ * Ensure shared and private EPTs to be flushed on all vCPUs.
+ * tdh_mem_track() is the only caller that increases TD epoch. An increase in
+ * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
+ * running in guest mode with the value "N - 1".
+ *
+ * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
+ * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
+ * being increased to "N + 1".
+ *
+ * Kicking off all vCPUs after that further results in no vCPUs can run in guest
+ * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
+ * to increase TD epoch to "N + 2").
+ *
+ * TDX module will flush EPT on the next TD enter and make vCPUs to run in
+ * guest mode with TD epoch value "N + 1".
+ *
+ * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
+ * waiting empty IPI handler ack_kick().
+ *
+ * No action is required to the vCPUs being kicked off since the kicking off
+ * occurs certainly after TD epoch increment and before the next
+ * tdh_mem_track().
+ */
+static void __always_unused tdx_track(struct kvm *kvm)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	u64 err;
+
+	/* If TD isn't finalized, it's before any vcpu running. */
+	if (unlikely(!is_td_finalized(kvm_tdx)))
+		return;
+
+	lockdep_assert_held_write(&kvm->mmu_lock);
+
+	do {
+		err = tdh_mem_track(kvm_tdx);
+	} while (unlikely((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY));
+
+	if (KVM_BUG_ON(err, kvm))
+		pr_tdx_error(TDH_MEM_TRACK, err);
+
+	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+}
+
 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
 {
 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
@@ -947,6 +993,15 @@  static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
 	return ret;
 }
 
+void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * flush_tlb_current() is used only the first time for the vcpu to run.
+	 * As it isn't performance critical, keep this function simple.
+	 */
+	ept_sync_global();
+}
+
 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_tdx_cmd tdx_cmd;
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index dcf2b36efbb9..28fda93f0b27 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -131,6 +131,7 @@  void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
 
 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
 
+void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
 #else
 static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
@@ -145,6 +146,7 @@  static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
 
 static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }
 
+static inline void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) {}
 static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {}
 #endif