diff mbox series

[4/7] KVM: TDX: Kick off vCPUs when SEAMCALL is busy during TD page removal

Message ID 20250113021250.18948-1-yan.y.zhao@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: TDX SEPT SEAMCALL retry | expand

Commit Message

Yan Zhao Jan. 13, 2025, 2:12 a.m. UTC
Kick off all vCPUs and prevent tdh_vp_enter() from executing whenever
tdh_mem_range_block()/tdh_mem_track()/tdh_mem_page_remove() encounters
contention, since the page removal path does not expect error and is less
sensitive to the performance penalty caused by kicking off vCPUs.

Although KVM has protected SEPT zap-related SEAMCALLs with kvm->mmu_lock,
KVM may still encounter TDX_OPERAND_BUSY due to the contention in the TDX
module.
- tdh_mem_track() may contend with tdh_vp_enter().
- tdh_mem_range_block()/tdh_mem_page_remove() may contend with
  tdh_vp_enter() and TDCALLs.

Resources     SHARED users      EXCLUSIVE users
------------------------------------------------------------
TDCS epoch    tdh_vp_enter      tdh_mem_track
------------------------------------------------------------
SEPT tree  tdh_mem_page_remove  tdh_vp_enter (0-step mitigation)
                                tdh_mem_range_block
------------------------------------------------------------
SEPT entry                      tdh_mem_range_block (Host lock)
                                tdh_mem_page_remove (Host lock)
                                tdg_mem_page_accept (Guest lock)
                                tdg_mem_page_attr_rd (Guest lock)
                                tdg_mem_page_attr_wr (Guest lock)

Use a TDX specific per-VM flag wait_for_sept_zap along with
KVM_REQ_OUTSIDE_GUEST_MODE to kick off vCPUs and prevent them from entering
TD, thereby avoiding the potential contention. Apply the kick-off and no
vCPU entering only after each SEAMCALL busy error to minimize the window of
no TD entry, as the contention due to 0-step mitigation or TDCALLs is
expected to be rare.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
 arch/x86/kvm/vmx/tdx.c | 62 ++++++++++++++++++++++++++++++++++++------
 arch/x86/kvm/vmx/tdx.h |  7 +++++
 2 files changed, 60 insertions(+), 9 deletions(-)

Comments

Binbin Wu Jan. 16, 2025, 6:23 a.m. UTC | #1
On 1/13/2025 10:12 AM, Yan Zhao wrote:
[...]
> +
>   /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
>   static int __tdx_reclaim_page(hpa_t pa)
>   {
> @@ -979,6 +999,14 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
>   		return EXIT_FASTPATH_NONE;
>   	}
>   
> +	/*
> +	 * Wait until retry of SEPT-zap-related SEAMCALL completes before
> +	 * allowing vCPU entry to avoid contention with tdh_vp_enter() and
> +	 * TDCALLs.
> +	 */
> +	if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
> +		return EXIT_FASTPATH_EXIT_HANDLED;
> +
>   	trace_kvm_entry(vcpu, force_immediate_exit);
>   
>   	if (pi_test_on(&tdx->pi_desc)) {
> @@ -1647,15 +1675,23 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
>   	if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
>   		return -EINVAL;
>   
> -	do {
> +	/*
> +	 * When zapping private page, write lock is held. So no race condition
> +	 * with other vcpu sept operation.
> +	 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
> +	 */
> +	err = tdh_mem_page_remove(kvm_tdx->tdr_pa, gpa, tdx_level, &entry,
> +				  &level_state);
> +	if ((err & TDX_OPERAND_BUSY)) {

It is not safe to use "err & TDX_OPERAND_BUSY".
E.g., if the error is TDX_EPT_WALK_FAILED, "err & TDX_OPERAND_BUSY" will be true.

Maybe you can add a helper to check it.

staticinlinebooltdx_operand_busy(u64err)
{
return(err &TDX_SEAMCALL_STATUS_MASK) ==TDX_OPERAND_BUSY;
}


>   		/*
> -		 * When zapping private page, write lock is held. So no race
> -		 * condition with other vcpu sept operation.  Race only with
> -		 * TDH.VP.ENTER.
> +		 * The second retry is expected to succeed after kicking off all
> +		 * other vCPUs and prevent them from invoking TDH.VP.ENTER.
>   		 */
> +		tdx_no_vcpus_enter_start(kvm);
>   		err = tdh_mem_page_remove(kvm_tdx->tdr_pa, gpa, tdx_level, &entry,
>   					  &level_state);
> -	} while (unlikely(err == TDX_ERROR_SEPT_BUSY));
> +		tdx_no_vcpus_enter_stop(kvm);
> +	}
>   
>   	if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE &&
>   		     err == (TDX_EPT_WALK_FAILED | TDX_OPERAND_ID_RCX))) {
> @@ -1726,8 +1762,12 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
>   	WARN_ON_ONCE(level != PG_LEVEL_4K);
>   
>   	err = tdh_mem_range_block(kvm_tdx->tdr_pa, gpa, tdx_level, &entry, &level_state);
> -	if (unlikely(err == TDX_ERROR_SEPT_BUSY))
> -		return -EAGAIN;
> +	if (unlikely(err & TDX_OPERAND_BUSY)) {
Ditto.

> +		/* After no vCPUs enter, the second retry is expected to succeed */
> +		tdx_no_vcpus_enter_start(kvm);
> +		err = tdh_mem_range_block(kvm_tdx->tdr_pa, gpa, tdx_level, &entry, &level_state);
> +		tdx_no_vcpus_enter_stop(kvm);
> +	}
>   	if (KVM_BUG_ON(err, kvm)) {
>   		pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
>   		return -EIO;
> @@ -1770,9 +1810,13 @@ static void tdx_track(struct kvm *kvm)
>   
>   	lockdep_assert_held_write(&kvm->mmu_lock);
>   
> -	do {
> +	err = tdh_mem_track(kvm_tdx->tdr_pa);
> +	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY) {
> +		/* After no vCPUs enter, the second retry is expected to succeed */
> +		tdx_no_vcpus_enter_start(kvm);
>   		err = tdh_mem_track(kvm_tdx->tdr_pa);
> -	} while (unlikely((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY));
> +		tdx_no_vcpus_enter_stop(kvm);
> +	}
>   
>
[...]
Binbin Wu Jan. 16, 2025, 6:28 a.m. UTC | #2
On 1/16/2025 2:23 PM, Binbin Wu wrote:
>
>
>
> On 1/13/2025 10:12 AM, Yan Zhao wrote:
> [...]
>> +
>>   /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
>>   static int __tdx_reclaim_page(hpa_t pa)
>>   {
>> @@ -979,6 +999,14 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
>>           return EXIT_FASTPATH_NONE;
>>       }
>>   +    /*
>> +     * Wait until retry of SEPT-zap-related SEAMCALL completes before
>> +     * allowing vCPU entry to avoid contention with tdh_vp_enter() and
>> +     * TDCALLs.
>> +     */
>> +    if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
>> +        return EXIT_FASTPATH_EXIT_HANDLED;
>> +
>>       trace_kvm_entry(vcpu, force_immediate_exit);
>>         if (pi_test_on(&tdx->pi_desc)) {
>> @@ -1647,15 +1675,23 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
>>       if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
>>           return -EINVAL;
>>   -    do {
>> +    /*
>> +     * When zapping private page, write lock is held. So no race condition
>> +     * with other vcpu sept operation.
>> +     * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
>> +     */
>> +    err = tdh_mem_page_remove(kvm_tdx->tdr_pa, gpa, tdx_level, &entry,
>> +                  &level_state);
>> +    if ((err & TDX_OPERAND_BUSY)) {
>
> It is not safe to use "err & TDX_OPERAND_BUSY".
> E.g., if the error is TDX_EPT_WALK_FAILED, "err & TDX_OPERAND_BUSY" will be true.
>
> Maybe you can add a helper to check it.
>
> staticinlinebooltdx_operand_busy(u64err)
> {
> return(err &TDX_SEAMCALL_STATUS_MASK) ==TDX_OPERAND_BUSY;
> }
>
Don't know why some spaces were dropped by thunderbird. :-(

>
>>           /*
>> -         * When zapping private page, write lock is held. So no race
>> -         * condition with other vcpu sept operation.  Race only with
>> -         * TDH.VP.ENTER.
>> +         * The second retry is expected to succeed after kicking off all
>> +         * other vCPUs and prevent them from invoking TDH.VP.ENTER.
>>            */
>> +        tdx_no_vcpus_enter_start(kvm);
>>           err = tdh_mem_page_remove(kvm_tdx->tdr_pa, gpa, tdx_level, &entry,
>>                         &level_state);
>> -    } while (unlikely(err == TDX_ERROR_SEPT_BUSY));
>> +        tdx_no_vcpus_enter_stop(kvm);
>> +    }
>>         if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE &&
>>                err == (TDX_EPT_WALK_FAILED | TDX_OPERAND_ID_RCX))) {
>> @@ -1726,8 +1762,12 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
>>       WARN_ON_ONCE(level != PG_LEVEL_4K);
>>         err = tdh_mem_range_block(kvm_tdx->tdr_pa, gpa, tdx_level, &entry, &level_state);
>> -    if (unlikely(err == TDX_ERROR_SEPT_BUSY))
>> -        return -EAGAIN;
>> +    if (unlikely(err & TDX_OPERAND_BUSY)) {
> Ditto.
>
>> +        /* After no vCPUs enter, the second retry is expected to succeed */
>> +        tdx_no_vcpus_enter_start(kvm);
>> +        err = tdh_mem_range_block(kvm_tdx->tdr_pa, gpa, tdx_level, &entry, &level_state);
>> +        tdx_no_vcpus_enter_stop(kvm);
>> +    }
>>       if (KVM_BUG_ON(err, kvm)) {
>>           pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
>>           return -EIO;
>> @@ -1770,9 +1810,13 @@ static void tdx_track(struct kvm *kvm)
>>         lockdep_assert_held_write(&kvm->mmu_lock);
>>   -    do {
>> +    err = tdh_mem_track(kvm_tdx->tdr_pa);
>> +    if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY) {
>> +        /* After no vCPUs enter, the second retry is expected to succeed */
>> +        tdx_no_vcpus_enter_start(kvm);
>>           err = tdh_mem_track(kvm_tdx->tdr_pa);
>> -    } while (unlikely((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY));
>> +        tdx_no_vcpus_enter_stop(kvm);
>> +    }
>>
> [...]
>
Yan Zhao Jan. 16, 2025, 8:18 a.m. UTC | #3
On Thu, Jan 16, 2025 at 02:23:24PM +0800, Binbin Wu wrote:
> 
> 
> 
> On 1/13/2025 10:12 AM, Yan Zhao wrote:
> [...]
> > +
> >   /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
> >   static int __tdx_reclaim_page(hpa_t pa)
> >   {
> > @@ -979,6 +999,14 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
> >   		return EXIT_FASTPATH_NONE;
> >   	}
> > +	/*
> > +	 * Wait until retry of SEPT-zap-related SEAMCALL completes before
> > +	 * allowing vCPU entry to avoid contention with tdh_vp_enter() and
> > +	 * TDCALLs.
> > +	 */
> > +	if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
> > +		return EXIT_FASTPATH_EXIT_HANDLED;
> > +
> >   	trace_kvm_entry(vcpu, force_immediate_exit);
> >   	if (pi_test_on(&tdx->pi_desc)) {
> > @@ -1647,15 +1675,23 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
> >   	if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
> >   		return -EINVAL;
> > -	do {
> > +	/*
> > +	 * When zapping private page, write lock is held. So no race condition
> > +	 * with other vcpu sept operation.
> > +	 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
> > +	 */
> > +	err = tdh_mem_page_remove(kvm_tdx->tdr_pa, gpa, tdx_level, &entry,
> > +				  &level_state);
> > +	if ((err & TDX_OPERAND_BUSY)) {
> 
> It is not safe to use "err & TDX_OPERAND_BUSY".
> E.g., if the error is TDX_EPT_WALK_FAILED, "err & TDX_OPERAND_BUSY" will be true.
> 
> Maybe you can add a helper to check it.
> 
> staticinlinebooltdx_operand_busy(u64err)
> {
> return(err &TDX_SEAMCALL_STATUS_MASK) ==TDX_OPERAND_BUSY;
> }
> 
Good catch!
Thanks!

> 
> >   		/*
> > -		 * When zapping private page, write lock is held. So no race
> > -		 * condition with other vcpu sept operation.  Race only with
> > -		 * TDH.VP.ENTER.
> > +		 * The second retry is expected to succeed after kicking off all
> > +		 * other vCPUs and prevent them from invoking TDH.VP.ENTER.
> >   		 */
> > +		tdx_no_vcpus_enter_start(kvm);
> >   		err = tdh_mem_page_remove(kvm_tdx->tdr_pa, gpa, tdx_level, &entry,
> >   					  &level_state);
> > -	} while (unlikely(err == TDX_ERROR_SEPT_BUSY));
> > +		tdx_no_vcpus_enter_stop(kvm);
> > +	}
> >   	if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE &&
> >   		     err == (TDX_EPT_WALK_FAILED | TDX_OPERAND_ID_RCX))) {
> > @@ -1726,8 +1762,12 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
> >   	WARN_ON_ONCE(level != PG_LEVEL_4K);
> >   	err = tdh_mem_range_block(kvm_tdx->tdr_pa, gpa, tdx_level, &entry, &level_state);
> > -	if (unlikely(err == TDX_ERROR_SEPT_BUSY))
> > -		return -EAGAIN;
> > +	if (unlikely(err & TDX_OPERAND_BUSY)) {
> Ditto.
> 
> > +		/* After no vCPUs enter, the second retry is expected to succeed */
> > +		tdx_no_vcpus_enter_start(kvm);
> > +		err = tdh_mem_range_block(kvm_tdx->tdr_pa, gpa, tdx_level, &entry, &level_state);
> > +		tdx_no_vcpus_enter_stop(kvm);
> > +	}
> >   	if (KVM_BUG_ON(err, kvm)) {
> >   		pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
> >   		return -EIO;
> > @@ -1770,9 +1810,13 @@ static void tdx_track(struct kvm *kvm)
> >   	lockdep_assert_held_write(&kvm->mmu_lock);
> > -	do {
> > +	err = tdh_mem_track(kvm_tdx->tdr_pa);
> > +	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY) {
> > +		/* After no vCPUs enter, the second retry is expected to succeed */
> > +		tdx_no_vcpus_enter_start(kvm);
> >   		err = tdh_mem_track(kvm_tdx->tdr_pa);
> > -	} while (unlikely((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY));
> > +		tdx_no_vcpus_enter_stop(kvm);
> > +	}
> > 
> [...]
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index bb9d914765fc..09677a4cd605 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -312,6 +312,26 @@  static void tdx_clear_page(unsigned long page_pa)
 	__mb();
 }
 
+static void tdx_no_vcpus_enter_start(struct kvm *kvm)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+	lockdep_assert_held_write(&kvm->mmu_lock);
+
+	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
+
+	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+}
+
+static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+	lockdep_assert_held_write(&kvm->mmu_lock);
+
+	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
+}
+
 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
 static int __tdx_reclaim_page(hpa_t pa)
 {
@@ -979,6 +999,14 @@  fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
 		return EXIT_FASTPATH_NONE;
 	}
 
+	/*
+	 * Wait until retry of SEPT-zap-related SEAMCALL completes before
+	 * allowing vCPU entry to avoid contention with tdh_vp_enter() and
+	 * TDCALLs.
+	 */
+	if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
+		return EXIT_FASTPATH_EXIT_HANDLED;
+
 	trace_kvm_entry(vcpu, force_immediate_exit);
 
 	if (pi_test_on(&tdx->pi_desc)) {
@@ -1647,15 +1675,23 @@  static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
 	if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
 		return -EINVAL;
 
-	do {
+	/*
+	 * When zapping private page, write lock is held. So no race condition
+	 * with other vcpu sept operation.
+	 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
+	 */
+	err = tdh_mem_page_remove(kvm_tdx->tdr_pa, gpa, tdx_level, &entry,
+				  &level_state);
+	if ((err & TDX_OPERAND_BUSY)) {
 		/*
-		 * When zapping private page, write lock is held. So no race
-		 * condition with other vcpu sept operation.  Race only with
-		 * TDH.VP.ENTER.
+		 * The second retry is expected to succeed after kicking off all
+		 * other vCPUs and prevent them from invoking TDH.VP.ENTER.
 		 */
+		tdx_no_vcpus_enter_start(kvm);
 		err = tdh_mem_page_remove(kvm_tdx->tdr_pa, gpa, tdx_level, &entry,
 					  &level_state);
-	} while (unlikely(err == TDX_ERROR_SEPT_BUSY));
+		tdx_no_vcpus_enter_stop(kvm);
+	}
 
 	if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE &&
 		     err == (TDX_EPT_WALK_FAILED | TDX_OPERAND_ID_RCX))) {
@@ -1726,8 +1762,12 @@  static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
 	WARN_ON_ONCE(level != PG_LEVEL_4K);
 
 	err = tdh_mem_range_block(kvm_tdx->tdr_pa, gpa, tdx_level, &entry, &level_state);
-	if (unlikely(err == TDX_ERROR_SEPT_BUSY))
-		return -EAGAIN;
+	if (unlikely(err & TDX_OPERAND_BUSY)) {
+		/* After no vCPUs enter, the second retry is expected to succeed */
+		tdx_no_vcpus_enter_start(kvm);
+		err = tdh_mem_range_block(kvm_tdx->tdr_pa, gpa, tdx_level, &entry, &level_state);
+		tdx_no_vcpus_enter_stop(kvm);
+	}
 	if (KVM_BUG_ON(err, kvm)) {
 		pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
 		return -EIO;
@@ -1770,9 +1810,13 @@  static void tdx_track(struct kvm *kvm)
 
 	lockdep_assert_held_write(&kvm->mmu_lock);
 
-	do {
+	err = tdh_mem_track(kvm_tdx->tdr_pa);
+	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY) {
+		/* After no vCPUs enter, the second retry is expected to succeed */
+		tdx_no_vcpus_enter_start(kvm);
 		err = tdh_mem_track(kvm_tdx->tdr_pa);
-	} while (unlikely((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY));
+		tdx_no_vcpus_enter_stop(kvm);
+	}
 
 	if (KVM_BUG_ON(err, kvm))
 		pr_tdx_error(TDH_MEM_TRACK, err);
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 0833d1084331..e369a6f8721b 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -48,6 +48,13 @@  struct kvm_tdx {
 
 	/* For KVM_TDX_INIT_MEM_REGION. */
 	atomic64_t nr_premapped;
+
+	/*
+	 * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do
+	 * not contend with tdh_vp_enter() and TDCALLs.
+	 * Set/unset is protected with kvm->mmu_lock.
+	 */
+	bool wait_for_sept_zap;
 };
 
 /* TDX module vCPU states */