diff mbox series

[RFC,2/2] arch/powerpc/kvm: Fix doorbells for nested KVM guests on PowerNV

Message ID 20240627180342.110238-3-gautam@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series Fix doorbell emulation for nested KVM guests in V1 API | expand

Commit Message

Gautam Menghani June 27, 2024, 6:03 p.m. UTC
commit 6398326b9ba1("KVM: PPC: Book3S HV P9: Stop using vc->dpdes")
introduced an optimization to use only vcpu->doorbell_request for SMT
emulation for Power9 and above guests, but the code for nested guests 
still relies on the old way of handling doorbells, due to which an L2
guest cannot be booted with XICS with SMT>1. The command to repro
this issue is:

qemu-system-ppc64 \
	-drive file=rhel.qcow2,format=qcow2 \
	-m 20G \
	-smp 8,cores=1,threads=8 \
	-cpu  host \
	-nographic \
	-machine pseries,ic-mode=xics -accel kvm

Fix the plumbing to utilize vcpu->doorbell_request instead of vcore->dpdes 
on P9 and above.

Fixes: 6398326b9ba1 ("KVM: PPC: Book3S HV P9: Stop using vc->dpdes")
Signed-off-by: Gautam Menghani <gautam@linux.ibm.com>
---
 arch/powerpc/kvm/book3s_hv.c        |  9 ++++++++-
 arch/powerpc/kvm/book3s_hv_nested.c | 20 ++++++++++++++++----
 2 files changed, 24 insertions(+), 5 deletions(-)

Comments

Nicholas Piggin July 4, 2024, 12:10 p.m. UTC | #1
On Fri Jun 28, 2024 at 4:03 AM AEST, Gautam Menghani wrote:
> commit 6398326b9ba1("KVM: PPC: Book3S HV P9: Stop using vc->dpdes")
> introduced an optimization to use only vcpu->doorbell_request for SMT
> emulation for Power9 and above guests, but the code for nested guests 
> still relies on the old way of handling doorbells, due to which an L2
> guest cannot be booted with XICS with SMT>1. The command to repro
> this issue is:
>
> qemu-system-ppc64 \
> 	-drive file=rhel.qcow2,format=qcow2 \
> 	-m 20G \
> 	-smp 8,cores=1,threads=8 \
> 	-cpu  host \
> 	-nographic \
> 	-machine pseries,ic-mode=xics -accel kvm
>
> Fix the plumbing to utilize vcpu->doorbell_request instead of vcore->dpdes 
> on P9 and above.
>
> Fixes: 6398326b9ba1 ("KVM: PPC: Book3S HV P9: Stop using vc->dpdes")
> Signed-off-by: Gautam Menghani <gautam@linux.ibm.com>
> ---
>  arch/powerpc/kvm/book3s_hv.c        |  9 ++++++++-
>  arch/powerpc/kvm/book3s_hv_nested.c | 20 ++++++++++++++++----
>  2 files changed, 24 insertions(+), 5 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index cea28ac05923..0586fa636707 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4178,6 +4178,9 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns
>  	}
>  	hvregs.hdec_expiry = time_limit;
>  
> +	// clear doorbell bit as hvregs already has the info
> +	vcpu->arch.doorbell_request = 0;
> +
>  	/*
>  	 * When setting DEC, we must always deal with irq_work_raise
>  	 * via NMI vs setting DEC. The problem occurs right as we
> @@ -4694,6 +4697,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
>  	struct kvm_nested_guest *nested = vcpu->arch.nested;
>  	unsigned long flags;
>  	u64 tb;
> +	bool doorbell_pending;
>  
>  	trace_kvmppc_run_vcpu_enter(vcpu);
>  
> @@ -4752,6 +4756,9 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
>  	 */
>  	smp_mb();
>  
> +	doorbell_pending = !cpu_has_feature(CPU_FTR_ARCH_300) &&
> +				vcpu->arch.doorbell_request;

Hmm... is the feature test flipped here?

> +
>  	if (!nested) {
>  		kvmppc_core_prepare_to_enter(vcpu);
>  		if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
> @@ -4769,7 +4776,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
>  				lpcr |= LPCR_MER;
>  		}
>  	} else if (vcpu->arch.pending_exceptions ||
> -		   vcpu->arch.doorbell_request ||
> +		   doorbell_pending ||
>  		   xive_interrupt_pending(vcpu)) {
>  		vcpu->arch.ret = RESUME_HOST;
>  		goto out;
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
> index 05f5220960c6..b34eefa6b268 100644
> --- a/arch/powerpc/kvm/book3s_hv_nested.c
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -32,7 +32,10 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
>  	struct kvmppc_vcore *vc = vcpu->arch.vcore;
>  
>  	hr->pcr = vc->pcr | PCR_MASK;
> -	hr->dpdes = vc->dpdes;
> +	if (cpu_has_feature(CPU_FTR_ARCH_300))
> +		hr->dpdes = vcpu->arch.doorbell_request;
> +	else
> +		hr->dpdes = vc->dpdes;
>  	hr->hfscr = vcpu->arch.hfscr;
>  	hr->tb_offset = vc->tb_offset;
>  	hr->dawr0 = vcpu->arch.dawr0;

Great find.

Nested is all POWER9 and later only, so I think you can just
change to using doorbell_request always.

And probably don't have to do anything for book3s_hv.c unless
I'm mistaken about the feature test.

Thanks,
Nick

> @@ -105,7 +108,10 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu,
>  {
>  	struct kvmppc_vcore *vc = vcpu->arch.vcore;
>  
> -	hr->dpdes = vc->dpdes;
> +	if (cpu_has_feature(CPU_FTR_ARCH_300))
> +		hr->dpdes = vcpu->arch.doorbell_request;
> +	else
> +		hr->dpdes = vc->dpdes;
>  	hr->purr = vcpu->arch.purr;
>  	hr->spurr = vcpu->arch.spurr;
>  	hr->ic = vcpu->arch.ic;
> @@ -143,7 +149,10 @@ static void restore_hv_regs(struct kvm_vcpu *vcpu, const struct hv_guest_state *
>  	struct kvmppc_vcore *vc = vcpu->arch.vcore;
>  
>  	vc->pcr = hr->pcr | PCR_MASK;
> -	vc->dpdes = hr->dpdes;
> +	if (cpu_has_feature(CPU_FTR_ARCH_300))
> +		vcpu->arch.doorbell_request = hr->dpdes;
> +	else
> +		vc->dpdes = hr->dpdes;
>  	vcpu->arch.hfscr = hr->hfscr;
>  	vcpu->arch.dawr0 = hr->dawr0;
>  	vcpu->arch.dawrx0 = hr->dawrx0;
> @@ -170,7 +179,10 @@ void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
>  {
>  	struct kvmppc_vcore *vc = vcpu->arch.vcore;
>  
> -	vc->dpdes = hr->dpdes;
> +	if (cpu_has_feature(CPU_FTR_ARCH_300) && !vcpu->arch.doorbell_request)
> +		vcpu->arch.doorbell_request = hr->dpdes;
> +	else
> +		vc->dpdes = hr->dpdes;
>  	vcpu->arch.hfscr = hr->hfscr;
>  	vcpu->arch.purr = hr->purr;
>  	vcpu->arch.spurr = hr->spurr;
Gautam Menghani Oct. 30, 2024, 1:33 p.m. UTC | #2
On Thu, Jul 04, 2024 at 10:10:05PM +1000, Nicholas Piggin wrote:
> On Fri Jun 28, 2024 at 4:03 AM AEST, Gautam Menghani wrote:
> > commit 6398326b9ba1("KVM: PPC: Book3S HV P9: Stop using vc->dpdes")
> > introduced an optimization to use only vcpu->doorbell_request for SMT
> > emulation for Power9 and above guests, but the code for nested guests 
> > still relies on the old way of handling doorbells, due to which an L2
> > guest cannot be booted with XICS with SMT>1. The command to repro
> > this issue is:
> >
> > qemu-system-ppc64 \
> > 	-drive file=rhel.qcow2,format=qcow2 \
> > 	-m 20G \
> > 	-smp 8,cores=1,threads=8 \
> > 	-cpu  host \
> > 	-nographic \
> > 	-machine pseries,ic-mode=xics -accel kvm
> >
> > Fix the plumbing to utilize vcpu->doorbell_request instead of vcore->dpdes 
> > on P9 and above.
> >
> > Fixes: 6398326b9ba1 ("KVM: PPC: Book3S HV P9: Stop using vc->dpdes")
> > Signed-off-by: Gautam Menghani <gautam@linux.ibm.com>
> > ---
> >  arch/powerpc/kvm/book3s_hv.c        |  9 ++++++++-
> >  arch/powerpc/kvm/book3s_hv_nested.c | 20 ++++++++++++++++----
> >  2 files changed, 24 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > index cea28ac05923..0586fa636707 100644
> > --- a/arch/powerpc/kvm/book3s_hv.c
> > +++ b/arch/powerpc/kvm/book3s_hv.c
> > @@ -4178,6 +4178,9 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns
> >  	}
> >  	hvregs.hdec_expiry = time_limit;
> >  
> > +	// clear doorbell bit as hvregs already has the info
> > +	vcpu->arch.doorbell_request = 0;
> > +
> >  	/*
> >  	 * When setting DEC, we must always deal with irq_work_raise
> >  	 * via NMI vs setting DEC. The problem occurs right as we
> > @@ -4694,6 +4697,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
> >  	struct kvm_nested_guest *nested = vcpu->arch.nested;
> >  	unsigned long flags;
> >  	u64 tb;
> > +	bool doorbell_pending;
> >  
> >  	trace_kvmppc_run_vcpu_enter(vcpu);
> >  
> > @@ -4752,6 +4756,9 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
> >  	 */
> >  	smp_mb();
> >  
> > +	doorbell_pending = !cpu_has_feature(CPU_FTR_ARCH_300) &&
> > +				vcpu->arch.doorbell_request;
> 
> Hmm... is the feature test flipped here?

Sorry for responding late, I got involved in some other things. 
Yes I think I got that part wrong, I guess it should've been

doorbell_pending = !cpu_has_feature(CPU_FTR_HVMODE) &&
                        vcpu->arch.doorbell_request;

The objective of introducing this is to avoid returning to L1 midway
when L0 is about to run L2. The issue is that if L1 does H_ENTER_NESTED
and there is a doorbell for L2, this condition in kvmhv_run_single_vcpu
will cause L0 to abort and go back to L1:

	} else if (vcpu->arch.pending_exceptions ||
		   vcpu->arch.doorbell_request ||
		   xive_interrupt_pending(vcpu)) {
		vcpu->arch.ret = RESUME_HOST;
		goto out;
	}

Earlier, vc->dpdes was used to pass around doorbell state, that's why
this condition did not cause problems, until 
6398326b9ba1 ("KVM: PPC: Book3S HV P9: Stop using vc->dpdes")

> 
> > +
> >  	if (!nested) {
> >  		kvmppc_core_prepare_to_enter(vcpu);
> >  		if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
> > @@ -4769,7 +4776,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
> >  				lpcr |= LPCR_MER;
> >  		}
> >  	} else if (vcpu->arch.pending_exceptions ||
> > -		   vcpu->arch.doorbell_request ||
> > +		   doorbell_pending ||
> >  		   xive_interrupt_pending(vcpu)) {
> >  		vcpu->arch.ret = RESUME_HOST;
> >  		goto out;
> > diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
> > index 05f5220960c6..b34eefa6b268 100644
> > --- a/arch/powerpc/kvm/book3s_hv_nested.c
> > +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> > @@ -32,7 +32,10 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
> >  	struct kvmppc_vcore *vc = vcpu->arch.vcore;
> >  
> >  	hr->pcr = vc->pcr | PCR_MASK;
> > -	hr->dpdes = vc->dpdes;
> > +	if (cpu_has_feature(CPU_FTR_ARCH_300))
> > +		hr->dpdes = vcpu->arch.doorbell_request;
> > +	else
> > +		hr->dpdes = vc->dpdes;
> >  	hr->hfscr = vcpu->arch.hfscr;
> >  	hr->tb_offset = vc->tb_offset;
> >  	hr->dawr0 = vcpu->arch.dawr0;
> 
> Great find.
> 
> Nested is all POWER9 and later only, so I think you can just
> change to using doorbell_request always.

Noted.

> 
> And probably don't have to do anything for book3s_hv.c unless
> I'm mistaken about the feature test.
>

As pointed out above, the intention was to avoid the "else if" part in
kvmhv_run_single_vcpu(). Please do  point out if I missed something here.

Thanks,
Gautam
diff mbox series

Patch

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index cea28ac05923..0586fa636707 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4178,6 +4178,9 @@  static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns
 	}
 	hvregs.hdec_expiry = time_limit;
 
+	// clear doorbell bit as hvregs already has the info
+	vcpu->arch.doorbell_request = 0;
+
 	/*
 	 * When setting DEC, we must always deal with irq_work_raise
 	 * via NMI vs setting DEC. The problem occurs right as we
@@ -4694,6 +4697,7 @@  int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
 	struct kvm_nested_guest *nested = vcpu->arch.nested;
 	unsigned long flags;
 	u64 tb;
+	bool doorbell_pending;
 
 	trace_kvmppc_run_vcpu_enter(vcpu);
 
@@ -4752,6 +4756,9 @@  int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
 	 */
 	smp_mb();
 
+	doorbell_pending = !cpu_has_feature(CPU_FTR_ARCH_300) &&
+				vcpu->arch.doorbell_request;
+
 	if (!nested) {
 		kvmppc_core_prepare_to_enter(vcpu);
 		if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
@@ -4769,7 +4776,7 @@  int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
 				lpcr |= LPCR_MER;
 		}
 	} else if (vcpu->arch.pending_exceptions ||
-		   vcpu->arch.doorbell_request ||
+		   doorbell_pending ||
 		   xive_interrupt_pending(vcpu)) {
 		vcpu->arch.ret = RESUME_HOST;
 		goto out;
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 05f5220960c6..b34eefa6b268 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -32,7 +32,10 @@  void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
 	hr->pcr = vc->pcr | PCR_MASK;
-	hr->dpdes = vc->dpdes;
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		hr->dpdes = vcpu->arch.doorbell_request;
+	else
+		hr->dpdes = vc->dpdes;
 	hr->hfscr = vcpu->arch.hfscr;
 	hr->tb_offset = vc->tb_offset;
 	hr->dawr0 = vcpu->arch.dawr0;
@@ -105,7 +108,10 @@  static void save_hv_return_state(struct kvm_vcpu *vcpu,
 {
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
-	hr->dpdes = vc->dpdes;
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		hr->dpdes = vcpu->arch.doorbell_request;
+	else
+		hr->dpdes = vc->dpdes;
 	hr->purr = vcpu->arch.purr;
 	hr->spurr = vcpu->arch.spurr;
 	hr->ic = vcpu->arch.ic;
@@ -143,7 +149,10 @@  static void restore_hv_regs(struct kvm_vcpu *vcpu, const struct hv_guest_state *
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
 	vc->pcr = hr->pcr | PCR_MASK;
-	vc->dpdes = hr->dpdes;
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		vcpu->arch.doorbell_request = hr->dpdes;
+	else
+		vc->dpdes = hr->dpdes;
 	vcpu->arch.hfscr = hr->hfscr;
 	vcpu->arch.dawr0 = hr->dawr0;
 	vcpu->arch.dawrx0 = hr->dawrx0;
@@ -170,7 +179,10 @@  void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
 {
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
-	vc->dpdes = hr->dpdes;
+	if (cpu_has_feature(CPU_FTR_ARCH_300) && !vcpu->arch.doorbell_request)
+		vcpu->arch.doorbell_request = hr->dpdes;
+	else
+		vc->dpdes = hr->dpdes;
 	vcpu->arch.hfscr = hr->hfscr;
 	vcpu->arch.purr = hr->purr;
 	vcpu->arch.spurr = hr->spurr;