diff mbox

[v2,08/11] powerpc/kvm/xive: Keep escalation interrupt masked unless ceded

Message ID 20171123043619.15301-8-benh@kernel.crashing.org (mailing list archive)
State New, archived
Headers show

Commit Message

Benjamin Herrenschmidt Nov. 23, 2017, 4:36 a.m. UTC
This works on top of the single escalation support. When in single
escalation, with this change, we will keep the escalation interrupt
disabled unless the VCPU is in H_CEDE (idle). In any other case, we
know the VCPU will be rescheduled and thus there is no need to take
escalation interrupts in the host whenever a guest interrupt fires.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/kvm_host.h     |  2 ++
 arch/powerpc/kernel/asm-offsets.c       |  3 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 64 +++++++++++++++++++++++++++++++--
 arch/powerpc/kvm/book3s_xive.c          | 30 ++++++++++++++++
 4 files changed, 97 insertions(+), 2 deletions(-)

Comments

Paul Mackerras Nov. 25, 2017, 4:56 a.m. UTC | #1
On Thu, Nov 23, 2017 at 03:36:16PM +1100, Benjamin Herrenschmidt wrote:
> This works on top of the single escalation support. When in single
> escalation, with this change, we will keep the escalation interrupt
> disabled unless the VCPU is in H_CEDE (idle). In any other case, we
> know the VCPU will be rescheduled and thus there is no need to take
> escalation interrupts in the host whenever a guest interrupt fires.

Some comments below...

> @@ -2705,7 +2740,32 @@ kvm_cede_prodded:
>  	/* we've ceded but we want to give control to the host */
>  kvm_cede_exit:
>  	ld	r9, HSTATE_KVM_VCPU(r13)
> -	b	guest_exit_cont
> +#ifdef CONFIG_KVM_XICS
> +	/* Abort if we still have a pending escalation */

This comment might be clearer if you say "Cancel cede" rather than
"Abort".

> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> index eef9ccafdc09..7a047bc88f11 100644
> --- a/arch/powerpc/kvm/book3s_xive.c
> +++ b/arch/powerpc/kvm/book3s_xive.c
> @@ -89,6 +89,17 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
>  	if (vcpu->arch.ceded)
>  		kvmppc_fast_vcpu_kick(vcpu);
>  
> +	/* Since we have the no-EOI flag, the interrupt is effectively
> +	 * disabled now. Clearing xive_esc_on means we won't bother
> +	 * doing so on the next entry.
> +	 *
> +	 * This also allows the entry code to know that if a PQ combination
> +	 * of 10 is observed while xive_esc_on is true, it means the queue
> +	 * contains an unprocessed escalation interrupt. We don't make use of
> +	 * that knowledge today but might (see comment in book3s_hv_rmhandler.S)

Is "We don't make use of that knowledge" actually true?  I thought we
did make use of it in the assembly code in book3s_hv_rmhandlers.S (in
the code that this patch adds).  In what way don't we make use of it?

Paul.
Benjamin Herrenschmidt Nov. 26, 2017, 9:55 p.m. UTC | #2
On Sat, 2017-11-25 at 15:56 +1100, Paul Mackerras wrote:
> On Thu, Nov 23, 2017 at 03:36:16PM +1100, Benjamin Herrenschmidt wrote:
> > This works on top of the single escalation support. When in single
> > escalation, with this change, we will keep the escalation interrupt
> > disabled unless the VCPU is in H_CEDE (idle). In any other case, we
> > know the VCPU will be rescheduled and thus there is no need to take
> > escalation interrupts in the host whenever a guest interrupt fires.
> 
> Some comments below...
> 
> > @@ -2705,7 +2740,32 @@ kvm_cede_prodded:
> >  	/* we've ceded but we want to give control to the host */
> >  kvm_cede_exit:
> >  	ld	r9, HSTATE_KVM_VCPU(r13)
> > -	b	guest_exit_cont
> > +#ifdef CONFIG_KVM_XICS
> > +	/* Abort if we still have a pending escalation */
> 
> This comment might be clearer if you say "Cancel cede" rather than
> "Abort".
> 
> > diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> > index eef9ccafdc09..7a047bc88f11 100644
> > --- a/arch/powerpc/kvm/book3s_xive.c
> > +++ b/arch/powerpc/kvm/book3s_xive.c
> > @@ -89,6 +89,17 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
> >  	if (vcpu->arch.ceded)
> >  		kvmppc_fast_vcpu_kick(vcpu);
> >  
> > +	/* Since we have the no-EOI flag, the interrupt is effectively
> > +	 * disabled now. Clearing xive_esc_on means we won't bother
> > +	 * doing so on the next entry.
> > +	 *
> > +	 * This also allows the entry code to know that if a PQ combination
> > +	 * of 10 is observed while xive_esc_on is true, it means the queue
> > +	 * contains an unprocessed escalation interrupt. We don't make use of
> > +	 * that knowledge today but might (see comment in book3s_hv_rmhandler.S)
> 
> Is "We don't make use of that knowledge" actually true?  I thought we
> did make use of it in the assembly code in book3s_hv_rmhandlers.S (in
> the code that this patch adds).  In what way don't we make use of it?

Obsolete comment, sorry.

Cheers,
Ben.
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 8a4e77273b07..e433fe2ce4b7 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -744,6 +744,8 @@  struct kvm_vcpu_arch {
 	u8 xive_pushed;		 /* Is the VP pushed on the physical CPU ? */
 	u8 xive_esc_on;		 /* Is the escalation irq enabled ? */
 	union xive_tma_w01 xive_saved_state; /* W0..1 of XIVE thread state */
+	u64 xive_esc_raddr;	 /* Escalation interrupt ESB real addr */
+	u64 xive_esc_vaddr;	 /* Escalation interrupt ESB virt addr */
 #endif
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 0dc911a1feac..eff521c67ec3 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -731,6 +731,9 @@  int main(void)
 	DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu,
 					    arch.xive_cam_word));
 	DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed));
+	DEFINE(VCPU_XIVE_ESC_ON, offsetof(struct kvm_vcpu, arch.xive_esc_on));
+	DEFINE(VCPU_XIVE_ESC_RADDR, offsetof(struct kvm_vcpu, arch.xive_esc_raddr));
+	DEFINE(VCPU_XIVE_ESC_VADDR, offsetof(struct kvm_vcpu, arch.xive_esc_vaddr));
 #endif
 
 #ifdef CONFIG_KVM_EXIT_TIMING
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 286bcc4a73c2..ec66b0e07f47 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1006,7 +1006,42 @@  ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	 * issue.
 	 */
 	li	r0,0
-	stb	r0, VCPU_IRQ_PENDING(r4);
+	stb	r0, VCPU_IRQ_PENDING(r4)
+
+	/*
+	 * In single escalation mode, if the escalation interrupt is
+	 * on, we mask it.
+	 */
+	lbz	r0, VCPU_XIVE_ESC_ON(r4)
+	cmpwi	r0,0
+	beq	1f
+	ld	r10, VCPU_XIVE_ESC_RADDR(r4)
+	li	r9, XIVE_ESB_SET_PQ_01
+	ldcix	r0, r10, r9
+	sync
+
+	/* We have a possible subtle race here: The escalation interrupt might
+	 * have fired and be on its way to the host queue while we mask it,
+	 * and if we unmask it early enough (re-cede right away), there is
+	 * a theorical possibility that it fires again, thus landing in the
+	 * target queue more than once which is a big no-no.
+	 *
+	 * Fortunately, solving this is rather easy. If the above load setting
+	 * PQ to 01 returns a previous value where P is set, then we know the
+	 * escalation interrupt is somewhere on its way to the host. In that
+	 * case we simply don't clear the xive_esc_on flag below. It will be
+	 * eventually cleared by the handler for the escalation interrupt.
+	 *
+	 * Then, when doing a cede, we check that flag again before re-enabling
+	 * the escalation interrupt, and if set, we abort the cede.
+	 */
+	andi.	r0, r0, XIVE_ESB_VAL_P
+	bne-	1f
+
+	/* Now P is 0, we can clear the flag */
+	li	r0, 0
+	stb	r0, VCPU_XIVE_ESC_ON(r4)
+1:
 no_xive:
 #endif /* CONFIG_KVM_XICS */
 
@@ -2705,7 +2740,32 @@  kvm_cede_prodded:
 	/* we've ceded but we want to give control to the host */
 kvm_cede_exit:
 	ld	r9, HSTATE_KVM_VCPU(r13)
-	b	guest_exit_cont
+#ifdef CONFIG_KVM_XICS
+	/* Abort if we still have a pending escalation */
+	lbz	r5, VCPU_XIVE_ESC_ON(r9)
+	cmpwi	r5, 0
+	beq	1f
+	li	r0, 0
+	stb	r0, VCPU_CEDED(r9)
+1:	/* Enable XIVE escalation */
+	li	r5, XIVE_ESB_SET_PQ_00
+	mfmsr	r0
+	andi.	r0, r0, MSR_DR		/* in real mode? */
+	beq	1f
+	ld	r10, VCPU_XIVE_ESC_VADDR(r9)
+	cmpdi	r10, 0
+	beq	3f
+	ldx	r0, r10, r5
+	b	2f
+1:	ld	r10, VCPU_XIVE_ESC_RADDR(r9)
+	cmpdi	r10, 0
+	beq	3f
+	ldcix	r0, r10, r5
+2:	sync
+	li	r0, 1
+	stb	r0, VCPU_XIVE_ESC_ON(r9)
+#endif /* CONFIG_KVM_XICS */
+3:	b	guest_exit_cont
 
 	/* Try to handle a machine check in real mode */
 machine_check_realmode:
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index eef9ccafdc09..7a047bc88f11 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -89,6 +89,17 @@  static irqreturn_t xive_esc_irq(int irq, void *data)
 	if (vcpu->arch.ceded)
 		kvmppc_fast_vcpu_kick(vcpu);
 
+	/* Since we have the no-EOI flag, the interrupt is effectively
+	 * disabled now. Clearing xive_esc_on means we won't bother
+	 * doing so on the next entry.
+	 *
+	 * This also allows the entry code to know that if a PQ combination
+	 * of 10 is observed while xive_esc_on is true, it means the queue
+	 * contains an unprocessed escalation interrupt. We don't make use of
+	 * that knowledge today but might (see comment in book3s_hv_rmhandler.S)
+	 */
+	vcpu->arch.xive_esc_on = false;
+
 	return IRQ_HANDLED;
 }
 
@@ -134,6 +145,25 @@  static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
 		goto error;
 	}
 	xc->esc_virq_names[prio] = name;
+
+	/* In single escalation mode, we grab the ESB MMIO of the
+	 * interrupt and mask it. Also populate the VCPU v/raddr
+	 * of the ESB page for use by asm entry/exit code. Finally
+	 * set the XIVE_IRQ_NO_EOI flag which will prevent the
+	 * core code from performing an EOI on the escalation
+	 * interrupt, thus leaving it effectively masked after
+	 * it fires once.
+	 */
+	if (xc->xive->single_escalation) {
+		struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]);
+		struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+		xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
+		vcpu->arch.xive_esc_raddr = xd->eoi_page;
+		vcpu->arch.xive_esc_vaddr = (__force u64)xd->eoi_mmio;
+		xd->flags |= XIVE_IRQ_NO_EOI;
+	}
+
 	return 0;
 error:
 	irq_dispose_mapping(xc->esc_virq[prio]);