Message ID | 5718BBB202000078000E4484@prv-mh.provo.novell.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
>>> On 21.04.16 at 11:38, <JBeulich@suse.com> wrote: > Recent changes to Linux result in there just being a single unmask > operation prior to expecting the first interrupts to arrive. However, > we've had a chicken-and-egg problem here: Qemu invokes > xc_domain_update_msi_irq(), ultimately leading to > msixtbl_pt_register(), upon seeing that first unmask operation. Yet > for msixtbl_range() to return true (in order to msixtbl_write() to get > invoked at all) msixtbl_pt_register() must have completed. > > Deal with this by snooping suitable writes in msixtbl_range() and > triggering the invocation of msix_write_completion() from > msixtbl_pt_register() when that happens in the context of a still in > progress vector control field write. > > Note that the seemingly unrelated deletion of the redundant > irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to > any compiler version used that the "msi_desc" local variable isn't > being used uninitialized. (Doing the same in msixtbl_pt_unregister() is > just for consistency reasons.) > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > --- > TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())? One option I just now thought of might be to also look at batches of up to 4 dword or 2 qword writes there, tracking the address of the one that writes to the vector control field. One might even limit this to taking into consideration only the last iteration on such batches, on the assumption that if at all it should be that last one which writes the field of interest. (The fundamental assumption then would be that no OS uses REP MOVS to fill data for more than one table entry at a time.) This would then also cover various Windows versions. Jan
> -----Original Message----- > From: Jan Beulich [mailto:JBeulich@suse.com] > Sent: 21 April 2016 10:38 > To: xen-devel > Cc: Andrew Cooper; Paul Durrant; Wei Liu; Keir (Xen.org) > Subject: [PATCH] x86/vMSI-X: avoid missing first unmask of vectors > > Recent changes to Linux result in there just being a single unmask > operation prior to expecting the first interrupts to arrive. However, > we've had a chicken-and-egg problem here: Qemu invokes > xc_domain_update_msi_irq(), ultimately leading to > msixtbl_pt_register(), upon seeing that first unmask operation. Yet > for msixtbl_range() to return true (in order to msixtbl_write() to get > invoked at all) msixtbl_pt_register() must have completed. > > Deal with this by snooping suitable writes in msixtbl_range() and > triggering the invocation of msix_write_completion() from > msixtbl_pt_register() when that happens in the context of a still in > progress vector control field write. > > Note that the seemingly unrelated deletion of the redundant > irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to > any compiler version used that the "msi_desc" local variable isn't > being used uninitialized. (Doing the same in msixtbl_pt_unregister() is > just for consistency reasons.) > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > --- > TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())? > > --- a/xen/arch/x86/hvm/vmsi.c > +++ b/xen/arch/x86/hvm/vmsi.c > @@ -341,7 +352,21 @@ static int msixtbl_range(struct vcpu *v, > desc = msixtbl_addr_to_desc(msixtbl_find_entry(v, addr), addr); > rcu_read_unlock(&msixtbl_rcu_lock); > > - return !!desc; > + if ( desc ) > + return 1; > + > + if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) == > + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) > + { > + const ioreq_t *r = &v->arch.hvm_vcpu.hvm_io.io_req; > + If you need to start digging into the ioreq here then I think it would be better to have vmsi register a full hvm_io_ops rather than hvm_mmio_ops. That way it gets the ioreq passed the accept method without any need to dig. Paul > + ASSERT(r->type == IOREQ_TYPE_COPY); > + if ( r->dir == IOREQ_WRITE && r->size == 4 && !r->data_is_ptr > + && !(r->data & PCI_MSIX_VECTOR_BITMASK) ) > + v->arch.hvm_vcpu.hvm_io.msix_snoop_address = addr; > + } > + > + return 0; > } > > static const struct hvm_mmio_ops msixtbl_mmio_ops = { > @@ -410,9 +434,6 @@ int msixtbl_pt_register(struct domain *d > return r; > } > > - if ( !irq_desc->msi_desc ) > - goto out; > - > msi_desc = irq_desc->msi_desc; > if ( !msi_desc ) > goto out; > @@ -437,6 +458,21 @@ found: > out: > spin_unlock_irq(&irq_desc->lock); > xfree(new_entry); > + > + if ( !r ) > + { > + struct vcpu *v; > + > + for_each_vcpu ( d, v ) > + { > + if ( v->arch.hvm_vcpu.hvm_io.msix_snoop_address == > + (gtable + msi_desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE > + > + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) ) > + v->arch.hvm_vcpu.hvm_io.msix_unmask_address = > + v->arch.hvm_vcpu.hvm_io.msix_snoop_address; > + } > + } > + > return r; > } > > @@ -457,9 +496,6 @@ void msixtbl_pt_unregister(struct domain > if ( !irq_desc ) > return; > > - if ( !irq_desc->msi_desc ) > - goto out; > - > msi_desc = irq_desc->msi_desc; > if ( !msi_desc ) > goto out; > @@ -522,6 +558,8 @@ void msix_write_completion(struct vcpu * > { > unsigned long ctrl_address = v- > >arch.hvm_vcpu.hvm_io.msix_unmask_address; > > + v->arch.hvm_vcpu.hvm_io.msix_snoop_address = 0; > + > if ( !ctrl_address ) > return; > > --- a/xen/include/asm-x86/hvm/vcpu.h > +++ b/xen/include/asm-x86/hvm/vcpu.h > @@ -85,6 +85,7 @@ struct hvm_vcpu_io { > bool_t mmio_retry; > > unsigned long msix_unmask_address; > + unsigned long msix_snoop_address; > > const struct g2m_ioport *g2m_ioport; > }; > >
>>> On 21.04.16 at 13:33, <Paul.Durrant@citrix.com> wrote: >> From: Jan Beulich [mailto:JBeulich@suse.com] >> Sent: 21 April 2016 10:38 >> --- a/xen/arch/x86/hvm/vmsi.c >> +++ b/xen/arch/x86/hvm/vmsi.c >> @@ -341,7 +352,21 @@ static int msixtbl_range(struct vcpu *v, >> desc = msixtbl_addr_to_desc(msixtbl_find_entry(v, addr), addr); >> rcu_read_unlock(&msixtbl_rcu_lock); >> >> - return !!desc; >> + if ( desc ) >> + return 1; >> + >> + if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) == >> + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) >> + { >> + const ioreq_t *r = &v->arch.hvm_vcpu.hvm_io.io_req; >> + > > If you need to start digging into the ioreq here then I think it would be > better to have vmsi register a full hvm_io_ops rather than hvm_mmio_ops. That > way it gets the ioreq passed the accept method without any need to dig. I can certainly try and see how this works out, but it's relatively clear that it'll make the patch bigger and backporting more difficult. So maybe this would better be left as a subsequent cleanup exercise? Jan
> -----Original Message----- > From: Jan Beulich [mailto:JBeulich@suse.com] > Sent: 21 April 2016 12:44 > To: Paul Durrant > Cc: Andrew Cooper; Wei Liu; xen-devel; Keir (Xen.org) > Subject: RE: [PATCH] x86/vMSI-X: avoid missing first unmask of vectors > > >>> On 21.04.16 at 13:33, <Paul.Durrant@citrix.com> wrote: > >> From: Jan Beulich [mailto:JBeulich@suse.com] > >> Sent: 21 April 2016 10:38 > >> --- a/xen/arch/x86/hvm/vmsi.c > >> +++ b/xen/arch/x86/hvm/vmsi.c > >> @@ -341,7 +352,21 @@ static int msixtbl_range(struct vcpu *v, > >> desc = msixtbl_addr_to_desc(msixtbl_find_entry(v, addr), addr); > >> rcu_read_unlock(&msixtbl_rcu_lock); > >> > >> - return !!desc; > >> + if ( desc ) > >> + return 1; > >> + > >> + if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) == > >> + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) > >> + { > >> + const ioreq_t *r = &v->arch.hvm_vcpu.hvm_io.io_req; > >> + > > > > If you need to start digging into the ioreq here then I think it would be > > better to have vmsi register a full hvm_io_ops rather than hvm_mmio_ops. > That > > way it gets the ioreq passed the accept method without any need to dig. > > I can certainly try and see how this works out, but it's relatively > clear that it'll make the patch bigger and backporting more difficult. > So maybe this would better be left as a subsequent cleanup > exercise? > True, it would make backporting more tricky so maybe a follow-up patch would be the better approach. In which case... Reviewed-by: Paul Durrant <paul.durrant@citrix.com> > Jan
On Thu, Apr 21, 2016 at 03:38:26AM -0600, Jan Beulich wrote: > Recent changes to Linux result in there just being a single unmask > operation prior to expecting the first interrupts to arrive. However, > we've had a chicken-and-egg problem here: Qemu invokes > xc_domain_update_msi_irq(), ultimately leading to > msixtbl_pt_register(), upon seeing that first unmask operation. Yet > for msixtbl_range() to return true (in order to msixtbl_write() to get > invoked at all) msixtbl_pt_register() must have completed. > > Deal with this by snooping suitable writes in msixtbl_range() and > triggering the invocation of msix_write_completion() from > msixtbl_pt_register() when that happens in the context of a still in > progress vector control field write. > > Note that the seemingly unrelated deletion of the redundant > irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to > any compiler version used that the "msi_desc" local variable isn't > being used uninitialized. (Doing the same in msixtbl_pt_unregister() is > just for consistency reasons.) > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > --- > TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())? > As I understand it, this should be future improvement. The patch itself is an improvement in its own right so it can go in: Release-acked-by: Wei Liu <wei.liu2@citrix.com>
>>> On 25.04.16 at 15:25, <wei.liu2@citrix.com> wrote: > On Thu, Apr 21, 2016 at 03:38:26AM -0600, Jan Beulich wrote: >> Recent changes to Linux result in there just being a single unmask >> operation prior to expecting the first interrupts to arrive. However, >> we've had a chicken-and-egg problem here: Qemu invokes >> xc_domain_update_msi_irq(), ultimately leading to >> msixtbl_pt_register(), upon seeing that first unmask operation. Yet >> for msixtbl_range() to return true (in order to msixtbl_write() to get >> invoked at all) msixtbl_pt_register() must have completed. >> >> Deal with this by snooping suitable writes in msixtbl_range() and >> triggering the invocation of msix_write_completion() from >> msixtbl_pt_register() when that happens in the context of a still in >> progress vector control field write. >> >> Note that the seemingly unrelated deletion of the redundant >> irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to >> any compiler version used that the "msi_desc" local variable isn't >> being used uninitialized. (Doing the same in msixtbl_pt_unregister() is >> just for consistency reasons.) >> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> >> --- >> TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())? >> > > As I understand it, this should be future improvement. Yes. I've actually been working on this the last couple of hours. > The patch itself > is an improvement in its own right so it can go in: > > Release-acked-by: Wei Liu <wei.liu2@citrix.com> Thanks, Jan
--- a/xen/arch/x86/hvm/vmsi.c +++ b/xen/arch/x86/hvm/vmsi.c @@ -341,7 +352,21 @@ static int msixtbl_range(struct vcpu *v, desc = msixtbl_addr_to_desc(msixtbl_find_entry(v, addr), addr); rcu_read_unlock(&msixtbl_rcu_lock); - return !!desc; + if ( desc ) + return 1; + + if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) == + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) + { + const ioreq_t *r = &v->arch.hvm_vcpu.hvm_io.io_req; + + ASSERT(r->type == IOREQ_TYPE_COPY); + if ( r->dir == IOREQ_WRITE && r->size == 4 && !r->data_is_ptr + && !(r->data & PCI_MSIX_VECTOR_BITMASK) ) + v->arch.hvm_vcpu.hvm_io.msix_snoop_address = addr; + } + + return 0; } static const struct hvm_mmio_ops msixtbl_mmio_ops = { @@ -410,9 +434,6 @@ int msixtbl_pt_register(struct domain *d return r; } - if ( !irq_desc->msi_desc ) - goto out; - msi_desc = irq_desc->msi_desc; if ( !msi_desc ) goto out; @@ -437,6 +458,21 @@ found: out: spin_unlock_irq(&irq_desc->lock); xfree(new_entry); + + if ( !r ) + { + struct vcpu *v; + + for_each_vcpu ( d, v ) + { + if ( v->arch.hvm_vcpu.hvm_io.msix_snoop_address == + (gtable + msi_desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) ) + v->arch.hvm_vcpu.hvm_io.msix_unmask_address = + v->arch.hvm_vcpu.hvm_io.msix_snoop_address; + } + } + return r; } @@ -457,9 +496,6 @@ void msixtbl_pt_unregister(struct domain if ( !irq_desc ) return; - if ( !irq_desc->msi_desc ) - goto out; - msi_desc = irq_desc->msi_desc; if ( !msi_desc ) goto out; @@ -522,6 +558,8 @@ void msix_write_completion(struct vcpu * { unsigned long ctrl_address = v->arch.hvm_vcpu.hvm_io.msix_unmask_address; + v->arch.hvm_vcpu.hvm_io.msix_snoop_address = 0; + if ( !ctrl_address ) return; --- a/xen/include/asm-x86/hvm/vcpu.h +++ b/xen/include/asm-x86/hvm/vcpu.h @@ -85,6 +85,7 @@ struct hvm_vcpu_io { bool_t mmio_retry; unsigned long msix_unmask_address; + unsigned long msix_snoop_address; const struct g2m_ioport *g2m_ioport; };