diff mbox

[v8,4/7] VT-d: Use one function to update both remapped and posted IRTE

Message ID 1479434244-10223-5-git-send-email-feng.wu@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wu, Feng Nov. 18, 2016, 1:57 a.m. UTC
Use one function to update both remapped IRTE and posted IRET.

Signed-off-by: Feng Wu <feng.wu@intel.com>
---
v8:
- Newly added

 xen/drivers/passthrough/vtd/intremap.c | 162 ++++++++++++++-------------------
 1 file changed, 66 insertions(+), 96 deletions(-)

Comments

Tian, Kevin Nov. 18, 2016, 4:31 a.m. UTC | #1
> From: Wu, Feng
> Sent: Friday, November 18, 2016 9:57 AM
> 
> Use one function to update both remapped IRTE and posted IRET.
> 
> Signed-off-by: Feng Wu <feng.wu@intel.com>
> ---
> v8:
> - Newly added
> 
>  xen/drivers/passthrough/vtd/intremap.c | 162 ++++++++++++++-------------------
>  1 file changed, 66 insertions(+), 96 deletions(-)
> 
> diff --git a/xen/drivers/passthrough/vtd/intremap.c
> b/xen/drivers/passthrough/vtd/intremap.c
> index bfd468b..fd2a49a 100644
> --- a/xen/drivers/passthrough/vtd/intremap.c
> +++ b/xen/drivers/passthrough/vtd/intremap.c
> @@ -420,7 +420,7 @@ void io_apic_write_remap_rte(
>          __ioapic_write_entry(apic, ioapic_pin, 1, old_rte);
>  }
> 
> -static void set_msi_source_id(struct pci_dev *pdev, struct iremap_entry *ire)
> +static void set_msi_source_id(const struct pci_dev *pdev, struct iremap_entry *ire)
>  {
>      u16 seg;
>      u8 bus, devfn, secbus;
> @@ -548,11 +548,12 @@ static int remap_entry_to_msi_msg(
>  }
> 
>  static int msi_msg_to_remap_entry(
> -    struct iommu *iommu, struct pci_dev *pdev,
> -    struct msi_desc *msi_desc, struct msi_msg *msg)
> +    struct iommu *iommu, const struct pci_dev *pdev,
> +    struct msi_desc *msi_desc, struct msi_msg *msg,
> +    const struct pi_desc *pi_desc, const uint8_t gvec)
>  {
>      struct iremap_entry *iremap_entry = NULL, *iremap_entries;
> -    struct iremap_entry new_ire;
> +    struct iremap_entry new_ire, old_ire;

old_ire is used only in later 'else' branch. move definition there.

>      struct msi_msg_remap_entry *remap_rte;
>      unsigned int index, i, nr = 1;
>      unsigned long flags;
> @@ -597,31 +598,50 @@ static int msi_msg_to_remap_entry(
> 
>      memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry));
> 
> -    /* Set interrupt remapping table entry */
> -    new_ire.remap.fpd = 0;
> -    new_ire.remap.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1;
> -    new_ire.remap.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
> -    new_ire.remap.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1;
> -    /* Hardware require RH = 1 for LPR delivery mode */
> -    new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio);
> -    new_ire.remap.avail = 0;
> -    new_ire.remap.res_1 = 0;
> -    new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) &
> -                            MSI_DATA_VECTOR_MASK;
> -    new_ire.remap.res_2 = 0;
> -    if ( x2apic_enabled )
> -        new_ire.remap.dst = msg->dest32;
> +    if ( !pi_desc )
> +    {
> +        /* Set interrupt remapping table entry */
> +        new_ire.remap.fpd = 0;
> +        new_ire.remap.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) &
> 0x1;
> +        new_ire.remap.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
> +        new_ire.remap.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) &
> 0x1;
> +        /* Hardware require RH = 1 for LPR delivery mode */
> +        new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio);
> +        new_ire.remap.avail = 0;
> +        new_ire.remap.res_1 = 0;
> +        new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) &
> +                                MSI_DATA_VECTOR_MASK;
> +        new_ire.remap.res_2 = 0;
> +        if ( x2apic_enabled )
> +            new_ire.remap.dst = msg->dest32;
> +        else
> +            new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT)
> +                                 & 0xff) << 8;
> +
> +        new_ire.remap.res_3 = 0;
> +        new_ire.remap.res_4 = 0;
> +        new_ire.remap.p = 1;    /* finally, set present bit */
> +    }
>      else
> -        new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT)
> -                             & 0xff) << 8;
> +    {
> +        new_ire.post.fpd = 0;
> +        new_ire.post.res_1 = 0;
> +        new_ire.post.res_2 = 0;
> +        new_ire.post.urg = 0;
> +        new_ire.post.im = 1;
> +        new_ire.post.vector = gvec;
> +        new_ire.post.res_3 = 0;
> +        new_ire.post.res_4 = 0;
> +        new_ire.post.res_5 = 0;
> +        new_ire.post.pda_l = virt_to_maddr(pi_desc) >> (32 - PDA_LOW_BIT);
> +        new_ire.post.pda_h = virt_to_maddr(pi_desc) >> 32;
> +        new_ire.post.p = 1;    /* finally, set present bit */
> +    }

It's a little confusing here:

- you first copy current remapping entry to new_ire and then do some initialization.
Is it necessary? If there are some fields we'd like to inherit, better to describe them
in comment so others can have a clear understanding of what exactly fields must
be updated here;

- in original code of setup_posted_irte, you actually do conditional assignment 
based on format of old_ire. Above you changed to non-conditional assignment,
not relying on old_ire. Is it desired? 

> 
>      if ( pdev )
>          set_msi_source_id(pdev, &new_ire);
>      else
>          set_hpet_source_id(msi_desc->hpet_id, &new_ire);
> -    new_ire.remap.res_3 = 0;
> -    new_ire.remap.res_4 = 0;
> -    new_ire.remap.p = 1;    /* finally, set present bit */
> 
>      /* now construct new MSI/MSI-X rte entry */
>      remap_rte = (struct msi_msg_remap_entry *)msg;
> @@ -637,7 +657,23 @@ static int msi_msg_to_remap_entry(
>      remap_rte->address_hi = 0;
>      remap_rte->data = index - i;
> 
> -    memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
> +    if ( !pi_desc )
> +        memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
> +    else
> +    {
> +        __uint128_t ret;
> +
> +        old_ire = *iremap_entry;
> +        ret = cmpxchg16b(iremap_entry, &old_ire, &new_ire);
> +
> +        /*
> +         * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE,
> +         * and the hardware cannot update the IRTE behind us, so the return value
> +         * of cmpxchg16 should be the same as old_ire. This ASSERT validate it.
> +         */
> +        ASSERT(ret == old_ire.val);
> +    }
> +
>      iommu_flush_cache_entry(iremap_entry, sizeof(struct iremap_entry));
>      iommu_flush_iec_index(iommu, 0, index);
> 
> @@ -668,7 +704,8 @@ int msi_msg_write_remap_rte(
> 
>      drhd = pdev ? acpi_find_matched_drhd_unit(pdev)
>                  : hpet_to_drhd(msi_desc->hpet_id);
> -    return drhd ? msi_msg_to_remap_entry(drhd->iommu, pdev, msi_desc, msg)
> +    return drhd ? msi_msg_to_remap_entry(drhd->iommu, pdev,
> +                                         msi_desc, msg, NULL, 0)
>                  : -EINVAL;
>  }
> 
> @@ -902,42 +939,6 @@ void iommu_disable_x2apic_IR(void)
>          disable_qinval(drhd->iommu);
>  }
> 
> -static void setup_posted_irte(
> -    struct iremap_entry *new_ire, const struct iremap_entry *old_ire,
> -    const struct pi_desc *pi_desc, const uint8_t gvec)
> -{
> -    memset(new_ire, 0, sizeof(*new_ire));
> -
> -    /*
> -     * 'im' filed decides whether the irte is in posted format (with value 1)
> -     * or remapped format (with value 0), if the old irte is in remapped format,
> -     * we copy things from remapped part in 'struct iremap_entry', otherwise,
> -     * we copy from posted part.
> -     */
> -    if ( !old_ire->remap.im )
> -    {
> -        new_ire->post.p = old_ire->remap.p;
> -        new_ire->post.fpd = old_ire->remap.fpd;
> -        new_ire->post.sid = old_ire->remap.sid;
> -        new_ire->post.sq = old_ire->remap.sq;
> -        new_ire->post.svt = old_ire->remap.svt;
> -    }
> -    else
> -    {
> -        new_ire->post.p = old_ire->post.p;
> -        new_ire->post.fpd = old_ire->post.fpd;
> -        new_ire->post.sid = old_ire->post.sid;
> -        new_ire->post.sq = old_ire->post.sq;
> -        new_ire->post.svt = old_ire->post.svt;
> -        new_ire->post.urg = old_ire->post.urg;
> -    }
> -
> -    new_ire->post.im = 1;
> -    new_ire->post.vector = gvec;
> -    new_ire->post.pda_l = virt_to_maddr(pi_desc) >> (32 - PDA_LOW_BIT);
> -    new_ire->post.pda_h = virt_to_maddr(pi_desc) >> 32;
> -}
> -
>  /*
>   * This function is used to update the IRTE for posted-interrupt
>   * when guest changes MSI/MSI-X information.
> @@ -946,17 +947,12 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq,
>      const uint8_t gvec)
>  {
>      struct irq_desc *desc;
> -    const struct msi_desc *msi_desc;
> -    int remap_index;
> +    struct msi_desc *msi_desc;
>      int rc = 0;
>      const struct pci_dev *pci_dev;
>      const struct acpi_drhd_unit *drhd;
>      struct iommu *iommu;
> -    struct ir_ctrl *ir_ctrl;
> -    struct iremap_entry *iremap_entries = NULL, *p = NULL;
> -    struct iremap_entry new_ire, old_ire;
>      const struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> -    __uint128_t ret;
> 
>      desc = pirq_spin_lock_irq_desc(pirq, NULL);
>      if ( !desc )
> @@ -976,8 +972,6 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq,
>          goto unlock_out;
>      }
> 
> -    remap_index = msi_desc->remap_index;
> -
>      spin_unlock_irq(&desc->lock);
> 
>      ASSERT(pcidevs_locked());
> @@ -992,35 +986,11 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq,
>          return -ENODEV;
> 
>      iommu = drhd->iommu;
> -    ir_ctrl = iommu_ir_ctrl(iommu);
> -    if ( !ir_ctrl )
> +    if ( !iommu_ir_ctrl(iommu) )
>          return -ENODEV;
> 
> -    spin_lock_irq(&ir_ctrl->iremap_lock);
> -
> -    GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, remap_index, iremap_entries, p);
> -
> -    old_ire = *p;
> -
> -    /* Setup/Update interrupt remapping table entry. */
> -    setup_posted_irte(&new_ire, &old_ire, pi_desc, gvec);
> -    ret = cmpxchg16b(p, &old_ire, &new_ire);
> -
> -    /*
> -     * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE,
> -     * and the hardware cannot update the IRTE behind us, so the return value
> -     * of cmpxchg16 should be the same as old_ire. This ASSERT validate it.
> -     */
> -    ASSERT(ret == old_ire.val);
> -
> -    iommu_flush_cache_entry(p, sizeof(*p));
> -    iommu_flush_iec_index(iommu, 0, remap_index);
> -
> -    unmap_vtd_domain_page(iremap_entries);
> -
> -    spin_unlock_irq(&ir_ctrl->iremap_lock);
> -
> -    return 0;
> +    return msi_msg_to_remap_entry(iommu, pci_dev, msi_desc, &msi_desc->msg,
> +                                  pi_desc, gvec);
> 
>   unlock_out:
>      spin_unlock_irq(&desc->lock);
> --
> 2.1.0
Wu, Feng Nov. 22, 2016, 4:31 a.m. UTC | #2
> -----Original Message-----
> From: Tian, Kevin
> Sent: Friday, November 18, 2016 12:31 PM
> To: Wu, Feng <feng.wu@intel.com>; xen-devel@lists.xen.org
> Cc: jbeulich@suse.com; andrew.cooper3@citrix.com;
> george.dunlap@eu.citrix.com; dario.faggioli@citrix.com
> Subject: RE: [PATCH v8 4/7] VT-d: Use one function to update both remapped
> and posted IRTE
> 
> > From: Wu, Feng
> > Sent: Friday, November 18, 2016 9:57 AM
> >
> > Use one function to update both remapped IRTE and posted IRET.
> >
> > Signed-off-by: Feng Wu <feng.wu@intel.com>
> > ---
> > v8:
> > - Newly added
> >
> >  xen/drivers/passthrough/vtd/intremap.c | 162 ++++++++++++++-----------------
> --
> >  1 file changed, 66 insertions(+), 96 deletions(-)
> >
> > diff --git a/xen/drivers/passthrough/vtd/intremap.c
> > b/xen/drivers/passthrough/vtd/intremap.c
> > index bfd468b..fd2a49a 100644
> > --- a/xen/drivers/passthrough/vtd/intremap.c
> > +++ b/xen/drivers/passthrough/vtd/intremap.c
> > @@ -420,7 +420,7 @@ void io_apic_write_remap_rte(
> >          __ioapic_write_entry(apic, ioapic_pin, 1, old_rte);
> >  }
> >
> > -static void set_msi_source_id(struct pci_dev *pdev, struct iremap_entry *ire)
> > +static void set_msi_source_id(const struct pci_dev *pdev, struct
> iremap_entry *ire)
> >  {
> >      u16 seg;
> >      u8 bus, devfn, secbus;
> > @@ -548,11 +548,12 @@ static int remap_entry_to_msi_msg(
> >  }
> >
> >  static int msi_msg_to_remap_entry(
> > -    struct iommu *iommu, struct pci_dev *pdev,
> > -    struct msi_desc *msi_desc, struct msi_msg *msg)
> > +    struct iommu *iommu, const struct pci_dev *pdev,
> > +    struct msi_desc *msi_desc, struct msi_msg *msg,
> > +    const struct pi_desc *pi_desc, const uint8_t gvec)
> >  {
> >      struct iremap_entry *iremap_entry = NULL, *iremap_entries;
> > -    struct iremap_entry new_ire;
> > +    struct iremap_entry new_ire, old_ire;
> 
> old_ire is used only in later 'else' branch. move definition there.
> 
> >      struct msi_msg_remap_entry *remap_rte;
> >      unsigned int index, i, nr = 1;
> >      unsigned long flags;
> > @@ -597,31 +598,50 @@ static int msi_msg_to_remap_entry(
> >
> >      memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry));
> >
> > -    /* Set interrupt remapping table entry */
> > -    new_ire.remap.fpd = 0;
> > -    new_ire.remap.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT)
> & 0x1;
> > -    new_ire.remap.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
> > -    new_ire.remap.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) &
> 0x1;
> > -    /* Hardware require RH = 1 for LPR delivery mode */
> > -    new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio);
> > -    new_ire.remap.avail = 0;
> > -    new_ire.remap.res_1 = 0;
> > -    new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) &
> > -                            MSI_DATA_VECTOR_MASK;
> > -    new_ire.remap.res_2 = 0;
> > -    if ( x2apic_enabled )
> > -        new_ire.remap.dst = msg->dest32;
> > +    if ( !pi_desc )
> > +    {
> > +        /* Set interrupt remapping table entry */
> > +        new_ire.remap.fpd = 0;
> > +        new_ire.remap.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT)
> &
> > 0x1;
> > +        new_ire.remap.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
> > +        new_ire.remap.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT)
> &
> > 0x1;
> > +        /* Hardware require RH = 1 for LPR delivery mode */
> > +        new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio);
> > +        new_ire.remap.avail = 0;
> > +        new_ire.remap.res_1 = 0;
> > +        new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) &
> > +                                MSI_DATA_VECTOR_MASK;
> > +        new_ire.remap.res_2 = 0;
> > +        if ( x2apic_enabled )
> > +            new_ire.remap.dst = msg->dest32;
> > +        else
> > +            new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT)
> > +                                 & 0xff) << 8;
> > +
> > +        new_ire.remap.res_3 = 0;
> > +        new_ire.remap.res_4 = 0;
> > +        new_ire.remap.p = 1;    /* finally, set present bit */
> > +    }
> >      else
> > -        new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT)
> > -                             & 0xff) << 8;
> > +    {
> > +        new_ire.post.fpd = 0;
> > +        new_ire.post.res_1 = 0;
> > +        new_ire.post.res_2 = 0;
> > +        new_ire.post.urg = 0;
> > +        new_ire.post.im = 1;
> > +        new_ire.post.vector = gvec;
> > +        new_ire.post.res_3 = 0;
> > +        new_ire.post.res_4 = 0;
> > +        new_ire.post.res_5 = 0;
> > +        new_ire.post.pda_l = virt_to_maddr(pi_desc) >> (32 - PDA_LOW_BIT);
> > +        new_ire.post.pda_h = virt_to_maddr(pi_desc) >> 32;
> > +        new_ire.post.p = 1;    /* finally, set present bit */
> > +    }
> 
> It's a little confusing here:
> 
> - you first copy current remapping entry to new_ire and then do some
> initialization.
> Is it necessary? If there are some fields we'd like to inherit, better to describe
> them
> in comment so others can have a clear understanding of what exactly fields
> must
> be updated here;

This is needed, please refer to my replay to [5/7].

> 
> - in original code of setup_posted_irte, you actually do conditional assignment
> based on format of old_ire. Above you changed to non-conditional assignment,
> not relying on old_ire. Is it desired?

In setup_posted_irte() we need to inherit the common fields (fpd, sid, sq, svt)
from the old IRTE. But now this common fields are setup by set_msi_source_id()
in this function.

Thanks,
Feng
Jan Beulich Nov. 22, 2016, 9:58 a.m. UTC | #3
>>> On 18.11.16 at 02:57, <feng.wu@intel.com> wrote:
> @@ -597,31 +598,50 @@ static int msi_msg_to_remap_entry(

Considering you basically re-do most of the function, I think there's
some more adjustment necessary (or at least very desirable) here.

>  
>      memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry));
>  
> -    /* Set interrupt remapping table entry */
> -    new_ire.remap.fpd = 0;
> -    new_ire.remap.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1;
> -    new_ire.remap.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
> -    new_ire.remap.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1;
> -    /* Hardware require RH = 1 for LPR delivery mode */
> -    new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio);
> -    new_ire.remap.avail = 0;
> -    new_ire.remap.res_1 = 0;
> -    new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) &
> -                            MSI_DATA_VECTOR_MASK;
> -    new_ire.remap.res_2 = 0;
> -    if ( x2apic_enabled )
> -        new_ire.remap.dst = msg->dest32;
> +    if ( !pi_desc )
> +    {
> +        /* Set interrupt remapping table entry */
> +        new_ire.remap.fpd = 0;
> +        new_ire.remap.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1;
> +        new_ire.remap.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;

These two "& 0x1" seem unnecessary, as the fields are 1 bit only
anyway.

> +        new_ire.remap.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1;

This masking, however, has always been puzzling me: dlm is a 3 bit
field, and the MSI message field is a 3-bit one too. Hence the
masking should also be dropped here - if the message field is wrong
(i.e. holding an unsupported value), there's no good reason to try
to compensate for it here. If at all the function should refuse to do
the requested translation.

> +        /* Hardware require RH = 1 for LPR delivery mode */
> +        new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio);
> +        new_ire.remap.avail = 0;
> +        new_ire.remap.res_1 = 0;
> +        new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) &
> +                                MSI_DATA_VECTOR_MASK;
> +        new_ire.remap.res_2 = 0;
> +        if ( x2apic_enabled )
> +            new_ire.remap.dst = msg->dest32;
> +        else
> +            new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT)
> +                                 & 0xff) << 8;
> +
> +        new_ire.remap.res_3 = 0;
> +        new_ire.remap.res_4 = 0;
> +        new_ire.remap.p = 1;    /* finally, set present bit */

I don't understand this comment. The order of operations does not
matter at all, and in fact you now set p before being done with all
other fields. Please drop such misleading comments, or make them
useful/correct.

Also, the only field you don't explicitly set here is .im. Why?

> +    }
>      else
> -        new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT)
> -                             & 0xff) << 8;
> +    {
> +        new_ire.post.fpd = 0;
> +        new_ire.post.res_1 = 0;
> +        new_ire.post.res_2 = 0;
> +        new_ire.post.urg = 0;
> +        new_ire.post.im = 1;
> +        new_ire.post.vector = gvec;
> +        new_ire.post.res_3 = 0;
> +        new_ire.post.res_4 = 0;
> +        new_ire.post.res_5 = 0;
> +        new_ire.post.pda_l = virt_to_maddr(pi_desc) >> (32 - PDA_LOW_BIT);
> +        new_ire.post.pda_h = virt_to_maddr(pi_desc) >> 32;
> +        new_ire.post.p = 1;    /* finally, set present bit */
> +    }

Along the lines of the comment above, you don't fill avail here. Why?

Taking both together, I don't see why - after adding said initialization -
new_ire needs to start out as a copy of the live IRTE. Instead you
could memset() the whole structure to zero, or simply give it an empty
initializer (saving you from initializing all the reserved fields, plus some
more).

And of course, along the lines of ...

>      if ( pdev )
>          set_msi_source_id(pdev, &new_ire);
>      else
>          set_hpet_source_id(msi_desc->hpet_id, &new_ire);

... this, I see little reason for common fields to be initialized separately
on each path. According to the code above that's only p (leaving
aside fields which get zeroed), but anyway. Perhaps there should
even be a common sub-structure of the union ...

> @@ -637,7 +657,23 @@ static int msi_msg_to_remap_entry(
>      remap_rte->address_hi = 0;
>      remap_rte->data = index - i;
>  
> -    memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
> +    if ( !pi_desc )
> +        memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
> +    else
> +    {
> +        __uint128_t ret;
> +
> +        old_ire = *iremap_entry;
> +        ret = cmpxchg16b(iremap_entry, &old_ire, &new_ire);
> +
> +        /*
> +         * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE,
> +         * and the hardware cannot update the IRTE behind us, so the return value
> +         * of cmpxchg16 should be the same as old_ire. This ASSERT validate it.
> +         */
> +        ASSERT(ret == old_ire.val);
> +    }

Could you remind me again please why posted format updates need
to use cmpxchg16b, but remapped format ones don't? (As a aside,
with the code structure as you have it you should move the old_irte
declaration here, or omit it altogether, as you could as well pass
*iremap_entry directly afaict.)

> @@ -668,7 +704,8 @@ int msi_msg_write_remap_rte(
>  
>      drhd = pdev ? acpi_find_matched_drhd_unit(pdev)
>                  : hpet_to_drhd(msi_desc->hpet_id);
> -    return drhd ? msi_msg_to_remap_entry(drhd->iommu, pdev, msi_desc, msg)
> +    return drhd ? msi_msg_to_remap_entry(drhd->iommu, pdev,
> +                                         msi_desc, msg, NULL, 0)

Is this unconditional passing of NULL here really correct?

> @@ -946,17 +947,12 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq,
>      const uint8_t gvec)
>  {
>      struct irq_desc *desc;
> -    const struct msi_desc *msi_desc;
> -    int remap_index;
> +    struct msi_desc *msi_desc;
>      int rc = 0;
>      const struct pci_dev *pci_dev;
>      const struct acpi_drhd_unit *drhd;
>      struct iommu *iommu;
> -    struct ir_ctrl *ir_ctrl;
> -    struct iremap_entry *iremap_entries = NULL, *p = NULL;
> -    struct iremap_entry new_ire, old_ire;
>      const struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> -    __uint128_t ret;
>  
>      desc = pirq_spin_lock_irq_desc(pirq, NULL);
>      if ( !desc )
> @@ -976,8 +972,6 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq,
>          goto unlock_out;
>      }
>  
> -    remap_index = msi_desc->remap_index;
> -
>      spin_unlock_irq(&desc->lock);
>  
>      ASSERT(pcidevs_locked());
> @@ -992,35 +986,11 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq,
>          return -ENODEV;
>  
>      iommu = drhd->iommu;
> -    ir_ctrl = iommu_ir_ctrl(iommu);
> -    if ( !ir_ctrl )
> +    if ( !iommu_ir_ctrl(iommu) )
>          return -ENODEV;
>  
> -    spin_lock_irq(&ir_ctrl->iremap_lock);
> -
> -    GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, remap_index, iremap_entries, p);
> -
> -    old_ire = *p;
> -
> -    /* Setup/Update interrupt remapping table entry. */
> -    setup_posted_irte(&new_ire, &old_ire, pi_desc, gvec);
> -    ret = cmpxchg16b(p, &old_ire, &new_ire);
> -
> -    /*
> -     * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE,
> -     * and the hardware cannot update the IRTE behind us, so the return value
> -     * of cmpxchg16 should be the same as old_ire. This ASSERT validate it.
> -     */
> -    ASSERT(ret == old_ire.val);
> -
> -    iommu_flush_cache_entry(p, sizeof(*p));
> -    iommu_flush_iec_index(iommu, 0, remap_index);
> -
> -    unmap_vtd_domain_page(iremap_entries);
> -
> -    spin_unlock_irq(&ir_ctrl->iremap_lock);
> -
> -    return 0;
> +    return msi_msg_to_remap_entry(iommu, pci_dev, msi_desc, &msi_desc->msg,
> +                                  pi_desc, gvec);

There are few changes here which appear to have the potential of
affecting behavior: Previously you didn't alter msi_desc or the MSI
message contained therein (as documented by the pointer having
been const). Is this possible updating of message and remap index
really benign? In any event any such changes should be reasoned
about in the commit message.

Jan
Chao Gao Feb. 22, 2017, 1:53 a.m. UTC | #4
On Tue, Nov 22, 2016 at 05:58:56PM +0800, Jan Beulich wrote:
>>>> On 18.11.16 at 02:57, <feng.wu@intel.com> wrote:
>> @@ -597,31 +598,50 @@ static int msi_msg_to_remap_entry(
>
>Considering you basically re-do most of the function, I think there's
>some more adjustment necessary (or at least very desirable) here.
>
>>
>>      memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry));
>>
>> -    /* Set interrupt remapping table entry */
>> -    new_ire.remap.fpd = 0;
>> -    new_ire.remap.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1;
>> -    new_ire.remap.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
>> -    new_ire.remap.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1;
>> -    /* Hardware require RH = 1 for LPR delivery mode */
>> -    new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio);
>> -    new_ire.remap.avail = 0;
>> -    new_ire.remap.res_1 = 0;
>> -    new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) &
>> -                            MSI_DATA_VECTOR_MASK;
>> -    new_ire.remap.res_2 = 0;
>> -    if ( x2apic_enabled )
>> -        new_ire.remap.dst = msg->dest32;
>> +    if ( !pi_desc )
>> +    {
>> +        /* Set interrupt remapping table entry */
>> +        new_ire.remap.fpd = 0;
>> +        new_ire.remap.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1;
>> +        new_ire.remap.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
>
>These two "& 0x1" seem unnecessary, as the fields are 1 bit only
>anyway.
>

Ok, will remove them.

>> +        new_ire.remap.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1;
>
>This masking, however, has always been puzzling me: dlm is a 3 bit
>field, and the MSI message field is a 3-bit one too. Hence the
>masking should also be dropped here - if the message field is wrong
>(i.e. holding an unsupported value), there's no good reason to try
>to compensate for it here. If at all the function should refuse to do
>the requested translation.
>
>> +        /* Hardware require RH = 1 for LPR delivery mode */
>> +        new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio);
>> +        new_ire.remap.avail = 0;
>> +        new_ire.remap.res_1 = 0;
>> +        new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) &
>> +                                MSI_DATA_VECTOR_MASK;
>> +        new_ire.remap.res_2 = 0;
>> +        if ( x2apic_enabled )
>> +            new_ire.remap.dst = msg->dest32;
>> +        else
>> +            new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT)
>> +                                 & 0xff) << 8;
>> +
>> +        new_ire.remap.res_3 = 0;
>> +        new_ire.remap.res_4 = 0;
>> +        new_ire.remap.p = 1;    /* finally, set present bit */
>
>I don't understand this comment. The order of operations does not
>matter at all, and in fact you now set p before being done with all
>other fields. Please drop such misleading comments, or make them
>useful/correct.

Yes, The order does not matter. I will drop this comment.

>
>Also, the only field you don't explicitly set here is .im. Why?
>
>> +    }
>>      else
>> -        new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT)
>> -                             & 0xff) << 8;
>> +    {
>> +        new_ire.post.fpd = 0;
>> +        new_ire.post.res_1 = 0;
>> +        new_ire.post.res_2 = 0;
>> +        new_ire.post.urg = 0;
>> +        new_ire.post.im = 1;
>> +        new_ire.post.vector = gvec;
>> +        new_ire.post.res_3 = 0;
>> +        new_ire.post.res_4 = 0;
>> +        new_ire.post.res_5 = 0;
>> +        new_ire.post.pda_l = virt_to_maddr(pi_desc) >> (32 - PDA_LOW_BIT);
>> +        new_ire.post.pda_h = virt_to_maddr(pi_desc) >> 32;
>> +        new_ire.post.p = 1;    /* finally, set present bit */
>> +    }
>
>Along the lines of the comment above, you don't fill avail here. Why?
>
>Taking both together, I don't see why - after adding said initialization -
>new_ire needs to start out as a copy of the live IRTE. Instead you
>could memset() the whole structure to zero, or simply give it an empty
>initializer (saving you from initializing all the reserved fields, plus some
>more).
>

Yes, it is really confused. Maybe because this field is available to software in posting
format IRTE. We can reserve the avail field through introducing a flag to
distinguish initialization from update. We also can clear the avail field every
time if we don't use it right now, leaving the problem to others who want use
the avail field.  Do you think which is better?

>And of course, along the lines of ...
>
>>      if ( pdev )
>>          set_msi_source_id(pdev, &new_ire);
>>      else
>>          set_hpet_source_id(msi_desc->hpet_id, &new_ire);
>
>... this, I see little reason for common fields to be initialized separately
>on each path. According to the code above that's only p (leaving
>aside fields which get zeroed), but anyway. Perhaps there should
>even be a common sub-structure of the union ...

I can add a patch in this series to do this.

>
>> @@ -637,7 +657,23 @@ static int msi_msg_to_remap_entry(
>>      remap_rte->address_hi = 0;
>>      remap_rte->data = index - i;
>>
>> -    memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
>> +    if ( !pi_desc )
>> +        memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
>> +    else
>> +    {
>> +        __uint128_t ret;
>> +
>> +        old_ire = *iremap_entry;
>> +        ret = cmpxchg16b(iremap_entry, &old_ire, &new_ire);
>> +
>> +        /*
>> +         * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE,
>> +         * and the hardware cannot update the IRTE behind us, so the return value
>> +         * of cmpxchg16 should be the same as old_ire. This ASSERT validate it.
>> +         */
>> +        ASSERT(ret == old_ire.val);
>> +    }
>
>Could you remind me again please why posted format updates need
>to use cmpxchg16b, but remapped format ones don't? (As a aside,
>with the code structure as you have it you should move the old_irte
>declaration here, or omit it altogether, as you could as well pass
>*iremap_entry directly afaict.)

Before feng left, I have asked him about this question. He told me that
the PDA field of posted format IRTE comprises of two parts:
Posted Descritor Address High[127:96] and Low [63:38]. If we want to update
PDA field, do it atomically or disable-update-enable. He also said, it had
been confirmed that cmpxchg16b was supported on all intel platform with VT-d PI.
If we assume that updates to remapped format IRTE only is to update either
64 bit or high 64 bit (except initialition), two 64bit memory write operations
is enough. 

>
>> @@ -668,7 +704,8 @@ int msi_msg_write_remap_rte(
>>
>>      drhd = pdev ? acpi_find_matched_drhd_unit(pdev)
>>                  : hpet_to_drhd(msi_desc->hpet_id);
>> -    return drhd ? msi_msg_to_remap_entry(drhd->iommu, pdev, msi_desc, msg)
>> +    return drhd ? msi_msg_to_remap_entry(drhd->iommu, pdev,
>> +                                         msi_desc, msg, NULL, 0)
>
>Is this unconditional passing of NULL here really correct?

Since two parameters are added to this function, we should think about what
the function does again. the last 2 parameters are optional.

If they are not present, just means a physical device driver changes its msi
message. So it notifies iommu to do some changes to IRTE accordingly (the driver doesn't
know the format of the live IRTE). This is the case above.

If they are present, it means the msi should be delivered to the vcpu with the
vector num. To achieve that, the function replaces the old IRTE with a new
posted format IRTE.

>
>> @@ -946,17 +947,12 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq,
>>      const uint8_t gvec)
>>  {
>>      struct irq_desc *desc;
>> -    const struct msi_desc *msi_desc;
>> -    int remap_index;
>> +    struct msi_desc *msi_desc;
>>      int rc = 0;
>>      const struct pci_dev *pci_dev;
>>      const struct acpi_drhd_unit *drhd;
>>      struct iommu *iommu;
>> -    struct ir_ctrl *ir_ctrl;
>> -    struct iremap_entry *iremap_entries = NULL, *p = NULL;
>> -    struct iremap_entry new_ire, old_ire;
>>      const struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
>> -    __uint128_t ret;
>>
>>      desc = pirq_spin_lock_irq_desc(pirq, NULL);
>>      if ( !desc )
>> @@ -976,8 +972,6 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq,
>>          goto unlock_out;
>>      }
>>
>> -    remap_index = msi_desc->remap_index;
>> -
>>      spin_unlock_irq(&desc->lock);
>>
>>      ASSERT(pcidevs_locked());
>> @@ -992,35 +986,11 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq,
>>          return -ENODEV;
>>
>>      iommu = drhd->iommu;
>> -    ir_ctrl = iommu_ir_ctrl(iommu);
>> -    if ( !ir_ctrl )
>> +    if ( !iommu_ir_ctrl(iommu) )
>>          return -ENODEV;
>>
>> -    spin_lock_irq(&ir_ctrl->iremap_lock);
>> -
>> -    GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, remap_index, iremap_entries, p);
>> -
>> -    old_ire = *p;
>> -
>> -    /* Setup/Update interrupt remapping table entry. */
>> -    setup_posted_irte(&new_ire, &old_ire, pi_desc, gvec);
>> -    ret = cmpxchg16b(p, &old_ire, &new_ire);
>> -
>> -    /*
>> -     * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE,
>> -     * and the hardware cannot update the IRTE behind us, so the return value
>> -     * of cmpxchg16 should be the same as old_ire. This ASSERT validate it.
>> -     */
>> -    ASSERT(ret == old_ire.val);
>> -
>> -    iommu_flush_cache_entry(p, sizeof(*p));
>> -    iommu_flush_iec_index(iommu, 0, remap_index);
>> -
>> -    unmap_vtd_domain_page(iremap_entries);
>> -
>> -    spin_unlock_irq(&ir_ctrl->iremap_lock);
>> -
>> -    return 0;
>> +    return msi_msg_to_remap_entry(iommu, pci_dev, msi_desc, &msi_desc->msg,
>> +                                  pi_desc, gvec);
>
>There are few changes here which appear to have the potential of
>affecting behavior: Previously you didn't alter msi_desc or the MSI
>message contained therein (as documented by the pointer having
>been const). Is this possible updating of message and remap index
>really benign? In any event any such changes should be reasoned
>about in the commit message.

I agree that we can't update message and remap index in this pi_update_irte.
but msi_msg_to_remap_entry will change msi_desc when msi_desc->remap_index < 0.
How about splitting part of msi_msg_to_remap_entry to a new function which consumes a const
msi_desc parameter and pi_update_irte will call the new function?

>
>Jan
>
>_______________________________________________
>Xen-devel mailing list
>Xen-devel@lists.xen.org
>https://lists.xen.org/xen-devel
Chao Gao Feb. 22, 2017, 7:12 a.m. UTC | #5
On Wed, Feb 22, 2017 at 02:10:25AM -0700, Jan Beulich wrote:
>>>> On 22.02.17 at 02:53, <chao.gao@intel.com> wrote:
>> On Tue, Nov 22, 2016 at 05:58:56PM +0800, Jan Beulich wrote:
>>>> @@ -637,7 +657,23 @@ static int msi_msg_to_remap_entry(
>>>>      remap_rte->address_hi = 0;
>>>>      remap_rte->data = index - i;
>>>>
>>>> -    memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
>>>> +    if ( !pi_desc )
>>>> +        memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
>>>> +    else
>>>> +    {
>>>> +        __uint128_t ret;
>>>> +
>>>> +        old_ire = *iremap_entry;
>>>> +        ret = cmpxchg16b(iremap_entry, &old_ire, &new_ire);
>>>> +
>>>> +        /*
>>>> +         * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE,
>>>> +         * and the hardware cannot update the IRTE behind us, so the return value
>>>> +         * of cmpxchg16 should be the same as old_ire. This ASSERT validate it.
>>>> +         */
>>>> +        ASSERT(ret == old_ire.val);
>>>> +    }
>>>
>>>Could you remind me again please why posted format updates need
>>>to use cmpxchg16b, but remapped format ones don't? (As a aside,
>>>with the code structure as you have it you should move the old_irte
>>>declaration here, or omit it altogether, as you could as well pass
>>>*iremap_entry directly afaict.)
>> 
>> Before feng left, I have asked him about this question. He told me that
>> the PDA field of posted format IRTE comprises of two parts:
>> Posted Descritor Address High[127:96] and Low [63:38]. If we want to update
>> PDA field, do it atomically or disable-update-enable. He also said, it had
>> been confirmed that cmpxchg16b was supported on all intel platform with VT-d PI.
>> If we assume that updates to remapped format IRTE only is to update either
>> 64 bit or high 64 bit (except initialition), two 64bit memory write operations
>> is enough. 
>
>Two 64-bit memory write operations? Where do you see them? I
>only see memcpy(), which for the purposes of the code here is
>supposed to be a black box.

Ok. I made a mistake here. In ioapic case, before update IRTE, the according
IOAPIC RTE is masked. So, using a memcpy() is safe. In msi case, there is no
mask operation. I think only using a memcpy() is unsafe. Do you think so?

>
>>>> @@ -668,7 +704,8 @@ int msi_msg_write_remap_rte(
>>>>
>>>>      drhd = pdev ? acpi_find_matched_drhd_unit(pdev)
>>>>                  : hpet_to_drhd(msi_desc->hpet_id);
>>>> -    return drhd ? msi_msg_to_remap_entry(drhd->iommu, pdev, msi_desc, msg)
>>>> +    return drhd ? msi_msg_to_remap_entry(drhd->iommu, pdev,
>>>> +                                         msi_desc, msg, NULL, 0)
>>>
>>>Is this unconditional passing of NULL here really correct?
>> 
>> Since two parameters are added to this function, we should think about what
>> the function does again. the last 2 parameters are optional.
>> 
>> If they are not present, just means a physical device driver changes its msi
>> message. So it notifies iommu to do some changes to IRTE accordingly (the driver doesn't
>> know the format of the live IRTE). This is the case above.
>> 
>> If they are present, it means the msi should be delivered to the vcpu with the
>> vector num. To achieve that, the function replaces the old IRTE with a new
>> posted format IRTE.
>
>I don't see how this answers my question. In fact it feels like you,
>just like Feng, are making assumptions on the conditions under
>which the function here is being called _at present_. You should,
>however, make the function work correctly for all possible uses,
>or add ASSERT()s to clearly expose issues with potential new,
>future callers.

Ok. Your suggestion is very good. 
Every caller tells the function to construct a centain format IRTE. Return to
your question, I think it is defintely wrong to pass NULL unconditionally.
We should  pass NULL before the msi is binded to a guest interrupt and pass the
destination vcpu and vector num after that. At this moment, I can't come up
with a way to check whether the msi is binded to a guest interrupt and get the
destination vcpu and vector num only through the struct pci_dev and struct
msi_desc. Could you give me some suggestion on that or recommend a structure,
you think, in which we can add some fields to record these information?  

Thanks,
Chao

>
>>>> @@ -992,35 +986,11 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq,
>>>>          return -ENODEV;
>>>>
>>>>      iommu = drhd->iommu;
>>>> -    ir_ctrl = iommu_ir_ctrl(iommu);
>>>> -    if ( !ir_ctrl )
>>>> +    if ( !iommu_ir_ctrl(iommu) )
>>>>          return -ENODEV;
>>>>
>>>> -    spin_lock_irq(&ir_ctrl->iremap_lock);
>>>> -
>>>> -    GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, remap_index, iremap_entries, p);
>>>> -
>>>> -    old_ire = *p;
>>>> -
>>>> -    /* Setup/Update interrupt remapping table entry. */
>>>> -    setup_posted_irte(&new_ire, &old_ire, pi_desc, gvec);
>>>> -    ret = cmpxchg16b(p, &old_ire, &new_ire);
>>>> -
>>>> -    /*
>>>> -     * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE,
>>>> -     * and the hardware cannot update the IRTE behind us, so the return value
>>>> -     * of cmpxchg16 should be the same as old_ire. This ASSERT validate it.
>>>> -     */
>>>> -    ASSERT(ret == old_ire.val);
>>>> -
>>>> -    iommu_flush_cache_entry(p, sizeof(*p));
>>>> -    iommu_flush_iec_index(iommu, 0, remap_index);
>>>> -
>>>> -    unmap_vtd_domain_page(iremap_entries);
>>>> -
>>>> -    spin_unlock_irq(&ir_ctrl->iremap_lock);
>>>> -
>>>> -    return 0;
>>>> +    return msi_msg_to_remap_entry(iommu, pci_dev, msi_desc, &msi_desc->msg,
>>>> +                                  pi_desc, gvec);
>>>
>>>There are few changes here which appear to have the potential of
>>>affecting behavior: Previously you didn't alter msi_desc or the MSI
>>>message contained therein (as documented by the pointer having
>>>been const). Is this possible updating of message and remap index
>>>really benign? In any event any such changes should be reasoned
>>>about in the commit message.
>> 
>> I agree that we can't update message and remap index in this pi_update_irte.
>> but msi_msg_to_remap_entry will change msi_desc when msi_desc->remap_index < 0.
>> How about splitting part of msi_msg_to_remap_entry to a new function which consumes a const
>> msi_desc parameter and pi_update_irte will call the new function?
>
>Well, I can't easily say yes or no here without seeing what the
>result would be. Give it a try, and we'll look at the result in v9.
>
>Jan
Jan Beulich Feb. 22, 2017, 9:10 a.m. UTC | #6
>>> On 22.02.17 at 02:53, <chao.gao@intel.com> wrote:
> On Tue, Nov 22, 2016 at 05:58:56PM +0800, Jan Beulich wrote:
>>>>> On 18.11.16 at 02:57, <feng.wu@intel.com> wrote:
>>>      else
>>> -        new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT)
>>> -                             & 0xff) << 8;
>>> +    {
>>> +        new_ire.post.fpd = 0;
>>> +        new_ire.post.res_1 = 0;
>>> +        new_ire.post.res_2 = 0;
>>> +        new_ire.post.urg = 0;
>>> +        new_ire.post.im = 1;
>>> +        new_ire.post.vector = gvec;
>>> +        new_ire.post.res_3 = 0;
>>> +        new_ire.post.res_4 = 0;
>>> +        new_ire.post.res_5 = 0;
>>> +        new_ire.post.pda_l = virt_to_maddr(pi_desc) >> (32 - PDA_LOW_BIT);
>>> +        new_ire.post.pda_h = virt_to_maddr(pi_desc) >> 32;
>>> +        new_ire.post.p = 1;    /* finally, set present bit */
>>> +    }
>>
>>Along the lines of the comment above, you don't fill avail here. Why?
>>
>>Taking both together, I don't see why - after adding said initialization -
>>new_ire needs to start out as a copy of the live IRTE. Instead you
>>could memset() the whole structure to zero, or simply give it an empty
>>initializer (saving you from initializing all the reserved fields, plus some
>>more).
>>
> 
> Yes, it is really confused. Maybe because this field is available to software in posting
> format IRTE. We can reserve the avail field through introducing a flag to
> distinguish initialization from update. We also can clear the avail field every
> time if we don't use it right now, leaving the problem to others who want use
> the avail field.  Do you think which is better?

As its unused, clear it every time. We simply don't know how a
potential use would want it set during update.

>>> @@ -637,7 +657,23 @@ static int msi_msg_to_remap_entry(
>>>      remap_rte->address_hi = 0;
>>>      remap_rte->data = index - i;
>>>
>>> -    memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
>>> +    if ( !pi_desc )
>>> +        memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
>>> +    else
>>> +    {
>>> +        __uint128_t ret;
>>> +
>>> +        old_ire = *iremap_entry;
>>> +        ret = cmpxchg16b(iremap_entry, &old_ire, &new_ire);
>>> +
>>> +        /*
>>> +         * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE,
>>> +         * and the hardware cannot update the IRTE behind us, so the return value
>>> +         * of cmpxchg16 should be the same as old_ire. This ASSERT validate it.
>>> +         */
>>> +        ASSERT(ret == old_ire.val);
>>> +    }
>>
>>Could you remind me again please why posted format updates need
>>to use cmpxchg16b, but remapped format ones don't? (As a aside,
>>with the code structure as you have it you should move the old_irte
>>declaration here, or omit it altogether, as you could as well pass
>>*iremap_entry directly afaict.)
> 
> Before feng left, I have asked him about this question. He told me that
> the PDA field of posted format IRTE comprises of two parts:
> Posted Descritor Address High[127:96] and Low [63:38]. If we want to update
> PDA field, do it atomically or disable-update-enable. He also said, it had
> been confirmed that cmpxchg16b was supported on all intel platform with VT-d PI.
> If we assume that updates to remapped format IRTE only is to update either
> 64 bit or high 64 bit (except initialition), two 64bit memory write operations
> is enough. 

Two 64-bit memory write operations? Where do you see them? I
only see memcpy(), which for the purposes of the code here is
supposed to be a black box.

>>> @@ -668,7 +704,8 @@ int msi_msg_write_remap_rte(
>>>
>>>      drhd = pdev ? acpi_find_matched_drhd_unit(pdev)
>>>                  : hpet_to_drhd(msi_desc->hpet_id);
>>> -    return drhd ? msi_msg_to_remap_entry(drhd->iommu, pdev, msi_desc, msg)
>>> +    return drhd ? msi_msg_to_remap_entry(drhd->iommu, pdev,
>>> +                                         msi_desc, msg, NULL, 0)
>>
>>Is this unconditional passing of NULL here really correct?
> 
> Since two parameters are added to this function, we should think about what
> the function does again. the last 2 parameters are optional.
> 
> If they are not present, just means a physical device driver changes its msi
> message. So it notifies iommu to do some changes to IRTE accordingly (the driver doesn't
> know the format of the live IRTE). This is the case above.
> 
> If they are present, it means the msi should be delivered to the vcpu with the
> vector num. To achieve that, the function replaces the old IRTE with a new
> posted format IRTE.

I don't see how this answers my question. In fact it feels like you,
just like Feng, are making assumptions on the conditions under
which the function here is being called _at present_. You should,
however, make the function work correctly for all possible uses,
or add ASSERT()s to clearly expose issues with potential new,
future callers.

>>> @@ -992,35 +986,11 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq,
>>>          return -ENODEV;
>>>
>>>      iommu = drhd->iommu;
>>> -    ir_ctrl = iommu_ir_ctrl(iommu);
>>> -    if ( !ir_ctrl )
>>> +    if ( !iommu_ir_ctrl(iommu) )
>>>          return -ENODEV;
>>>
>>> -    spin_lock_irq(&ir_ctrl->iremap_lock);
>>> -
>>> -    GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, remap_index, iremap_entries, p);
>>> -
>>> -    old_ire = *p;
>>> -
>>> -    /* Setup/Update interrupt remapping table entry. */
>>> -    setup_posted_irte(&new_ire, &old_ire, pi_desc, gvec);
>>> -    ret = cmpxchg16b(p, &old_ire, &new_ire);
>>> -
>>> -    /*
>>> -     * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE,
>>> -     * and the hardware cannot update the IRTE behind us, so the return value
>>> -     * of cmpxchg16 should be the same as old_ire. This ASSERT validate it.
>>> -     */
>>> -    ASSERT(ret == old_ire.val);
>>> -
>>> -    iommu_flush_cache_entry(p, sizeof(*p));
>>> -    iommu_flush_iec_index(iommu, 0, remap_index);
>>> -
>>> -    unmap_vtd_domain_page(iremap_entries);
>>> -
>>> -    spin_unlock_irq(&ir_ctrl->iremap_lock);
>>> -
>>> -    return 0;
>>> +    return msi_msg_to_remap_entry(iommu, pci_dev, msi_desc, &msi_desc->msg,
>>> +                                  pi_desc, gvec);
>>
>>There are few changes here which appear to have the potential of
>>affecting behavior: Previously you didn't alter msi_desc or the MSI
>>message contained therein (as documented by the pointer having
>>been const). Is this possible updating of message and remap index
>>really benign? In any event any such changes should be reasoned
>>about in the commit message.
> 
> I agree that we can't update message and remap index in this pi_update_irte.
> but msi_msg_to_remap_entry will change msi_desc when msi_desc->remap_index < 0.
> How about splitting part of msi_msg_to_remap_entry to a new function which consumes a const
> msi_desc parameter and pi_update_irte will call the new function?

Well, I can't easily say yes or no here without seeing what the
result would be. Give it a try, and we'll look at the result in v9.

Jan
Jan Beulich Feb. 22, 2017, 2:54 p.m. UTC | #7
>>> On 22.02.17 at 08:12, <chao.gao@intel.com> wrote:
> On Wed, Feb 22, 2017 at 02:10:25AM -0700, Jan Beulich wrote:
>>>>> On 22.02.17 at 02:53, <chao.gao@intel.com> wrote:
>>> On Tue, Nov 22, 2016 at 05:58:56PM +0800, Jan Beulich wrote:
>>>>> @@ -637,7 +657,23 @@ static int msi_msg_to_remap_entry(
>>>>>      remap_rte->address_hi = 0;
>>>>>      remap_rte->data = index - i;
>>>>>
>>>>> -    memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
>>>>> +    if ( !pi_desc )
>>>>> +        memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
>>>>> +    else
>>>>> +    {
>>>>> +        __uint128_t ret;
>>>>> +
>>>>> +        old_ire = *iremap_entry;
>>>>> +        ret = cmpxchg16b(iremap_entry, &old_ire, &new_ire);
>>>>> +
>>>>> +        /*
>>>>> +         * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE,
>>>>> +         * and the hardware cannot update the IRTE behind us, so the return value
>>>>> +         * of cmpxchg16 should be the same as old_ire. This ASSERT validate it.
>>>>> +         */
>>>>> +        ASSERT(ret == old_ire.val);
>>>>> +    }
>>>>
>>>>Could you remind me again please why posted format updates need
>>>>to use cmpxchg16b, but remapped format ones don't? (As a aside,
>>>>with the code structure as you have it you should move the old_irte
>>>>declaration here, or omit it altogether, as you could as well pass
>>>>*iremap_entry directly afaict.)
>>> 
>>> Before feng left, I have asked him about this question. He told me that
>>> the PDA field of posted format IRTE comprises of two parts:
>>> Posted Descritor Address High[127:96] and Low [63:38]. If we want to update
>>> PDA field, do it atomically or disable-update-enable. He also said, it had
>>> been confirmed that cmpxchg16b was supported on all intel platform with VT-d PI.
>>> If we assume that updates to remapped format IRTE only is to update either
>>> 64 bit or high 64 bit (except initialition), two 64bit memory write operations
>>> is enough. 
>>
>>Two 64-bit memory write operations? Where do you see them? I
>>only see memcpy(), which for the purposes of the code here is
>>supposed to be a black box.
> 
> Ok. I made a mistake here. In ioapic case, before update IRTE, the according
> IOAPIC RTE is masked. So, using a memcpy() is safe. In msi case, there is no
> mask operation. I think only using a memcpy() is unsafe. Do you think so?

memcpy() is unsafe in any event. I was actually trying to
understand why it's not always cmpxchg16b that gets used
here.

>>>>> @@ -668,7 +704,8 @@ int msi_msg_write_remap_rte(
>>>>>
>>>>>      drhd = pdev ? acpi_find_matched_drhd_unit(pdev)
>>>>>                  : hpet_to_drhd(msi_desc->hpet_id);
>>>>> -    return drhd ? msi_msg_to_remap_entry(drhd->iommu, pdev, msi_desc, msg)
>>>>> +    return drhd ? msi_msg_to_remap_entry(drhd->iommu, pdev,
>>>>> +                                         msi_desc, msg, NULL, 0)
>>>>
>>>>Is this unconditional passing of NULL here really correct?
>>> 
>>> Since two parameters are added to this function, we should think about what
>>> the function does again. the last 2 parameters are optional.
>>> 
>>> If they are not present, just means a physical device driver changes its msi
>>> message. So it notifies iommu to do some changes to IRTE accordingly (the driver doesn't
>>> know the format of the live IRTE). This is the case above.
>>> 
>>> If they are present, it means the msi should be delivered to the vcpu with the
>>> vector num. To achieve that, the function replaces the old IRTE with a new
>>> posted format IRTE.
>>
>>I don't see how this answers my question. In fact it feels like you,
>>just like Feng, are making assumptions on the conditions under
>>which the function here is being called _at present_. You should,
>>however, make the function work correctly for all possible uses,
>>or add ASSERT()s to clearly expose issues with potential new,
>>future callers.
> 
> Ok. Your suggestion is very good. 
> Every caller tells the function to construct a centain format IRTE. Return to
> your question, I think it is defintely wrong to pass NULL unconditionally.
> We should  pass NULL before the msi is binded to a guest interrupt and pass the
> destination vcpu and vector num after that. At this moment, I can't come up
> with a way to check whether the msi is binded to a guest interrupt and get the
> destination vcpu and vector num only through the struct pci_dev and struct
> msi_desc. Could you give me some suggestion on that or recommend a structure,
> you think, in which we can add some fields to record these information?  

I would hope all information is already available (i.e. stored
somewhere). And I have to admit that it is increasingly annoying
to have to repeatedly, after many weeks of silence, swap back
in all the context here. In particular, I don't have the original
patch in my mailbox anymore (it got purged automatically as it
seems), so recovering context is even more time consuming.

And then, pi_update_irte() is being called when the interrupt
gets bound. gvec, if it changed, would require a rebind, so the
request would again come that route. Perhaps the function
here simply needs to fail if undue changes are being requested
without all necessary data? As suggested before, even adding
ASSERT()s for places where you make assumptions on current
caller behavior would be okay. The only thing that's not okay in
my opinion is to make implicit/hidden assumptions.

Jan
diff mbox

Patch

diff --git a/xen/drivers/passthrough/vtd/intremap.c b/xen/drivers/passthrough/vtd/intremap.c
index bfd468b..fd2a49a 100644
--- a/xen/drivers/passthrough/vtd/intremap.c
+++ b/xen/drivers/passthrough/vtd/intremap.c
@@ -420,7 +420,7 @@  void io_apic_write_remap_rte(
         __ioapic_write_entry(apic, ioapic_pin, 1, old_rte);
 }
 
-static void set_msi_source_id(struct pci_dev *pdev, struct iremap_entry *ire)
+static void set_msi_source_id(const struct pci_dev *pdev, struct iremap_entry *ire)
 {
     u16 seg;
     u8 bus, devfn, secbus;
@@ -548,11 +548,12 @@  static int remap_entry_to_msi_msg(
 }
 
 static int msi_msg_to_remap_entry(
-    struct iommu *iommu, struct pci_dev *pdev,
-    struct msi_desc *msi_desc, struct msi_msg *msg)
+    struct iommu *iommu, const struct pci_dev *pdev,
+    struct msi_desc *msi_desc, struct msi_msg *msg,
+    const struct pi_desc *pi_desc, const uint8_t gvec)
 {
     struct iremap_entry *iremap_entry = NULL, *iremap_entries;
-    struct iremap_entry new_ire;
+    struct iremap_entry new_ire, old_ire;
     struct msi_msg_remap_entry *remap_rte;
     unsigned int index, i, nr = 1;
     unsigned long flags;
@@ -597,31 +598,50 @@  static int msi_msg_to_remap_entry(
 
     memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry));
 
-    /* Set interrupt remapping table entry */
-    new_ire.remap.fpd = 0;
-    new_ire.remap.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1;
-    new_ire.remap.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
-    new_ire.remap.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1;
-    /* Hardware require RH = 1 for LPR delivery mode */
-    new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio);
-    new_ire.remap.avail = 0;
-    new_ire.remap.res_1 = 0;
-    new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) &
-                            MSI_DATA_VECTOR_MASK;
-    new_ire.remap.res_2 = 0;
-    if ( x2apic_enabled )
-        new_ire.remap.dst = msg->dest32;
+    if ( !pi_desc )
+    {
+        /* Set interrupt remapping table entry */
+        new_ire.remap.fpd = 0;
+        new_ire.remap.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1;
+        new_ire.remap.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
+        new_ire.remap.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1;
+        /* Hardware require RH = 1 for LPR delivery mode */
+        new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio);
+        new_ire.remap.avail = 0;
+        new_ire.remap.res_1 = 0;
+        new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) &
+                                MSI_DATA_VECTOR_MASK;
+        new_ire.remap.res_2 = 0;
+        if ( x2apic_enabled )
+            new_ire.remap.dst = msg->dest32;
+        else
+            new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT)
+                                 & 0xff) << 8;
+
+        new_ire.remap.res_3 = 0;
+        new_ire.remap.res_4 = 0;
+        new_ire.remap.p = 1;    /* finally, set present bit */
+    }
     else
-        new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT)
-                             & 0xff) << 8;
+    {
+        new_ire.post.fpd = 0;
+        new_ire.post.res_1 = 0;
+        new_ire.post.res_2 = 0;
+        new_ire.post.urg = 0;
+        new_ire.post.im = 1;
+        new_ire.post.vector = gvec;
+        new_ire.post.res_3 = 0;
+        new_ire.post.res_4 = 0;
+        new_ire.post.res_5 = 0;
+        new_ire.post.pda_l = virt_to_maddr(pi_desc) >> (32 - PDA_LOW_BIT);
+        new_ire.post.pda_h = virt_to_maddr(pi_desc) >> 32;
+        new_ire.post.p = 1;    /* finally, set present bit */
+    }
 
     if ( pdev )
         set_msi_source_id(pdev, &new_ire);
     else
         set_hpet_source_id(msi_desc->hpet_id, &new_ire);
-    new_ire.remap.res_3 = 0;
-    new_ire.remap.res_4 = 0;
-    new_ire.remap.p = 1;    /* finally, set present bit */
 
     /* now construct new MSI/MSI-X rte entry */
     remap_rte = (struct msi_msg_remap_entry *)msg;
@@ -637,7 +657,23 @@  static int msi_msg_to_remap_entry(
     remap_rte->address_hi = 0;
     remap_rte->data = index - i;
 
-    memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
+    if ( !pi_desc )
+        memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
+    else
+    {
+        __uint128_t ret;
+
+        old_ire = *iremap_entry;
+        ret = cmpxchg16b(iremap_entry, &old_ire, &new_ire);
+
+        /*
+         * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE,
+         * and the hardware cannot update the IRTE behind us, so the return value
+         * of cmpxchg16 should be the same as old_ire. This ASSERT validate it.
+         */
+        ASSERT(ret == old_ire.val);
+    }
+
     iommu_flush_cache_entry(iremap_entry, sizeof(struct iremap_entry));
     iommu_flush_iec_index(iommu, 0, index);
 
@@ -668,7 +704,8 @@  int msi_msg_write_remap_rte(
 
     drhd = pdev ? acpi_find_matched_drhd_unit(pdev)
                 : hpet_to_drhd(msi_desc->hpet_id);
-    return drhd ? msi_msg_to_remap_entry(drhd->iommu, pdev, msi_desc, msg)
+    return drhd ? msi_msg_to_remap_entry(drhd->iommu, pdev,
+                                         msi_desc, msg, NULL, 0)
                 : -EINVAL;
 }
 
@@ -902,42 +939,6 @@  void iommu_disable_x2apic_IR(void)
         disable_qinval(drhd->iommu);
 }
 
-static void setup_posted_irte(
-    struct iremap_entry *new_ire, const struct iremap_entry *old_ire,
-    const struct pi_desc *pi_desc, const uint8_t gvec)
-{
-    memset(new_ire, 0, sizeof(*new_ire));
-
-    /*
-     * 'im' filed decides whether the irte is in posted format (with value 1)
-     * or remapped format (with value 0), if the old irte is in remapped format,
-     * we copy things from remapped part in 'struct iremap_entry', otherwise,
-     * we copy from posted part.
-     */
-    if ( !old_ire->remap.im )
-    {
-        new_ire->post.p = old_ire->remap.p;
-        new_ire->post.fpd = old_ire->remap.fpd;
-        new_ire->post.sid = old_ire->remap.sid;
-        new_ire->post.sq = old_ire->remap.sq;
-        new_ire->post.svt = old_ire->remap.svt;
-    }
-    else
-    {
-        new_ire->post.p = old_ire->post.p;
-        new_ire->post.fpd = old_ire->post.fpd;
-        new_ire->post.sid = old_ire->post.sid;
-        new_ire->post.sq = old_ire->post.sq;
-        new_ire->post.svt = old_ire->post.svt;
-        new_ire->post.urg = old_ire->post.urg;
-    }
-
-    new_ire->post.im = 1;
-    new_ire->post.vector = gvec;
-    new_ire->post.pda_l = virt_to_maddr(pi_desc) >> (32 - PDA_LOW_BIT);
-    new_ire->post.pda_h = virt_to_maddr(pi_desc) >> 32;
-}
-
 /*
  * This function is used to update the IRTE for posted-interrupt
  * when guest changes MSI/MSI-X information.
@@ -946,17 +947,12 @@  int pi_update_irte(const struct vcpu *v, const struct pirq *pirq,
     const uint8_t gvec)
 {
     struct irq_desc *desc;
-    const struct msi_desc *msi_desc;
-    int remap_index;
+    struct msi_desc *msi_desc;
     int rc = 0;
     const struct pci_dev *pci_dev;
     const struct acpi_drhd_unit *drhd;
     struct iommu *iommu;
-    struct ir_ctrl *ir_ctrl;
-    struct iremap_entry *iremap_entries = NULL, *p = NULL;
-    struct iremap_entry new_ire, old_ire;
     const struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
-    __uint128_t ret;
 
     desc = pirq_spin_lock_irq_desc(pirq, NULL);
     if ( !desc )
@@ -976,8 +972,6 @@  int pi_update_irte(const struct vcpu *v, const struct pirq *pirq,
         goto unlock_out;
     }
 
-    remap_index = msi_desc->remap_index;
-
     spin_unlock_irq(&desc->lock);
 
     ASSERT(pcidevs_locked());
@@ -992,35 +986,11 @@  int pi_update_irte(const struct vcpu *v, const struct pirq *pirq,
         return -ENODEV;
 
     iommu = drhd->iommu;
-    ir_ctrl = iommu_ir_ctrl(iommu);
-    if ( !ir_ctrl )
+    if ( !iommu_ir_ctrl(iommu) )
         return -ENODEV;
 
-    spin_lock_irq(&ir_ctrl->iremap_lock);
-
-    GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, remap_index, iremap_entries, p);
-
-    old_ire = *p;
-
-    /* Setup/Update interrupt remapping table entry. */
-    setup_posted_irte(&new_ire, &old_ire, pi_desc, gvec);
-    ret = cmpxchg16b(p, &old_ire, &new_ire);
-
-    /*
-     * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE,
-     * and the hardware cannot update the IRTE behind us, so the return value
-     * of cmpxchg16 should be the same as old_ire. This ASSERT validate it.
-     */
-    ASSERT(ret == old_ire.val);
-
-    iommu_flush_cache_entry(p, sizeof(*p));
-    iommu_flush_iec_index(iommu, 0, remap_index);
-
-    unmap_vtd_domain_page(iremap_entries);
-
-    spin_unlock_irq(&ir_ctrl->iremap_lock);
-
-    return 0;
+    return msi_msg_to_remap_entry(iommu, pci_dev, msi_desc, &msi_desc->msg,
+                                  pi_desc, gvec);
 
  unlock_out:
     spin_unlock_irq(&desc->lock);