diff mbox

[V3,28/29] x86/vvtd: Add queued invalidation (QI) support

Message ID 1506049330-11196-29-git-send-email-tianyu.lan@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

lan,Tianyu Sept. 22, 2017, 3:02 a.m. UTC
From: Chao Gao <chao.gao@intel.com>

Queued Invalidation Interface is an expanded invalidation interface with
extended capabilities. Hardware implementations report support for queued
invalidation interface through the Extended Capability Register. The queued
invalidation interface uses an Invalidation Queue (IQ), which is a circular
buffer in system memory. Software submits commands by writing Invalidation
Descriptors to the IQ.

In this patch, a new function viommu_process_iq() is used for emulating how
hardware handles invalidation requests through QI.

Signed-off-by: Chao Gao <chao.gao@intel.com>
Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
---
 xen/drivers/passthrough/vtd/iommu.h |  19 ++-
 xen/drivers/passthrough/vtd/vvtd.c  | 232 ++++++++++++++++++++++++++++++++++++
 2 files changed, 250 insertions(+), 1 deletion(-)

Comments

Roger Pau Monné Oct. 20, 2017, 11:20 a.m. UTC | #1
On Thu, Sep 21, 2017 at 11:02:09PM -0400, Lan Tianyu wrote:
> From: Chao Gao <chao.gao@intel.com>
> 
> Queued Invalidation Interface is an expanded invalidation interface with
> extended capabilities. Hardware implementations report support for queued
> invalidation interface through the Extended Capability Register. The queued
> invalidation interface uses an Invalidation Queue (IQ), which is a circular
> buffer in system memory. Software submits commands by writing Invalidation
> Descriptors to the IQ.
> 
> In this patch, a new function viommu_process_iq() is used for emulating how
> hardware handles invalidation requests through QI.
> 
> Signed-off-by: Chao Gao <chao.gao@intel.com>
> Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
> ---
>  xen/drivers/passthrough/vtd/iommu.h |  19 ++-
>  xen/drivers/passthrough/vtd/vvtd.c  | 232 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 250 insertions(+), 1 deletion(-)
> 
> diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h
> index c69cd21..c2b83f1 100644
> --- a/xen/drivers/passthrough/vtd/iommu.h
> +++ b/xen/drivers/passthrough/vtd/iommu.h
> @@ -177,6 +177,21 @@
>  #define DMA_IRTA_S(val)         (val & 0xf)
>  #define DMA_IRTA_SIZE(val)      (1UL << (DMA_IRTA_S(val) + 1))
>  
> +/* IQA_REG */
> +#define DMA_IQA_ADDR(val)       (val & ~0xfffULL)
> +#define DMA_IQA_QS(val)         (val & 0x7)
> +#define DMA_IQA_RSVD            0xff8ULL
> +
> +/* IECTL_REG */
> +#define DMA_IECTL_IM_SHIFT 31
> +#define DMA_IECTL_IM            (1 << DMA_IECTL_IM_SHIFT)

Isn't this undefined behavior? It should be 1u.

You should consider using u for all the defines added.

> +#define DMA_IECTL_IP_SHIFT 30
> +#define DMA_IECTL_IP            (1 << DMA_IECTL_IP_SHIFT)
> +
> +/* ICS_REG */
> +#define DMA_ICS_IWC_SHIFT       0
> +#define DMA_ICS_IWC             (1 << DMA_ICS_IWC_SHIFT)
> +
>  /* PMEN_REG */
>  #define DMA_PMEN_EPM    (((u32)1) << 31)
>  #define DMA_PMEN_PRS    (((u32)1) << 0)
> @@ -211,7 +226,8 @@
>  #define DMA_FSTS_PPF (1U << DMA_FSTS_PPF_SHIFT)
>  #define DMA_FSTS_AFO (1U << 2)
>  #define DMA_FSTS_APF (1U << 3)
> -#define DMA_FSTS_IQE (1U << 4)
> +#define DMA_FSTS_IQE_SHIFT 4
> +#define DMA_FSTS_IQE (1U << DMA_FSTS_IQE_SHIFT)
>  #define DMA_FSTS_ICE (1U << 5)
>  #define DMA_FSTS_ITE (1U << 6)
>  #define DMA_FSTS_PRO_SHIFT 7
> @@ -562,6 +578,7 @@ struct qinval_entry {
>  
>  /* Queue invalidation head/tail shift */
>  #define QINVAL_INDEX_SHIFT 4
> +#define QINVAL_INDEX_MASK  0x7fff0ULL
>  
>  #define qinval_present(v) ((v).lo & 1)
>  #define qinval_fault_disable(v) (((v).lo >> 1) & 1)
> diff --git a/xen/drivers/passthrough/vtd/vvtd.c b/xen/drivers/passthrough/vtd/vvtd.c
> index 55f7a46..668d0c9 100644
> --- a/xen/drivers/passthrough/vtd/vvtd.c
> +++ b/xen/drivers/passthrough/vtd/vvtd.c
> @@ -28,6 +28,7 @@
>  #include <asm/current.h>
>  #include <asm/event.h>
>  #include <asm/hvm/domain.h>
> +#include <asm/hvm/support.h>
>  #include <asm/io_apic.h>
>  #include <asm/page.h>
>  #include <asm/p2m.h>
> @@ -419,6 +420,177 @@ static int vvtd_record_fault(struct vvtd *vvtd,
>      return X86EMUL_OKAY;
>  }
>  
> +/*
> + * Process a invalidation descriptor. Currently, only two types descriptors,
> + * Interrupt Entry Cache Invalidation Descritor and Invalidation Wait
> + * Descriptor are handled.
> + * @vvtd: the virtual vtd instance
> + * @i: the index of the invalidation descriptor to be processed
> + *
> + * If success return 0, or return non-zero when failure.
> + */
> +static int process_iqe(struct vvtd *vvtd, int i)

unsigned int.

> +{
> +    uint64_t iqa;
> +    struct qinval_entry *qinval_page;
> +    int ret = 0;
> +
> +    iqa = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG);
> +    qinval_page = map_guest_page(vvtd->domain, DMA_IQA_ADDR(iqa)>>PAGE_SHIFT);

PFN_DOWN instead of open coding the shift. Both can be initialized
at declaration. Also AFAICT iqa is only used once, so the local
variable is not needed.

> +    if ( IS_ERR(qinval_page) )
> +    {
> +        gdprintk(XENLOG_ERR, "Can't map guest IRT (rc %ld)",
> +                 PTR_ERR(qinval_page));
> +        return PTR_ERR(qinval_page);
> +    }
> +
> +    switch ( qinval_page[i].q.inv_wait_dsc.lo.type )
> +    {
> +    case TYPE_INVAL_WAIT:
> +        if ( qinval_page[i].q.inv_wait_dsc.lo.sw )
> +        {
> +            uint32_t data = qinval_page[i].q.inv_wait_dsc.lo.sdata;
> +            uint64_t addr = (qinval_page[i].q.inv_wait_dsc.hi.saddr << 2);

Unneeded parentheses.

> +
> +            ret = hvm_copy_to_guest_phys(addr, &data, sizeof(data), current);
> +            if ( ret )
> +                vvtd_info("Failed to write status address");

Don't you need to return or do something here? (like raise some kind
of error?)

> +        }
> +
> +        /*
> +         * The following code generates an invalidation completion event
> +         * indicating the invalidation wait descriptor completion. Note that
> +         * the following code fragment is not tested properly.
> +         */
> +        if ( qinval_page[i].q.inv_wait_dsc.lo.iflag )
> +        {
> +            uint32_t ie_data, ie_addr;

Missing newline, but...

> +            if ( !vvtd_test_and_set_bit(vvtd, DMAR_ICS_REG, DMA_ICS_IWC_SHIFT) )
> +            {
> +                vvtd_set_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT);
> +                if ( !vvtd_test_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT) )
> +                {
> +                    ie_data = vvtd_get_reg(vvtd, DMAR_IEDATA_REG);
> +                    ie_addr = vvtd_get_reg(vvtd, DMAR_IEADDR_REG);
> +                    vvtd_generate_interrupt(vvtd, ie_addr, ie_data);

...you don't seem two need the two local variables. They are used only
once.

> +                    vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT);
> +                }
> +            }
> +        }
> +        break;
> +
> +    case TYPE_INVAL_IEC:
> +        /*
> +         * Currently, no cache is preserved in hypervisor. Only need to update
> +         * pIRTEs which are modified in binding process.
> +         */
> +        break;
> +
> +    default:
> +        goto error;

There's no reason to use a label that's only used for the default
case. Simply place the code in the error label here.

> +    }
> +
> +    unmap_guest_page((void*)qinval_page);
> +    return ret;
> +
> + error:
> +    unmap_guest_page((void*)qinval_page);
> +    gdprintk(XENLOG_ERR, "Internal error in Queue Invalidation.\n");
> +    domain_crash(vvtd->domain);

Do you really need to crash the domain in such case?

> +    return ret;
> +}
> +
> +/*
> + * Invalidate all the descriptors in Invalidation Queue.
> + */
> +static void vvtd_process_iq(struct vvtd *vvtd)
> +{
> +    uint64_t iqh, iqt, iqa, max_entry, i;
> +    int err = 0;
> +
> +    /*
> +     * No new descriptor is fetched from the Invalidation Queue until
> +     * software clears the IQE field in the Fault Status Register
> +     */
> +    if ( vvtd_test_bit(vvtd, DMAR_FSTS_REG, DMA_FSTS_IQE_SHIFT) )
> +        return;
> +
> +    iqh = vvtd_get_reg_quad(vvtd, DMAR_IQH_REG);
> +    iqt = vvtd_get_reg_quad(vvtd, DMAR_IQT_REG);
> +    iqa = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG);
> +
> +    max_entry = 1 << (QINVAL_ENTRY_ORDER + DMA_IQA_QS(iqa));
> +    iqh = MASK_EXTR(iqh, QINVAL_INDEX_MASK);
> +    iqt = MASK_EXTR(iqt, QINVAL_INDEX_MASK);

This should be done above, when they are initialized.

> +
> +    ASSERT(iqt < max_entry);
> +    if ( iqh == iqt )
> +        return;
> +
> +    for ( i = iqh; i != iqt; i = (i + 1) % max_entry )
> +    {
> +        err = process_iqe(vvtd, i);

process_iqe takes an int parameter, and here you are feeding it a
uint64_t.

> +        if ( err )
> +            break;
> +    }
> +    vvtd_set_reg_quad(vvtd, DMAR_IQH_REG, i << QINVAL_INDEX_SHIFT);
> +
> +    /*
> +     * When IQE set, IQH references the desriptor associated with the error.
> +     */
> +    if ( err )
> +        vvtd_report_non_recoverable_fault(vvtd, DMA_FSTS_IQE_SHIFT);
> +}
> +
> +static int vvtd_write_iqt(struct vvtd *vvtd, unsigned long val)

Not sure there's much point in making this function return int, when
all the return values are X86EMUL_OKAY.

Sionce val here is a register AFAICT, I would rather prefer that you
use an explicit type size (uint64_t or uint32_t as fits).

> +{
> +    uint64_t max_entry, iqa = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG);
> +
> +    if ( val & ~QINVAL_INDEX_MASK )
> +    {
> +        vvtd_info("Attempt to set reserved bits in Invalidation Queue Tail");
> +        return X86EMUL_OKAY;
> +    }
> +
> +    max_entry = 1 << (QINVAL_ENTRY_ORDER + DMA_IQA_QS(iqa));

1ull please.

> +    if ( MASK_EXTR(val, QINVAL_INDEX_MASK) >= max_entry )
> +    {
> +        vvtd_info("IQT: Value %lx exceeded supported max index.", val);
> +        return X86EMUL_OKAY;
> +    }
> +
> +    vvtd_set_reg_quad(vvtd, DMAR_IQT_REG, val);
> +    vvtd_process_iq(vvtd);
> +
> +    return X86EMUL_OKAY;
> +}
> +
> +static int vvtd_write_iqa(struct vvtd *vvtd, unsigned long val)

Same here, it seems like this function should return void, because the
current return value is meaningless, and same comment about 'val'
being uintXX_t.

> +{
> +    uint32_t cap = vvtd_get_reg(vvtd, DMAR_CAP_REG);
> +    unsigned int guest_max_addr_width = cap_mgaw(cap);
> +
> +    if ( val & (~((1ULL << guest_max_addr_width) - 1) | DMA_IQA_RSVD) )
> +    {
> +        vvtd_info("Attempt to set reserved bits in Invalidation Queue Address");
> +        return X86EMUL_OKAY;
> +    }
> +
> +    vvtd_set_reg_quad(vvtd, DMAR_IQA_REG, val);
> +    return X86EMUL_OKAY;
> +}
> +
> +static int vvtd_write_ics(struct vvtd *vvtd, uint32_t val)
> +{
> +    if ( val & DMA_ICS_IWC )
> +    {
> +        vvtd_clear_bit(vvtd, DMAR_ICS_REG, DMA_ICS_IWC_SHIFT);
> +        /*When IWC field is cleared, the IP field needs to be cleared */
             ^ missing space.

> +        vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT);
> +    }
> +    return X86EMUL_OKAY;

This function wants to be void.

> +}
> +
>  static int vvtd_write_frcd3(struct vvtd *vvtd, uint32_t val)
>  {
>      /* Writing a 1 means clear fault */
> @@ -430,6 +602,30 @@ static int vvtd_write_frcd3(struct vvtd *vvtd, uint32_t val)
>      return X86EMUL_OKAY;
>  }
>  
> +static int vvtd_write_iectl(struct vvtd *vvtd, uint32_t val)

void please.

> +{
> +    /*
> +     * Only DMA_IECTL_IM bit is writable. Generate pending event when unmask.
> +     */

Single line comments use /* ... */

> +    if ( !(val & DMA_IECTL_IM) )
> +    {
> +        /* Clear IM and clear IP */
> +        vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT);
> +        if ( vvtd_test_and_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT) )
> +        {
> +            uint32_t ie_data, ie_addr;
> +
> +            ie_data = vvtd_get_reg(vvtd, DMAR_IEDATA_REG);
> +            ie_addr = vvtd_get_reg(vvtd, DMAR_IEADDR_REG);
> +            vvtd_generate_interrupt(vvtd, ie_addr, ie_data);

No need for ie_data and ie_addr.

> +        }
> +    }
> +    else
> +        vvtd_set_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT);
> +
> +    return X86EMUL_OKAY;
> +}
> +
>  static int vvtd_write_fectl(struct vvtd *vvtd, uint32_t val)
>  {
>      /*
> @@ -476,6 +672,10 @@ static int vvtd_write_fsts(struct vvtd *vvtd, uint32_t val)
>      if ( !((vvtd_get_reg(vvtd, DMAR_FSTS_REG) & DMA_FSTS_FAULTS)) )
>          vvtd_clear_bit(vvtd, DMAR_FECTL_REG, DMA_FECTL_IP_SHIFT);
>  
> +    /* Continue to deal invalidation when IQE is clear */
> +    if ( !vvtd_test_bit(vvtd, DMAR_FSTS_REG, DMA_FSTS_IQE_SHIFT) )
> +        vvtd_process_iq(vvtd);
> +
>      return X86EMUL_OKAY;
>  }
>  
> @@ -611,6 +811,32 @@ static int vvtd_write(struct vcpu *v, unsigned long addr,
>          case DMAR_FECTL_REG:
>              return vvtd_write_fectl(vvtd, val);
>  
> +        case DMAR_IECTL_REG:
> +            return vvtd_write_iectl(vvtd, val);
> +
> +        case DMAR_ICS_REG:
> +            return vvtd_write_ics(vvtd, val);
> +
> +        case DMAR_IQT_REG:
> +            return vvtd_write_iqt(vvtd, (uint32_t)val);
> +
> +        case DMAR_IQA_REG:
> +        {
> +            uint32_t iqa_hi;
> +
> +            iqa_hi = vvtd_get_reg(vvtd, DMAR_IQA_REG_HI);

Initialization at declaration time, but since it's used only once, I
would rather prefer that you don't use a local variable at all.

> +            return vvtd_write_iqa(vvtd,
> +                                 (uint32_t)val | ((uint64_t)iqa_hi << 32));
> +        }
> +
> +        case DMAR_IQA_REG_HI:
> +        {
> +            uint32_t iqa_lo;
> +
> +            iqa_lo = vvtd_get_reg(vvtd, DMAR_IQA_REG);
> +            return vvtd_write_iqa(vvtd, (val << 32) | iqa_lo);

Same comment as above regarding iqa_lo.

Thanks, Roger.
Chao Gao Oct. 23, 2017, 7:50 a.m. UTC | #2
On Fri, Oct 20, 2017 at 12:20:06PM +0100, Roger Pau Monné wrote:
>On Thu, Sep 21, 2017 at 11:02:09PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao.gao@intel.com>
>> 
>> Queued Invalidation Interface is an expanded invalidation interface with
>> extended capabilities. Hardware implementations report support for queued
>> invalidation interface through the Extended Capability Register. The queued
>> invalidation interface uses an Invalidation Queue (IQ), which is a circular
>> buffer in system memory. Software submits commands by writing Invalidation
>> Descriptors to the IQ.
>> 
>> In this patch, a new function viommu_process_iq() is used for emulating how
>> hardware handles invalidation requests through QI.
>> 
>> Signed-off-by: Chao Gao <chao.gao@intel.com>
>> Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
>> ---
>> +static int process_iqe(struct vvtd *vvtd, int i)
>
>unsigned int.
>
>> +{
>> +    uint64_t iqa;
>> +    struct qinval_entry *qinval_page;
>> +    int ret = 0;
>> +
>> +    iqa = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG);
>> +    qinval_page = map_guest_page(vvtd->domain, DMA_IQA_ADDR(iqa)>>PAGE_SHIFT);
>
>PFN_DOWN instead of open coding the shift. Both can be initialized
>at declaration. Also AFAICT iqa is only used once, so the local
>variable is not needed.
>
>> +    if ( IS_ERR(qinval_page) )
>> +    {
>> +        gdprintk(XENLOG_ERR, "Can't map guest IRT (rc %ld)",
>> +                 PTR_ERR(qinval_page));
>> +        return PTR_ERR(qinval_page);
>> +    }
>> +
>> +    switch ( qinval_page[i].q.inv_wait_dsc.lo.type )
>> +    {
>> +    case TYPE_INVAL_WAIT:
>> +        if ( qinval_page[i].q.inv_wait_dsc.lo.sw )
>> +        {
>> +            uint32_t data = qinval_page[i].q.inv_wait_dsc.lo.sdata;
>> +            uint64_t addr = (qinval_page[i].q.inv_wait_dsc.hi.saddr << 2);
>
>Unneeded parentheses.
>
>> +
>> +            ret = hvm_copy_to_guest_phys(addr, &data, sizeof(data), current);
>> +            if ( ret )
>> +                vvtd_info("Failed to write status address");
>
>Don't you need to return or do something here? (like raise some kind
>of error?)

The 'addr' is programmed by guest. Here vvtd cannot finish this write
for some reason (i.e. the 'addr' may be not in the guest physical memory space).
According to VT-d spec 6.5.2.8 Invalidation Wait Descriptor, "Hardware
behavior is undefined if the Status Address specified is not an address
route-able to memory (such as peer address, interrupt address range of
0xFEEX_XXXX etc.) I think that Xen can just ignore it. I should use
vvtd_debug() for it is guest triggerable.

>> +                if ( !vvtd_test_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT) )
>> +                {
>> +                    ie_data = vvtd_get_reg(vvtd, DMAR_IEDATA_REG);
>> +                    ie_addr = vvtd_get_reg(vvtd, DMAR_IEADDR_REG);
>> +                    vvtd_generate_interrupt(vvtd, ie_addr, ie_data);
>
>...you don't seem two need the two local variables. They are used only
>once.
>
>> +                    vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT);
>> +                }
>> +            }
>> +        }
>> +        break;
>> +
>> +    case TYPE_INVAL_IEC:
>> +        /*
>> +         * Currently, no cache is preserved in hypervisor. Only need to update
>> +         * pIRTEs which are modified in binding process.
>> +         */
>> +        break;
>> +
>> +    default:
>> +        goto error;
>
>There's no reason to use a label that's only used for the default
>case. Simply place the code in the error label here.
>
>> +    }
>> +
>> +    unmap_guest_page((void*)qinval_page);
>> +    return ret;
>> +
>> + error:
>> +    unmap_guest_page((void*)qinval_page);
>> +    gdprintk(XENLOG_ERR, "Internal error in Queue Invalidation.\n");
>> +    domain_crash(vvtd->domain);
>
>Do you really need to crash the domain in such case?

We reach here when guest requests some operations vvtd doesn't claim
supported or emulated. I am afraid it also can be triggered by guest.
How about ignoring the invalidation request?

I will change the error message for it isn't internal error.

Thanks
Chao
Chao Gao Oct. 23, 2017, 8:52 a.m. UTC | #3
On Mon, Oct 23, 2017 at 09:57:16AM +0100, Roger Pau Monné wrote:
>On Mon, Oct 23, 2017 at 03:50:24PM +0800, Chao Gao wrote:
>> On Fri, Oct 20, 2017 at 12:20:06PM +0100, Roger Pau Monné wrote:
>> >On Thu, Sep 21, 2017 at 11:02:09PM -0400, Lan Tianyu wrote:
>> >> From: Chao Gao <chao.gao@intel.com>
>> >> +    }
>> >> +
>> >> +    unmap_guest_page((void*)qinval_page);
>> >> +    return ret;
>> >> +
>> >> + error:
>> >> +    unmap_guest_page((void*)qinval_page);
>> >> +    gdprintk(XENLOG_ERR, "Internal error in Queue Invalidation.\n");
>> >> +    domain_crash(vvtd->domain);
>> >
>> >Do you really need to crash the domain in such case?
>> 
>> We reach here when guest requests some operations vvtd doesn't claim
>> supported or emulated. I am afraid it also can be triggered by guest.
>> How about ignoring the invalidation request?
>
>What would real hardware do in such case?

After reading the spec again, I think hardware may generate a fault
event, seeing VT-d spec 10.4.9 Fault Status Register: 
Hardware detected an error associated with the invalidation queue. This
could be due to either a hardware error while fetching a descriptor from
the invalidation queue, or hardware detecting an erroneous or invalid
descriptor in the invalidation queue. At this time, a fault event may be
generated based on the programming of the Fault Event Control register

Thanks
Chao
Roger Pau Monné Oct. 23, 2017, 8:57 a.m. UTC | #4
On Mon, Oct 23, 2017 at 03:50:24PM +0800, Chao Gao wrote:
> On Fri, Oct 20, 2017 at 12:20:06PM +0100, Roger Pau Monné wrote:
> >On Thu, Sep 21, 2017 at 11:02:09PM -0400, Lan Tianyu wrote:
> >> From: Chao Gao <chao.gao@intel.com>
> >> +    }
> >> +
> >> +    unmap_guest_page((void*)qinval_page);
> >> +    return ret;
> >> +
> >> + error:
> >> +    unmap_guest_page((void*)qinval_page);
> >> +    gdprintk(XENLOG_ERR, "Internal error in Queue Invalidation.\n");
> >> +    domain_crash(vvtd->domain);
> >
> >Do you really need to crash the domain in such case?
> 
> We reach here when guest requests some operations vvtd doesn't claim
> supported or emulated. I am afraid it also can be triggered by guest.
> How about ignoring the invalidation request?

What would real hardware do in such case?

Roger.
Tian, Kevin Oct. 23, 2017, 11:26 p.m. UTC | #5
> From: Gao, Chao
> Sent: Monday, October 23, 2017 4:52 PM
> 
> On Mon, Oct 23, 2017 at 09:57:16AM +0100, Roger Pau Monné wrote:
> >On Mon, Oct 23, 2017 at 03:50:24PM +0800, Chao Gao wrote:
> >> On Fri, Oct 20, 2017 at 12:20:06PM +0100, Roger Pau Monné wrote:
> >> >On Thu, Sep 21, 2017 at 11:02:09PM -0400, Lan Tianyu wrote:
> >> >> From: Chao Gao <chao.gao@intel.com>
> >> >> +    }
> >> >> +
> >> >> +    unmap_guest_page((void*)qinval_page);
> >> >> +    return ret;
> >> >> +
> >> >> + error:
> >> >> +    unmap_guest_page((void*)qinval_page);
> >> >> +    gdprintk(XENLOG_ERR, "Internal error in Queue Invalidation.\n");
> >> >> +    domain_crash(vvtd->domain);
> >> >
> >> >Do you really need to crash the domain in such case?
> >>
> >> We reach here when guest requests some operations vvtd doesn't claim
> >> supported or emulated. I am afraid it also can be triggered by guest.
> >> How about ignoring the invalidation request?
> >
> >What would real hardware do in such case?
> 
> After reading the spec again, I think hardware may generate a fault
> event, seeing VT-d spec 10.4.9 Fault Status Register:
> Hardware detected an error associated with the invalidation queue. This
> could be due to either a hardware error while fetching a descriptor from
> the invalidation queue, or hardware detecting an erroneous or invalid
> descriptor in the invalidation queue. At this time, a fault event may be
> generated based on the programming of the Fault Event Control register
> 

Please do proper emulation according to hardware behavior.
diff mbox

Patch

diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h
index c69cd21..c2b83f1 100644
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -177,6 +177,21 @@ 
 #define DMA_IRTA_S(val)         (val & 0xf)
 #define DMA_IRTA_SIZE(val)      (1UL << (DMA_IRTA_S(val) + 1))
 
+/* IQA_REG */
+#define DMA_IQA_ADDR(val)       (val & ~0xfffULL)
+#define DMA_IQA_QS(val)         (val & 0x7)
+#define DMA_IQA_RSVD            0xff8ULL
+
+/* IECTL_REG */
+#define DMA_IECTL_IM_SHIFT 31
+#define DMA_IECTL_IM            (1 << DMA_IECTL_IM_SHIFT)
+#define DMA_IECTL_IP_SHIFT 30
+#define DMA_IECTL_IP            (1 << DMA_IECTL_IP_SHIFT)
+
+/* ICS_REG */
+#define DMA_ICS_IWC_SHIFT       0
+#define DMA_ICS_IWC             (1 << DMA_ICS_IWC_SHIFT)
+
 /* PMEN_REG */
 #define DMA_PMEN_EPM    (((u32)1) << 31)
 #define DMA_PMEN_PRS    (((u32)1) << 0)
@@ -211,7 +226,8 @@ 
 #define DMA_FSTS_PPF (1U << DMA_FSTS_PPF_SHIFT)
 #define DMA_FSTS_AFO (1U << 2)
 #define DMA_FSTS_APF (1U << 3)
-#define DMA_FSTS_IQE (1U << 4)
+#define DMA_FSTS_IQE_SHIFT 4
+#define DMA_FSTS_IQE (1U << DMA_FSTS_IQE_SHIFT)
 #define DMA_FSTS_ICE (1U << 5)
 #define DMA_FSTS_ITE (1U << 6)
 #define DMA_FSTS_PRO_SHIFT 7
@@ -562,6 +578,7 @@  struct qinval_entry {
 
 /* Queue invalidation head/tail shift */
 #define QINVAL_INDEX_SHIFT 4
+#define QINVAL_INDEX_MASK  0x7fff0ULL
 
 #define qinval_present(v) ((v).lo & 1)
 #define qinval_fault_disable(v) (((v).lo >> 1) & 1)
diff --git a/xen/drivers/passthrough/vtd/vvtd.c b/xen/drivers/passthrough/vtd/vvtd.c
index 55f7a46..668d0c9 100644
--- a/xen/drivers/passthrough/vtd/vvtd.c
+++ b/xen/drivers/passthrough/vtd/vvtd.c
@@ -28,6 +28,7 @@ 
 #include <asm/current.h>
 #include <asm/event.h>
 #include <asm/hvm/domain.h>
+#include <asm/hvm/support.h>
 #include <asm/io_apic.h>
 #include <asm/page.h>
 #include <asm/p2m.h>
@@ -419,6 +420,177 @@  static int vvtd_record_fault(struct vvtd *vvtd,
     return X86EMUL_OKAY;
 }
 
+/*
+ * Process a invalidation descriptor. Currently, only two types descriptors,
+ * Interrupt Entry Cache Invalidation Descritor and Invalidation Wait
+ * Descriptor are handled.
+ * @vvtd: the virtual vtd instance
+ * @i: the index of the invalidation descriptor to be processed
+ *
+ * If success return 0, or return non-zero when failure.
+ */
+static int process_iqe(struct vvtd *vvtd, int i)
+{
+    uint64_t iqa;
+    struct qinval_entry *qinval_page;
+    int ret = 0;
+
+    iqa = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG);
+    qinval_page = map_guest_page(vvtd->domain, DMA_IQA_ADDR(iqa)>>PAGE_SHIFT);
+    if ( IS_ERR(qinval_page) )
+    {
+        gdprintk(XENLOG_ERR, "Can't map guest IRT (rc %ld)",
+                 PTR_ERR(qinval_page));
+        return PTR_ERR(qinval_page);
+    }
+
+    switch ( qinval_page[i].q.inv_wait_dsc.lo.type )
+    {
+    case TYPE_INVAL_WAIT:
+        if ( qinval_page[i].q.inv_wait_dsc.lo.sw )
+        {
+            uint32_t data = qinval_page[i].q.inv_wait_dsc.lo.sdata;
+            uint64_t addr = (qinval_page[i].q.inv_wait_dsc.hi.saddr << 2);
+
+            ret = hvm_copy_to_guest_phys(addr, &data, sizeof(data), current);
+            if ( ret )
+                vvtd_info("Failed to write status address");
+        }
+
+        /*
+         * The following code generates an invalidation completion event
+         * indicating the invalidation wait descriptor completion. Note that
+         * the following code fragment is not tested properly.
+         */
+        if ( qinval_page[i].q.inv_wait_dsc.lo.iflag )
+        {
+            uint32_t ie_data, ie_addr;
+            if ( !vvtd_test_and_set_bit(vvtd, DMAR_ICS_REG, DMA_ICS_IWC_SHIFT) )
+            {
+                vvtd_set_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT);
+                if ( !vvtd_test_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT) )
+                {
+                    ie_data = vvtd_get_reg(vvtd, DMAR_IEDATA_REG);
+                    ie_addr = vvtd_get_reg(vvtd, DMAR_IEADDR_REG);
+                    vvtd_generate_interrupt(vvtd, ie_addr, ie_data);
+                    vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT);
+                }
+            }
+        }
+        break;
+
+    case TYPE_INVAL_IEC:
+        /*
+         * Currently, no cache is preserved in hypervisor. Only need to update
+         * pIRTEs which are modified in binding process.
+         */
+        break;
+
+    default:
+        goto error;
+    }
+
+    unmap_guest_page((void*)qinval_page);
+    return ret;
+
+ error:
+    unmap_guest_page((void*)qinval_page);
+    gdprintk(XENLOG_ERR, "Internal error in Queue Invalidation.\n");
+    domain_crash(vvtd->domain);
+    return ret;
+}
+
+/*
+ * Invalidate all the descriptors in Invalidation Queue.
+ */
+static void vvtd_process_iq(struct vvtd *vvtd)
+{
+    uint64_t iqh, iqt, iqa, max_entry, i;
+    int err = 0;
+
+    /*
+     * No new descriptor is fetched from the Invalidation Queue until
+     * software clears the IQE field in the Fault Status Register
+     */
+    if ( vvtd_test_bit(vvtd, DMAR_FSTS_REG, DMA_FSTS_IQE_SHIFT) )
+        return;
+
+    iqh = vvtd_get_reg_quad(vvtd, DMAR_IQH_REG);
+    iqt = vvtd_get_reg_quad(vvtd, DMAR_IQT_REG);
+    iqa = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG);
+
+    max_entry = 1 << (QINVAL_ENTRY_ORDER + DMA_IQA_QS(iqa));
+    iqh = MASK_EXTR(iqh, QINVAL_INDEX_MASK);
+    iqt = MASK_EXTR(iqt, QINVAL_INDEX_MASK);
+
+    ASSERT(iqt < max_entry);
+    if ( iqh == iqt )
+        return;
+
+    for ( i = iqh; i != iqt; i = (i + 1) % max_entry )
+    {
+        err = process_iqe(vvtd, i);
+        if ( err )
+            break;
+    }
+    vvtd_set_reg_quad(vvtd, DMAR_IQH_REG, i << QINVAL_INDEX_SHIFT);
+
+    /*
+     * When IQE set, IQH references the desriptor associated with the error.
+     */
+    if ( err )
+        vvtd_report_non_recoverable_fault(vvtd, DMA_FSTS_IQE_SHIFT);
+}
+
+static int vvtd_write_iqt(struct vvtd *vvtd, unsigned long val)
+{
+    uint64_t max_entry, iqa = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG);
+
+    if ( val & ~QINVAL_INDEX_MASK )
+    {
+        vvtd_info("Attempt to set reserved bits in Invalidation Queue Tail");
+        return X86EMUL_OKAY;
+    }
+
+    max_entry = 1 << (QINVAL_ENTRY_ORDER + DMA_IQA_QS(iqa));
+    if ( MASK_EXTR(val, QINVAL_INDEX_MASK) >= max_entry )
+    {
+        vvtd_info("IQT: Value %lx exceeded supported max index.", val);
+        return X86EMUL_OKAY;
+    }
+
+    vvtd_set_reg_quad(vvtd, DMAR_IQT_REG, val);
+    vvtd_process_iq(vvtd);
+
+    return X86EMUL_OKAY;
+}
+
+static int vvtd_write_iqa(struct vvtd *vvtd, unsigned long val)
+{
+    uint32_t cap = vvtd_get_reg(vvtd, DMAR_CAP_REG);
+    unsigned int guest_max_addr_width = cap_mgaw(cap);
+
+    if ( val & (~((1ULL << guest_max_addr_width) - 1) | DMA_IQA_RSVD) )
+    {
+        vvtd_info("Attempt to set reserved bits in Invalidation Queue Address");
+        return X86EMUL_OKAY;
+    }
+
+    vvtd_set_reg_quad(vvtd, DMAR_IQA_REG, val);
+    return X86EMUL_OKAY;
+}
+
+static int vvtd_write_ics(struct vvtd *vvtd, uint32_t val)
+{
+    if ( val & DMA_ICS_IWC )
+    {
+        vvtd_clear_bit(vvtd, DMAR_ICS_REG, DMA_ICS_IWC_SHIFT);
+        /*When IWC field is cleared, the IP field needs to be cleared */
+        vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT);
+    }
+    return X86EMUL_OKAY;
+}
+
 static int vvtd_write_frcd3(struct vvtd *vvtd, uint32_t val)
 {
     /* Writing a 1 means clear fault */
@@ -430,6 +602,30 @@  static int vvtd_write_frcd3(struct vvtd *vvtd, uint32_t val)
     return X86EMUL_OKAY;
 }
 
+static int vvtd_write_iectl(struct vvtd *vvtd, uint32_t val)
+{
+    /*
+     * Only DMA_IECTL_IM bit is writable. Generate pending event when unmask.
+     */
+    if ( !(val & DMA_IECTL_IM) )
+    {
+        /* Clear IM and clear IP */
+        vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT);
+        if ( vvtd_test_and_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT) )
+        {
+            uint32_t ie_data, ie_addr;
+
+            ie_data = vvtd_get_reg(vvtd, DMAR_IEDATA_REG);
+            ie_addr = vvtd_get_reg(vvtd, DMAR_IEADDR_REG);
+            vvtd_generate_interrupt(vvtd, ie_addr, ie_data);
+        }
+    }
+    else
+        vvtd_set_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT);
+
+    return X86EMUL_OKAY;
+}
+
 static int vvtd_write_fectl(struct vvtd *vvtd, uint32_t val)
 {
     /*
@@ -476,6 +672,10 @@  static int vvtd_write_fsts(struct vvtd *vvtd, uint32_t val)
     if ( !((vvtd_get_reg(vvtd, DMAR_FSTS_REG) & DMA_FSTS_FAULTS)) )
         vvtd_clear_bit(vvtd, DMAR_FECTL_REG, DMA_FECTL_IP_SHIFT);
 
+    /* Continue to deal invalidation when IQE is clear */
+    if ( !vvtd_test_bit(vvtd, DMAR_FSTS_REG, DMA_FSTS_IQE_SHIFT) )
+        vvtd_process_iq(vvtd);
+
     return X86EMUL_OKAY;
 }
 
@@ -611,6 +811,32 @@  static int vvtd_write(struct vcpu *v, unsigned long addr,
         case DMAR_FECTL_REG:
             return vvtd_write_fectl(vvtd, val);
 
+        case DMAR_IECTL_REG:
+            return vvtd_write_iectl(vvtd, val);
+
+        case DMAR_ICS_REG:
+            return vvtd_write_ics(vvtd, val);
+
+        case DMAR_IQT_REG:
+            return vvtd_write_iqt(vvtd, (uint32_t)val);
+
+        case DMAR_IQA_REG:
+        {
+            uint32_t iqa_hi;
+
+            iqa_hi = vvtd_get_reg(vvtd, DMAR_IQA_REG_HI);
+            return vvtd_write_iqa(vvtd,
+                                 (uint32_t)val | ((uint64_t)iqa_hi << 32));
+        }
+
+        case DMAR_IQA_REG_HI:
+        {
+            uint32_t iqa_lo;
+
+            iqa_lo = vvtd_get_reg(vvtd, DMAR_IQA_REG);
+            return vvtd_write_iqa(vvtd, (val << 32) | iqa_lo);
+        }
+
         case DMAR_IEDATA_REG:
         case DMAR_IEADDR_REG:
         case DMAR_IEUADDR_REG:
@@ -637,6 +863,12 @@  static int vvtd_write(struct vcpu *v, unsigned long addr,
             vvtd_set_reg_quad(vvtd, DMAR_IRTA_REG, val);
             break;
 
+        case DMAR_IQT_REG:
+            return vvtd_write_iqt(vvtd, val);
+
+        case DMAR_IQA_REG:
+            return vvtd_write_iqa(vvtd, val);
+
         default:
             if ( offset == fault_offset + DMA_FRCD2_OFFSET )
                 return vvtd_write_frcd3(vvtd, val >> 32);