diff mbox

[V3,15/29] x86/vvtd: Process interrupt remapping request

Message ID 1506049330-11196-16-git-send-email-tianyu.lan@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

lan,Tianyu Sept. 22, 2017, 3:01 a.m. UTC
From: Chao Gao <chao.gao@intel.com>

When a remapping interrupt request arrives, remapping hardware computes the
interrupt_index per the algorithm described in VTD spec
"Interrupt Remapping Table", interprets the IRTE and generates a remapped
interrupt request.

This patch introduces viommu_handle_irq_request() to emulate the process how
remapping hardware handles a remapping interrupt request.

Signed-off-by: Chao Gao <chao.gao@intel.com>
Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>

---
v3:
 - Encode map_guest_page()'s error into void* to avoid using another parameter
---
 xen/drivers/passthrough/vtd/iommu.h |  21 +++
 xen/drivers/passthrough/vtd/vvtd.c  | 264 +++++++++++++++++++++++++++++++++++-
 2 files changed, 284 insertions(+), 1 deletion(-)

Comments

Roger Pau Monné Oct. 19, 2017, 2:26 p.m. UTC | #1
On Thu, Sep 21, 2017 at 11:01:56PM -0400, Lan Tianyu wrote:
> From: Chao Gao <chao.gao@intel.com>
> 
> When a remapping interrupt request arrives, remapping hardware computes the
> interrupt_index per the algorithm described in VTD spec
> "Interrupt Remapping Table", interprets the IRTE and generates a remapped
> interrupt request.
> 
> This patch introduces viommu_handle_irq_request() to emulate the process how
> remapping hardware handles a remapping interrupt request.
> 
> Signed-off-by: Chao Gao <chao.gao@intel.com>
> Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
> 
> ---
> v3:
>  - Encode map_guest_page()'s error into void* to avoid using another parameter
> ---
>  xen/drivers/passthrough/vtd/iommu.h |  21 +++
>  xen/drivers/passthrough/vtd/vvtd.c  | 264 +++++++++++++++++++++++++++++++++++-
>  2 files changed, 284 insertions(+), 1 deletion(-)
> 
> diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h
> index 703726f..790384f 100644
> --- a/xen/drivers/passthrough/vtd/iommu.h
> +++ b/xen/drivers/passthrough/vtd/iommu.h
> @@ -218,6 +218,21 @@
>  #define dma_frcd_source_id(c) (c & 0xffff)
>  #define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
>  
> +enum VTD_FAULT_TYPE
> +{
> +    /* Interrupt remapping transition faults */
> +    VTD_FR_IR_REQ_RSVD      = 0x20, /* One or more IR request reserved
> +                                     * fields set */
> +    VTD_FR_IR_INDEX_OVER    = 0x21, /* Index value greater than max */
> +    VTD_FR_IR_ENTRY_P       = 0x22, /* Present (P) not set in IRTE */
> +    VTD_FR_IR_ROOT_INVAL    = 0x23, /* IR Root table invalid */
> +    VTD_FR_IR_IRTE_RSVD     = 0x24, /* IRTE Rsvd field non-zero with
> +                                     * Present flag set */
> +    VTD_FR_IR_REQ_COMPAT    = 0x25, /* Encountered compatible IR
> +                                     * request while disabled */
> +    VTD_FR_IR_SID_ERR       = 0x26, /* Invalid Source-ID */
> +};

Why does this need to be an enum? Plus enum type names should not be
all in uppercase.

In any case, I would just use defines, like it's done for all other
values in the file.

> +
>  /*
>   * 0: Present
>   * 1-11: Reserved
> @@ -358,6 +373,12 @@ struct iremap_entry {
>  };
>  
>  /*
> + * When VT-d doesn't enable Extended Interrupt Mode. Hardware only interprets
> + * only 8-bits ([15:8]) of Destination-ID field in the IRTEs.
> + */
> +#define IRTE_xAPIC_DEST_MASK 0xff00
> +
> +/*
>   * Posted-interrupt descriptor address is 64 bits with 64-byte aligned, only
>   * the upper 26 bits of lest significiant 32 bits is available.
>   */
> diff --git a/xen/drivers/passthrough/vtd/vvtd.c b/xen/drivers/passthrough/vtd/vvtd.c
> index a0f63e9..90c00f5 100644
> --- a/xen/drivers/passthrough/vtd/vvtd.c
> +++ b/xen/drivers/passthrough/vtd/vvtd.c
> @@ -23,11 +23,17 @@
>  #include <xen/types.h>
>  #include <xen/viommu.h>
>  #include <xen/xmalloc.h>
> +#include <asm/apic.h>
>  #include <asm/current.h>
> +#include <asm/event.h>
>  #include <asm/hvm/domain.h>
> +#include <asm/io_apic.h>
>  #include <asm/page.h>
> +#include <asm/p2m.h>
> +#include <asm/viommu.h>
>  
>  #include "iommu.h"
> +#include "vtd.h"
>  
>  /* Supported capabilities by vvtd */
>  unsigned int vvtd_caps = VIOMMU_CAP_IRQ_REMAPPING;
> @@ -111,6 +117,132 @@ static inline uint64_t vvtd_get_reg_quad(struct vvtd *vtd, uint32_t reg)
>      return vtd->regs->data64[reg/sizeof(uint64_t)];
>  }
>  
> +static void* map_guest_page(struct domain *d, uint64_t gfn)

gfn_t seems like a better type then uint64_t.

> +{
> +    struct page_info *p;
> +    void *ret;
> +
> +    p = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC);

You can do the initialization of p at declaration.

> +    if ( !p )
> +        return ERR_PTR(-EINVAL);
> +
> +    if ( !get_page_type(p, PGT_writable_page) )
> +    {
> +        put_page(p);
> +        return ERR_PTR(-EINVAL);
> +    }
> +
> +    ret = __map_domain_page_global(p);
> +    if ( !ret )
> +    {
> +        put_page_and_type(p);
> +        return ERR_PTR(-ENOMEM);
> +    }
> +
> +    return ret;
> +}
> +
> +static void unmap_guest_page(void *virt)
> +{
> +    struct page_info *page;
> +
> +    ASSERT((unsigned long)virt & PAGE_MASK);

I'm not sure I get the point of the check above.

> +    page = mfn_to_page(domain_page_map_to_mfn(virt));
> +
> +    unmap_domain_page_global(virt);
> +    put_page_and_type(page);
> +}
> +
> +static void vvtd_inj_irq(struct vlapic *target, uint8_t vector,
> +                         uint8_t trig_mode, uint8_t delivery_mode)
> +{
> +    vvtd_debug("dest=v%d, delivery_mode=%x vector=%d trig_mode=%d\n",
> +               vlapic_vcpu(target)->vcpu_id, delivery_mode, vector, trig_mode);
> +
> +    ASSERT((delivery_mode == dest_Fixed) ||
> +           (delivery_mode == dest_LowestPrio));
> +
> +    vlapic_set_irq(target, vector, trig_mode);
> +}
> +
> +static int vvtd_delivery(struct domain *d, uint8_t vector,
> +                         uint32_t dest, uint8_t dest_mode,
> +                         uint8_t delivery_mode, uint8_t trig_mode)
> +{
> +    struct vlapic *target;
> +    struct vcpu *v;
> +
> +    switch ( delivery_mode )
> +    {
> +    case dest_LowestPrio:
> +        target = vlapic_lowest_prio(d, NULL, 0, dest, dest_mode);
> +        if ( target != NULL )
> +        {
> +            vvtd_inj_irq(target, vector, trig_mode, delivery_mode);
> +            break;
> +        }
> +        vvtd_debug("null round robin: vector=%02x\n", vector);
> +        break;
> +
> +    case dest_Fixed:
> +        for_each_vcpu ( d, v )
> +            if ( vlapic_match_dest(vcpu_vlapic(v), NULL, 0, dest, dest_mode) )
> +                vvtd_inj_irq(vcpu_vlapic(v), vector, trig_mode, delivery_mode);
> +        break;
> +
> +    case dest_NMI:
> +        for_each_vcpu ( d, v )
> +            if ( vlapic_match_dest(vcpu_vlapic(v), NULL, 0, dest, dest_mode) &&
> +                 !test_and_set_bool(v->nmi_pending) )
> +                vcpu_kick(v);
> +        break;
> +
> +    default:
> +        gdprintk(XENLOG_WARNING, "Unsupported VTD delivery mode %d\n",
> +                 delivery_mode);
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
> +static uint32_t irq_remapping_request_index(
> +    const struct arch_irq_remapping_request *irq)
> +{
> +    if ( irq->type == VIOMMU_REQUEST_IRQ_MSI )
> +    {
> +        uint32_t index;
> +        struct msi_msg_remap_entry msi_msg =
> +        {
> +            .address_lo = { .val = irq->msg.msi.addr },
> +            .data = irq->msg.msi.data,
> +        };
> +
> +        index = (msi_msg.address_lo.index_15 << 15) +
> +                msi_msg.address_lo.index_0_14;
> +        if ( msi_msg.address_lo.SHV )
> +            index += (uint16_t)msi_msg.data;
> +
> +        return index;
> +    }
> +    else if ( irq->type == VIOMMU_REQUEST_IRQ_APIC )
> +    {
> +        struct IO_APIC_route_remap_entry remap_rte = { .val = irq->msg.rte };
> +
> +        return (remap_rte.index_15 << 15) + remap_rte.index_0_14;
> +    }

IMHO a switch with a single return would be better here:

uint32_t index = 0;

switch ( irq->type )
{
case ...:
    index = ...;
    break;
}

return index;

> +    ASSERT_UNREACHABLE();
> +
> +    return 0;
> +}
> +
> +static inline uint32_t irte_dest(struct vvtd *vvtd, uint32_t dest)
> +{
> +    /* In xAPIC mode, only 8-bits([15:8]) are valid */
> +    return vvtd->status.eim_enabled ? dest
                                       : MASK_EXTR(dest, IRTE_xAPIC_DEST_MASK);

It's easier to read style wise.

> +}
> +
>  static void vvtd_handle_gcmd_ire(struct vvtd *vvtd, uint32_t val)
>  {
>      vvtd_info("%sable Interrupt Remapping",
> @@ -255,6 +387,135 @@ static const struct hvm_mmio_ops vvtd_mmio_ops = {
>      .write = vvtd_write
>  };
>  
> +static void vvtd_handle_fault(struct vvtd *vvtd,
> +                              struct arch_irq_remapping_request *irq,
> +                              struct iremap_entry *irte,
> +                              unsigned int fault,
> +                              bool record_fault)
> +{
> +   if ( !record_fault )
> +        return;
> +
> +    switch ( fault )
> +    {
> +    case VTD_FR_IR_SID_ERR:
> +    case VTD_FR_IR_IRTE_RSVD:
> +    case VTD_FR_IR_ENTRY_P:
> +        if ( qinval_fault_disable(*irte) )
> +            break;
> +    /* fall through */
> +    case VTD_FR_IR_INDEX_OVER:
> +    case VTD_FR_IR_ROOT_INVAL:
> +        /* TODO: handle fault (e.g. record and report this fault to VM */
> +        break;
> +
> +    default:
> +        gdprintk(XENLOG_INFO, "Can't handle VT-d fault %x\n", fault);

You already defined some vvtd specific debug helpers, why are those
not used here? gdprintk (as the 'd' denotes) is only for debug
purposes.

> +    }
> +    return;
> +}
> +
> +static bool vvtd_irq_request_sanity_check(const struct vvtd *vvtd,
> +                                          struct arch_irq_remapping_request *irq)
> +{
> +    if ( irq->type == VIOMMU_REQUEST_IRQ_APIC )
> +    {
> +        struct IO_APIC_route_remap_entry rte = { .val = irq->msg.rte };
> +
> +        ASSERT(rte.format);

Is it fine to ASSERT here? Can't the guest set rte.format to whatever
it wants?

> +        return !!rte.reserved;
> +    }
> +    else if ( irq->type == VIOMMU_REQUEST_IRQ_MSI )
> +    {
> +        struct msi_msg_remap_entry msi_msg =
> +        { .address_lo = { .val = irq->msg.msi.addr } };
> +
> +        ASSERT(msi_msg.address_lo.format);
> +        return 0;
> +    }
> +    ASSERT_UNREACHABLE();

Again I think a switch would be better here.

> +
> +    return 0;
> +}
> +
> +/*
> + * 'record_fault' is a flag to indicate whether we need recording a fault
> + * and notifying guest when a fault happens during fetching vIRTE.
> + */
> +static int vvtd_get_entry(struct vvtd *vvtd,
> +                          struct arch_irq_remapping_request *irq,
> +                          struct iremap_entry *dest,
> +                          bool record_fault)
> +{
> +    uint32_t entry = irq_remapping_request_index(irq);
> +    struct iremap_entry  *irte, *irt_page;
> +
> +    vvtd_debug("interpret a request with index %x\n", entry);
> +
> +    if ( vvtd_irq_request_sanity_check(vvtd, irq) )
> +    {
> +        vvtd_handle_fault(vvtd, irq, NULL, VTD_FR_IR_REQ_RSVD, record_fault);
> +        return -EINVAL;
> +    }
> +
> +    if ( entry > vvtd->status.irt_max_entry )
> +    {
> +        vvtd_handle_fault(vvtd, irq, NULL, VTD_FR_IR_INDEX_OVER, record_fault);
> +        return -EACCES;
> +    }
> +
> +    irt_page = map_guest_page(vvtd->domain,
> +                              vvtd->status.irt + (entry >> IREMAP_ENTRY_ORDER));

Since AFAICT you have to read this page(s) every time an interrupt
needs to be delivered, wouldn't it make sense for performance reasons
to have the page permanently mapped?

What's the maximum number of pages that can be used here?

> +    if ( IS_ERR(irt_page) )
> +    {
> +        vvtd_handle_fault(vvtd, irq, NULL, VTD_FR_IR_ROOT_INVAL, record_fault);
> +        return PTR_ERR(irt_page);
> +    }
> +
> +    irte = irt_page + (entry % (1 << IREMAP_ENTRY_ORDER));
> +    dest->val = irte->val;

Not that it matters much, but for coherency reasons I would only set
dest->val after all the checks have been performed.

> +    if ( !qinval_present(*irte) )
> +    {
> +        vvtd_handle_fault(vvtd, irq, NULL, VTD_FR_IR_ENTRY_P, record_fault);
> +        unmap_guest_page(irt_page);
> +        return -ENOENT;
> +    }
> +
> +    /* Check reserved bits */
> +    if ( (irte->remap.res_1 || irte->remap.res_2 || irte->remap.res_3 ||
> +          irte->remap.res_4) )
> +    {
> +        vvtd_handle_fault(vvtd, irq, NULL, VTD_FR_IR_IRTE_RSVD, record_fault);
> +        unmap_guest_page(irt_page);
> +        return -EINVAL;
> +    }
> +
> +    /* FIXME: We don't check against the source ID */
> +    unmap_guest_page(irt_page);
> +
> +    return 0;
> +}
> +
> +static int vvtd_handle_irq_request(struct domain *d,
> +                                   struct arch_irq_remapping_request *irq)
> +{
> +    struct iremap_entry irte;
> +    int ret;
> +    struct vvtd *vvtd = domain_vvtd(d);
> +
> +    if ( !vvtd || !vvtd->status.intremap_enabled )
> +        return -ENODEV;
> +
> +    ret = vvtd_get_entry(vvtd, irq, &irte, true);
> +    if ( ret )
> +        return ret;
> +
> +    return vvtd_delivery(vvtd->domain, irte.remap.vector,
> +                         irte_dest(vvtd, irte.remap.dst),
> +                         irte.remap.dm, irte.remap.dlm,
> +                         irte.remap.tm);
> +}
> +
>  static void vvtd_reset(struct vvtd *vvtd, uint64_t capability)
>  {
>      uint64_t cap = cap_set_num_fault_regs(1ULL) |
> @@ -324,7 +585,8 @@ static int vvtd_destroy(struct viommu *viommu)
>  
>  struct viommu_ops vvtd_hvm_vmx_ops = {
>      .create = vvtd_create,
> -    .destroy = vvtd_destroy
> +    .destroy = vvtd_destroy,
> +    .handle_irq_request = vvtd_handle_irq_request

You can add a ',' at the end, so that further additions don't need to
change two lines (and the same should be done with vvtd_destroy).

Thanks, Roger.
Chao Gao Oct. 20, 2017, 5:16 a.m. UTC | #2
On Thu, Oct 19, 2017 at 03:26:30PM +0100, Roger Pau Monné wrote:
>On Thu, Sep 21, 2017 at 11:01:56PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao.gao@intel.com>
>> 
>> When a remapping interrupt request arrives, remapping hardware computes the
>> interrupt_index per the algorithm described in VTD spec
>> "Interrupt Remapping Table", interprets the IRTE and generates a remapped
>> interrupt request.
>> 
>> This patch introduces viommu_handle_irq_request() to emulate the process how
>> remapping hardware handles a remapping interrupt request.
>> 
>> Signed-off-by: Chao Gao <chao.gao@intel.com>
>> Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
>> 
>> ---
>>  
>> +enum VTD_FAULT_TYPE
>> +{
>> +    /* Interrupt remapping transition faults */
>> +    VTD_FR_IR_REQ_RSVD      = 0x20, /* One or more IR request reserved
>> +                                     * fields set */
>> +    VTD_FR_IR_INDEX_OVER    = 0x21, /* Index value greater than max */
>> +    VTD_FR_IR_ENTRY_P       = 0x22, /* Present (P) not set in IRTE */
>> +    VTD_FR_IR_ROOT_INVAL    = 0x23, /* IR Root table invalid */
>> +    VTD_FR_IR_IRTE_RSVD     = 0x24, /* IRTE Rsvd field non-zero with
>> +                                     * Present flag set */
>> +    VTD_FR_IR_REQ_COMPAT    = 0x25, /* Encountered compatible IR
>> +                                     * request while disabled */
>> +    VTD_FR_IR_SID_ERR       = 0x26, /* Invalid Source-ID */
>> +};
>
>Why does this need to be an enum? Plus enum type names should not be
>all in uppercase.
>
>In any case, I would just use defines, like it's done for all other
>values in the file.

Sure. Will follow your suggestion.

>> +static void unmap_guest_page(void *virt)
>> +{
>> +    struct page_info *page;
>> +
>> +    ASSERT((unsigned long)virt & PAGE_MASK);
>
>I'm not sure I get the point of the check above.

I intended to check the address is 4K-page aligned. It should be

ASSERT(!((unsigned long)virt & (PAGE_SIZE - 1)))

>> +}
>> +
>> +static inline uint32_t irte_dest(struct vvtd *vvtd, uint32_t dest)
>> +{
>> +    /* In xAPIC mode, only 8-bits([15:8]) are valid */
>> +    return vvtd->status.eim_enabled ? dest
>                                       : MASK_EXTR(dest, IRTE_xAPIC_DEST_MASK);
>
>It's easier to read style wise.

sure.

>
>> +}
>> +
>>  static void vvtd_handle_gcmd_ire(struct vvtd *vvtd, uint32_t val)
>>  {
>>      vvtd_info("%sable Interrupt Remapping",
>> @@ -255,6 +387,135 @@ static const struct hvm_mmio_ops vvtd_mmio_ops = {
>>      .write = vvtd_write
>>  };
>>  
>> +static void vvtd_handle_fault(struct vvtd *vvtd,
>> +                              struct arch_irq_remapping_request *irq,
>> +                              struct iremap_entry *irte,
>> +                              unsigned int fault,
>> +                              bool record_fault)
>> +{
>> +   if ( !record_fault )
>> +        return;
>> +
>> +    switch ( fault )
>> +    {
>> +    case VTD_FR_IR_SID_ERR:
>> +    case VTD_FR_IR_IRTE_RSVD:
>> +    case VTD_FR_IR_ENTRY_P:
>> +        if ( qinval_fault_disable(*irte) )
>> +            break;
>> +    /* fall through */
>> +    case VTD_FR_IR_INDEX_OVER:
>> +    case VTD_FR_IR_ROOT_INVAL:
>> +        /* TODO: handle fault (e.g. record and report this fault to VM */
>> +        break;
>> +
>> +    default:
>> +        gdprintk(XENLOG_INFO, "Can't handle VT-d fault %x\n", fault);
>
>You already defined some vvtd specific debug helpers, why are those
>not used here? gdprintk (as the 'd' denotes) is only for debug
>purposes.

The default case means we encounter a bug in our code. I want to output
this kind of message even for non-debug version. I should use gprintk.

>
>> +    }
>> +    return;
>> +}
>> +
>> +static bool vvtd_irq_request_sanity_check(const struct vvtd *vvtd,
>> +                                          struct arch_irq_remapping_request *irq)
>> +{
>> +    if ( irq->type == VIOMMU_REQUEST_IRQ_APIC )
>> +    {
>> +        struct IO_APIC_route_remap_entry rte = { .val = irq->msg.rte };
>> +
>> +        ASSERT(rte.format);
>
>Is it fine to ASSERT here? Can't the guest set rte.format to whatever
>it wants?

Guest can use legacy format interrupt (i.e. rte.format = 0). However,
we only reach here when callback 'check_irq_remapping' return true and
for vvtd, 'check_irq_remapping' just returns the format bit of irq request.
If here ret.format isn't true, there must be a bug in our code.

>> +        vvtd_handle_fault(vvtd, irq, NULL, VTD_FR_IR_REQ_RSVD, record_fault);
>> +        return -EINVAL;
>> +    }
>> +
>> +    if ( entry > vvtd->status.irt_max_entry )
>> +    {
>> +        vvtd_handle_fault(vvtd, irq, NULL, VTD_FR_IR_INDEX_OVER, record_fault);
>> +        return -EACCES;
>> +    }
>> +
>> +    irt_page = map_guest_page(vvtd->domain,
>> +                              vvtd->status.irt + (entry >> IREMAP_ENTRY_ORDER));
>
>Since AFAICT you have to read this page(s) every time an interrupt
>needs to be delivered, wouldn't it make sense for performance reasons
>to have the page permanently mapped?

Yes. It is. Actually, we have a draft patch to do this. But to justify
the necessity, I should run some benchmark at first. Mapping a guest
page is slow on x86, right?

>
>What's the maximum number of pages that can be used here?

VT-d current support 2^16 entries at most. The size of each entry is 128
byte. Thus, we need 2^11 pages at most.

>
>> +    if ( IS_ERR(irt_page) )
>> +    {
>> +        vvtd_handle_fault(vvtd, irq, NULL, VTD_FR_IR_ROOT_INVAL, record_fault);
>> +        return PTR_ERR(irt_page);
>> +    }
>> +
>> +    irte = irt_page + (entry % (1 << IREMAP_ENTRY_ORDER));
>> +    dest->val = irte->val;
>
>Not that it matters much, but for coherency reasons I would only set
>dest->val after all the checks have been performed.

agree.

Thanks
chao
Roger Pau Monné Oct. 20, 2017, 10:01 a.m. UTC | #3
On Fri, Oct 20, 2017 at 01:16:37PM +0800, Chao Gao wrote:
> On Thu, Oct 19, 2017 at 03:26:30PM +0100, Roger Pau Monné wrote:
> >On Thu, Sep 21, 2017 at 11:01:56PM -0400, Lan Tianyu wrote:
> >> +static void unmap_guest_page(void *virt)
> >> +{
> >> +    struct page_info *page;
> >> +
> >> +    ASSERT((unsigned long)virt & PAGE_MASK);
> >
> >I'm not sure I get the point of the check above.
> 
> I intended to check the address is 4K-page aligned. It should be
> 
> ASSERT(!((unsigned long)virt & (PAGE_SIZE - 1)))

Please use the IS_ALIGNED macro.

> >
> >> +    }
> >> +    return;
> >> +}
> >> +
> >> +static bool vvtd_irq_request_sanity_check(const struct vvtd *vvtd,
> >> +                                          struct arch_irq_remapping_request *irq)
> >> +{
> >> +    if ( irq->type == VIOMMU_REQUEST_IRQ_APIC )
> >> +    {
> >> +        struct IO_APIC_route_remap_entry rte = { .val = irq->msg.rte };
> >> +
> >> +        ASSERT(rte.format);
> >
> >Is it fine to ASSERT here? Can't the guest set rte.format to whatever
> >it wants?
> 
> Guest can use legacy format interrupt (i.e. rte.format = 0). However,
> we only reach here when callback 'check_irq_remapping' return true and
> for vvtd, 'check_irq_remapping' just returns the format bit of irq request.
> If here ret.format isn't true, there must be a bug in our code.

Are you sure the correct locks are hold here to prevent the guest
from changing rte while all this processing is happening?

> >> +        vvtd_handle_fault(vvtd, irq, NULL, VTD_FR_IR_REQ_RSVD, record_fault);
> >> +        return -EINVAL;
> >> +    }
> >> +
> >> +    if ( entry > vvtd->status.irt_max_entry )
> >> +    {
> >> +        vvtd_handle_fault(vvtd, irq, NULL, VTD_FR_IR_INDEX_OVER, record_fault);
> >> +        return -EACCES;
> >> +    }
> >> +
> >> +    irt_page = map_guest_page(vvtd->domain,
> >> +                              vvtd->status.irt + (entry >> IREMAP_ENTRY_ORDER));
> >
> >Since AFAICT you have to read this page(s) every time an interrupt
> >needs to be delivered, wouldn't it make sense for performance reasons
> >to have the page permanently mapped?
> 
> Yes. It is. Actually, we have a draft patch to do this. But to justify
> the necessity, I should run some benchmark at first. Mapping a guest
> page is slow on x86, right?

The issue is the tblflush, not the actual modifications of the page
tables.

> >
> >What's the maximum number of pages that can be used here?
> 
> VT-d current support 2^16 entries at most. The size of each entry is 128
> byte. Thus, we need 2^11 pages at most.

Those are guest pages at the end, so it shouldn't be a problem.

Roger.
Chao Gao Oct. 23, 2017, 6:44 a.m. UTC | #4
On Fri, Oct 20, 2017 at 11:01:03AM +0100, Roger Pau Monné wrote:
>On Fri, Oct 20, 2017 at 01:16:37PM +0800, Chao Gao wrote:
>> On Thu, Oct 19, 2017 at 03:26:30PM +0100, Roger Pau Monné wrote:
>> >On Thu, Sep 21, 2017 at 11:01:56PM -0400, Lan Tianyu wrote:
>> >> +static void unmap_guest_page(void *virt)
>> >> +{
>> >> +    struct page_info *page;
>> >> +
>> >> +    ASSERT((unsigned long)virt & PAGE_MASK);
>> >
>> >I'm not sure I get the point of the check above.
>> 
>> I intended to check the address is 4K-page aligned. It should be
>> 
>> ASSERT(!((unsigned long)virt & (PAGE_SIZE - 1)))
>
>Please use the IS_ALIGNED macro.

Ok.

>
>> >
>> >> +    }
>> >> +    return;
>> >> +}
>> >> +
>> >> +static bool vvtd_irq_request_sanity_check(const struct vvtd *vvtd,
>> >> +                                          struct arch_irq_remapping_request *irq)
>> >> +{
>> >> +    if ( irq->type == VIOMMU_REQUEST_IRQ_APIC )
>> >> +    {
>> >> +        struct IO_APIC_route_remap_entry rte = { .val = irq->msg.rte };
>> >> +
>> >> +        ASSERT(rte.format);
>> >
>> >Is it fine to ASSERT here? Can't the guest set rte.format to whatever
>> >it wants?
>> 
>> Guest can use legacy format interrupt (i.e. rte.format = 0). However,
>> we only reach here when callback 'check_irq_remapping' return true and
>> for vvtd, 'check_irq_remapping' just returns the format bit of irq request.
>> If here ret.format isn't true, there must be a bug in our code.
>
>Are you sure the correct locks are hold here to prevent the guest
>from changing rte while all this processing is happening?

The rte here isn't the registers in IOAPIC. It is only (or part of) the
interrupt request (abstract of ioapic rte and msi message). Every time
an interrupt is to be delivered, the interrupt request is composed
according the IOAPIC RTE or MSI message on stack. Then we recognize
the format of the interrupt, means remapping format or not remapping
format. Only for remapping format, the function is called. For
non-remapping format, the interrupt is delivered by ioapic directly and
needn't come here and be translated by vIOMMU.

Thanks
Chao
diff mbox

Patch

diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h
index 703726f..790384f 100644
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -218,6 +218,21 @@ 
 #define dma_frcd_source_id(c) (c & 0xffff)
 #define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
 
+enum VTD_FAULT_TYPE
+{
+    /* Interrupt remapping transition faults */
+    VTD_FR_IR_REQ_RSVD      = 0x20, /* One or more IR request reserved
+                                     * fields set */
+    VTD_FR_IR_INDEX_OVER    = 0x21, /* Index value greater than max */
+    VTD_FR_IR_ENTRY_P       = 0x22, /* Present (P) not set in IRTE */
+    VTD_FR_IR_ROOT_INVAL    = 0x23, /* IR Root table invalid */
+    VTD_FR_IR_IRTE_RSVD     = 0x24, /* IRTE Rsvd field non-zero with
+                                     * Present flag set */
+    VTD_FR_IR_REQ_COMPAT    = 0x25, /* Encountered compatible IR
+                                     * request while disabled */
+    VTD_FR_IR_SID_ERR       = 0x26, /* Invalid Source-ID */
+};
+
 /*
  * 0: Present
  * 1-11: Reserved
@@ -358,6 +373,12 @@  struct iremap_entry {
 };
 
 /*
+ * When VT-d doesn't enable Extended Interrupt Mode. Hardware only interprets
+ * only 8-bits ([15:8]) of Destination-ID field in the IRTEs.
+ */
+#define IRTE_xAPIC_DEST_MASK 0xff00
+
+/*
  * Posted-interrupt descriptor address is 64 bits with 64-byte aligned, only
  * the upper 26 bits of lest significiant 32 bits is available.
  */
diff --git a/xen/drivers/passthrough/vtd/vvtd.c b/xen/drivers/passthrough/vtd/vvtd.c
index a0f63e9..90c00f5 100644
--- a/xen/drivers/passthrough/vtd/vvtd.c
+++ b/xen/drivers/passthrough/vtd/vvtd.c
@@ -23,11 +23,17 @@ 
 #include <xen/types.h>
 #include <xen/viommu.h>
 #include <xen/xmalloc.h>
+#include <asm/apic.h>
 #include <asm/current.h>
+#include <asm/event.h>
 #include <asm/hvm/domain.h>
+#include <asm/io_apic.h>
 #include <asm/page.h>
+#include <asm/p2m.h>
+#include <asm/viommu.h>
 
 #include "iommu.h"
+#include "vtd.h"
 
 /* Supported capabilities by vvtd */
 unsigned int vvtd_caps = VIOMMU_CAP_IRQ_REMAPPING;
@@ -111,6 +117,132 @@  static inline uint64_t vvtd_get_reg_quad(struct vvtd *vtd, uint32_t reg)
     return vtd->regs->data64[reg/sizeof(uint64_t)];
 }
 
+static void* map_guest_page(struct domain *d, uint64_t gfn)
+{
+    struct page_info *p;
+    void *ret;
+
+    p = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC);
+    if ( !p )
+        return ERR_PTR(-EINVAL);
+
+    if ( !get_page_type(p, PGT_writable_page) )
+    {
+        put_page(p);
+        return ERR_PTR(-EINVAL);
+    }
+
+    ret = __map_domain_page_global(p);
+    if ( !ret )
+    {
+        put_page_and_type(p);
+        return ERR_PTR(-ENOMEM);
+    }
+
+    return ret;
+}
+
+static void unmap_guest_page(void *virt)
+{
+    struct page_info *page;
+
+    ASSERT((unsigned long)virt & PAGE_MASK);
+    page = mfn_to_page(domain_page_map_to_mfn(virt));
+
+    unmap_domain_page_global(virt);
+    put_page_and_type(page);
+}
+
+static void vvtd_inj_irq(struct vlapic *target, uint8_t vector,
+                         uint8_t trig_mode, uint8_t delivery_mode)
+{
+    vvtd_debug("dest=v%d, delivery_mode=%x vector=%d trig_mode=%d\n",
+               vlapic_vcpu(target)->vcpu_id, delivery_mode, vector, trig_mode);
+
+    ASSERT((delivery_mode == dest_Fixed) ||
+           (delivery_mode == dest_LowestPrio));
+
+    vlapic_set_irq(target, vector, trig_mode);
+}
+
+static int vvtd_delivery(struct domain *d, uint8_t vector,
+                         uint32_t dest, uint8_t dest_mode,
+                         uint8_t delivery_mode, uint8_t trig_mode)
+{
+    struct vlapic *target;
+    struct vcpu *v;
+
+    switch ( delivery_mode )
+    {
+    case dest_LowestPrio:
+        target = vlapic_lowest_prio(d, NULL, 0, dest, dest_mode);
+        if ( target != NULL )
+        {
+            vvtd_inj_irq(target, vector, trig_mode, delivery_mode);
+            break;
+        }
+        vvtd_debug("null round robin: vector=%02x\n", vector);
+        break;
+
+    case dest_Fixed:
+        for_each_vcpu ( d, v )
+            if ( vlapic_match_dest(vcpu_vlapic(v), NULL, 0, dest, dest_mode) )
+                vvtd_inj_irq(vcpu_vlapic(v), vector, trig_mode, delivery_mode);
+        break;
+
+    case dest_NMI:
+        for_each_vcpu ( d, v )
+            if ( vlapic_match_dest(vcpu_vlapic(v), NULL, 0, dest, dest_mode) &&
+                 !test_and_set_bool(v->nmi_pending) )
+                vcpu_kick(v);
+        break;
+
+    default:
+        gdprintk(XENLOG_WARNING, "Unsupported VTD delivery mode %d\n",
+                 delivery_mode);
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static uint32_t irq_remapping_request_index(
+    const struct arch_irq_remapping_request *irq)
+{
+    if ( irq->type == VIOMMU_REQUEST_IRQ_MSI )
+    {
+        uint32_t index;
+        struct msi_msg_remap_entry msi_msg =
+        {
+            .address_lo = { .val = irq->msg.msi.addr },
+            .data = irq->msg.msi.data,
+        };
+
+        index = (msi_msg.address_lo.index_15 << 15) +
+                msi_msg.address_lo.index_0_14;
+        if ( msi_msg.address_lo.SHV )
+            index += (uint16_t)msi_msg.data;
+
+        return index;
+    }
+    else if ( irq->type == VIOMMU_REQUEST_IRQ_APIC )
+    {
+        struct IO_APIC_route_remap_entry remap_rte = { .val = irq->msg.rte };
+
+        return (remap_rte.index_15 << 15) + remap_rte.index_0_14;
+    }
+    ASSERT_UNREACHABLE();
+
+    return 0;
+}
+
+static inline uint32_t irte_dest(struct vvtd *vvtd, uint32_t dest)
+{
+    /* In xAPIC mode, only 8-bits([15:8]) are valid */
+    return vvtd->status.eim_enabled ? dest :
+           MASK_EXTR(dest, IRTE_xAPIC_DEST_MASK);
+}
+
 static void vvtd_handle_gcmd_ire(struct vvtd *vvtd, uint32_t val)
 {
     vvtd_info("%sable Interrupt Remapping",
@@ -255,6 +387,135 @@  static const struct hvm_mmio_ops vvtd_mmio_ops = {
     .write = vvtd_write
 };
 
+static void vvtd_handle_fault(struct vvtd *vvtd,
+                              struct arch_irq_remapping_request *irq,
+                              struct iremap_entry *irte,
+                              unsigned int fault,
+                              bool record_fault)
+{
+   if ( !record_fault )
+        return;
+
+    switch ( fault )
+    {
+    case VTD_FR_IR_SID_ERR:
+    case VTD_FR_IR_IRTE_RSVD:
+    case VTD_FR_IR_ENTRY_P:
+        if ( qinval_fault_disable(*irte) )
+            break;
+    /* fall through */
+    case VTD_FR_IR_INDEX_OVER:
+    case VTD_FR_IR_ROOT_INVAL:
+        /* TODO: handle fault (e.g. record and report this fault to VM */
+        break;
+
+    default:
+        gdprintk(XENLOG_INFO, "Can't handle VT-d fault %x\n", fault);
+    }
+    return;
+}
+
+static bool vvtd_irq_request_sanity_check(const struct vvtd *vvtd,
+                                          struct arch_irq_remapping_request *irq)
+{
+    if ( irq->type == VIOMMU_REQUEST_IRQ_APIC )
+    {
+        struct IO_APIC_route_remap_entry rte = { .val = irq->msg.rte };
+
+        ASSERT(rte.format);
+        return !!rte.reserved;
+    }
+    else if ( irq->type == VIOMMU_REQUEST_IRQ_MSI )
+    {
+        struct msi_msg_remap_entry msi_msg =
+        { .address_lo = { .val = irq->msg.msi.addr } };
+
+        ASSERT(msi_msg.address_lo.format);
+        return 0;
+    }
+    ASSERT_UNREACHABLE();
+
+    return 0;
+}
+
+/*
+ * 'record_fault' is a flag to indicate whether we need recording a fault
+ * and notifying guest when a fault happens during fetching vIRTE.
+ */
+static int vvtd_get_entry(struct vvtd *vvtd,
+                          struct arch_irq_remapping_request *irq,
+                          struct iremap_entry *dest,
+                          bool record_fault)
+{
+    uint32_t entry = irq_remapping_request_index(irq);
+    struct iremap_entry  *irte, *irt_page;
+
+    vvtd_debug("interpret a request with index %x\n", entry);
+
+    if ( vvtd_irq_request_sanity_check(vvtd, irq) )
+    {
+        vvtd_handle_fault(vvtd, irq, NULL, VTD_FR_IR_REQ_RSVD, record_fault);
+        return -EINVAL;
+    }
+
+    if ( entry > vvtd->status.irt_max_entry )
+    {
+        vvtd_handle_fault(vvtd, irq, NULL, VTD_FR_IR_INDEX_OVER, record_fault);
+        return -EACCES;
+    }
+
+    irt_page = map_guest_page(vvtd->domain,
+                              vvtd->status.irt + (entry >> IREMAP_ENTRY_ORDER));
+    if ( IS_ERR(irt_page) )
+    {
+        vvtd_handle_fault(vvtd, irq, NULL, VTD_FR_IR_ROOT_INVAL, record_fault);
+        return PTR_ERR(irt_page);
+    }
+
+    irte = irt_page + (entry % (1 << IREMAP_ENTRY_ORDER));
+    dest->val = irte->val;
+    if ( !qinval_present(*irte) )
+    {
+        vvtd_handle_fault(vvtd, irq, NULL, VTD_FR_IR_ENTRY_P, record_fault);
+        unmap_guest_page(irt_page);
+        return -ENOENT;
+    }
+
+    /* Check reserved bits */
+    if ( (irte->remap.res_1 || irte->remap.res_2 || irte->remap.res_3 ||
+          irte->remap.res_4) )
+    {
+        vvtd_handle_fault(vvtd, irq, NULL, VTD_FR_IR_IRTE_RSVD, record_fault);
+        unmap_guest_page(irt_page);
+        return -EINVAL;
+    }
+
+    /* FIXME: We don't check against the source ID */
+    unmap_guest_page(irt_page);
+
+    return 0;
+}
+
+static int vvtd_handle_irq_request(struct domain *d,
+                                   struct arch_irq_remapping_request *irq)
+{
+    struct iremap_entry irte;
+    int ret;
+    struct vvtd *vvtd = domain_vvtd(d);
+
+    if ( !vvtd || !vvtd->status.intremap_enabled )
+        return -ENODEV;
+
+    ret = vvtd_get_entry(vvtd, irq, &irte, true);
+    if ( ret )
+        return ret;
+
+    return vvtd_delivery(vvtd->domain, irte.remap.vector,
+                         irte_dest(vvtd, irte.remap.dst),
+                         irte.remap.dm, irte.remap.dlm,
+                         irte.remap.tm);
+}
+
 static void vvtd_reset(struct vvtd *vvtd, uint64_t capability)
 {
     uint64_t cap = cap_set_num_fault_regs(1ULL) |
@@ -324,7 +585,8 @@  static int vvtd_destroy(struct viommu *viommu)
 
 struct viommu_ops vvtd_hvm_vmx_ops = {
     .create = vvtd_create,
-    .destroy = vvtd_destroy
+    .destroy = vvtd_destroy,
+    .handle_irq_request = vvtd_handle_irq_request
 };
 
 static int vvtd_register(void)