diff mbox series

[RFC,v2,10/22] intel_iommu: add virtual command capability support

Message ID 1571920483-3382-11-git-send-email-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series intel_iommu: expose Shared Virtual Addressing to VM | expand

Commit Message

Yi Liu Oct. 24, 2019, 12:34 p.m. UTC
This patch adds virtual command support to Intel vIOMMU per
Intel VT-d 3.1 spec. And adds two virtual commands: alloc_pasid
and free_pasid.

Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
---
 hw/i386/intel_iommu.c          | 162 ++++++++++++++++++++++++++++++++++++++++-
 hw/i386/intel_iommu_internal.h |  38 ++++++++++
 hw/i386/trace-events           |   1 +
 include/hw/i386/intel_iommu.h  |   6 +-
 4 files changed, 205 insertions(+), 2 deletions(-)

Comments

Peter Xu Nov. 1, 2019, 6:05 p.m. UTC | #1
On Thu, Oct 24, 2019 at 08:34:31AM -0400, Liu Yi L wrote:
> This patch adds virtual command support to Intel vIOMMU per
> Intel VT-d 3.1 spec. And adds two virtual commands: alloc_pasid
> and free_pasid.
> 
> Cc: Kevin Tian <kevin.tian@intel.com>
> Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: Yi Sun <yi.y.sun@linux.intel.com>
> Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> ---
>  hw/i386/intel_iommu.c          | 162 ++++++++++++++++++++++++++++++++++++++++-
>  hw/i386/intel_iommu_internal.h |  38 ++++++++++
>  hw/i386/trace-events           |   1 +
>  include/hw/i386/intel_iommu.h  |   6 +-
>  4 files changed, 205 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index e9f8692..88b843f 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -944,6 +944,7 @@ static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
>                  return vtd_bus;
>              }
>          }
> +        vtd_bus = NULL;

I feel like I've commented on this..

Should this be a standalone patch?

>      }
>      return vtd_bus;
>  }
> @@ -2590,6 +2591,140 @@ static void vtd_handle_iectl_write(IntelIOMMUState *s)
>      }
>  }
>  
> +static int vtd_request_pasid_alloc(IntelIOMMUState *s)
> +{
> +    VTDBus *vtd_bus;
> +    int bus_n, devfn;
> +    IOMMUCTXEventData event_data;
> +    IOMMUCTXPASIDReqDesc req;
> +    VTDIOMMUContext *vtd_ic;
> +
> +    event_data.event = IOMMU_CTX_EVENT_PASID_ALLOC;
> +    event_data.data = &req;
> +    req.min_pasid = VTD_MIN_HPASID;
> +    req.max_pasid = VTD_MAX_HPASID;
> +    req.alloc_result = 0;
> +    event_data.length = sizeof(req);

As mentioned in the other thread, do you think we can drop this length
field?

> +    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
> +        vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
> +        if (!vtd_bus) {
> +            continue;
> +        }
> +        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
> +            vtd_ic = vtd_bus->dev_ic[devfn];
> +            if (!vtd_ic) {
> +                continue;
> +            }
> +            iommu_ctx_event_notify(&vtd_ic->iommu_context, &event_data);

Considering that we'll fill in the result into event_data, it could be
a bit misleading to still call it "notify" here because normally it
should only get data from the notifier caller rather than returning a
meaningful value..  Things like SUCCESS/FAIL would be fine, but here
we're returning a pasid from the notifier which seems a bit odd.

Maybe rename it to iommu_ctx_event_deliver()?  Then we just rename all
the references of "notify" thingys into "hook" or something clearer?

> +            if (req.alloc_result > 0) {

I'd suggest we comment on this:

    We'll return the first valid result we got.  It's a bit hackish in
    that we don't have a good global interface yet to talk to modules
    like vfio to deliver this allocation request, so we're leveraging
    this per-device context to do the same thing just to make sure the
    allocation happens only once.

Same to the pasid_free() below, though you can reference the comment
here from there to be simple.

> +                return req.alloc_result;
> +            }
> +        }
> +    }
> +    return -1;
> +}
> +
> +static int vtd_request_pasid_free(IntelIOMMUState *s, uint32_t pasid)
> +{
> +    VTDBus *vtd_bus;
> +    int bus_n, devfn;
> +    IOMMUCTXEventData event_data;
> +    IOMMUCTXPASIDReqDesc req;
> +    VTDIOMMUContext *vtd_ic;
> +
> +    event_data.event = IOMMU_CTX_EVENT_PASID_FREE;
> +    event_data.data = &req;
> +    req.pasid = pasid;
> +    req.free_result = 0;
> +    event_data.length = sizeof(req);
> +    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
> +        vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
> +        if (!vtd_bus) {
> +            continue;
> +        }
> +        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
> +            vtd_ic = vtd_bus->dev_ic[devfn];
> +            if (!vtd_ic) {
> +                continue;
> +            }
> +            iommu_ctx_event_notify(&vtd_ic->iommu_context, &event_data);
> +            if (req.free_result == 0) {
> +                return 0;
> +            }
> +        }
> +    }
> +    return -1;
> +}
> +
> +/*
> + * If IP is not set, set it and return 0
> + * If IP is already set, return -1
> + */
> +static int vtd_vcmd_rsp_ip_check(IntelIOMMUState *s)
> +{
> +    if (!(s->vccap & VTD_VCCAP_PAS) ||
> +         (s->vcrsp & 1)) {
> +        return -1;
> +    }

VTD_VCCAP_PAS is not a IP check, so maybe simply move these chunk out
to vtd_handle_vcmd_write?  Then we can rename this function to
"void vtd_vcmd_ip_set(...)".

> +    s->vcrsp = 1;
> +    vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> +                     ((uint64_t) s->vcrsp));
> +    return 0;
> +}
> +
> +static void vtd_vcmd_clear_ip(IntelIOMMUState *s)
> +{
> +    s->vcrsp &= (~((uint64_t)(0x1)));
> +    vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> +                     ((uint64_t) s->vcrsp));
> +}
> +
> +/* Handle write to Virtual Command Register */
> +static int vtd_handle_vcmd_write(IntelIOMMUState *s, uint64_t val)
> +{
> +    uint32_t pasid;
> +    int ret = -1;
> +
> +    trace_vtd_reg_write_vcmd(s->vcrsp, val);
> +
> +    /*
> +     * Since vCPU should be blocked when the guest VMCD
> +     * write was trapped to here. Should be no other vCPUs
> +     * try to access VCMD if guest software is well written.
> +     * However, we still emulate the IP bit here in case of
> +     * bad guest software. Also align with the spec.
> +     */
> +    ret = vtd_vcmd_rsp_ip_check(s);
> +    if (ret) {
> +        return ret;
> +    }
> +    switch (val & VTD_VCMD_CMD_MASK) {
> +    case VTD_VCMD_ALLOC_PASID:
> +        ret = vtd_request_pasid_alloc(s);
> +        if (ret < 0) {
> +            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_NO_AVAILABLE_PASID);
> +        } else {
> +            s->vcrsp |= VTD_VCRSP_RSLT(ret);
> +        }
> +        break;
> +
> +    case VTD_VCMD_FREE_PASID:
> +        pasid = VTD_VCMD_PASID_VALUE(val);
> +        ret = vtd_request_pasid_free(s, pasid);
> +        if (ret < 0) {
> +            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_FREE_INVALID_PASID);
> +        }
> +        break;
> +
> +    default:
> +        s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_UNDEFINED_CMD);
> +        printf("Virtual Command: unsupported command!!!\n");

Perhaps error_report_once()?

> +        break;
> +    }
> +    vtd_vcmd_clear_ip(s);
> +    return 0;
> +}
> +
>  static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
>  {
>      IntelIOMMUState *s = opaque;
> @@ -2879,6 +3014,23 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
>          vtd_set_long(s, addr, val);
>          break;
>  
> +    case DMAR_VCMD_REG:
> +        if (!vtd_handle_vcmd_write(s, val)) {
> +            if (size == 4) {
> +                vtd_set_long(s, addr, val);
> +            } else {
> +                vtd_set_quad(s, addr, val);
> +            }
> +        }
> +        break;
> +
> +    case DMAR_VCMD_REG_HI:
> +        assert(size == 4);

This assert() seems scary, but of course not a problem of this patch
because plenty of that are there in vtd_mem_write..  So we can fix
that later.

Do you know what should happen on bare-metal from spec-wise that when
the guest e.g. writes 2 bytes to these mmio regions?

> +        if (!vtd_handle_vcmd_write(s, val)) {
> +            vtd_set_long(s, addr, val);
> +        }
> +        break;
> +
>      default:
>          if (size == 4) {
>              vtd_set_long(s, addr, val);
> @@ -3617,7 +3769,8 @@ static void vtd_init(IntelIOMMUState *s)
>              s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
>          } else if (!strcmp(s->scalable_mode, "modern")) {
>              s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_PASID
> -                       | VTD_ECAP_FLTS | VTD_ECAP_PSS;
> +                       | VTD_ECAP_FLTS | VTD_ECAP_PSS | VTD_ECAP_VCS;
> +            s->vccap |= VTD_VCCAP_PAS;
>          }
>      }
>  

[...]

> +#define VTD_VCMD_CMD_MASK           0xffUL
> +#define VTD_VCMD_PASID_VALUE(val)   (((val) >> 8) & 0xfffff)
> +
> +#define VTD_VCRSP_RSLT(val)         ((val) << 8)
> +#define VTD_VCRSP_SC(val)           (((val) & 0x3) << 1)
> +
> +#define VTD_VCMD_UNDEFINED_CMD         1ULL
> +#define VTD_VCMD_NO_AVAILABLE_PASID    2ULL

According to 10.4.44 - should this be 1?
Yi Liu Nov. 6, 2019, 12:40 p.m. UTC | #2
> From: Peter Xu
> Sent: Saturday, November 2, 2019 2:06 AM
> To: Liu, Yi L <yi.l.liu@intel.com>
> Subject: Re: [RFC v2 10/22] intel_iommu: add virtual command capability support
> 
> On Thu, Oct 24, 2019 at 08:34:31AM -0400, Liu Yi L wrote:
> > This patch adds virtual command support to Intel vIOMMU per
> > Intel VT-d 3.1 spec. And adds two virtual commands: alloc_pasid
> > and free_pasid.
> >
> > Cc: Kevin Tian <kevin.tian@intel.com>
> > Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > Cc: Peter Xu <peterx@redhat.com>
> > Cc: Yi Sun <yi.y.sun@linux.intel.com>
> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> > ---
> >  hw/i386/intel_iommu.c          | 162
> ++++++++++++++++++++++++++++++++++++++++-
> >  hw/i386/intel_iommu_internal.h |  38 ++++++++++
> >  hw/i386/trace-events           |   1 +
> >  include/hw/i386/intel_iommu.h  |   6 +-
> >  4 files changed, 205 insertions(+), 2 deletions(-)
> >
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index e9f8692..88b843f 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -944,6 +944,7 @@ static VTDBus
> *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
> >                  return vtd_bus;
> >              }
> >          }
> > +        vtd_bus = NULL;
> 
> I feel like I've commented on this..
> 
> Should this be a standalone patch?

Oops, I should have made it in a separate patch. will do it in next version.

> >      }
> >      return vtd_bus;
> >  }
> > @@ -2590,6 +2591,140 @@ static void vtd_handle_iectl_write(IntelIOMMUState
> *s)
> >      }
> >  }
> >
> > +static int vtd_request_pasid_alloc(IntelIOMMUState *s)
> > +{
> > +    VTDBus *vtd_bus;
> > +    int bus_n, devfn;
> > +    IOMMUCTXEventData event_data;
> > +    IOMMUCTXPASIDReqDesc req;
> > +    VTDIOMMUContext *vtd_ic;
> > +
> > +    event_data.event = IOMMU_CTX_EVENT_PASID_ALLOC;
> > +    event_data.data = &req;
> > +    req.min_pasid = VTD_MIN_HPASID;
> > +    req.max_pasid = VTD_MAX_HPASID;
> > +    req.alloc_result = 0;
> > +    event_data.length = sizeof(req);
> 
> As mentioned in the other thread, do you think we can drop this length
> field?

yep, will do it.

> 
> > +    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
> > +        vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
> > +        if (!vtd_bus) {
> > +            continue;
> > +        }
> > +        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
> > +            vtd_ic = vtd_bus->dev_ic[devfn];
> > +            if (!vtd_ic) {
> > +                continue;
> > +            }
> > +            iommu_ctx_event_notify(&vtd_ic->iommu_context, &event_data);
> 
> Considering that we'll fill in the result into event_data, it could be
> a bit misleading to still call it "notify" here because normally it
> should only get data from the notifier caller rather than returning a
> meaningful value..  Things like SUCCESS/FAIL would be fine, but here
> we're returning a pasid from the notifier which seems a bit odd.
> 
> Maybe rename it to iommu_ctx_event_deliver()?  Then we just rename all
> the references of "notify" thingys into "hook" or something clearer?

got it. Will do it when we got agreement on the comments regards to
[RFC v2 09/22] vfio/pci: add iommu_context notifier for pasid alloc/free
of this series.

> > +            if (req.alloc_result > 0) {
> 
> I'd suggest we comment on this:
> 
>     We'll return the first valid result we got.  It's a bit hackish in
>     that we don't have a good global interface yet to talk to modules
>     like vfio to deliver this allocation request, so we're leveraging
>     this per-device context to do the same thing just to make sure the
>     allocation happens only once.
> 
> Same to the pasid_free() below, though you can reference the comment
> here from there to be simple.

Got it. Will add it in both place.

> 
> > +                return req.alloc_result;
> > +            }
> > +        }
> > +    }
> > +    return -1;
> > +}
> > +
> > +static int vtd_request_pasid_free(IntelIOMMUState *s, uint32_t pasid)
> > +{
> > +    VTDBus *vtd_bus;
> > +    int bus_n, devfn;
> > +    IOMMUCTXEventData event_data;
> > +    IOMMUCTXPASIDReqDesc req;
> > +    VTDIOMMUContext *vtd_ic;
> > +
> > +    event_data.event = IOMMU_CTX_EVENT_PASID_FREE;
> > +    event_data.data = &req;
> > +    req.pasid = pasid;
> > +    req.free_result = 0;
> > +    event_data.length = sizeof(req);
> > +    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
> > +        vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
> > +        if (!vtd_bus) {
> > +            continue;
> > +        }
> > +        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
> > +            vtd_ic = vtd_bus->dev_ic[devfn];
> > +            if (!vtd_ic) {
> > +                continue;
> > +            }
> > +            iommu_ctx_event_notify(&vtd_ic->iommu_context, &event_data);
> > +            if (req.free_result == 0) {
> > +                return 0;
> > +            }
> > +        }
> > +    }
> > +    return -1;
> > +}
> > +
> > +/*
> > + * If IP is not set, set it and return 0
> > + * If IP is already set, return -1
> > + */
> > +static int vtd_vcmd_rsp_ip_check(IntelIOMMUState *s)
> > +{
> > +    if (!(s->vccap & VTD_VCCAP_PAS) ||
> > +         (s->vcrsp & 1)) {
> > +        return -1;
> > +    }
> 
> VTD_VCCAP_PAS is not a IP check, so maybe simply move these chunk out
> to vtd_handle_vcmd_write?  Then we can rename this function to
> "void vtd_vcmd_ip_set(...)".

yes, it is. will do it in next version.

> 
> > +    s->vcrsp = 1;
> > +    vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> > +                     ((uint64_t) s->vcrsp));
> > +    return 0;
> > +}
> > +
> > +static void vtd_vcmd_clear_ip(IntelIOMMUState *s)
> > +{
> > +    s->vcrsp &= (~((uint64_t)(0x1)));
> > +    vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> > +                     ((uint64_t) s->vcrsp));
> > +}
> > +
> > +/* Handle write to Virtual Command Register */
> > +static int vtd_handle_vcmd_write(IntelIOMMUState *s, uint64_t val)
> > +{
> > +    uint32_t pasid;
> > +    int ret = -1;
> > +
> > +    trace_vtd_reg_write_vcmd(s->vcrsp, val);
> > +
> > +    /*
> > +     * Since vCPU should be blocked when the guest VMCD
> > +     * write was trapped to here. Should be no other vCPUs
> > +     * try to access VCMD if guest software is well written.
> > +     * However, we still emulate the IP bit here in case of
> > +     * bad guest software. Also align with the spec.
> > +     */
> > +    ret = vtd_vcmd_rsp_ip_check(s);
> > +    if (ret) {
> > +        return ret;
> > +    }
> > +    switch (val & VTD_VCMD_CMD_MASK) {
> > +    case VTD_VCMD_ALLOC_PASID:
> > +        ret = vtd_request_pasid_alloc(s);
> > +        if (ret < 0) {
> > +            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_NO_AVAILABLE_PASID);
> > +        } else {
> > +            s->vcrsp |= VTD_VCRSP_RSLT(ret);
> > +        }
> > +        break;
> > +
> > +    case VTD_VCMD_FREE_PASID:
> > +        pasid = VTD_VCMD_PASID_VALUE(val);
> > +        ret = vtd_request_pasid_free(s, pasid);
> > +        if (ret < 0) {
> > +            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_FREE_INVALID_PASID);
> > +        }
> > +        break;
> > +
> > +    default:
> > +        s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_UNDEFINED_CMD);
> > +        printf("Virtual Command: unsupported command!!!\n");
> 
> Perhaps error_report_once()?

will fix it in next version. thx~

> > +        break;
> > +    }
> > +    vtd_vcmd_clear_ip(s);
> > +    return 0;
> > +}
> > +
> >  static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
> >  {
> >      IntelIOMMUState *s = opaque;
> > @@ -2879,6 +3014,23 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
> >          vtd_set_long(s, addr, val);
> >          break;
> >
> > +    case DMAR_VCMD_REG:
> > +        if (!vtd_handle_vcmd_write(s, val)) {
> > +            if (size == 4) {
> > +                vtd_set_long(s, addr, val);
> > +            } else {
> > +                vtd_set_quad(s, addr, val);
> > +            }
> > +        }
> > +        break;
> > +
> > +    case DMAR_VCMD_REG_HI:
> > +        assert(size == 4);
> 
> This assert() seems scary, but of course not a problem of this patch
> because plenty of that are there in vtd_mem_write..  So we can fix
> that later.

got it.

> 
> Do you know what should happen on bare-metal from spec-wise that when
> the guest e.g. writes 2 bytes to these mmio regions?

I've no idea to your question. It is not a bare-metal capability. Personally, I
prefer to have a toggle bit to mark the full written of a cmd to VMCD_REG.
Reason is that we have no control on guest software. It may write new cmd
to VCMD_REG in a bad manner. e.g. write high 32 bits first and then write the
low 32 bits. Then it will have two traps. Apparently, for the first trap, it fills
in the VCMD_REG and no need to handle it since it is not a full written. I'm
checking it and evaluating it. How do you think on it?

> 
> > +        if (!vtd_handle_vcmd_write(s, val)) {
> > +            vtd_set_long(s, addr, val);
> > +        }
> > +        break;
> > +
> >      default:
> >          if (size == 4) {
> >              vtd_set_long(s, addr, val);
> > @@ -3617,7 +3769,8 @@ static void vtd_init(IntelIOMMUState *s)
> >              s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
> >          } else if (!strcmp(s->scalable_mode, "modern")) {
> >              s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_PASID
> > -                       | VTD_ECAP_FLTS | VTD_ECAP_PSS;
> > +                       | VTD_ECAP_FLTS | VTD_ECAP_PSS | VTD_ECAP_VCS;
> > +            s->vccap |= VTD_VCCAP_PAS;
> >          }
> >      }
> >
> 
> [...]
> 
> > +#define VTD_VCMD_CMD_MASK           0xffUL
> > +#define VTD_VCMD_PASID_VALUE(val)   (((val) >> 8) & 0xfffff)
> > +
> > +#define VTD_VCRSP_RSLT(val)         ((val) << 8)
> > +#define VTD_VCRSP_SC(val)           (((val) & 0x3) << 1)
> > +
> > +#define VTD_VCMD_UNDEFINED_CMD         1ULL
> > +#define VTD_VCMD_NO_AVAILABLE_PASID    2ULL
> 
> According to 10.4.44 - should this be 1?

It's 2 now per VT-d spec 3.1 (2019 June). I should have mentioned it in the cover
letter...

Regards,
Yi Liu
Peter Xu Nov. 6, 2019, 2 p.m. UTC | #3
On Wed, Nov 06, 2019 at 12:40:41PM +0000, Liu, Yi L wrote:
> > 
> > Do you know what should happen on bare-metal from spec-wise that when
> > the guest e.g. writes 2 bytes to these mmio regions?
> 
> I've no idea to your question. It is not a bare-metal capability. Personally, I
> prefer to have a toggle bit to mark the full written of a cmd to VMCD_REG.
> Reason is that we have no control on guest software. It may write new cmd
> to VCMD_REG in a bad manner. e.g. write high 32 bits first and then write the
> low 32 bits. Then it will have two traps. Apparently, for the first trap, it fills
> in the VCMD_REG and no need to handle it since it is not a full written. I'm
> checking it and evaluating it. How do you think on it?

Oh I just noticed that vtd_mem_ops.min_access_size==4 now so writting
2B should never happen at least.  Then we'll bail out at
memory_region_access_valid().  Seems fine.

> 
> > 
> > > +        if (!vtd_handle_vcmd_write(s, val)) {
> > > +            vtd_set_long(s, addr, val);
> > > +        }
> > > +        break;
> > > +
> > >      default:
> > >          if (size == 4) {
> > >              vtd_set_long(s, addr, val);
> > > @@ -3617,7 +3769,8 @@ static void vtd_init(IntelIOMMUState *s)
> > >              s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
> > >          } else if (!strcmp(s->scalable_mode, "modern")) {
> > >              s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_PASID
> > > -                       | VTD_ECAP_FLTS | VTD_ECAP_PSS;
> > > +                       | VTD_ECAP_FLTS | VTD_ECAP_PSS | VTD_ECAP_VCS;
> > > +            s->vccap |= VTD_VCCAP_PAS;
> > >          }
> > >      }
> > >
> > 
> > [...]
> > 
> > > +#define VTD_VCMD_CMD_MASK           0xffUL
> > > +#define VTD_VCMD_PASID_VALUE(val)   (((val) >> 8) & 0xfffff)
> > > +
> > > +#define VTD_VCRSP_RSLT(val)         ((val) << 8)
> > > +#define VTD_VCRSP_SC(val)           (((val) & 0x3) << 1)
> > > +
> > > +#define VTD_VCMD_UNDEFINED_CMD         1ULL
> > > +#define VTD_VCMD_NO_AVAILABLE_PASID    2ULL
> > 
> > According to 10.4.44 - should this be 1?
> 
> It's 2 now per VT-d spec 3.1 (2019 June). I should have mentioned it in the cover
> letter...

Well you're right... I hope there won't be other "major" things get
changed otherwise it'll be really a pain of working on all of these
before things settle...

Thanks,
Yi Liu Nov. 12, 2019, 6:27 a.m. UTC | #4
> From: Peter Xu <peterx@redhat.com>
> Sent: Wednesday, November 6, 2019 10:01 PM
> To: Liu, Yi L <yi.l.liu@intel.com>
> Subject: Re: [RFC v2 10/22] intel_iommu: add virtual command capability support
> 
> On Wed, Nov 06, 2019 at 12:40:41PM +0000, Liu, Yi L wrote:
> > >
> > > Do you know what should happen on bare-metal from spec-wise that when
> > > the guest e.g. writes 2 bytes to these mmio regions?
> >
> > I've no idea to your question. It is not a bare-metal capability. Personally, I
> > prefer to have a toggle bit to mark the full written of a cmd to VMCD_REG.
> > Reason is that we have no control on guest software. It may write new cmd
> > to VCMD_REG in a bad manner. e.g. write high 32 bits first and then write the
> > low 32 bits. Then it will have two traps. Apparently, for the first trap, it fills
> > in the VCMD_REG and no need to handle it since it is not a full written. I'm
> > checking it and evaluating it. How do you think on it?
> 
> Oh I just noticed that vtd_mem_ops.min_access_size==4 now so writting
> 2B should never happen at least.  Then we'll bail out at
> memory_region_access_valid().  Seems fine.

got it.

> 
> >
> > >
> > > > +        if (!vtd_handle_vcmd_write(s, val)) {
> > > > +            vtd_set_long(s, addr, val);
> > > > +        }
> > > > +        break;
> > > > +
> > > >      default:
> > > >          if (size == 4) {
> > > >              vtd_set_long(s, addr, val);
> > > > @@ -3617,7 +3769,8 @@ static void vtd_init(IntelIOMMUState *s)
> > > >              s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
> > > >          } else if (!strcmp(s->scalable_mode, "modern")) {
> > > >              s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_PASID
> > > > -                       | VTD_ECAP_FLTS | VTD_ECAP_PSS;
> > > > +                       | VTD_ECAP_FLTS | VTD_ECAP_PSS | VTD_ECAP_VCS;
> > > > +            s->vccap |= VTD_VCCAP_PAS;
> > > >          }
> > > >      }
> > > >
> > >
> > > [...]
> > >
> > > > +#define VTD_VCMD_CMD_MASK           0xffUL
> > > > +#define VTD_VCMD_PASID_VALUE(val)   (((val) >> 8) & 0xfffff)
> > > > +
> > > > +#define VTD_VCRSP_RSLT(val)         ((val) << 8)
> > > > +#define VTD_VCRSP_SC(val)           (((val) & 0x3) << 1)
> > > > +
> > > > +#define VTD_VCMD_UNDEFINED_CMD         1ULL
> > > > +#define VTD_VCMD_NO_AVAILABLE_PASID    2ULL
> > >
> > > According to 10.4.44 - should this be 1?
> >
> > It's 2 now per VT-d spec 3.1 (2019 June). I should have mentioned it in the cover
> > letter...
> 
> Well you're right... I hope there won't be other "major" things get
> changed otherwise it'll be really a pain of working on all of these
> before things settle...

As far as I know, only this part has significant change. Other parts are consistent.
I'll mention spec version next time in the cover letter.

Thanks,
Yi Liu
diff mbox series

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index e9f8692..88b843f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -944,6 +944,7 @@  static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
                 return vtd_bus;
             }
         }
+        vtd_bus = NULL;
     }
     return vtd_bus;
 }
@@ -2590,6 +2591,140 @@  static void vtd_handle_iectl_write(IntelIOMMUState *s)
     }
 }
 
+static int vtd_request_pasid_alloc(IntelIOMMUState *s)
+{
+    VTDBus *vtd_bus;
+    int bus_n, devfn;
+    IOMMUCTXEventData event_data;
+    IOMMUCTXPASIDReqDesc req;
+    VTDIOMMUContext *vtd_ic;
+
+    event_data.event = IOMMU_CTX_EVENT_PASID_ALLOC;
+    event_data.data = &req;
+    req.min_pasid = VTD_MIN_HPASID;
+    req.max_pasid = VTD_MAX_HPASID;
+    req.alloc_result = 0;
+    event_data.length = sizeof(req);
+    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
+        vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
+        if (!vtd_bus) {
+            continue;
+        }
+        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
+            vtd_ic = vtd_bus->dev_ic[devfn];
+            if (!vtd_ic) {
+                continue;
+            }
+            iommu_ctx_event_notify(&vtd_ic->iommu_context, &event_data);
+            if (req.alloc_result > 0) {
+                return req.alloc_result;
+            }
+        }
+    }
+    return -1;
+}
+
+static int vtd_request_pasid_free(IntelIOMMUState *s, uint32_t pasid)
+{
+    VTDBus *vtd_bus;
+    int bus_n, devfn;
+    IOMMUCTXEventData event_data;
+    IOMMUCTXPASIDReqDesc req;
+    VTDIOMMUContext *vtd_ic;
+
+    event_data.event = IOMMU_CTX_EVENT_PASID_FREE;
+    event_data.data = &req;
+    req.pasid = pasid;
+    req.free_result = 0;
+    event_data.length = sizeof(req);
+    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
+        vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
+        if (!vtd_bus) {
+            continue;
+        }
+        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
+            vtd_ic = vtd_bus->dev_ic[devfn];
+            if (!vtd_ic) {
+                continue;
+            }
+            iommu_ctx_event_notify(&vtd_ic->iommu_context, &event_data);
+            if (req.free_result == 0) {
+                return 0;
+            }
+        }
+    }
+    return -1;
+}
+
+/*
+ * If IP is not set, set it and return 0
+ * If IP is already set, return -1
+ */
+static int vtd_vcmd_rsp_ip_check(IntelIOMMUState *s)
+{
+    if (!(s->vccap & VTD_VCCAP_PAS) ||
+         (s->vcrsp & 1)) {
+        return -1;
+    }
+    s->vcrsp = 1;
+    vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+                     ((uint64_t) s->vcrsp));
+    return 0;
+}
+
+static void vtd_vcmd_clear_ip(IntelIOMMUState *s)
+{
+    s->vcrsp &= (~((uint64_t)(0x1)));
+    vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+                     ((uint64_t) s->vcrsp));
+}
+
+/* Handle write to Virtual Command Register */
+static int vtd_handle_vcmd_write(IntelIOMMUState *s, uint64_t val)
+{
+    uint32_t pasid;
+    int ret = -1;
+
+    trace_vtd_reg_write_vcmd(s->vcrsp, val);
+
+    /*
+     * Since vCPU should be blocked when the guest VMCD
+     * write was trapped to here. Should be no other vCPUs
+     * try to access VCMD if guest software is well written.
+     * However, we still emulate the IP bit here in case of
+     * bad guest software. Also align with the spec.
+     */
+    ret = vtd_vcmd_rsp_ip_check(s);
+    if (ret) {
+        return ret;
+    }
+    switch (val & VTD_VCMD_CMD_MASK) {
+    case VTD_VCMD_ALLOC_PASID:
+        ret = vtd_request_pasid_alloc(s);
+        if (ret < 0) {
+            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_NO_AVAILABLE_PASID);
+        } else {
+            s->vcrsp |= VTD_VCRSP_RSLT(ret);
+        }
+        break;
+
+    case VTD_VCMD_FREE_PASID:
+        pasid = VTD_VCMD_PASID_VALUE(val);
+        ret = vtd_request_pasid_free(s, pasid);
+        if (ret < 0) {
+            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_FREE_INVALID_PASID);
+        }
+        break;
+
+    default:
+        s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_UNDEFINED_CMD);
+        printf("Virtual Command: unsupported command!!!\n");
+        break;
+    }
+    vtd_vcmd_clear_ip(s);
+    return 0;
+}
+
 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
 {
     IntelIOMMUState *s = opaque;
@@ -2879,6 +3014,23 @@  static void vtd_mem_write(void *opaque, hwaddr addr,
         vtd_set_long(s, addr, val);
         break;
 
+    case DMAR_VCMD_REG:
+        if (!vtd_handle_vcmd_write(s, val)) {
+            if (size == 4) {
+                vtd_set_long(s, addr, val);
+            } else {
+                vtd_set_quad(s, addr, val);
+            }
+        }
+        break;
+
+    case DMAR_VCMD_REG_HI:
+        assert(size == 4);
+        if (!vtd_handle_vcmd_write(s, val)) {
+            vtd_set_long(s, addr, val);
+        }
+        break;
+
     default:
         if (size == 4) {
             vtd_set_long(s, addr, val);
@@ -3617,7 +3769,8 @@  static void vtd_init(IntelIOMMUState *s)
             s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
         } else if (!strcmp(s->scalable_mode, "modern")) {
             s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_PASID
-                       | VTD_ECAP_FLTS | VTD_ECAP_PSS;
+                       | VTD_ECAP_FLTS | VTD_ECAP_PSS | VTD_ECAP_VCS;
+            s->vccap |= VTD_VCCAP_PAS;
         }
     }
 
@@ -3674,6 +3827,13 @@  static void vtd_init(IntelIOMMUState *s)
      * Interrupt remapping registers.
      */
     vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
+
+    /*
+     * Virtual Command Definitions
+     */
+    vtd_define_quad(s, DMAR_VCCAP_REG, s->vccap, 0, 0);
+    vtd_define_quad(s, DMAR_VCMD_REG, 0, 0xffffffffffffffffULL, 0);
+    vtd_define_quad(s, DMAR_VCRSP_REG, 0, 0, 0);
 }
 
 /* Should not reset address_spaces when reset because devices will still use
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index be7b30a..8668771 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -85,6 +85,12 @@ 
 #define DMAR_MTRRCAP_REG_HI     0x104
 #define DMAR_MTRRDEF_REG        0x108 /* MTRR default type */
 #define DMAR_MTRRDEF_REG_HI     0x10c
+#define DMAR_VCCAP_REG          0xE00 /* Virtual Command Capability Register */
+#define DMAR_VCCAP_REG_HI       0xE04
+#define DMAR_VCMD_REG           0xE10 /* Virtual Command Register */
+#define DMAR_VCMD_REG_HI        0xE14
+#define DMAR_VCRSP_REG          0xE20 /* Virtual Command Reponse Register */
+#define DMAR_VCRSP_REG_HI       0xE24
 
 /* IOTLB registers */
 #define DMAR_IOTLB_REG_OFFSET   0xf0 /* Offset to the IOTLB registers */
@@ -193,6 +199,7 @@ 
 #define VTD_ECAP_PSS                (19ULL << 35)
 #define VTD_ECAP_PASID              (1ULL << 40)
 #define VTD_ECAP_SMTS               (1ULL << 43)
+#define VTD_ECAP_VCS                (1ULL << 44)
 #define VTD_ECAP_SLTS               (1ULL << 46)
 #define VTD_ECAP_FLTS               (1ULL << 47)
 
@@ -315,6 +322,37 @@  typedef enum VTDFaultReason {
 
 #define VTD_CONTEXT_CACHE_GEN_MAX       0xffffffffUL
 
+/* VCCAP_REG */
+#define VTD_VCCAP_PAS               (1UL << 0)
+
+/*
+ * The basic idea is to let hypervisor to set a range for available
+ * PASIDs for VMs. One of the reasons is PASID #0 is reserved by
+ * RID_PASID usage. We have no idea how many reserved PASIDs in future,
+ * so here just an evaluated value. Honestly, set it as "1" is enough
+ * at current stage.
+ */
+#define VTD_MIN_HPASID              1
+#define VTD_MAX_HPASID              0xFFFFF
+
+/* Virtual Command Register */
+enum {
+     VTD_VCMD_NULL_CMD = 0,
+     VTD_VCMD_ALLOC_PASID = 1,
+     VTD_VCMD_FREE_PASID = 2,
+     VTD_VCMD_CMD_NUM,
+};
+
+#define VTD_VCMD_CMD_MASK           0xffUL
+#define VTD_VCMD_PASID_VALUE(val)   (((val) >> 8) & 0xfffff)
+
+#define VTD_VCRSP_RSLT(val)         ((val) << 8)
+#define VTD_VCRSP_SC(val)           (((val) & 0x3) << 1)
+
+#define VTD_VCMD_UNDEFINED_CMD         1ULL
+#define VTD_VCMD_NO_AVAILABLE_PASID    2ULL
+#define VTD_VCMD_FREE_INVALID_PASID    2ULL
+
 /* Interrupt Entry Cache Invalidation Descriptor: VT-d 6.5.2.7. */
 struct VTDInvDescIEC {
     uint32_t type:4;            /* Should always be 0x4 */
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index c8bc464..43c0314 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -51,6 +51,7 @@  vtd_reg_write_gcmd(uint32_t status, uint32_t val) "status 0x%"PRIx32" value 0x%"
 vtd_reg_write_fectl(uint32_t value) "value 0x%"PRIx32
 vtd_reg_write_iectl(uint32_t value) "value 0x%"PRIx32
 vtd_reg_ics_clear_ip(void) ""
+vtd_reg_write_vcmd(uint32_t status, uint32_t val) "status 0x%"PRIx32" value 0x%"PRIx32
 vtd_dmar_translate(uint8_t bus, uint8_t slot, uint8_t func, uint64_t iova, uint64_t gpa, uint64_t mask) "dev %02x:%02x.%02x iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64
 vtd_dmar_enable(bool en) "enable %d"
 vtd_dmar_fault(uint16_t sid, int fault, uint64_t addr, bool is_write) "sid 0x%"PRIx16" fault %d addr 0x%"PRIx64" write %d"
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 1c580c1..0d49480 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -46,7 +46,7 @@ 
 #define VTD_SID_TO_BUS(sid)         (((sid) >> 8) & 0xff)
 #define VTD_SID_TO_DEVFN(sid)       ((sid) & 0xff)
 
-#define DMAR_REG_SIZE               0x230
+#define DMAR_REG_SIZE               0xF00
 #define VTD_HOST_AW_39BIT           39
 #define VTD_HOST_AW_48BIT           48
 #define VTD_HOST_ADDRESS_WIDTH      VTD_HOST_AW_39BIT
@@ -282,6 +282,10 @@  struct IntelIOMMUState {
     uint8_t aw_bits;                /* Host/IOVA address width (in bits) */
     bool dma_drain;                 /* Whether DMA r/w draining enabled */
 
+    /* Virtual Command Register */
+    uint64_t vccap;                 /* The value of vcmd capability reg */
+    uint64_t vcrsp;                 /* Current value of VCMD RSP REG */
+
     /*
      * Protects IOMMU states in general.  Currently it protects the
      * per-IOMMU IOTLB cache, and context entry cache in VTDAddressSpace.