diff mbox series

[RFC,v1,06/18] intel_iommu: support virtual command emulation and pasid request

Message ID 1562324511-2910-7-git-send-email-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series intel_iommu: expose Shared Virtual Addressing to VM | expand

Commit Message

Yi Liu July 5, 2019, 11:01 a.m. UTC
This patch adds virtual command support to Intel vIOMMU per Intel VT-d 3.1
spec. This patch adds two virtual commands: alloc_pasid and free_pasid.

Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
---
 hw/i386/intel_iommu.c          | 139 ++++++++++++++++++++++++++++++++++++++++-
 hw/i386/intel_iommu_internal.h |  30 +++++++++
 hw/i386/trace-events           |   1 +
 include/hw/i386/intel_iommu.h  |   6 +-
 4 files changed, 174 insertions(+), 2 deletions(-)

Comments

Peter Xu July 9, 2019, 3:19 a.m. UTC | #1
On Fri, Jul 05, 2019 at 07:01:39PM +0800, Liu Yi L wrote:
> This patch adds virtual command support to Intel vIOMMU per Intel VT-d 3.1
> spec. This patch adds two virtual commands: alloc_pasid and free_pasid.
> 
> Cc: Kevin Tian <kevin.tian@intel.com>
> Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: Yi Sun <yi.y.sun@linux.intel.com>
> Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> ---
>  hw/i386/intel_iommu.c          | 139 ++++++++++++++++++++++++++++++++++++++++-
>  hw/i386/intel_iommu_internal.h |  30 +++++++++
>  hw/i386/trace-events           |   1 +
>  include/hw/i386/intel_iommu.h  |   6 +-
>  4 files changed, 174 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 3160a05..3cf250d 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -932,11 +932,19 @@ static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
>                  s->vtd_as_by_bus_num[bus_num] = vtd_bus;
>                  return vtd_bus;
>              }
> +            vtd_bus = NULL;

Can move to ...
>          }

... here?

>      }
>      return vtd_bus;
>  }
>  
> +static PCIBus *vtd_find_pci_bus_from_bus_num(IntelIOMMUState *s,
> +                                             uint8_t bus_num)
> +{
> +    VTDBus *vtd_bus = vtd_find_as_from_bus_num(s, bus_num);
> +    return vtd_bus ? vtd_bus->bus : NULL;
> +}
> +
>  /* Given the @iova, get relevant @slptep. @slpte_level will be the last level
>   * of the translation, can be used for deciding the size of large page.
>   */
> @@ -2579,6 +2587,103 @@ static void vtd_handle_iectl_write(IntelIOMMUState *s)
>      }
>  }
>  
> +static int vtd_request_pasid_alloc(IntelIOMMUState *s)
> +{
> +    PCIBus *bus;
> +    int bus_n, devfn;
> +
> +    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
> +        bus = vtd_find_pci_bus_from_bus_num(s, bus_n);
> +        if (!bus) {
> +            continue;
> +        }
> +        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
> +            if (pci_device_is_ops_set(bus, devfn)) {
> +                return pci_device_request_pasid_alloc(bus, devfn,
> +                                                      VTD_MIN_HPASID,
> +                                                      VTD_MAX_HPASID);

Ah so here I see why pci_device_is_ops_set() is necessary... you
wanted to find a device that is vfio-pci and supports PASID.  This is
a bit awkward but indeed I don't know what's a better option to make
it a clearer interface if we can't let IOMMU to talk directly to vfio.

THe thing is that VFIO_IOMMU_PASID_REQUEST seems to be defined per
VFIO container, while VT-d spec is of course defining PASID allocation
as globally.  More context on how the pasid address space will be
defined and considerations behind (not only for this series, but for
the big picture of SVA work) would be greatly welcomed.

> +            }
> +        }
> +    }
> +    return -1;
> +}
> +
> +static int vtd_request_pasid_free(IntelIOMMUState *s, uint32_t pasid)
> +{
> +    PCIBus *bus;
> +    int bus_n, devfn;
> +
> +    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
> +        bus = vtd_find_pci_bus_from_bus_num(s, bus_n);
> +        if (!bus) {
> +            continue;
> +        }
> +        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
> +            if (pci_device_is_ops_set(bus, devfn)) {
> +                return pci_device_request_pasid_free(bus, devfn, pasid);
> +            }
> +        }
> +    }
> +    return -1;
> +}
> +
> +/* Handle write to Virtual Command Register */
> +static void vtd_handle_vcmd_write(IntelIOMMUState *s)
> +{
> +    uint32_t status = vtd_get_long_raw(s, DMAR_VCRSP_REG);
> +    uint32_t val = vtd_get_long_raw(s, DMAR_VCMD_REG);
> +    uint32_t pasid;
> +    int ret = -1;
> +
> +    trace_vtd_reg_write_vcmd(status, val);

Could we use s->vcrsp directly instead of using DMAR_VCRSP_REG?

> +
> +    switch (val & VTD_VCMD_CMD_MASK) {
> +    case VTD_VCMD_ALLOC_PASID:
> +        if (!(s->vccap & VTD_VCCAP_PAS) ||
> +             (s->vcrsp & 1)) {

Nit: we can consider to offer some helpers for them.

Also, I think we should check vcrsp&1 at the entry for all vcmds. [1]

> +            break;
> +        }
> +        s->vcrsp = 1;
> +        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> +                         ((uint64_t) s->vcrsp));

Do we really need to emulate the "In Progress" like this?  The vcpu is
blocked here after all, and AFAICT all the rest of vcpus should not
access these registers because obviously these registers cannot be
accessed concurrently...

I think the IP bit is useful when some new vcmd would take plenty of
time so that we can do the long vcmds in async way.  However here it
seems not the case?

> +        ret = vtd_request_pasid_alloc(s);
> +        if (ret < 0) {
> +            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_NO_AVAILABLE_PASID);
> +        } else {
> +            s->vcrsp |= VTD_VCRSP_RSLT(ret);
> +        }
> +        s->vcrsp &= (~((uint64_t)(0x1)));
> +        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> +                         ((uint64_t) s->vcrsp));
> +        break;
> +
> +    case VTD_VCMD_FREE_PASID:
> +        if (!(s->vccap & VTD_VCCAP_PAS) ||
> +             (s->vcrsp & 1)) {
> +            break;
> +        }
> +        s->vcrsp &= 1;
> +        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> +                         ((uint64_t) s->vcrsp));

Same here on IP bit emulation.  IMHO we can drop these and this
function can be greatly simplified.  Your call. :)

> +        pasid = VTD_VCMD_PASID_VALUE(val);
> +        ret = vtd_request_pasid_free(s, pasid);
> +        if (ret < 0) {
> +            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_FREE_INVALID_PASID);
> +        }
> +        s->vcrsp &= (~((uint64_t)(0x1)));
> +        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> +                         ((uint64_t) s->vcrsp));
> +        break;
> +
> +    default:
> +        s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_UNDEFINED_CMD);

(IMHO you can simply do s/|=/=/ here if you handle IP well at the
 entry of the function)

> +        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> +                         ((uint64_t) s->vcrsp));
> +        printf("Virtual Command: unsupported command!!!\n");
> +        break;
> +    }
> +}
> +
>  static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
>  {
>      IntelIOMMUState *s = opaque;
> @@ -2620,6 +2725,15 @@ static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
>          val = s->iq >> 32;
>          break;
>  
> +    case DMAR_VCRSP_REG:
> +        val = s->vcrsp;
> +        break;
> +
> +    case DMAR_VCRSP_REG_HI:
> +        assert(size == 4);
> +        val = s->vcrsp >> 32;
> +        break;

If you're always with vtd_set_quad_raw()s then IMHO you can drop these
lines?  vtd_mem_read() has a default to handle all these.

> +
>      default:
>          if (size == 4) {
>              val = vtd_get_long(s, addr);
> @@ -2868,6 +2982,21 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
>          vtd_set_long(s, addr, val);
>          break;
>  
> +    case DMAR_VCMD_REG:
> +        if (size == 4) {
> +            vtd_set_long(s, addr, val);
> +        } else {
> +            vtd_set_quad(s, addr, val);
> +        }
> +        vtd_handle_vcmd_write(s);

IMHO you should do vtd_handle_vcmd_write() first and let it return a
value, when returning true you update the regisers using vtd_set_*()
otherwise you should skip (e.g., when IP is set in vcmd result reg).

> +        break;
> +
> +    case DMAR_VCMD_REG_HI:
> +        assert(size == 4);
> +        vtd_set_long(s, addr, val);
> +        vtd_handle_vcmd_write(s);

Same here?

> +        break;
> +
>      default:
>          if (size == 4) {
>              vtd_set_long(s, addr, val);
> @@ -3579,7 +3708,8 @@ static void vtd_init(IntelIOMMUState *s)
>              s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
>          } else if (!strcmp(s->sm_model, "scalable")) {
>              s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_PASID
> -                       | VTD_ECAP_FLTS;
> +                       | VTD_ECAP_FLTS | VTD_ECAP_VCS;
> +            s->vccap |= VTD_VCCAP_PAS;
>          } else {
>              printf("\n!!!!! Invalid sm_model config !!!!!\n"
>                  "Please config sm_model=[\"legacy\"|\"scalable\"]\n"
> @@ -3641,6 +3771,13 @@ static void vtd_init(IntelIOMMUState *s)
>       * Interrupt remapping registers.
>       */
>      vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
> +
> +    /*
> +     * Virtual Command Definitions
> +     */
> +    vtd_define_quad(s, DMAR_VCCAP_REG, s->vccap, 0, 0);
> +    vtd_define_quad(s, DMAR_VCMD_REG, 0, 0xffffffffffffffffULL, 0);
> +    vtd_define_quad(s, DMAR_VCRSP_REG, 0, 0, 0);
>  }
>  
>  /* Should not reset address_spaces when reset because devices will still use
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index adae198..f5a2f0d 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -85,6 +85,12 @@
>  #define DMAR_MTRRCAP_REG_HI     0x104
>  #define DMAR_MTRRDEF_REG        0x108 /* MTRR default type */
>  #define DMAR_MTRRDEF_REG_HI     0x10c
> +#define DMAR_VCCAP_REG          0xE00 /* Virtual Command Capability Register */
> +#define DMAR_VCCAP_REG_HI       0xE04
> +#define DMAR_VCMD_REG           0xE10 /* Virtual Command Register */
> +#define DMAR_VCMD_REG_HI        0xE14
> +#define DMAR_VCRSP_REG          0xE20 /* Virtual Command Reponse Register */
> +#define DMAR_VCRSP_REG_HI       0xE24
>  
>  /* IOTLB registers */
>  #define DMAR_IOTLB_REG_OFFSET   0xf0 /* Offset to the IOTLB registers */
> @@ -192,6 +198,7 @@
>  #define VTD_ECAP_SRS                (1ULL << 31)
>  #define VTD_ECAP_PASID              (1ULL << 40)
>  #define VTD_ECAP_SMTS               (1ULL << 43)
> +#define VTD_ECAP_VCS                (1ULL << 44)
>  #define VTD_ECAP_SLTS               (1ULL << 46)
>  #define VTD_ECAP_FLTS               (1ULL << 47)
>  
> @@ -314,6 +321,29 @@ typedef enum VTDFaultReason {
>  
>  #define VTD_CONTEXT_CACHE_GEN_MAX       0xffffffffUL
>  
> +/* VCCAP_REG */
> +#define VTD_VCCAP_PAS               (1UL << 0)
> +#define VTD_MIN_HPASID              200

Comment this value a bit?

> +#define VTD_MAX_HPASID              0xFFFFF
> +
> +/* Virtual Command Register */
> +enum {
> +     VTD_VCMD_NULL_CMD = 0,
> +     VTD_VCMD_ALLOC_PASID,

Shall we spell " = 1" explicitly if defined in spec?

> +     VTD_VCMD_FREE_PASID,

Same here.

Regards,
Yi Liu July 10, 2019, 11:51 a.m. UTC | #2
> From: Peter Xu [mailto:zhexu@redhat.com]
> Sent: Tuesday, July 9, 2019 11:19 AM
> Subject: Re: [RFC v1 06/18] intel_iommu: support virtual command emulation and
> pasid request
> 
> On Fri, Jul 05, 2019 at 07:01:39PM +0800, Liu Yi L wrote:
> > This patch adds virtual command support to Intel vIOMMU per Intel VT-d 3.1
> > spec. This patch adds two virtual commands: alloc_pasid and free_pasid.
> >
> > Cc: Kevin Tian <kevin.tian@intel.com>
> > Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > Cc: Peter Xu <peterx@redhat.com>
> > Cc: Yi Sun <yi.y.sun@linux.intel.com>
> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> > ---
> >  hw/i386/intel_iommu.c          | 139
> ++++++++++++++++++++++++++++++++++++++++-
> >  hw/i386/intel_iommu_internal.h |  30 +++++++++
> >  hw/i386/trace-events           |   1 +
> >  include/hw/i386/intel_iommu.h  |   6 +-
> >  4 files changed, 174 insertions(+), 2 deletions(-)
> >
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index 3160a05..3cf250d 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -932,11 +932,19 @@ static VTDBus
> *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
> >                  s->vtd_as_by_bus_num[bus_num] = vtd_bus;
> >                  return vtd_bus;
> >              }
> > +            vtd_bus = NULL;
> 
> Can move to ...
> >          }
> 
> ... here?

yes, it is. could save variable assignments.

> >      }
> >      return vtd_bus;
> >  }
> >
> > +static PCIBus *vtd_find_pci_bus_from_bus_num(IntelIOMMUState *s,
> > +                                             uint8_t bus_num)
> > +{
> > +    VTDBus *vtd_bus = vtd_find_as_from_bus_num(s, bus_num);
> > +    return vtd_bus ? vtd_bus->bus : NULL;
> > +}
> > +
> >  /* Given the @iova, get relevant @slptep. @slpte_level will be the last level
> >   * of the translation, can be used for deciding the size of large page.
> >   */
> > @@ -2579,6 +2587,103 @@ static void vtd_handle_iectl_write(IntelIOMMUState
> *s)
> >      }
> >  }
> >
> > +static int vtd_request_pasid_alloc(IntelIOMMUState *s)
> > +{
> > +    PCIBus *bus;
> > +    int bus_n, devfn;
> > +
> > +    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
> > +        bus = vtd_find_pci_bus_from_bus_num(s, bus_n);
> > +        if (!bus) {
> > +            continue;
> > +        }
> > +        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
> > +            if (pci_device_is_ops_set(bus, devfn)) {
> > +                return pci_device_request_pasid_alloc(bus, devfn,
> > +                                                      VTD_MIN_HPASID,
> > +                                                      VTD_MAX_HPASID);
> 
> Ah so here I see why pci_device_is_ops_set() is necessary... you
> wanted to find a device that is vfio-pci and supports PASID.  This is
> a bit awkward but indeed I don't know what's a better option to make
> it a clearer interface if we can't let IOMMU to talk directly to vfio.

yes, it is.

> THe thing is that VFIO_IOMMU_PASID_REQUEST seems to be defined per
> VFIO container, while VT-d spec is of course defining PASID allocation
> as globally. 

right.

> More context on how the pasid address space will be
> defined and considerations behind (not only for this series, but for
> the big picture of SVA work) would be greatly welcomed.

already noticed in other replies. let's align one by one.

> > +            }
> > +        }
> > +    }
> > +    return -1;
> > +}
> > +
> > +static int vtd_request_pasid_free(IntelIOMMUState *s, uint32_t pasid)
> > +{
> > +    PCIBus *bus;
> > +    int bus_n, devfn;
> > +
> > +    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
> > +        bus = vtd_find_pci_bus_from_bus_num(s, bus_n);
> > +        if (!bus) {
> > +            continue;
> > +        }
> > +        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
> > +            if (pci_device_is_ops_set(bus, devfn)) {
> > +                return pci_device_request_pasid_free(bus, devfn, pasid);
> > +            }
> > +        }
> > +    }
> > +    return -1;
> > +}
> > +
> > +/* Handle write to Virtual Command Register */
> > +static void vtd_handle_vcmd_write(IntelIOMMUState *s)
> > +{
> > +    uint32_t status = vtd_get_long_raw(s, DMAR_VCRSP_REG);
> > +    uint32_t val = vtd_get_long_raw(s, DMAR_VCMD_REG);
> > +    uint32_t pasid;
> > +    int ret = -1;
> > +
> > +    trace_vtd_reg_write_vcmd(status, val);
> 
> Could we use s->vcrsp directly instead of using DMAR_VCRSP_REG?

yes, I think so.

> > +
> > +    switch (val & VTD_VCMD_CMD_MASK) {
> > +    case VTD_VCMD_ALLOC_PASID:
> > +        if (!(s->vccap & VTD_VCCAP_PAS) ||
> > +             (s->vcrsp & 1)) {
> 
> Nit: we can consider to offer some helpers for them.

will have helper for them.

> Also, I think we should check vcrsp&1 at the entry for all vcmds. [1]

Agreed.

> > +            break;
> > +        }
> > +        s->vcrsp = 1;
> > +        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> > +                         ((uint64_t) s->vcrsp));
> 
> Do we really need to emulate the "In Progress" like this?  The vcpu is
> blocked here after all, and AFAICT all the rest of vcpus should not
> access these registers because obviously these registers cannot be
> accessed concurrently...

Other vcpus should poll the IP bit before submitting vcmds. As IP bit
is set, other vcpus will not access these bits. but if not, they may submit
new vcmds, while we only have 1 response register, that is not we
support. That's why we need to set IP bit.

> 
> I think the IP bit is useful when some new vcmd would take plenty of
> time so that we can do the long vcmds in async way.  However here it
> seems not the case?

no, so far, it is synchronize way. As mentioned above, IP bit is to ensure
only one vcmd is handled for a time. Other vcpus won't be able to submit
vcmds before IP is cleared.

> > +        ret = vtd_request_pasid_alloc(s);
> > +        if (ret < 0) {
> > +            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_NO_AVAILABLE_PASID);
> > +        } else {
> > +            s->vcrsp |= VTD_VCRSP_RSLT(ret);
> > +        }
> > +        s->vcrsp &= (~((uint64_t)(0x1)));
> > +        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> > +                         ((uint64_t) s->vcrsp));
> > +        break;
> > +
> > +    case VTD_VCMD_FREE_PASID:
> > +        if (!(s->vccap & VTD_VCCAP_PAS) ||
> > +             (s->vcrsp & 1)) {
> > +            break;
> > +        }
> > +        s->vcrsp &= 1;
> > +        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> > +                         ((uint64_t) s->vcrsp));
> 
> Same here on IP bit emulation.  IMHO we can drop these and this
> function can be greatly simplified.  Your call. :)
> 
> > +        pasid = VTD_VCMD_PASID_VALUE(val);
> > +        ret = vtd_request_pasid_free(s, pasid);
> > +        if (ret < 0) {
> > +            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_FREE_INVALID_PASID);
> > +        }
> > +        s->vcrsp &= (~((uint64_t)(0x1)));
> > +        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> > +                         ((uint64_t) s->vcrsp));
> > +        break;
> > +
> > +    default:
> > +        s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_UNDEFINED_CMD);
> 
> (IMHO you can simply do s/|=/=/ here if you handle IP well at the
>  entry of the function)
> 
> > +        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> > +                         ((uint64_t) s->vcrsp));
> > +        printf("Virtual Command: unsupported command!!!\n");
> > +        break;
> > +    }
> > +}
> > +
> >  static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
> >  {
> >      IntelIOMMUState *s = opaque;
> > @@ -2620,6 +2725,15 @@ static uint64_t vtd_mem_read(void *opaque, hwaddr
> addr, unsigned size)
> >          val = s->iq >> 32;
> >          break;
> >
> > +    case DMAR_VCRSP_REG:
> > +        val = s->vcrsp;
> > +        break;
> > +
> > +    case DMAR_VCRSP_REG_HI:
> > +        assert(size == 4);
> > +        val = s->vcrsp >> 32;
> > +        break;
> 
> If you're always with vtd_set_quad_raw()s then IMHO you can drop these
> lines?  vtd_mem_read() has a default to handle all these.

aha, yes. nice suggestion.

> > +
> >      default:
> >          if (size == 4) {
> >              val = vtd_get_long(s, addr);
> > @@ -2868,6 +2982,21 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
> >          vtd_set_long(s, addr, val);
> >          break;
> >
> > +    case DMAR_VCMD_REG:
> > +        if (size == 4) {
> > +            vtd_set_long(s, addr, val);
> > +        } else {
> > +            vtd_set_quad(s, addr, val);
> > +        }
> > +        vtd_handle_vcmd_write(s);
> 
> IMHO you should do vtd_handle_vcmd_write() first and let it return a
> value, when returning true you update the regisers using vtd_set_*()
> otherwise you should skip (e.g., when IP is set in vcmd result reg).

Good. Let me refine the logic here.

> > +        break;
> > +
> > +    case DMAR_VCMD_REG_HI:
> > +        assert(size == 4);
> > +        vtd_set_long(s, addr, val);
> > +        vtd_handle_vcmd_write(s);
> 
> Same here?

Accepted.

> > +        break;
> > +
> >      default:
> >          if (size == 4) {
> >              vtd_set_long(s, addr, val);
> > @@ -3579,7 +3708,8 @@ static void vtd_init(IntelIOMMUState *s)
> >              s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
> >          } else if (!strcmp(s->sm_model, "scalable")) {
> >              s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_PASID
> > -                       | VTD_ECAP_FLTS;
> > +                       | VTD_ECAP_FLTS | VTD_ECAP_VCS;
> > +            s->vccap |= VTD_VCCAP_PAS;
> >          } else {
> >              printf("\n!!!!! Invalid sm_model config !!!!!\n"
> >                  "Please config sm_model=[\"legacy\"|\"scalable\"]\n"
> > @@ -3641,6 +3771,13 @@ static void vtd_init(IntelIOMMUState *s)
> >       * Interrupt remapping registers.
> >       */
> >      vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
> > +
> > +    /*
> > +     * Virtual Command Definitions
> > +     */
> > +    vtd_define_quad(s, DMAR_VCCAP_REG, s->vccap, 0, 0);
> > +    vtd_define_quad(s, DMAR_VCMD_REG, 0, 0xffffffffffffffffULL, 0);
> > +    vtd_define_quad(s, DMAR_VCRSP_REG, 0, 0, 0);
> >  }
> >
> >  /* Should not reset address_spaces when reset because devices will still use
> > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> > index adae198..f5a2f0d 100644
> > --- a/hw/i386/intel_iommu_internal.h
> > +++ b/hw/i386/intel_iommu_internal.h
> > @@ -85,6 +85,12 @@
> >  #define DMAR_MTRRCAP_REG_HI     0x104
> >  #define DMAR_MTRRDEF_REG        0x108 /* MTRR default type */
> >  #define DMAR_MTRRDEF_REG_HI     0x10c
> > +#define DMAR_VCCAP_REG          0xE00 /* Virtual Command Capability Register */
> > +#define DMAR_VCCAP_REG_HI       0xE04
> > +#define DMAR_VCMD_REG           0xE10 /* Virtual Command Register */
> > +#define DMAR_VCMD_REG_HI        0xE14
> > +#define DMAR_VCRSP_REG          0xE20 /* Virtual Command Reponse Register */
> > +#define DMAR_VCRSP_REG_HI       0xE24
> >
> >  /* IOTLB registers */
> >  #define DMAR_IOTLB_REG_OFFSET   0xf0 /* Offset to the IOTLB registers */
> > @@ -192,6 +198,7 @@
> >  #define VTD_ECAP_SRS                (1ULL << 31)
> >  #define VTD_ECAP_PASID              (1ULL << 40)
> >  #define VTD_ECAP_SMTS               (1ULL << 43)
> > +#define VTD_ECAP_VCS                (1ULL << 44)
> >  #define VTD_ECAP_SLTS               (1ULL << 46)
> >  #define VTD_ECAP_FLTS               (1ULL << 47)
> >
> > @@ -314,6 +321,29 @@ typedef enum VTDFaultReason {
> >
> >  #define VTD_CONTEXT_CACHE_GEN_MAX       0xffffffffUL
> >
> > +/* VCCAP_REG */
> > +#define VTD_VCCAP_PAS               (1UL << 0)
> > +#define VTD_MIN_HPASID              200
> 
> Comment this value a bit?

The basic idea is to let hypervisor to set a range for available PASIDs for
VMs. One of the reasons is PASID #0 is reserved by RID_PASID usage.
We have no idea how many reserved PASIDs in future, so here just a
evaluated value. Honestly, set it as "1" is enough at current stage.

> > +#define VTD_MAX_HPASID              0xFFFFF
> > +
> > +/* Virtual Command Register */
> > +enum {
> > +     VTD_VCMD_NULL_CMD = 0,
> > +     VTD_VCMD_ALLOC_PASID,
> 
> Shall we spell " = 1" explicitly if defined in spec?

yes, it is.

> > +     VTD_VCMD_FREE_PASID,
> 
> Same here.

Accepted.

> 
> Regards,
> 
> --
> Peter Xu

Thanks,
Yi Liu
Peter Xu July 11, 2019, 1:13 a.m. UTC | #3
On Wed, Jul 10, 2019 at 11:51:17AM +0000, Liu, Yi L wrote:

[...]

> > > +        s->vcrsp = 1;
> > > +        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> > > +                         ((uint64_t) s->vcrsp));
> > 
> > Do we really need to emulate the "In Progress" like this?  The vcpu is
> > blocked here after all, and AFAICT all the rest of vcpus should not
> > access these registers because obviously these registers cannot be
> > accessed concurrently...
> 
> Other vcpus should poll the IP bit before submitting vcmds. As IP bit
> is set, other vcpus will not access these bits. but if not, they may submit
> new vcmds, while we only have 1 response register, that is not we
> support. That's why we need to set IP bit.

I still don't think another CPU can use this register even if it
polled with IP==0...  The reason is simply as you described - we only
have one pair of VCMD/VRSPD registers so IMHO the guest IOMMU driver
must have a lock (probably a mutex) to guarantee sequential access of
these registers otherwise race can happen.

> 
> > 
> > I think the IP bit is useful when some new vcmd would take plenty of
> > time so that we can do the long vcmds in async way.  However here it
> > seems not the case?
> 
> no, so far, it is synchronize way. As mentioned above, IP bit is to ensure
> only one vcmd is handled for a time. Other vcpus won't be able to submit
> vcmds before IP is cleared.

[...]

> > > @@ -192,6 +198,7 @@
> > >  #define VTD_ECAP_SRS                (1ULL << 31)
> > >  #define VTD_ECAP_PASID              (1ULL << 40)
> > >  #define VTD_ECAP_SMTS               (1ULL << 43)
> > > +#define VTD_ECAP_VCS                (1ULL << 44)
> > >  #define VTD_ECAP_SLTS               (1ULL << 46)
> > >  #define VTD_ECAP_FLTS               (1ULL << 47)
> > >
> > > @@ -314,6 +321,29 @@ typedef enum VTDFaultReason {
> > >
> > >  #define VTD_CONTEXT_CACHE_GEN_MAX       0xffffffffUL
> > >
> > > +/* VCCAP_REG */
> > > +#define VTD_VCCAP_PAS               (1UL << 0)
> > > +#define VTD_MIN_HPASID              200
> > 
> > Comment this value a bit?
> 
> The basic idea is to let hypervisor to set a range for available PASIDs for
> VMs. One of the reasons is PASID #0 is reserved by RID_PASID usage.
> We have no idea how many reserved PASIDs in future, so here just a
> evaluated value. Honestly, set it as "1" is enough at current stage.

That'll be a very nice initial comment for that (I mean, put it into
the patch, of course :).

Regards,
Yi Liu July 11, 2019, 6:59 a.m. UTC | #4
> From: Peter Xu [mailto:zhexu@redhat.com]
> Sent: Thursday, July 11, 2019 9:13 AM
> To: Liu, Yi L <yi.l.liu@intel.com>
> Subject: Re: [RFC v1 06/18] intel_iommu: support virtual command emulation and
> pasid request
> 
> On Wed, Jul 10, 2019 at 11:51:17AM +0000, Liu, Yi L wrote:
> 
> [...]
> 
> > > > +        s->vcrsp = 1;
> > > > +        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> > > > +                         ((uint64_t) s->vcrsp));
> > >
> > > Do we really need to emulate the "In Progress" like this?  The vcpu is
> > > blocked here after all, and AFAICT all the rest of vcpus should not
> > > access these registers because obviously these registers cannot be
> > > accessed concurrently...
> >
> > Other vcpus should poll the IP bit before submitting vcmds. As IP bit
> > is set, other vcpus will not access these bits. but if not, they may submit
> > new vcmds, while we only have 1 response register, that is not we
> > support. That's why we need to set IP bit.
> 
> I still don't think another CPU can use this register even if it
> polled with IP==0...  The reason is simply as you described - we only
> have one pair of VCMD/VRSPD registers so IMHO the guest IOMMU driver
> must have a lock (probably a mutex) to guarantee sequential access of
> these registers otherwise race can happen.

Got it. So the case here is: other vcpus will not be able to access the VCMD/
VRSP due the lock in guest iommu driver. So IP bit is only used to block any
further VCMDs from the same vcpu which gained the lock. But we are emulating
VCMD/VRSP in a synchronize manner, so vcpu has no way to submit new VCMDs
before a prior VMCD is completed.

> >
> > >
> > > I think the IP bit is useful when some new vcmd would take plenty of
> > > time so that we can do the long vcmds in async way.  However here it
> > > seems not the case?
> >
> > no, so far, it is synchronize way. As mentioned above, IP bit is to ensure
> > only one vcmd is handled for a time. Other vcpus won't be able to submit
> > vcmds before IP is cleared.
> 
> [...]
> 
> > > > @@ -192,6 +198,7 @@
> > > >  #define VTD_ECAP_SRS                (1ULL << 31)
> > > >  #define VTD_ECAP_PASID              (1ULL << 40)
> > > >  #define VTD_ECAP_SMTS               (1ULL << 43)
> > > > +#define VTD_ECAP_VCS                (1ULL << 44)
> > > >  #define VTD_ECAP_SLTS               (1ULL << 46)
> > > >  #define VTD_ECAP_FLTS               (1ULL << 47)
> > > >
> > > > @@ -314,6 +321,29 @@ typedef enum VTDFaultReason {
> > > >
> > > >  #define VTD_CONTEXT_CACHE_GEN_MAX       0xffffffffUL
> > > >
> > > > +/* VCCAP_REG */
> > > > +#define VTD_VCCAP_PAS               (1UL << 0)
> > > > +#define VTD_MIN_HPASID              200
> > >
> > > Comment this value a bit?
> >
> > The basic idea is to let hypervisor to set a range for available PASIDs for
> > VMs. One of the reasons is PASID #0 is reserved by RID_PASID usage.
> > We have no idea how many reserved PASIDs in future, so here just a
> > evaluated value. Honestly, set it as "1" is enough at current stage.
> 
> That'll be a very nice initial comment for that (I mean, put it into
> the patch, of course :).

Got it. will add it in next version.

Thanks,
Yi Liu
diff mbox series

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 3160a05..3cf250d 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -932,11 +932,19 @@  static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
                 s->vtd_as_by_bus_num[bus_num] = vtd_bus;
                 return vtd_bus;
             }
+            vtd_bus = NULL;
         }
     }
     return vtd_bus;
 }
 
+static PCIBus *vtd_find_pci_bus_from_bus_num(IntelIOMMUState *s,
+                                             uint8_t bus_num)
+{
+    VTDBus *vtd_bus = vtd_find_as_from_bus_num(s, bus_num);
+    return vtd_bus ? vtd_bus->bus : NULL;
+}
+
 /* Given the @iova, get relevant @slptep. @slpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
  */
@@ -2579,6 +2587,103 @@  static void vtd_handle_iectl_write(IntelIOMMUState *s)
     }
 }
 
+static int vtd_request_pasid_alloc(IntelIOMMUState *s)
+{
+    PCIBus *bus;
+    int bus_n, devfn;
+
+    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
+        bus = vtd_find_pci_bus_from_bus_num(s, bus_n);
+        if (!bus) {
+            continue;
+        }
+        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
+            if (pci_device_is_ops_set(bus, devfn)) {
+                return pci_device_request_pasid_alloc(bus, devfn,
+                                                      VTD_MIN_HPASID,
+                                                      VTD_MAX_HPASID);
+            }
+        }
+    }
+    return -1;
+}
+
+static int vtd_request_pasid_free(IntelIOMMUState *s, uint32_t pasid)
+{
+    PCIBus *bus;
+    int bus_n, devfn;
+
+    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
+        bus = vtd_find_pci_bus_from_bus_num(s, bus_n);
+        if (!bus) {
+            continue;
+        }
+        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
+            if (pci_device_is_ops_set(bus, devfn)) {
+                return pci_device_request_pasid_free(bus, devfn, pasid);
+            }
+        }
+    }
+    return -1;
+}
+
+/* Handle write to Virtual Command Register */
+static void vtd_handle_vcmd_write(IntelIOMMUState *s)
+{
+    uint32_t status = vtd_get_long_raw(s, DMAR_VCRSP_REG);
+    uint32_t val = vtd_get_long_raw(s, DMAR_VCMD_REG);
+    uint32_t pasid;
+    int ret = -1;
+
+    trace_vtd_reg_write_vcmd(status, val);
+
+    switch (val & VTD_VCMD_CMD_MASK) {
+    case VTD_VCMD_ALLOC_PASID:
+        if (!(s->vccap & VTD_VCCAP_PAS) ||
+             (s->vcrsp & 1)) {
+            break;
+        }
+        s->vcrsp = 1;
+        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+                         ((uint64_t) s->vcrsp));
+        ret = vtd_request_pasid_alloc(s);
+        if (ret < 0) {
+            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_NO_AVAILABLE_PASID);
+        } else {
+            s->vcrsp |= VTD_VCRSP_RSLT(ret);
+        }
+        s->vcrsp &= (~((uint64_t)(0x1)));
+        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+                         ((uint64_t) s->vcrsp));
+        break;
+
+    case VTD_VCMD_FREE_PASID:
+        if (!(s->vccap & VTD_VCCAP_PAS) ||
+             (s->vcrsp & 1)) {
+            break;
+        }
+        s->vcrsp &= 1;
+        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+                         ((uint64_t) s->vcrsp));
+        pasid = VTD_VCMD_PASID_VALUE(val);
+        ret = vtd_request_pasid_free(s, pasid);
+        if (ret < 0) {
+            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_FREE_INVALID_PASID);
+        }
+        s->vcrsp &= (~((uint64_t)(0x1)));
+        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+                         ((uint64_t) s->vcrsp));
+        break;
+
+    default:
+        s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_UNDEFINED_CMD);
+        vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+                         ((uint64_t) s->vcrsp));
+        printf("Virtual Command: unsupported command!!!\n");
+        break;
+    }
+}
+
 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
 {
     IntelIOMMUState *s = opaque;
@@ -2620,6 +2725,15 @@  static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
         val = s->iq >> 32;
         break;
 
+    case DMAR_VCRSP_REG:
+        val = s->vcrsp;
+        break;
+
+    case DMAR_VCRSP_REG_HI:
+        assert(size == 4);
+        val = s->vcrsp >> 32;
+        break;
+
     default:
         if (size == 4) {
             val = vtd_get_long(s, addr);
@@ -2868,6 +2982,21 @@  static void vtd_mem_write(void *opaque, hwaddr addr,
         vtd_set_long(s, addr, val);
         break;
 
+    case DMAR_VCMD_REG:
+        if (size == 4) {
+            vtd_set_long(s, addr, val);
+        } else {
+            vtd_set_quad(s, addr, val);
+        }
+        vtd_handle_vcmd_write(s);
+        break;
+
+    case DMAR_VCMD_REG_HI:
+        assert(size == 4);
+        vtd_set_long(s, addr, val);
+        vtd_handle_vcmd_write(s);
+        break;
+
     default:
         if (size == 4) {
             vtd_set_long(s, addr, val);
@@ -3579,7 +3708,8 @@  static void vtd_init(IntelIOMMUState *s)
             s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
         } else if (!strcmp(s->sm_model, "scalable")) {
             s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_PASID
-                       | VTD_ECAP_FLTS;
+                       | VTD_ECAP_FLTS | VTD_ECAP_VCS;
+            s->vccap |= VTD_VCCAP_PAS;
         } else {
             printf("\n!!!!! Invalid sm_model config !!!!!\n"
                 "Please config sm_model=[\"legacy\"|\"scalable\"]\n"
@@ -3641,6 +3771,13 @@  static void vtd_init(IntelIOMMUState *s)
      * Interrupt remapping registers.
      */
     vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
+
+    /*
+     * Virtual Command Definitions
+     */
+    vtd_define_quad(s, DMAR_VCCAP_REG, s->vccap, 0, 0);
+    vtd_define_quad(s, DMAR_VCMD_REG, 0, 0xffffffffffffffffULL, 0);
+    vtd_define_quad(s, DMAR_VCRSP_REG, 0, 0, 0);
 }
 
 /* Should not reset address_spaces when reset because devices will still use
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index adae198..f5a2f0d 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -85,6 +85,12 @@ 
 #define DMAR_MTRRCAP_REG_HI     0x104
 #define DMAR_MTRRDEF_REG        0x108 /* MTRR default type */
 #define DMAR_MTRRDEF_REG_HI     0x10c
+#define DMAR_VCCAP_REG          0xE00 /* Virtual Command Capability Register */
+#define DMAR_VCCAP_REG_HI       0xE04
+#define DMAR_VCMD_REG           0xE10 /* Virtual Command Register */
+#define DMAR_VCMD_REG_HI        0xE14
+#define DMAR_VCRSP_REG          0xE20 /* Virtual Command Reponse Register */
+#define DMAR_VCRSP_REG_HI       0xE24
 
 /* IOTLB registers */
 #define DMAR_IOTLB_REG_OFFSET   0xf0 /* Offset to the IOTLB registers */
@@ -192,6 +198,7 @@ 
 #define VTD_ECAP_SRS                (1ULL << 31)
 #define VTD_ECAP_PASID              (1ULL << 40)
 #define VTD_ECAP_SMTS               (1ULL << 43)
+#define VTD_ECAP_VCS                (1ULL << 44)
 #define VTD_ECAP_SLTS               (1ULL << 46)
 #define VTD_ECAP_FLTS               (1ULL << 47)
 
@@ -314,6 +321,29 @@  typedef enum VTDFaultReason {
 
 #define VTD_CONTEXT_CACHE_GEN_MAX       0xffffffffUL
 
+/* VCCAP_REG */
+#define VTD_VCCAP_PAS               (1UL << 0)
+#define VTD_MIN_HPASID              200
+#define VTD_MAX_HPASID              0xFFFFF
+
+/* Virtual Command Register */
+enum {
+     VTD_VCMD_NULL_CMD = 0,
+     VTD_VCMD_ALLOC_PASID,
+     VTD_VCMD_FREE_PASID,
+     VTD_VCMD_CMD_NUM,
+};
+
+#define VTD_VCMD_CMD_MASK           0xffUL
+#define VTD_VCMD_PASID_VALUE(val)   (((val) >> 8) & 0xfffff)
+
+#define VTD_VCRSP_RSLT(val)         ((val) << 8)
+#define VTD_VCRSP_SC(val)           (((val) & 0x3) << 1)
+
+#define VTD_VCMD_UNDEFINED_CMD         1ULL
+#define VTD_VCMD_NO_AVAILABLE_PASID    2ULL
+#define VTD_VCMD_FREE_INVALID_PASID    2ULL
+
 /* Interrupt Entry Cache Invalidation Descriptor: VT-d 6.5.2.7. */
 struct VTDInvDescIEC {
     uint32_t type:4;            /* Should always be 0x4 */
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index c8bc464..43c0314 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -51,6 +51,7 @@  vtd_reg_write_gcmd(uint32_t status, uint32_t val) "status 0x%"PRIx32" value 0x%"
 vtd_reg_write_fectl(uint32_t value) "value 0x%"PRIx32
 vtd_reg_write_iectl(uint32_t value) "value 0x%"PRIx32
 vtd_reg_ics_clear_ip(void) ""
+vtd_reg_write_vcmd(uint32_t status, uint32_t val) "status 0x%"PRIx32" value 0x%"PRIx32
 vtd_dmar_translate(uint8_t bus, uint8_t slot, uint8_t func, uint64_t iova, uint64_t gpa, uint64_t mask) "dev %02x:%02x.%02x iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64
 vtd_dmar_enable(bool en) "enable %d"
 vtd_dmar_fault(uint16_t sid, int fault, uint64_t addr, bool is_write) "sid 0x%"PRIx16" fault %d addr 0x%"PRIx64" write %d"
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index b51cc9f..4b74f3d 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -46,7 +46,7 @@ 
 #define VTD_SID_TO_BUS(sid)         (((sid) >> 8) & 0xff)
 #define VTD_SID_TO_DEVFN(sid)       ((sid) & 0xff)
 
-#define DMAR_REG_SIZE               0x230
+#define DMAR_REG_SIZE               0xF00
 #define VTD_HOST_AW_39BIT           39
 #define VTD_HOST_AW_48BIT           48
 #define VTD_HOST_ADDRESS_WIDTH      VTD_HOST_AW_39BIT
@@ -272,6 +272,10 @@  struct IntelIOMMUState {
     bool dma_drain;                 /* Whether DMA r/w draining enabled */
     char *sm_model;          /* identify actual scalable mode iommu model*/
 
+    /* Virtual Command Register */
+    uint64_t vccap;                 /* The value of vcmd capability reg */
+    uint64_t vcrsp;                 /* Current value of VCMD RSP REG */
+
     /*
      * Protects IOMMU states in general.  Currently it protects the
      * per-IOMMU IOTLB cache, and context entry cache in VTDAddressSpace.