diff mbox series

[v5,03/18] pci: isolated address space for PCI bus

Message ID 2971c1bec04acaac4eb3c1f2b104cbeabad01e22.1642626515.git.jag.raman@oracle.com (mailing list archive)
State New, archived
Headers show
Series vfio-user server in QEMU | expand

Commit Message

Jag Raman Jan. 19, 2022, 9:41 p.m. UTC
Allow PCI buses to be part of isolated CPU address spaces. This has a
niche usage.

TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
the same machine/server. This would cause address space collision as
well as be a security vulnerability. Having separate address spaces for
each PCI bus would solve this problem.

Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
 include/hw/pci/pci.h     |  2 ++
 include/hw/pci/pci_bus.h | 17 +++++++++++++++++
 hw/pci/pci.c             | 17 +++++++++++++++++
 hw/pci/pci_bridge.c      |  5 +++++
 4 files changed, 41 insertions(+)

Comments

Michael S. Tsirkin Jan. 20, 2022, 12:12 a.m. UTC | #1
On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:
> Allow PCI buses to be part of isolated CPU address spaces. This has a
> niche usage.
> 
> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
> the same machine/server. This would cause address space collision as
> well as be a security vulnerability. Having separate address spaces for
> each PCI bus would solve this problem.

Fascinating, but I am not sure I understand. any examples?

I also wonder whether this special type could be modelled like a special
kind of iommu internally.

> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> ---
>  include/hw/pci/pci.h     |  2 ++
>  include/hw/pci/pci_bus.h | 17 +++++++++++++++++
>  hw/pci/pci.c             | 17 +++++++++++++++++
>  hw/pci/pci_bridge.c      |  5 +++++
>  4 files changed, 41 insertions(+)
> 
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index 023abc0f79..9bb4472abc 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -387,6 +387,8 @@ void pci_device_save(PCIDevice *s, QEMUFile *f);
>  int pci_device_load(PCIDevice *s, QEMUFile *f);
>  MemoryRegion *pci_address_space(PCIDevice *dev);
>  MemoryRegion *pci_address_space_io(PCIDevice *dev);
> +AddressSpace *pci_isol_as_mem(PCIDevice *dev);
> +AddressSpace *pci_isol_as_io(PCIDevice *dev);
>  
>  /*
>   * Should not normally be used by devices. For use by sPAPR target
> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> index 347440d42c..d78258e79e 100644
> --- a/include/hw/pci/pci_bus.h
> +++ b/include/hw/pci/pci_bus.h
> @@ -39,9 +39,26 @@ struct PCIBus {
>      void *irq_opaque;
>      PCIDevice *devices[PCI_SLOT_MAX * PCI_FUNC_MAX];
>      PCIDevice *parent_dev;
> +
>      MemoryRegion *address_space_mem;
>      MemoryRegion *address_space_io;
>  
> +    /**
> +     * Isolated address spaces - these allow the PCI bus to be part
> +     * of an isolated address space as opposed to the global
> +     * address_space_memory & address_space_io.

Are you sure address_space_memory & address_space_io are
always global? even in the case of an iommu?

> This allows the
> +     * bus to be attached to CPUs from different machines. The
> +     * following is not used used commonly.
> +     *
> +     * TYPE_REMOTE_MACHINE allows emulating devices from multiple
> +     * VM clients,

what are VM clients?

> as such it needs the PCI buses in the same machine
> +     * to be part of different CPU address spaces. The following is
> +     * useful in that scenario.
> +     *
> +     */
> +    AddressSpace *isol_as_mem;
> +    AddressSpace *isol_as_io;
> +
>      QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
>      QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */
>  
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 5d30f9ca60..d5f1c6c421 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -442,6 +442,8 @@ static void pci_root_bus_internal_init(PCIBus *bus, DeviceState *parent,
>      bus->slot_reserved_mask = 0x0;
>      bus->address_space_mem = address_space_mem;
>      bus->address_space_io = address_space_io;
> +    bus->isol_as_mem = NULL;
> +    bus->isol_as_io = NULL;
>      bus->flags |= PCI_BUS_IS_ROOT;
>  
>      /* host bridge */
> @@ -2676,6 +2678,16 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev)
>      return pci_get_bus(dev)->address_space_io;
>  }
>  
> +AddressSpace *pci_isol_as_mem(PCIDevice *dev)
> +{
> +    return pci_get_bus(dev)->isol_as_mem;
> +}
> +
> +AddressSpace *pci_isol_as_io(PCIDevice *dev)
> +{
> +    return pci_get_bus(dev)->isol_as_io;
> +}
> +
>  static void pci_device_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *k = DEVICE_CLASS(klass);
> @@ -2699,6 +2711,7 @@ static void pci_device_class_base_init(ObjectClass *klass, void *data)
>  
>  AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>  {
> +    AddressSpace *iommu_as = NULL;
>      PCIBus *bus = pci_get_bus(dev);
>      PCIBus *iommu_bus = bus;
>      uint8_t devfn = dev->devfn;
> @@ -2745,6 +2758,10 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>      if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) {
>          return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn);
>      }
> +    iommu_as = pci_isol_as_mem(dev);
> +    if (iommu_as) {
> +        return iommu_as;
> +    }
>      return &address_space_memory;
>  }
>  
> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
> index da34c8ebcd..98366768d2 100644
> --- a/hw/pci/pci_bridge.c
> +++ b/hw/pci/pci_bridge.c
> @@ -383,6 +383,11 @@ void pci_bridge_initfn(PCIDevice *dev, const char *typename)
>      sec_bus->address_space_io = &br->address_space_io;
>      memory_region_init(&br->address_space_io, OBJECT(br), "pci_bridge_io",
>                         4 * GiB);
> +
> +    /* This PCI bridge puts the sec_bus in its parent's address space */
> +    sec_bus->isol_as_mem = pci_isol_as_mem(dev);
> +    sec_bus->isol_as_io = pci_isol_as_io(dev);
> +
>      br->windows = pci_bridge_region_init(br);
>      QLIST_INIT(&sec_bus->child);
>      QLIST_INSERT_HEAD(&parent->child, sec_bus, sibling);
> -- 
> 2.20.1
Jag Raman Jan. 20, 2022, 3:20 p.m. UTC | #2
> On Jan 19, 2022, at 7:12 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> 
> On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:
>> Allow PCI buses to be part of isolated CPU address spaces. This has a
>> niche usage.
>> 
>> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
>> the same machine/server. This would cause address space collision as
>> well as be a security vulnerability. Having separate address spaces for
>> each PCI bus would solve this problem.
> 
> Fascinating, but I am not sure I understand. any examples?

Hi Michael!

multiprocess QEMU and vfio-user implement a client-server model to allow
out-of-process emulation of devices. The client QEMU, which makes ioctls
to the kernel and runs VCPUs, could attach devices running in a server
QEMU. The server QEMU needs access to parts of the client’s RAM to
perform DMA.

In the case where multiple clients attach devices that are running on the
same server, we need to ensure that each devices has isolated memory
ranges. This ensures that the memory space of one device is not visible
to other devices in the same server.
 
> 
> I also wonder whether this special type could be modelled like a special
> kind of iommu internally.

Could you please provide some more details on the design?

> 
>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>> ---
>> include/hw/pci/pci.h     |  2 ++
>> include/hw/pci/pci_bus.h | 17 +++++++++++++++++
>> hw/pci/pci.c             | 17 +++++++++++++++++
>> hw/pci/pci_bridge.c      |  5 +++++
>> 4 files changed, 41 insertions(+)
>> 
>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>> index 023abc0f79..9bb4472abc 100644
>> --- a/include/hw/pci/pci.h
>> +++ b/include/hw/pci/pci.h
>> @@ -387,6 +387,8 @@ void pci_device_save(PCIDevice *s, QEMUFile *f);
>> int pci_device_load(PCIDevice *s, QEMUFile *f);
>> MemoryRegion *pci_address_space(PCIDevice *dev);
>> MemoryRegion *pci_address_space_io(PCIDevice *dev);
>> +AddressSpace *pci_isol_as_mem(PCIDevice *dev);
>> +AddressSpace *pci_isol_as_io(PCIDevice *dev);
>> 
>> /*
>>  * Should not normally be used by devices. For use by sPAPR target
>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
>> index 347440d42c..d78258e79e 100644
>> --- a/include/hw/pci/pci_bus.h
>> +++ b/include/hw/pci/pci_bus.h
>> @@ -39,9 +39,26 @@ struct PCIBus {
>>     void *irq_opaque;
>>     PCIDevice *devices[PCI_SLOT_MAX * PCI_FUNC_MAX];
>>     PCIDevice *parent_dev;
>> +
>>     MemoryRegion *address_space_mem;
>>     MemoryRegion *address_space_io;
>> 
>> +    /**
>> +     * Isolated address spaces - these allow the PCI bus to be part
>> +     * of an isolated address space as opposed to the global
>> +     * address_space_memory & address_space_io.
> 
> Are you sure address_space_memory & address_space_io are
> always global? even in the case of an iommu?

On the CPU side of the Root Complex, I believe address_space_memory
& address_space_io are global.

In the vfio-user case, devices on the same machine (TYPE_REMOTE_MACHINE)
could be attached to different clients VMs. Each client would have their own address
space for their CPUs. With isolated address spaces, we ensure that the devices
see the address space of the CPUs they’re attached to.

Not sure if it’s OK to share weblinks in this mailing list, please let me know if that’s
not preferred. But I’m referring to the terminology used in the following block diagram:
https://en.wikipedia.org/wiki/Root_complex#/media/File:Example_PCI_Express_Topology.svg

> 
>> This allows the
>> +     * bus to be attached to CPUs from different machines. The
>> +     * following is not used used commonly.
>> +     *
>> +     * TYPE_REMOTE_MACHINE allows emulating devices from multiple
>> +     * VM clients,
> 
> what are VM clients?

It’s the client in the client - server model explained above.

Thank you!
--
Jag

> 
>> as such it needs the PCI buses in the same machine
>> +     * to be part of different CPU address spaces. The following is
>> +     * useful in that scenario.
>> +     *
>> +     */
>> +    AddressSpace *isol_as_mem;
>> +    AddressSpace *isol_as_io;
>> +
>>     QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
>>     QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */
>> 
>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>> index 5d30f9ca60..d5f1c6c421 100644
>> --- a/hw/pci/pci.c
>> +++ b/hw/pci/pci.c
>> @@ -442,6 +442,8 @@ static void pci_root_bus_internal_init(PCIBus *bus, DeviceState *parent,
>>     bus->slot_reserved_mask = 0x0;
>>     bus->address_space_mem = address_space_mem;
>>     bus->address_space_io = address_space_io;
>> +    bus->isol_as_mem = NULL;
>> +    bus->isol_as_io = NULL;
>>     bus->flags |= PCI_BUS_IS_ROOT;
>> 
>>     /* host bridge */
>> @@ -2676,6 +2678,16 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev)
>>     return pci_get_bus(dev)->address_space_io;
>> }
>> 
>> +AddressSpace *pci_isol_as_mem(PCIDevice *dev)
>> +{
>> +    return pci_get_bus(dev)->isol_as_mem;
>> +}
>> +
>> +AddressSpace *pci_isol_as_io(PCIDevice *dev)
>> +{
>> +    return pci_get_bus(dev)->isol_as_io;
>> +}
>> +
>> static void pci_device_class_init(ObjectClass *klass, void *data)
>> {
>>     DeviceClass *k = DEVICE_CLASS(klass);
>> @@ -2699,6 +2711,7 @@ static void pci_device_class_base_init(ObjectClass *klass, void *data)
>> 
>> AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>> {
>> +    AddressSpace *iommu_as = NULL;
>>     PCIBus *bus = pci_get_bus(dev);
>>     PCIBus *iommu_bus = bus;
>>     uint8_t devfn = dev->devfn;
>> @@ -2745,6 +2758,10 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>>     if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) {
>>         return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn);
>>     }
>> +    iommu_as = pci_isol_as_mem(dev);
>> +    if (iommu_as) {
>> +        return iommu_as;
>> +    }
>>     return &address_space_memory;
>> }
>> 
>> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
>> index da34c8ebcd..98366768d2 100644
>> --- a/hw/pci/pci_bridge.c
>> +++ b/hw/pci/pci_bridge.c
>> @@ -383,6 +383,11 @@ void pci_bridge_initfn(PCIDevice *dev, const char *typename)
>>     sec_bus->address_space_io = &br->address_space_io;
>>     memory_region_init(&br->address_space_io, OBJECT(br), "pci_bridge_io",
>>                        4 * GiB);
>> +
>> +    /* This PCI bridge puts the sec_bus in its parent's address space */
>> +    sec_bus->isol_as_mem = pci_isol_as_mem(dev);
>> +    sec_bus->isol_as_io = pci_isol_as_io(dev);
>> +
>>     br->windows = pci_bridge_region_init(br);
>>     QLIST_INIT(&sec_bus->child);
>>     QLIST_INSERT_HEAD(&parent->child, sec_bus, sibling);
>> -- 
>> 2.20.1
Stefan Hajnoczi Jan. 25, 2022, 9:56 a.m. UTC | #3
On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:
> Allow PCI buses to be part of isolated CPU address spaces. This has a
> niche usage.
> 
> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
> the same machine/server. This would cause address space collision as
> well as be a security vulnerability. Having separate address spaces for
> each PCI bus would solve this problem.
> 
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> ---
>  include/hw/pci/pci.h     |  2 ++
>  include/hw/pci/pci_bus.h | 17 +++++++++++++++++
>  hw/pci/pci.c             | 17 +++++++++++++++++
>  hw/pci/pci_bridge.c      |  5 +++++
>  4 files changed, 41 insertions(+)
> 
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index 023abc0f79..9bb4472abc 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -387,6 +387,8 @@ void pci_device_save(PCIDevice *s, QEMUFile *f);
>  int pci_device_load(PCIDevice *s, QEMUFile *f);
>  MemoryRegion *pci_address_space(PCIDevice *dev);
>  MemoryRegion *pci_address_space_io(PCIDevice *dev);
> +AddressSpace *pci_isol_as_mem(PCIDevice *dev);
> +AddressSpace *pci_isol_as_io(PCIDevice *dev);
>  
>  /*
>   * Should not normally be used by devices. For use by sPAPR target
> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> index 347440d42c..d78258e79e 100644
> --- a/include/hw/pci/pci_bus.h
> +++ b/include/hw/pci/pci_bus.h
> @@ -39,9 +39,26 @@ struct PCIBus {
>      void *irq_opaque;
>      PCIDevice *devices[PCI_SLOT_MAX * PCI_FUNC_MAX];
>      PCIDevice *parent_dev;
> +
>      MemoryRegion *address_space_mem;
>      MemoryRegion *address_space_io;

This seems like a good point to rename address_space_mem,
address_space_io, as well as PCIIORegion->address_space since they are
all MemoryRegions and not AddressSpaces. Names could be
mem_space_mr/io_space_mr and PCIIORegion->container_mr. This avoids
confusion with the actual AddressSpaces that are introduced in this
patch.

>  
> +    /**
> +     * Isolated address spaces - these allow the PCI bus to be part
> +     * of an isolated address space as opposed to the global
> +     * address_space_memory & address_space_io. This allows the
> +     * bus to be attached to CPUs from different machines. The
> +     * following is not used used commonly.
> +     *
> +     * TYPE_REMOTE_MACHINE allows emulating devices from multiple
> +     * VM clients, as such it needs the PCI buses in the same machine
> +     * to be part of different CPU address spaces. The following is
> +     * useful in that scenario.
> +     *
> +     */
> +    AddressSpace *isol_as_mem;
> +    AddressSpace *isol_as_io;

Or use the pointers unconditionally and initialize them to the global
address_space_memory/address_space_io? That might simplify the code so
isolated address spaces is no longer a special case.

isol_as_io isn't used by this patch?

> +
>      QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
>      QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */
>  
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 5d30f9ca60..d5f1c6c421 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -442,6 +442,8 @@ static void pci_root_bus_internal_init(PCIBus *bus, DeviceState *parent,
>      bus->slot_reserved_mask = 0x0;
>      bus->address_space_mem = address_space_mem;
>      bus->address_space_io = address_space_io;
> +    bus->isol_as_mem = NULL;
> +    bus->isol_as_io = NULL;
>      bus->flags |= PCI_BUS_IS_ROOT;
>  
>      /* host bridge */
> @@ -2676,6 +2678,16 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev)
>      return pci_get_bus(dev)->address_space_io;
>  }
>  
> +AddressSpace *pci_isol_as_mem(PCIDevice *dev)
> +{
> +    return pci_get_bus(dev)->isol_as_mem;
> +}
> +
> +AddressSpace *pci_isol_as_io(PCIDevice *dev)
> +{
> +    return pci_get_bus(dev)->isol_as_io;
> +}
> +
>  static void pci_device_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *k = DEVICE_CLASS(klass);
> @@ -2699,6 +2711,7 @@ static void pci_device_class_base_init(ObjectClass *klass, void *data)
>  
>  AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>  {
> +    AddressSpace *iommu_as = NULL;
>      PCIBus *bus = pci_get_bus(dev);
>      PCIBus *iommu_bus = bus;
>      uint8_t devfn = dev->devfn;
> @@ -2745,6 +2758,10 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>      if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) {
>          return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn);
>      }
> +    iommu_as = pci_isol_as_mem(dev);
> +    if (iommu_as) {
> +        return iommu_as;
> +    }
>      return &address_space_memory;
>  }
>  

> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
> index da34c8ebcd..98366768d2 100644
> --- a/hw/pci/pci_bridge.c
> +++ b/hw/pci/pci_bridge.c
> @@ -383,6 +383,11 @@ void pci_bridge_initfn(PCIDevice *dev, const char *typename)
>      sec_bus->address_space_io = &br->address_space_io;
>      memory_region_init(&br->address_space_io, OBJECT(br), "pci_bridge_io",
>                         4 * GiB);
> +
> +    /* This PCI bridge puts the sec_bus in its parent's address space */
> +    sec_bus->isol_as_mem = pci_isol_as_mem(dev);
> +    sec_bus->isol_as_io = pci_isol_as_io(dev);
> +
>      br->windows = pci_bridge_region_init(br);
>      QLIST_INIT(&sec_bus->child);
>      QLIST_INSERT_HEAD(&parent->child, sec_bus, sibling);
> -- 
> 2.20.1
>
Jag Raman Jan. 25, 2022, 1:49 p.m. UTC | #4
> On Jan 25, 2022, at 4:56 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:
>> Allow PCI buses to be part of isolated CPU address spaces. This has a
>> niche usage.
>> 
>> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
>> the same machine/server. This would cause address space collision as
>> well as be a security vulnerability. Having separate address spaces for
>> each PCI bus would solve this problem.
>> 
>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>> ---
>> include/hw/pci/pci.h     |  2 ++
>> include/hw/pci/pci_bus.h | 17 +++++++++++++++++
>> hw/pci/pci.c             | 17 +++++++++++++++++
>> hw/pci/pci_bridge.c      |  5 +++++
>> 4 files changed, 41 insertions(+)
>> 
>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>> index 023abc0f79..9bb4472abc 100644
>> --- a/include/hw/pci/pci.h
>> +++ b/include/hw/pci/pci.h
>> @@ -387,6 +387,8 @@ void pci_device_save(PCIDevice *s, QEMUFile *f);
>> int pci_device_load(PCIDevice *s, QEMUFile *f);
>> MemoryRegion *pci_address_space(PCIDevice *dev);
>> MemoryRegion *pci_address_space_io(PCIDevice *dev);
>> +AddressSpace *pci_isol_as_mem(PCIDevice *dev);
>> +AddressSpace *pci_isol_as_io(PCIDevice *dev);
>> 
>> /*
>>  * Should not normally be used by devices. For use by sPAPR target
>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
>> index 347440d42c..d78258e79e 100644
>> --- a/include/hw/pci/pci_bus.h
>> +++ b/include/hw/pci/pci_bus.h
>> @@ -39,9 +39,26 @@ struct PCIBus {
>>     void *irq_opaque;
>>     PCIDevice *devices[PCI_SLOT_MAX * PCI_FUNC_MAX];
>>     PCIDevice *parent_dev;
>> +
>>     MemoryRegion *address_space_mem;
>>     MemoryRegion *address_space_io;
> 
> This seems like a good point to rename address_space_mem,
> address_space_io, as well as PCIIORegion->address_space since they are
> all MemoryRegions and not AddressSpaces. Names could be
> mem_space_mr/io_space_mr and PCIIORegion->container_mr. This avoids
> confusion with the actual AddressSpaces that are introduced in this
> patch.

Are you referring to renaming address_space_mem, address_space_io and
PCIIORegion->address_space alone? I’m asking because there are many
other symbols in the code which are named similarly.

> 
>> 
>> +    /**
>> +     * Isolated address spaces - these allow the PCI bus to be part
>> +     * of an isolated address space as opposed to the global
>> +     * address_space_memory & address_space_io. This allows the
>> +     * bus to be attached to CPUs from different machines. The
>> +     * following is not used used commonly.
>> +     *
>> +     * TYPE_REMOTE_MACHINE allows emulating devices from multiple
>> +     * VM clients, as such it needs the PCI buses in the same machine
>> +     * to be part of different CPU address spaces. The following is
>> +     * useful in that scenario.
>> +     *
>> +     */
>> +    AddressSpace *isol_as_mem;
>> +    AddressSpace *isol_as_io;
> 
> Or use the pointers unconditionally and initialize them to the global
> address_space_memory/address_space_io? That might simplify the code so
> isolated address spaces is no longer a special case.

I did start off with using these pointers unconditionally - but adopted an optional
isolated address space for the following reasons:
  - There is a potential for regression
  - CPU address space per bus is not a common scenario. In most case, all PCI
    buses are attached to CPU sharing the same address space. Therefore, an
    optional address space made sense for this special scenario

We can also set it unconditionally if you prefer, kindly confirm.

> 
> isol_as_io isn't used by this patch?

This patch introduces these variables, defines its getters and sets them to NULL in
places where new PCI buses are presently created. The following patch creates a
separate isolated address space:
[PATCH v5 04/18] pci: create and free isolated PCI buses

I could merge these patches if you prefer.

--
Jag

> 
>> +
>>     QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
>>     QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */
>> 
>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>> index 5d30f9ca60..d5f1c6c421 100644
>> --- a/hw/pci/pci.c
>> +++ b/hw/pci/pci.c
>> @@ -442,6 +442,8 @@ static void pci_root_bus_internal_init(PCIBus *bus, DeviceState *parent,
>>     bus->slot_reserved_mask = 0x0;
>>     bus->address_space_mem = address_space_mem;
>>     bus->address_space_io = address_space_io;
>> +    bus->isol_as_mem = NULL;
>> +    bus->isol_as_io = NULL;
>>     bus->flags |= PCI_BUS_IS_ROOT;
>> 
>>     /* host bridge */
>> @@ -2676,6 +2678,16 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev)
>>     return pci_get_bus(dev)->address_space_io;
>> }
>> 
>> +AddressSpace *pci_isol_as_mem(PCIDevice *dev)
>> +{
>> +    return pci_get_bus(dev)->isol_as_mem;
>> +}
>> +
>> +AddressSpace *pci_isol_as_io(PCIDevice *dev)
>> +{
>> +    return pci_get_bus(dev)->isol_as_io;
>> +}
>> +
>> static void pci_device_class_init(ObjectClass *klass, void *data)
>> {
>>     DeviceClass *k = DEVICE_CLASS(klass);
>> @@ -2699,6 +2711,7 @@ static void pci_device_class_base_init(ObjectClass *klass, void *data)
>> 
>> AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>> {
>> +    AddressSpace *iommu_as = NULL;
>>     PCIBus *bus = pci_get_bus(dev);
>>     PCIBus *iommu_bus = bus;
>>     uint8_t devfn = dev->devfn;
>> @@ -2745,6 +2758,10 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>>     if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) {
>>         return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn);
>>     }
>> +    iommu_as = pci_isol_as_mem(dev);
>> +    if (iommu_as) {
>> +        return iommu_as;
>> +    }
>>     return &address_space_memory;
>> }
>> 
> 
>> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
>> index da34c8ebcd..98366768d2 100644
>> --- a/hw/pci/pci_bridge.c
>> +++ b/hw/pci/pci_bridge.c
>> @@ -383,6 +383,11 @@ void pci_bridge_initfn(PCIDevice *dev, const char *typename)
>>     sec_bus->address_space_io = &br->address_space_io;
>>     memory_region_init(&br->address_space_io, OBJECT(br), "pci_bridge_io",
>>                        4 * GiB);
>> +
>> +    /* This PCI bridge puts the sec_bus in its parent's address space */
>> +    sec_bus->isol_as_mem = pci_isol_as_mem(dev);
>> +    sec_bus->isol_as_io = pci_isol_as_io(dev);
>> +
>>     br->windows = pci_bridge_region_init(br);
>>     QLIST_INIT(&sec_bus->child);
>>     QLIST_INSERT_HEAD(&parent->child, sec_bus, sibling);
>> -- 
>> 2.20.1
>>
Stefan Hajnoczi Jan. 25, 2022, 2:19 p.m. UTC | #5
On Tue, Jan 25, 2022 at 01:49:23PM +0000, Jag Raman wrote:
> 
> 
> > On Jan 25, 2022, at 4:56 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> > On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:
> >> Allow PCI buses to be part of isolated CPU address spaces. This has a
> >> niche usage.
> >> 
> >> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
> >> the same machine/server. This would cause address space collision as
> >> well as be a security vulnerability. Having separate address spaces for
> >> each PCI bus would solve this problem.
> >> 
> >> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> >> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> >> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> >> ---
> >> include/hw/pci/pci.h     |  2 ++
> >> include/hw/pci/pci_bus.h | 17 +++++++++++++++++
> >> hw/pci/pci.c             | 17 +++++++++++++++++
> >> hw/pci/pci_bridge.c      |  5 +++++
> >> 4 files changed, 41 insertions(+)
> >> 
> >> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> >> index 023abc0f79..9bb4472abc 100644
> >> --- a/include/hw/pci/pci.h
> >> +++ b/include/hw/pci/pci.h
> >> @@ -387,6 +387,8 @@ void pci_device_save(PCIDevice *s, QEMUFile *f);
> >> int pci_device_load(PCIDevice *s, QEMUFile *f);
> >> MemoryRegion *pci_address_space(PCIDevice *dev);
> >> MemoryRegion *pci_address_space_io(PCIDevice *dev);
> >> +AddressSpace *pci_isol_as_mem(PCIDevice *dev);
> >> +AddressSpace *pci_isol_as_io(PCIDevice *dev);
> >> 
> >> /*
> >>  * Should not normally be used by devices. For use by sPAPR target
> >> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> >> index 347440d42c..d78258e79e 100644
> >> --- a/include/hw/pci/pci_bus.h
> >> +++ b/include/hw/pci/pci_bus.h
> >> @@ -39,9 +39,26 @@ struct PCIBus {
> >>     void *irq_opaque;
> >>     PCIDevice *devices[PCI_SLOT_MAX * PCI_FUNC_MAX];
> >>     PCIDevice *parent_dev;
> >> +
> >>     MemoryRegion *address_space_mem;
> >>     MemoryRegion *address_space_io;
> > 
> > This seems like a good point to rename address_space_mem,
> > address_space_io, as well as PCIIORegion->address_space since they are
> > all MemoryRegions and not AddressSpaces. Names could be
> > mem_space_mr/io_space_mr and PCIIORegion->container_mr. This avoids
> > confusion with the actual AddressSpaces that are introduced in this
> > patch.
> 
> Are you referring to renaming address_space_mem, address_space_io and
> PCIIORegion->address_space alone? I’m asking because there are many
> other symbols in the code which are named similarly.

I only see those symbols in hw/pci/pci.c. Which ones were you thinking
about?

> > 
> >> 
> >> +    /**
> >> +     * Isolated address spaces - these allow the PCI bus to be part
> >> +     * of an isolated address space as opposed to the global
> >> +     * address_space_memory & address_space_io. This allows the
> >> +     * bus to be attached to CPUs from different machines. The
> >> +     * following is not used used commonly.
> >> +     *
> >> +     * TYPE_REMOTE_MACHINE allows emulating devices from multiple
> >> +     * VM clients, as such it needs the PCI buses in the same machine
> >> +     * to be part of different CPU address spaces. The following is
> >> +     * useful in that scenario.
> >> +     *
> >> +     */
> >> +    AddressSpace *isol_as_mem;
> >> +    AddressSpace *isol_as_io;
> > 
> > Or use the pointers unconditionally and initialize them to the global
> > address_space_memory/address_space_io? That might simplify the code so
> > isolated address spaces is no longer a special case.
> 
> I did start off with using these pointers unconditionally - but adopted an optional
> isolated address space for the following reasons:
>   - There is a potential for regression
>   - CPU address space per bus is not a common scenario. In most case, all PCI
>     buses are attached to CPU sharing the same address space. Therefore, an
>     optional address space made sense for this special scenario
> 
> We can also set it unconditionally if you prefer, kindly confirm.

It's a matter of taste. I don't have a strong opinion on it but
personally I would try to make it unconditional. I think the risk of
regressions is low and the code complexity will be lower than making it
a special case. If you wanted to keep it as is, that's fine.

> 
> > 
> > isol_as_io isn't used by this patch?
> 
> This patch introduces these variables, defines its getters and sets them to NULL in
> places where new PCI buses are presently created. The following patch creates a
> separate isolated address space:
> [PATCH v5 04/18] pci: create and free isolated PCI buses
> 
> I could merge these patches if you prefer.

The only place I saw that reads isol_as_io is "[PATCH v5 15/18]
vfio-user: handle PCI BAR accesses", but that's for PCI I/O Space
accesses. Did I miss where I/O Space BARs are mapped into isol_as_io?

Stefan
Dr. David Alan Gilbert Jan. 25, 2022, 6:38 p.m. UTC | #6
* Jag Raman (jag.raman@oracle.com) wrote:
> 
> 
> > On Jan 19, 2022, at 7:12 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > 
> > On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:
> >> Allow PCI buses to be part of isolated CPU address spaces. This has a
> >> niche usage.
> >> 
> >> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
> >> the same machine/server. This would cause address space collision as
> >> well as be a security vulnerability. Having separate address spaces for
> >> each PCI bus would solve this problem.
> > 
> > Fascinating, but I am not sure I understand. any examples?
> 
> Hi Michael!
> 
> multiprocess QEMU and vfio-user implement a client-server model to allow
> out-of-process emulation of devices. The client QEMU, which makes ioctls
> to the kernel and runs VCPUs, could attach devices running in a server
> QEMU. The server QEMU needs access to parts of the client’s RAM to
> perform DMA.

Do you ever have the opposite problem? i.e. when an emulated PCI device
exposes a chunk of RAM-like space (frame buffer, or maybe a mapped file)
that the client can see.  What happens if two emulated devices need to
access each others emulated address space?

Dave

> In the case where multiple clients attach devices that are running on the
> same server, we need to ensure that each devices has isolated memory
> ranges. This ensures that the memory space of one device is not visible
> to other devices in the same server.
>  
> > 
> > I also wonder whether this special type could be modelled like a special
> > kind of iommu internally.
> 
> Could you please provide some more details on the design?
> 
> > 
> >> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> >> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> >> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> >> ---
> >> include/hw/pci/pci.h     |  2 ++
> >> include/hw/pci/pci_bus.h | 17 +++++++++++++++++
> >> hw/pci/pci.c             | 17 +++++++++++++++++
> >> hw/pci/pci_bridge.c      |  5 +++++
> >> 4 files changed, 41 insertions(+)
> >> 
> >> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> >> index 023abc0f79..9bb4472abc 100644
> >> --- a/include/hw/pci/pci.h
> >> +++ b/include/hw/pci/pci.h
> >> @@ -387,6 +387,8 @@ void pci_device_save(PCIDevice *s, QEMUFile *f);
> >> int pci_device_load(PCIDevice *s, QEMUFile *f);
> >> MemoryRegion *pci_address_space(PCIDevice *dev);
> >> MemoryRegion *pci_address_space_io(PCIDevice *dev);
> >> +AddressSpace *pci_isol_as_mem(PCIDevice *dev);
> >> +AddressSpace *pci_isol_as_io(PCIDevice *dev);
> >> 
> >> /*
> >>  * Should not normally be used by devices. For use by sPAPR target
> >> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> >> index 347440d42c..d78258e79e 100644
> >> --- a/include/hw/pci/pci_bus.h
> >> +++ b/include/hw/pci/pci_bus.h
> >> @@ -39,9 +39,26 @@ struct PCIBus {
> >>     void *irq_opaque;
> >>     PCIDevice *devices[PCI_SLOT_MAX * PCI_FUNC_MAX];
> >>     PCIDevice *parent_dev;
> >> +
> >>     MemoryRegion *address_space_mem;
> >>     MemoryRegion *address_space_io;
> >> 
> >> +    /**
> >> +     * Isolated address spaces - these allow the PCI bus to be part
> >> +     * of an isolated address space as opposed to the global
> >> +     * address_space_memory & address_space_io.
> > 
> > Are you sure address_space_memory & address_space_io are
> > always global? even in the case of an iommu?
> 
> On the CPU side of the Root Complex, I believe address_space_memory
> & address_space_io are global.
> 
> In the vfio-user case, devices on the same machine (TYPE_REMOTE_MACHINE)
> could be attached to different clients VMs. Each client would have their own address
> space for their CPUs. With isolated address spaces, we ensure that the devices
> see the address space of the CPUs they’re attached to.
> 
> Not sure if it’s OK to share weblinks in this mailing list, please let me know if that’s
> not preferred. But I’m referring to the terminology used in the following block diagram:
> https://en.wikipedia.org/wiki/Root_complex#/media/File:Example_PCI_Express_Topology.svg
> 
> > 
> >> This allows the
> >> +     * bus to be attached to CPUs from different machines. The
> >> +     * following is not used used commonly.
> >> +     *
> >> +     * TYPE_REMOTE_MACHINE allows emulating devices from multiple
> >> +     * VM clients,
> > 
> > what are VM clients?
> 
> It’s the client in the client - server model explained above.
> 
> Thank you!
> --
> Jag
> 
> > 
> >> as such it needs the PCI buses in the same machine
> >> +     * to be part of different CPU address spaces. The following is
> >> +     * useful in that scenario.
> >> +     *
> >> +     */
> >> +    AddressSpace *isol_as_mem;
> >> +    AddressSpace *isol_as_io;
> >> +
> >>     QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
> >>     QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */
> >> 
> >> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> >> index 5d30f9ca60..d5f1c6c421 100644
> >> --- a/hw/pci/pci.c
> >> +++ b/hw/pci/pci.c
> >> @@ -442,6 +442,8 @@ static void pci_root_bus_internal_init(PCIBus *bus, DeviceState *parent,
> >>     bus->slot_reserved_mask = 0x0;
> >>     bus->address_space_mem = address_space_mem;
> >>     bus->address_space_io = address_space_io;
> >> +    bus->isol_as_mem = NULL;
> >> +    bus->isol_as_io = NULL;
> >>     bus->flags |= PCI_BUS_IS_ROOT;
> >> 
> >>     /* host bridge */
> >> @@ -2676,6 +2678,16 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev)
> >>     return pci_get_bus(dev)->address_space_io;
> >> }
> >> 
> >> +AddressSpace *pci_isol_as_mem(PCIDevice *dev)
> >> +{
> >> +    return pci_get_bus(dev)->isol_as_mem;
> >> +}
> >> +
> >> +AddressSpace *pci_isol_as_io(PCIDevice *dev)
> >> +{
> >> +    return pci_get_bus(dev)->isol_as_io;
> >> +}
> >> +
> >> static void pci_device_class_init(ObjectClass *klass, void *data)
> >> {
> >>     DeviceClass *k = DEVICE_CLASS(klass);
> >> @@ -2699,6 +2711,7 @@ static void pci_device_class_base_init(ObjectClass *klass, void *data)
> >> 
> >> AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
> >> {
> >> +    AddressSpace *iommu_as = NULL;
> >>     PCIBus *bus = pci_get_bus(dev);
> >>     PCIBus *iommu_bus = bus;
> >>     uint8_t devfn = dev->devfn;
> >> @@ -2745,6 +2758,10 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
> >>     if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) {
> >>         return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn);
> >>     }
> >> +    iommu_as = pci_isol_as_mem(dev);
> >> +    if (iommu_as) {
> >> +        return iommu_as;
> >> +    }
> >>     return &address_space_memory;
> >> }
> >> 
> >> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
> >> index da34c8ebcd..98366768d2 100644
> >> --- a/hw/pci/pci_bridge.c
> >> +++ b/hw/pci/pci_bridge.c
> >> @@ -383,6 +383,11 @@ void pci_bridge_initfn(PCIDevice *dev, const char *typename)
> >>     sec_bus->address_space_io = &br->address_space_io;
> >>     memory_region_init(&br->address_space_io, OBJECT(br), "pci_bridge_io",
> >>                        4 * GiB);
> >> +
> >> +    /* This PCI bridge puts the sec_bus in its parent's address space */
> >> +    sec_bus->isol_as_mem = pci_isol_as_mem(dev);
> >> +    sec_bus->isol_as_io = pci_isol_as_io(dev);
> >> +
> >>     br->windows = pci_bridge_region_init(br);
> >>     QLIST_INIT(&sec_bus->child);
> >>     QLIST_INSERT_HEAD(&parent->child, sec_bus, sibling);
> >> -- 
> >> 2.20.1
>
Jag Raman Jan. 26, 2022, 5:27 a.m. UTC | #7
> On Jan 25, 2022, at 1:38 PM, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> 
> * Jag Raman (jag.raman@oracle.com) wrote:
>> 
>> 
>>> On Jan 19, 2022, at 7:12 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>> 
>>> On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:
>>>> Allow PCI buses to be part of isolated CPU address spaces. This has a
>>>> niche usage.
>>>> 
>>>> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
>>>> the same machine/server. This would cause address space collision as
>>>> well as be a security vulnerability. Having separate address spaces for
>>>> each PCI bus would solve this problem.
>>> 
>>> Fascinating, but I am not sure I understand. any examples?
>> 
>> Hi Michael!
>> 
>> multiprocess QEMU and vfio-user implement a client-server model to allow
>> out-of-process emulation of devices. The client QEMU, which makes ioctls
>> to the kernel and runs VCPUs, could attach devices running in a server
>> QEMU. The server QEMU needs access to parts of the client’s RAM to
>> perform DMA.
> 
> Do you ever have the opposite problem? i.e. when an emulated PCI device

That’s an interesting question.

> exposes a chunk of RAM-like space (frame buffer, or maybe a mapped file)
> that the client can see.  What happens if two emulated devices need to
> access each others emulated address space?

In this case, the kernel driver would map the destination’s chunk of internal RAM into
the DMA space of the source device. Then the source device could write to that
mapped address range, and the IOMMU should direct those writes to the
destination device.

I would like to take a closer look at the IOMMU implementation on how to achieve
this, and get back to you. I think the IOMMU would handle this. Could you please
point me to the IOMMU implementation you have in mind?

Thank you!
--
Jag

> 
> Dave
> 
>> In the case where multiple clients attach devices that are running on the
>> same server, we need to ensure that each devices has isolated memory
>> ranges. This ensures that the memory space of one device is not visible
>> to other devices in the same server.
>> 
>>> 
>>> I also wonder whether this special type could be modelled like a special
>>> kind of iommu internally.
>> 
>> Could you please provide some more details on the design?
>> 
>>> 
>>>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>>>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>>>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>>>> ---
>>>> include/hw/pci/pci.h     |  2 ++
>>>> include/hw/pci/pci_bus.h | 17 +++++++++++++++++
>>>> hw/pci/pci.c             | 17 +++++++++++++++++
>>>> hw/pci/pci_bridge.c      |  5 +++++
>>>> 4 files changed, 41 insertions(+)
>>>> 
>>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>>>> index 023abc0f79..9bb4472abc 100644
>>>> --- a/include/hw/pci/pci.h
>>>> +++ b/include/hw/pci/pci.h
>>>> @@ -387,6 +387,8 @@ void pci_device_save(PCIDevice *s, QEMUFile *f);
>>>> int pci_device_load(PCIDevice *s, QEMUFile *f);
>>>> MemoryRegion *pci_address_space(PCIDevice *dev);
>>>> MemoryRegion *pci_address_space_io(PCIDevice *dev);
>>>> +AddressSpace *pci_isol_as_mem(PCIDevice *dev);
>>>> +AddressSpace *pci_isol_as_io(PCIDevice *dev);
>>>> 
>>>> /*
>>>> * Should not normally be used by devices. For use by sPAPR target
>>>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
>>>> index 347440d42c..d78258e79e 100644
>>>> --- a/include/hw/pci/pci_bus.h
>>>> +++ b/include/hw/pci/pci_bus.h
>>>> @@ -39,9 +39,26 @@ struct PCIBus {
>>>>    void *irq_opaque;
>>>>    PCIDevice *devices[PCI_SLOT_MAX * PCI_FUNC_MAX];
>>>>    PCIDevice *parent_dev;
>>>> +
>>>>    MemoryRegion *address_space_mem;
>>>>    MemoryRegion *address_space_io;
>>>> 
>>>> +    /**
>>>> +     * Isolated address spaces - these allow the PCI bus to be part
>>>> +     * of an isolated address space as opposed to the global
>>>> +     * address_space_memory & address_space_io.
>>> 
>>> Are you sure address_space_memory & address_space_io are
>>> always global? even in the case of an iommu?
>> 
>> On the CPU side of the Root Complex, I believe address_space_memory
>> & address_space_io are global.
>> 
>> In the vfio-user case, devices on the same machine (TYPE_REMOTE_MACHINE)
>> could be attached to different clients VMs. Each client would have their own address
>> space for their CPUs. With isolated address spaces, we ensure that the devices
>> see the address space of the CPUs they’re attached to.
>> 
>> Not sure if it’s OK to share weblinks in this mailing list, please let me know if that’s
>> not preferred. But I’m referring to the terminology used in the following block diagram:
>> https://en.wikipedia.org/wiki/Root_complex#/media/File:Example_PCI_Express_Topology.svg
>> 
>>> 
>>>> This allows the
>>>> +     * bus to be attached to CPUs from different machines. The
>>>> +     * following is not used used commonly.
>>>> +     *
>>>> +     * TYPE_REMOTE_MACHINE allows emulating devices from multiple
>>>> +     * VM clients,
>>> 
>>> what are VM clients?
>> 
>> It’s the client in the client - server model explained above.
>> 
>> Thank you!
>> --
>> Jag
>> 
>>> 
>>>> as such it needs the PCI buses in the same machine
>>>> +     * to be part of different CPU address spaces. The following is
>>>> +     * useful in that scenario.
>>>> +     *
>>>> +     */
>>>> +    AddressSpace *isol_as_mem;
>>>> +    AddressSpace *isol_as_io;
>>>> +
>>>>    QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
>>>>    QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */
>>>> 
>>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>>>> index 5d30f9ca60..d5f1c6c421 100644
>>>> --- a/hw/pci/pci.c
>>>> +++ b/hw/pci/pci.c
>>>> @@ -442,6 +442,8 @@ static void pci_root_bus_internal_init(PCIBus *bus, DeviceState *parent,
>>>>    bus->slot_reserved_mask = 0x0;
>>>>    bus->address_space_mem = address_space_mem;
>>>>    bus->address_space_io = address_space_io;
>>>> +    bus->isol_as_mem = NULL;
>>>> +    bus->isol_as_io = NULL;
>>>>    bus->flags |= PCI_BUS_IS_ROOT;
>>>> 
>>>>    /* host bridge */
>>>> @@ -2676,6 +2678,16 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev)
>>>>    return pci_get_bus(dev)->address_space_io;
>>>> }
>>>> 
>>>> +AddressSpace *pci_isol_as_mem(PCIDevice *dev)
>>>> +{
>>>> +    return pci_get_bus(dev)->isol_as_mem;
>>>> +}
>>>> +
>>>> +AddressSpace *pci_isol_as_io(PCIDevice *dev)
>>>> +{
>>>> +    return pci_get_bus(dev)->isol_as_io;
>>>> +}
>>>> +
>>>> static void pci_device_class_init(ObjectClass *klass, void *data)
>>>> {
>>>>    DeviceClass *k = DEVICE_CLASS(klass);
>>>> @@ -2699,6 +2711,7 @@ static void pci_device_class_base_init(ObjectClass *klass, void *data)
>>>> 
>>>> AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>>>> {
>>>> +    AddressSpace *iommu_as = NULL;
>>>>    PCIBus *bus = pci_get_bus(dev);
>>>>    PCIBus *iommu_bus = bus;
>>>>    uint8_t devfn = dev->devfn;
>>>> @@ -2745,6 +2758,10 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>>>>    if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) {
>>>>        return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn);
>>>>    }
>>>> +    iommu_as = pci_isol_as_mem(dev);
>>>> +    if (iommu_as) {
>>>> +        return iommu_as;
>>>> +    }
>>>>    return &address_space_memory;
>>>> }
>>>> 
>>>> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
>>>> index da34c8ebcd..98366768d2 100644
>>>> --- a/hw/pci/pci_bridge.c
>>>> +++ b/hw/pci/pci_bridge.c
>>>> @@ -383,6 +383,11 @@ void pci_bridge_initfn(PCIDevice *dev, const char *typename)
>>>>    sec_bus->address_space_io = &br->address_space_io;
>>>>    memory_region_init(&br->address_space_io, OBJECT(br), "pci_bridge_io",
>>>>                       4 * GiB);
>>>> +
>>>> +    /* This PCI bridge puts the sec_bus in its parent's address space */
>>>> +    sec_bus->isol_as_mem = pci_isol_as_mem(dev);
>>>> +    sec_bus->isol_as_io = pci_isol_as_io(dev);
>>>> +
>>>>    br->windows = pci_bridge_region_init(br);
>>>>    QLIST_INIT(&sec_bus->child);
>>>>    QLIST_INSERT_HEAD(&parent->child, sec_bus, sibling);
>>>> -- 
>>>> 2.20.1
>> 
> -- 
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Stefan Hajnoczi Jan. 26, 2022, 9:45 a.m. UTC | #8
On Wed, Jan 26, 2022 at 05:27:32AM +0000, Jag Raman wrote:
> 
> 
> > On Jan 25, 2022, at 1:38 PM, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> > 
> > * Jag Raman (jag.raman@oracle.com) wrote:
> >> 
> >> 
> >>> On Jan 19, 2022, at 7:12 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> >>> 
> >>> On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:
> >>>> Allow PCI buses to be part of isolated CPU address spaces. This has a
> >>>> niche usage.
> >>>> 
> >>>> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
> >>>> the same machine/server. This would cause address space collision as
> >>>> well as be a security vulnerability. Having separate address spaces for
> >>>> each PCI bus would solve this problem.
> >>> 
> >>> Fascinating, but I am not sure I understand. any examples?
> >> 
> >> Hi Michael!
> >> 
> >> multiprocess QEMU and vfio-user implement a client-server model to allow
> >> out-of-process emulation of devices. The client QEMU, which makes ioctls
> >> to the kernel and runs VCPUs, could attach devices running in a server
> >> QEMU. The server QEMU needs access to parts of the client’s RAM to
> >> perform DMA.
> > 
> > Do you ever have the opposite problem? i.e. when an emulated PCI device
> 
> That’s an interesting question.
> 
> > exposes a chunk of RAM-like space (frame buffer, or maybe a mapped file)
> > that the client can see.  What happens if two emulated devices need to
> > access each others emulated address space?
> 
> In this case, the kernel driver would map the destination’s chunk of internal RAM into
> the DMA space of the source device. Then the source device could write to that
> mapped address range, and the IOMMU should direct those writes to the
> destination device.
> 
> I would like to take a closer look at the IOMMU implementation on how to achieve
> this, and get back to you. I think the IOMMU would handle this. Could you please
> point me to the IOMMU implementation you have in mind?

I don't know if the current vfio-user client/server patches already
implement device-to-device DMA, but the functionality is supported by
the vfio-user protocol.

Basically: if the DMA regions lookup inside the vfio-user server fails,
fall back to VFIO_USER_DMA_READ/WRITE messages instead.
https://github.com/nutanix/libvfio-user/blob/master/docs/vfio-user.rst#vfio-user-dma-read

Here is the flow:
1. The vfio-user server with device A sends a DMA read to QEMU.
2. QEMU finds the MemoryRegion associated with the DMA address and sees
   it's a device.
   a. If it's emulated inside the QEMU process then the normal
      device emulation code kicks in.
   b. If it's another vfio-user PCI device then the vfio-user PCI proxy
      device forwards the DMA to the second vfio-user server's device B.

Stefan
Dr. David Alan Gilbert Jan. 26, 2022, 6:13 p.m. UTC | #9
* Jag Raman (jag.raman@oracle.com) wrote:
> 
> 
> > On Jan 25, 2022, at 1:38 PM, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> > 
> > * Jag Raman (jag.raman@oracle.com) wrote:
> >> 
> >> 
> >>> On Jan 19, 2022, at 7:12 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> >>> 
> >>> On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:
> >>>> Allow PCI buses to be part of isolated CPU address spaces. This has a
> >>>> niche usage.
> >>>> 
> >>>> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
> >>>> the same machine/server. This would cause address space collision as
> >>>> well as be a security vulnerability. Having separate address spaces for
> >>>> each PCI bus would solve this problem.
> >>> 
> >>> Fascinating, but I am not sure I understand. any examples?
> >> 
> >> Hi Michael!
> >> 
> >> multiprocess QEMU and vfio-user implement a client-server model to allow
> >> out-of-process emulation of devices. The client QEMU, which makes ioctls
> >> to the kernel and runs VCPUs, could attach devices running in a server
> >> QEMU. The server QEMU needs access to parts of the client’s RAM to
> >> perform DMA.
> > 
> > Do you ever have the opposite problem? i.e. when an emulated PCI device
> 
> That’s an interesting question.
> 
> > exposes a chunk of RAM-like space (frame buffer, or maybe a mapped file)
> > that the client can see.  What happens if two emulated devices need to
> > access each others emulated address space?
> 
> In this case, the kernel driver would map the destination’s chunk of internal RAM into
> the DMA space of the source device. Then the source device could write to that
> mapped address range, and the IOMMU should direct those writes to the
> destination device.

Are all devices mappable like that?

> I would like to take a closer look at the IOMMU implementation on how to achieve
> this, and get back to you. I think the IOMMU would handle this. Could you please
> point me to the IOMMU implementation you have in mind?

I didn't have one in mind; I was just hitting a similar problem on
Virtiofs DAX.

Dave

> Thank you!
> --
> Jag
> 
> > 
> > Dave
> > 
> >> In the case where multiple clients attach devices that are running on the
> >> same server, we need to ensure that each devices has isolated memory
> >> ranges. This ensures that the memory space of one device is not visible
> >> to other devices in the same server.
> >> 
> >>> 
> >>> I also wonder whether this special type could be modelled like a special
> >>> kind of iommu internally.
> >> 
> >> Could you please provide some more details on the design?
> >> 
> >>> 
> >>>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> >>>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> >>>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> >>>> ---
> >>>> include/hw/pci/pci.h     |  2 ++
> >>>> include/hw/pci/pci_bus.h | 17 +++++++++++++++++
> >>>> hw/pci/pci.c             | 17 +++++++++++++++++
> >>>> hw/pci/pci_bridge.c      |  5 +++++
> >>>> 4 files changed, 41 insertions(+)
> >>>> 
> >>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> >>>> index 023abc0f79..9bb4472abc 100644
> >>>> --- a/include/hw/pci/pci.h
> >>>> +++ b/include/hw/pci/pci.h
> >>>> @@ -387,6 +387,8 @@ void pci_device_save(PCIDevice *s, QEMUFile *f);
> >>>> int pci_device_load(PCIDevice *s, QEMUFile *f);
> >>>> MemoryRegion *pci_address_space(PCIDevice *dev);
> >>>> MemoryRegion *pci_address_space_io(PCIDevice *dev);
> >>>> +AddressSpace *pci_isol_as_mem(PCIDevice *dev);
> >>>> +AddressSpace *pci_isol_as_io(PCIDevice *dev);
> >>>> 
> >>>> /*
> >>>> * Should not normally be used by devices. For use by sPAPR target
> >>>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> >>>> index 347440d42c..d78258e79e 100644
> >>>> --- a/include/hw/pci/pci_bus.h
> >>>> +++ b/include/hw/pci/pci_bus.h
> >>>> @@ -39,9 +39,26 @@ struct PCIBus {
> >>>>    void *irq_opaque;
> >>>>    PCIDevice *devices[PCI_SLOT_MAX * PCI_FUNC_MAX];
> >>>>    PCIDevice *parent_dev;
> >>>> +
> >>>>    MemoryRegion *address_space_mem;
> >>>>    MemoryRegion *address_space_io;
> >>>> 
> >>>> +    /**
> >>>> +     * Isolated address spaces - these allow the PCI bus to be part
> >>>> +     * of an isolated address space as opposed to the global
> >>>> +     * address_space_memory & address_space_io.
> >>> 
> >>> Are you sure address_space_memory & address_space_io are
> >>> always global? even in the case of an iommu?
> >> 
> >> On the CPU side of the Root Complex, I believe address_space_memory
> >> & address_space_io are global.
> >> 
> >> In the vfio-user case, devices on the same machine (TYPE_REMOTE_MACHINE)
> >> could be attached to different clients VMs. Each client would have their own address
> >> space for their CPUs. With isolated address spaces, we ensure that the devices
> >> see the address space of the CPUs they’re attached to.
> >> 
> >> Not sure if it’s OK to share weblinks in this mailing list, please let me know if that’s
> >> not preferred. But I’m referring to the terminology used in the following block diagram:
> >> https://en.wikipedia.org/wiki/Root_complex#/media/File:Example_PCI_Express_Topology.svg
> >> 
> >>> 
> >>>> This allows the
> >>>> +     * bus to be attached to CPUs from different machines. The
> >>>> +     * following is not used used commonly.
> >>>> +     *
> >>>> +     * TYPE_REMOTE_MACHINE allows emulating devices from multiple
> >>>> +     * VM clients,
> >>> 
> >>> what are VM clients?
> >> 
> >> It’s the client in the client - server model explained above.
> >> 
> >> Thank you!
> >> --
> >> Jag
> >> 
> >>> 
> >>>> as such it needs the PCI buses in the same machine
> >>>> +     * to be part of different CPU address spaces. The following is
> >>>> +     * useful in that scenario.
> >>>> +     *
> >>>> +     */
> >>>> +    AddressSpace *isol_as_mem;
> >>>> +    AddressSpace *isol_as_io;
> >>>> +
> >>>>    QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
> >>>>    QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */
> >>>> 
> >>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> >>>> index 5d30f9ca60..d5f1c6c421 100644
> >>>> --- a/hw/pci/pci.c
> >>>> +++ b/hw/pci/pci.c
> >>>> @@ -442,6 +442,8 @@ static void pci_root_bus_internal_init(PCIBus *bus, DeviceState *parent,
> >>>>    bus->slot_reserved_mask = 0x0;
> >>>>    bus->address_space_mem = address_space_mem;
> >>>>    bus->address_space_io = address_space_io;
> >>>> +    bus->isol_as_mem = NULL;
> >>>> +    bus->isol_as_io = NULL;
> >>>>    bus->flags |= PCI_BUS_IS_ROOT;
> >>>> 
> >>>>    /* host bridge */
> >>>> @@ -2676,6 +2678,16 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev)
> >>>>    return pci_get_bus(dev)->address_space_io;
> >>>> }
> >>>> 
> >>>> +AddressSpace *pci_isol_as_mem(PCIDevice *dev)
> >>>> +{
> >>>> +    return pci_get_bus(dev)->isol_as_mem;
> >>>> +}
> >>>> +
> >>>> +AddressSpace *pci_isol_as_io(PCIDevice *dev)
> >>>> +{
> >>>> +    return pci_get_bus(dev)->isol_as_io;
> >>>> +}
> >>>> +
> >>>> static void pci_device_class_init(ObjectClass *klass, void *data)
> >>>> {
> >>>>    DeviceClass *k = DEVICE_CLASS(klass);
> >>>> @@ -2699,6 +2711,7 @@ static void pci_device_class_base_init(ObjectClass *klass, void *data)
> >>>> 
> >>>> AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
> >>>> {
> >>>> +    AddressSpace *iommu_as = NULL;
> >>>>    PCIBus *bus = pci_get_bus(dev);
> >>>>    PCIBus *iommu_bus = bus;
> >>>>    uint8_t devfn = dev->devfn;
> >>>> @@ -2745,6 +2758,10 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
> >>>>    if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) {
> >>>>        return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn);
> >>>>    }
> >>>> +    iommu_as = pci_isol_as_mem(dev);
> >>>> +    if (iommu_as) {
> >>>> +        return iommu_as;
> >>>> +    }
> >>>>    return &address_space_memory;
> >>>> }
> >>>> 
> >>>> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
> >>>> index da34c8ebcd..98366768d2 100644
> >>>> --- a/hw/pci/pci_bridge.c
> >>>> +++ b/hw/pci/pci_bridge.c
> >>>> @@ -383,6 +383,11 @@ void pci_bridge_initfn(PCIDevice *dev, const char *typename)
> >>>>    sec_bus->address_space_io = &br->address_space_io;
> >>>>    memory_region_init(&br->address_space_io, OBJECT(br), "pci_bridge_io",
> >>>>                       4 * GiB);
> >>>> +
> >>>> +    /* This PCI bridge puts the sec_bus in its parent's address space */
> >>>> +    sec_bus->isol_as_mem = pci_isol_as_mem(dev);
> >>>> +    sec_bus->isol_as_io = pci_isol_as_io(dev);
> >>>> +
> >>>>    br->windows = pci_bridge_region_init(br);
> >>>>    QLIST_INIT(&sec_bus->child);
> >>>>    QLIST_INSERT_HEAD(&parent->child, sec_bus, sibling);
> >>>> -- 
> >>>> 2.20.1
> >> 
> > -- 
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > 
>
Dr. David Alan Gilbert Jan. 26, 2022, 8:07 p.m. UTC | #10
* Stefan Hajnoczi (stefanha@redhat.com) wrote:
> On Wed, Jan 26, 2022 at 05:27:32AM +0000, Jag Raman wrote:
> > 
> > 
> > > On Jan 25, 2022, at 1:38 PM, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> > > 
> > > * Jag Raman (jag.raman@oracle.com) wrote:
> > >> 
> > >> 
> > >>> On Jan 19, 2022, at 7:12 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > >>> 
> > >>> On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:
> > >>>> Allow PCI buses to be part of isolated CPU address spaces. This has a
> > >>>> niche usage.
> > >>>> 
> > >>>> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
> > >>>> the same machine/server. This would cause address space collision as
> > >>>> well as be a security vulnerability. Having separate address spaces for
> > >>>> each PCI bus would solve this problem.
> > >>> 
> > >>> Fascinating, but I am not sure I understand. any examples?
> > >> 
> > >> Hi Michael!
> > >> 
> > >> multiprocess QEMU and vfio-user implement a client-server model to allow
> > >> out-of-process emulation of devices. The client QEMU, which makes ioctls
> > >> to the kernel and runs VCPUs, could attach devices running in a server
> > >> QEMU. The server QEMU needs access to parts of the client’s RAM to
> > >> perform DMA.
> > > 
> > > Do you ever have the opposite problem? i.e. when an emulated PCI device
> > 
> > That’s an interesting question.
> > 
> > > exposes a chunk of RAM-like space (frame buffer, or maybe a mapped file)
> > > that the client can see.  What happens if two emulated devices need to
> > > access each others emulated address space?
> > 
> > In this case, the kernel driver would map the destination’s chunk of internal RAM into
> > the DMA space of the source device. Then the source device could write to that
> > mapped address range, and the IOMMU should direct those writes to the
> > destination device.
> > 
> > I would like to take a closer look at the IOMMU implementation on how to achieve
> > this, and get back to you. I think the IOMMU would handle this. Could you please
> > point me to the IOMMU implementation you have in mind?
> 
> I don't know if the current vfio-user client/server patches already
> implement device-to-device DMA, but the functionality is supported by
> the vfio-user protocol.
> 
> Basically: if the DMA regions lookup inside the vfio-user server fails,
> fall back to VFIO_USER_DMA_READ/WRITE messages instead.
> https://github.com/nutanix/libvfio-user/blob/master/docs/vfio-user.rst#vfio-user-dma-read
> 
> Here is the flow:
> 1. The vfio-user server with device A sends a DMA read to QEMU.
> 2. QEMU finds the MemoryRegion associated with the DMA address and sees
>    it's a device.
>    a. If it's emulated inside the QEMU process then the normal
>       device emulation code kicks in.
>    b. If it's another vfio-user PCI device then the vfio-user PCI proxy
>       device forwards the DMA to the second vfio-user server's device B.

I'm starting to be curious if there's a way to persuade the guest kernel
to do it for us; in general is there a way to say to PCI devices that
they can only DMA to the host and not other PCI devices?  Or that the
address space of a given PCIe bus is non-overlapping with one of the
others?

Dave

> Stefan
Michael S. Tsirkin Jan. 26, 2022, 9:13 p.m. UTC | #11
On Wed, Jan 26, 2022 at 08:07:36PM +0000, Dr. David Alan Gilbert wrote:
> * Stefan Hajnoczi (stefanha@redhat.com) wrote:
> > On Wed, Jan 26, 2022 at 05:27:32AM +0000, Jag Raman wrote:
> > > 
> > > 
> > > > On Jan 25, 2022, at 1:38 PM, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> > > > 
> > > > * Jag Raman (jag.raman@oracle.com) wrote:
> > > >> 
> > > >> 
> > > >>> On Jan 19, 2022, at 7:12 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >>> 
> > > >>> On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:
> > > >>>> Allow PCI buses to be part of isolated CPU address spaces. This has a
> > > >>>> niche usage.
> > > >>>> 
> > > >>>> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
> > > >>>> the same machine/server. This would cause address space collision as
> > > >>>> well as be a security vulnerability. Having separate address spaces for
> > > >>>> each PCI bus would solve this problem.
> > > >>> 
> > > >>> Fascinating, but I am not sure I understand. any examples?
> > > >> 
> > > >> Hi Michael!
> > > >> 
> > > >> multiprocess QEMU and vfio-user implement a client-server model to allow
> > > >> out-of-process emulation of devices. The client QEMU, which makes ioctls
> > > >> to the kernel and runs VCPUs, could attach devices running in a server
> > > >> QEMU. The server QEMU needs access to parts of the client’s RAM to
> > > >> perform DMA.
> > > > 
> > > > Do you ever have the opposite problem? i.e. when an emulated PCI device
> > > 
> > > That’s an interesting question.
> > > 
> > > > exposes a chunk of RAM-like space (frame buffer, or maybe a mapped file)
> > > > that the client can see.  What happens if two emulated devices need to
> > > > access each others emulated address space?
> > > 
> > > In this case, the kernel driver would map the destination’s chunk of internal RAM into
> > > the DMA space of the source device. Then the source device could write to that
> > > mapped address range, and the IOMMU should direct those writes to the
> > > destination device.
> > > 
> > > I would like to take a closer look at the IOMMU implementation on how to achieve
> > > this, and get back to you. I think the IOMMU would handle this. Could you please
> > > point me to the IOMMU implementation you have in mind?
> > 
> > I don't know if the current vfio-user client/server patches already
> > implement device-to-device DMA, but the functionality is supported by
> > the vfio-user protocol.
> > 
> > Basically: if the DMA regions lookup inside the vfio-user server fails,
> > fall back to VFIO_USER_DMA_READ/WRITE messages instead.
> > https://github.com/nutanix/libvfio-user/blob/master/docs/vfio-user.rst#vfio-user-dma-read
> > 
> > Here is the flow:
> > 1. The vfio-user server with device A sends a DMA read to QEMU.
> > 2. QEMU finds the MemoryRegion associated with the DMA address and sees
> >    it's a device.
> >    a. If it's emulated inside the QEMU process then the normal
> >       device emulation code kicks in.
> >    b. If it's another vfio-user PCI device then the vfio-user PCI proxy
> >       device forwards the DMA to the second vfio-user server's device B.
> 
> I'm starting to be curious if there's a way to persuade the guest kernel
> to do it for us; in general is there a way to say to PCI devices that
> they can only DMA to the host and not other PCI devices?


But of course - this is how e.g. VFIO protects host PCI devices from
each other when one of them is passed through to a VM.

>  Or that the
> address space of a given PCIe bus is non-overlapping with one of the
> others?



> Dave
> > Stefan
> 
> 
> -- 
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Stefan Hajnoczi Jan. 27, 2022, 8:30 a.m. UTC | #12
On Wed, Jan 26, 2022 at 04:13:33PM -0500, Michael S. Tsirkin wrote:
> On Wed, Jan 26, 2022 at 08:07:36PM +0000, Dr. David Alan Gilbert wrote:
> > * Stefan Hajnoczi (stefanha@redhat.com) wrote:
> > > On Wed, Jan 26, 2022 at 05:27:32AM +0000, Jag Raman wrote:
> > > > 
> > > > 
> > > > > On Jan 25, 2022, at 1:38 PM, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> > > > > 
> > > > > * Jag Raman (jag.raman@oracle.com) wrote:
> > > > >> 
> > > > >> 
> > > > >>> On Jan 19, 2022, at 7:12 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >>> 
> > > > >>> On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:
> > > > >>>> Allow PCI buses to be part of isolated CPU address spaces. This has a
> > > > >>>> niche usage.
> > > > >>>> 
> > > > >>>> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
> > > > >>>> the same machine/server. This would cause address space collision as
> > > > >>>> well as be a security vulnerability. Having separate address spaces for
> > > > >>>> each PCI bus would solve this problem.
> > > > >>> 
> > > > >>> Fascinating, but I am not sure I understand. any examples?
> > > > >> 
> > > > >> Hi Michael!
> > > > >> 
> > > > >> multiprocess QEMU and vfio-user implement a client-server model to allow
> > > > >> out-of-process emulation of devices. The client QEMU, which makes ioctls
> > > > >> to the kernel and runs VCPUs, could attach devices running in a server
> > > > >> QEMU. The server QEMU needs access to parts of the client’s RAM to
> > > > >> perform DMA.
> > > > > 
> > > > > Do you ever have the opposite problem? i.e. when an emulated PCI device
> > > > 
> > > > That’s an interesting question.
> > > > 
> > > > > exposes a chunk of RAM-like space (frame buffer, or maybe a mapped file)
> > > > > that the client can see.  What happens if two emulated devices need to
> > > > > access each others emulated address space?
> > > > 
> > > > In this case, the kernel driver would map the destination’s chunk of internal RAM into
> > > > the DMA space of the source device. Then the source device could write to that
> > > > mapped address range, and the IOMMU should direct those writes to the
> > > > destination device.
> > > > 
> > > > I would like to take a closer look at the IOMMU implementation on how to achieve
> > > > this, and get back to you. I think the IOMMU would handle this. Could you please
> > > > point me to the IOMMU implementation you have in mind?
> > > 
> > > I don't know if the current vfio-user client/server patches already
> > > implement device-to-device DMA, but the functionality is supported by
> > > the vfio-user protocol.
> > > 
> > > Basically: if the DMA regions lookup inside the vfio-user server fails,
> > > fall back to VFIO_USER_DMA_READ/WRITE messages instead.
> > > https://github.com/nutanix/libvfio-user/blob/master/docs/vfio-user.rst#vfio-user-dma-read
> > > 
> > > Here is the flow:
> > > 1. The vfio-user server with device A sends a DMA read to QEMU.
> > > 2. QEMU finds the MemoryRegion associated with the DMA address and sees
> > >    it's a device.
> > >    a. If it's emulated inside the QEMU process then the normal
> > >       device emulation code kicks in.
> > >    b. If it's another vfio-user PCI device then the vfio-user PCI proxy
> > >       device forwards the DMA to the second vfio-user server's device B.
> > 
> > I'm starting to be curious if there's a way to persuade the guest kernel
> > to do it for us; in general is there a way to say to PCI devices that
> > they can only DMA to the host and not other PCI devices?
> 
> 
> But of course - this is how e.g. VFIO protects host PCI devices from
> each other when one of them is passed through to a VM.

Michael: Are you saying just turn on vIOMMU? :)

Devices in different VFIO groups have their own IOMMU context, so their
IOVA space is isolated. Just don't map other devices into the IOVA space
and those other devices will be inaccessible.

Stefan
Michael S. Tsirkin Jan. 27, 2022, 12:50 p.m. UTC | #13
On Thu, Jan 27, 2022 at 08:30:13AM +0000, Stefan Hajnoczi wrote:
> On Wed, Jan 26, 2022 at 04:13:33PM -0500, Michael S. Tsirkin wrote:
> > On Wed, Jan 26, 2022 at 08:07:36PM +0000, Dr. David Alan Gilbert wrote:
> > > * Stefan Hajnoczi (stefanha@redhat.com) wrote:
> > > > On Wed, Jan 26, 2022 at 05:27:32AM +0000, Jag Raman wrote:
> > > > > 
> > > > > 
> > > > > > On Jan 25, 2022, at 1:38 PM, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> > > > > > 
> > > > > > * Jag Raman (jag.raman@oracle.com) wrote:
> > > > > >> 
> > > > > >> 
> > > > > >>> On Jan 19, 2022, at 7:12 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >>> 
> > > > > >>> On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:
> > > > > >>>> Allow PCI buses to be part of isolated CPU address spaces. This has a
> > > > > >>>> niche usage.
> > > > > >>>> 
> > > > > >>>> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
> > > > > >>>> the same machine/server. This would cause address space collision as
> > > > > >>>> well as be a security vulnerability. Having separate address spaces for
> > > > > >>>> each PCI bus would solve this problem.
> > > > > >>> 
> > > > > >>> Fascinating, but I am not sure I understand. any examples?
> > > > > >> 
> > > > > >> Hi Michael!
> > > > > >> 
> > > > > >> multiprocess QEMU and vfio-user implement a client-server model to allow
> > > > > >> out-of-process emulation of devices. The client QEMU, which makes ioctls
> > > > > >> to the kernel and runs VCPUs, could attach devices running in a server
> > > > > >> QEMU. The server QEMU needs access to parts of the client’s RAM to
> > > > > >> perform DMA.
> > > > > > 
> > > > > > Do you ever have the opposite problem? i.e. when an emulated PCI device
> > > > > 
> > > > > That’s an interesting question.
> > > > > 
> > > > > > exposes a chunk of RAM-like space (frame buffer, or maybe a mapped file)
> > > > > > that the client can see.  What happens if two emulated devices need to
> > > > > > access each others emulated address space?
> > > > > 
> > > > > In this case, the kernel driver would map the destination’s chunk of internal RAM into
> > > > > the DMA space of the source device. Then the source device could write to that
> > > > > mapped address range, and the IOMMU should direct those writes to the
> > > > > destination device.
> > > > > 
> > > > > I would like to take a closer look at the IOMMU implementation on how to achieve
> > > > > this, and get back to you. I think the IOMMU would handle this. Could you please
> > > > > point me to the IOMMU implementation you have in mind?
> > > > 
> > > > I don't know if the current vfio-user client/server patches already
> > > > implement device-to-device DMA, but the functionality is supported by
> > > > the vfio-user protocol.
> > > > 
> > > > Basically: if the DMA regions lookup inside the vfio-user server fails,
> > > > fall back to VFIO_USER_DMA_READ/WRITE messages instead.
> > > > https://github.com/nutanix/libvfio-user/blob/master/docs/vfio-user.rst#vfio-user-dma-read
> > > > 
> > > > Here is the flow:
> > > > 1. The vfio-user server with device A sends a DMA read to QEMU.
> > > > 2. QEMU finds the MemoryRegion associated with the DMA address and sees
> > > >    it's a device.
> > > >    a. If it's emulated inside the QEMU process then the normal
> > > >       device emulation code kicks in.
> > > >    b. If it's another vfio-user PCI device then the vfio-user PCI proxy
> > > >       device forwards the DMA to the second vfio-user server's device B.
> > > 
> > > I'm starting to be curious if there's a way to persuade the guest kernel
> > > to do it for us; in general is there a way to say to PCI devices that
> > > they can only DMA to the host and not other PCI devices?
> > 
> > 
> > But of course - this is how e.g. VFIO protects host PCI devices from
> > each other when one of them is passed through to a VM.
> 
> Michael: Are you saying just turn on vIOMMU? :)
> 
> Devices in different VFIO groups have their own IOMMU context, so their
> IOVA space is isolated. Just don't map other devices into the IOVA space
> and those other devices will be inaccessible.
> 
> Stefan

I was wondering about it here:
https://lore.kernel.org/r/20220119190742-mutt-send-email-mst%40kernel.org
Jag Raman Jan. 27, 2022, 5:43 p.m. UTC | #14
> On Jan 26, 2022, at 1:13 PM, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> 
> * Jag Raman (jag.raman@oracle.com) wrote:
>> 
>> 
>>> On Jan 25, 2022, at 1:38 PM, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
>>> 
>>> * Jag Raman (jag.raman@oracle.com) wrote:
>>>> 
>>>> 
>>>>> On Jan 19, 2022, at 7:12 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>>>> 
>>>>> On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:
>>>>>> Allow PCI buses to be part of isolated CPU address spaces. This has a
>>>>>> niche usage.
>>>>>> 
>>>>>> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
>>>>>> the same machine/server. This would cause address space collision as
>>>>>> well as be a security vulnerability. Having separate address spaces for
>>>>>> each PCI bus would solve this problem.
>>>>> 
>>>>> Fascinating, but I am not sure I understand. any examples?
>>>> 
>>>> Hi Michael!
>>>> 
>>>> multiprocess QEMU and vfio-user implement a client-server model to allow
>>>> out-of-process emulation of devices. The client QEMU, which makes ioctls
>>>> to the kernel and runs VCPUs, could attach devices running in a server
>>>> QEMU. The server QEMU needs access to parts of the client’s RAM to
>>>> perform DMA.
>>> 
>>> Do you ever have the opposite problem? i.e. when an emulated PCI device
>> 
>> That’s an interesting question.
>> 
>>> exposes a chunk of RAM-like space (frame buffer, or maybe a mapped file)
>>> that the client can see.  What happens if two emulated devices need to
>>> access each others emulated address space?
>> 
>> In this case, the kernel driver would map the destination’s chunk of internal RAM into
>> the DMA space of the source device. Then the source device could write to that
>> mapped address range, and the IOMMU should direct those writes to the
>> destination device.
> 
> Are all devices mappable like that?

If there is an IOMMU that supports DMA-Remapping (DMAR), that would be
possible - the kernel could configure the DMAR to facilitate such mapping.

If there is no DMAR, the kernel/cpu could buffer the write between devices.

--
Jag

> 
>> I would like to take a closer look at the IOMMU implementation on how to achieve
>> this, and get back to you. I think the IOMMU would handle this. Could you please
>> point me to the IOMMU implementation you have in mind?
> 
> I didn't have one in mind; I was just hitting a similar problem on
> Virtiofs DAX.
> 
> Dave
> 
>> Thank you!
>> --
>> Jag
>> 
>>> 
>>> Dave
>>> 
>>>> In the case where multiple clients attach devices that are running on the
>>>> same server, we need to ensure that each devices has isolated memory
>>>> ranges. This ensures that the memory space of one device is not visible
>>>> to other devices in the same server.
>>>> 
>>>>> 
>>>>> I also wonder whether this special type could be modelled like a special
>>>>> kind of iommu internally.
>>>> 
>>>> Could you please provide some more details on the design?
>>>> 
>>>>> 
>>>>>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>>>>>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>>>>>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>>>>>> ---
>>>>>> include/hw/pci/pci.h     |  2 ++
>>>>>> include/hw/pci/pci_bus.h | 17 +++++++++++++++++
>>>>>> hw/pci/pci.c             | 17 +++++++++++++++++
>>>>>> hw/pci/pci_bridge.c      |  5 +++++
>>>>>> 4 files changed, 41 insertions(+)
>>>>>> 
>>>>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>>>>>> index 023abc0f79..9bb4472abc 100644
>>>>>> --- a/include/hw/pci/pci.h
>>>>>> +++ b/include/hw/pci/pci.h
>>>>>> @@ -387,6 +387,8 @@ void pci_device_save(PCIDevice *s, QEMUFile *f);
>>>>>> int pci_device_load(PCIDevice *s, QEMUFile *f);
>>>>>> MemoryRegion *pci_address_space(PCIDevice *dev);
>>>>>> MemoryRegion *pci_address_space_io(PCIDevice *dev);
>>>>>> +AddressSpace *pci_isol_as_mem(PCIDevice *dev);
>>>>>> +AddressSpace *pci_isol_as_io(PCIDevice *dev);
>>>>>> 
>>>>>> /*
>>>>>> * Should not normally be used by devices. For use by sPAPR target
>>>>>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
>>>>>> index 347440d42c..d78258e79e 100644
>>>>>> --- a/include/hw/pci/pci_bus.h
>>>>>> +++ b/include/hw/pci/pci_bus.h
>>>>>> @@ -39,9 +39,26 @@ struct PCIBus {
>>>>>>   void *irq_opaque;
>>>>>>   PCIDevice *devices[PCI_SLOT_MAX * PCI_FUNC_MAX];
>>>>>>   PCIDevice *parent_dev;
>>>>>> +
>>>>>>   MemoryRegion *address_space_mem;
>>>>>>   MemoryRegion *address_space_io;
>>>>>> 
>>>>>> +    /**
>>>>>> +     * Isolated address spaces - these allow the PCI bus to be part
>>>>>> +     * of an isolated address space as opposed to the global
>>>>>> +     * address_space_memory & address_space_io.
>>>>> 
>>>>> Are you sure address_space_memory & address_space_io are
>>>>> always global? even in the case of an iommu?
>>>> 
>>>> On the CPU side of the Root Complex, I believe address_space_memory
>>>> & address_space_io are global.
>>>> 
>>>> In the vfio-user case, devices on the same machine (TYPE_REMOTE_MACHINE)
>>>> could be attached to different clients VMs. Each client would have their own address
>>>> space for their CPUs. With isolated address spaces, we ensure that the devices
>>>> see the address space of the CPUs they’re attached to.
>>>> 
>>>> Not sure if it’s OK to share weblinks in this mailing list, please let me know if that’s
>>>> not preferred. But I’m referring to the terminology used in the following block diagram:
>>>> https://en.wikipedia.org/wiki/Root_complex#/media/File:Example_PCI_Express_Topology.svg
>>>> 
>>>>> 
>>>>>> This allows the
>>>>>> +     * bus to be attached to CPUs from different machines. The
>>>>>> +     * following is not used used commonly.
>>>>>> +     *
>>>>>> +     * TYPE_REMOTE_MACHINE allows emulating devices from multiple
>>>>>> +     * VM clients,
>>>>> 
>>>>> what are VM clients?
>>>> 
>>>> It’s the client in the client - server model explained above.
>>>> 
>>>> Thank you!
>>>> --
>>>> Jag
>>>> 
>>>>> 
>>>>>> as such it needs the PCI buses in the same machine
>>>>>> +     * to be part of different CPU address spaces. The following is
>>>>>> +     * useful in that scenario.
>>>>>> +     *
>>>>>> +     */
>>>>>> +    AddressSpace *isol_as_mem;
>>>>>> +    AddressSpace *isol_as_io;
>>>>>> +
>>>>>>   QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
>>>>>>   QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */
>>>>>> 
>>>>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>>>>>> index 5d30f9ca60..d5f1c6c421 100644
>>>>>> --- a/hw/pci/pci.c
>>>>>> +++ b/hw/pci/pci.c
>>>>>> @@ -442,6 +442,8 @@ static void pci_root_bus_internal_init(PCIBus *bus, DeviceState *parent,
>>>>>>   bus->slot_reserved_mask = 0x0;
>>>>>>   bus->address_space_mem = address_space_mem;
>>>>>>   bus->address_space_io = address_space_io;
>>>>>> +    bus->isol_as_mem = NULL;
>>>>>> +    bus->isol_as_io = NULL;
>>>>>>   bus->flags |= PCI_BUS_IS_ROOT;
>>>>>> 
>>>>>>   /* host bridge */
>>>>>> @@ -2676,6 +2678,16 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev)
>>>>>>   return pci_get_bus(dev)->address_space_io;
>>>>>> }
>>>>>> 
>>>>>> +AddressSpace *pci_isol_as_mem(PCIDevice *dev)
>>>>>> +{
>>>>>> +    return pci_get_bus(dev)->isol_as_mem;
>>>>>> +}
>>>>>> +
>>>>>> +AddressSpace *pci_isol_as_io(PCIDevice *dev)
>>>>>> +{
>>>>>> +    return pci_get_bus(dev)->isol_as_io;
>>>>>> +}
>>>>>> +
>>>>>> static void pci_device_class_init(ObjectClass *klass, void *data)
>>>>>> {
>>>>>>   DeviceClass *k = DEVICE_CLASS(klass);
>>>>>> @@ -2699,6 +2711,7 @@ static void pci_device_class_base_init(ObjectClass *klass, void *data)
>>>>>> 
>>>>>> AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>>>>>> {
>>>>>> +    AddressSpace *iommu_as = NULL;
>>>>>>   PCIBus *bus = pci_get_bus(dev);
>>>>>>   PCIBus *iommu_bus = bus;
>>>>>>   uint8_t devfn = dev->devfn;
>>>>>> @@ -2745,6 +2758,10 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>>>>>>   if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) {
>>>>>>       return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn);
>>>>>>   }
>>>>>> +    iommu_as = pci_isol_as_mem(dev);
>>>>>> +    if (iommu_as) {
>>>>>> +        return iommu_as;
>>>>>> +    }
>>>>>>   return &address_space_memory;
>>>>>> }
>>>>>> 
>>>>>> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
>>>>>> index da34c8ebcd..98366768d2 100644
>>>>>> --- a/hw/pci/pci_bridge.c
>>>>>> +++ b/hw/pci/pci_bridge.c
>>>>>> @@ -383,6 +383,11 @@ void pci_bridge_initfn(PCIDevice *dev, const char *typename)
>>>>>>   sec_bus->address_space_io = &br->address_space_io;
>>>>>>   memory_region_init(&br->address_space_io, OBJECT(br), "pci_bridge_io",
>>>>>>                      4 * GiB);
>>>>>> +
>>>>>> +    /* This PCI bridge puts the sec_bus in its parent's address space */
>>>>>> +    sec_bus->isol_as_mem = pci_isol_as_mem(dev);
>>>>>> +    sec_bus->isol_as_io = pci_isol_as_io(dev);
>>>>>> +
>>>>>>   br->windows = pci_bridge_region_init(br);
>>>>>>   QLIST_INIT(&sec_bus->child);
>>>>>>   QLIST_INSERT_HEAD(&parent->child, sec_bus, sibling);
>>>>>> -- 
>>>>>> 2.20.1
>>>> 
>>> -- 
>>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>>> 
>> 
> -- 
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Alex Williamson Jan. 27, 2022, 9:22 p.m. UTC | #15
On Thu, 27 Jan 2022 08:30:13 +0000
Stefan Hajnoczi <stefanha@redhat.com> wrote:

> On Wed, Jan 26, 2022 at 04:13:33PM -0500, Michael S. Tsirkin wrote:
> > On Wed, Jan 26, 2022 at 08:07:36PM +0000, Dr. David Alan Gilbert wrote:  
> > > * Stefan Hajnoczi (stefanha@redhat.com) wrote:  
> > > > On Wed, Jan 26, 2022 at 05:27:32AM +0000, Jag Raman wrote:  
> > > > > 
> > > > >   
> > > > > > On Jan 25, 2022, at 1:38 PM, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> > > > > > 
> > > > > > * Jag Raman (jag.raman@oracle.com) wrote:  
> > > > > >> 
> > > > > >>   
> > > > > >>> On Jan 19, 2022, at 7:12 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >>> 
> > > > > >>> On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:  
> > > > > >>>> Allow PCI buses to be part of isolated CPU address spaces. This has a
> > > > > >>>> niche usage.
> > > > > >>>> 
> > > > > >>>> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
> > > > > >>>> the same machine/server. This would cause address space collision as
> > > > > >>>> well as be a security vulnerability. Having separate address spaces for
> > > > > >>>> each PCI bus would solve this problem.  
> > > > > >>> 
> > > > > >>> Fascinating, but I am not sure I understand. any examples?  
> > > > > >> 
> > > > > >> Hi Michael!
> > > > > >> 
> > > > > >> multiprocess QEMU and vfio-user implement a client-server model to allow
> > > > > >> out-of-process emulation of devices. The client QEMU, which makes ioctls
> > > > > >> to the kernel and runs VCPUs, could attach devices running in a server
> > > > > >> QEMU. The server QEMU needs access to parts of the client’s RAM to
> > > > > >> perform DMA.  
> > > > > > 
> > > > > > Do you ever have the opposite problem? i.e. when an emulated PCI device  
> > > > > 
> > > > > That’s an interesting question.
> > > > >   
> > > > > > exposes a chunk of RAM-like space (frame buffer, or maybe a mapped file)
> > > > > > that the client can see.  What happens if two emulated devices need to
> > > > > > access each others emulated address space?  
> > > > > 
> > > > > In this case, the kernel driver would map the destination’s chunk of internal RAM into
> > > > > the DMA space of the source device. Then the source device could write to that
> > > > > mapped address range, and the IOMMU should direct those writes to the
> > > > > destination device.
> > > > > 
> > > > > I would like to take a closer look at the IOMMU implementation on how to achieve
> > > > > this, and get back to you. I think the IOMMU would handle this. Could you please
> > > > > point me to the IOMMU implementation you have in mind?  
> > > > 
> > > > I don't know if the current vfio-user client/server patches already
> > > > implement device-to-device DMA, but the functionality is supported by
> > > > the vfio-user protocol.
> > > > 
> > > > Basically: if the DMA regions lookup inside the vfio-user server fails,
> > > > fall back to VFIO_USER_DMA_READ/WRITE messages instead.
> > > > https://github.com/nutanix/libvfio-user/blob/master/docs/vfio-user.rst#vfio-user-dma-read
> > > > 
> > > > Here is the flow:
> > > > 1. The vfio-user server with device A sends a DMA read to QEMU.
> > > > 2. QEMU finds the MemoryRegion associated with the DMA address and sees
> > > >    it's a device.
> > > >    a. If it's emulated inside the QEMU process then the normal
> > > >       device emulation code kicks in.
> > > >    b. If it's another vfio-user PCI device then the vfio-user PCI proxy
> > > >       device forwards the DMA to the second vfio-user server's device B.  
> > > 
> > > I'm starting to be curious if there's a way to persuade the guest kernel
> > > to do it for us; in general is there a way to say to PCI devices that
> > > they can only DMA to the host and not other PCI devices?  
> > 
> > 
> > But of course - this is how e.g. VFIO protects host PCI devices from
> > each other when one of them is passed through to a VM.  
> 
> Michael: Are you saying just turn on vIOMMU? :)
> 
> Devices in different VFIO groups have their own IOMMU context, so their
> IOVA space is isolated. Just don't map other devices into the IOVA space
> and those other devices will be inaccessible.

Devices in different VFIO *containers* have their own IOMMU context.
Based on the group attachment to a container, groups can either have
shared or isolated IOVA space.  That determination is made by looking
at the address space of the bus, which is governed by the presence of a
vIOMMU.

If the goal here is to restrict DMA between devices, ie. peer-to-peer
(p2p), why are we trying to re-invent what an IOMMU already does?  In
fact, it seems like an IOMMU does this better in providing an IOVA
address space per BDF.  Is the dynamic mapping overhead too much?  What
physical hardware properties or specifications could we leverage to
restrict p2p mappings to a device?  Should it be governed by machine
type to provide consistency between devices?  Should each "isolated"
bus be in a separate root complex?  Thanks,

Alex
Stefan Hajnoczi Jan. 28, 2022, 8:19 a.m. UTC | #16
On Thu, Jan 27, 2022 at 02:22:53PM -0700, Alex Williamson wrote:
> On Thu, 27 Jan 2022 08:30:13 +0000
> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> > On Wed, Jan 26, 2022 at 04:13:33PM -0500, Michael S. Tsirkin wrote:
> > > On Wed, Jan 26, 2022 at 08:07:36PM +0000, Dr. David Alan Gilbert wrote:  
> > > > * Stefan Hajnoczi (stefanha@redhat.com) wrote:  
> > > > > On Wed, Jan 26, 2022 at 05:27:32AM +0000, Jag Raman wrote:  
> > > > > > 
> > > > > >   
> > > > > > > On Jan 25, 2022, at 1:38 PM, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> > > > > > > 
> > > > > > > * Jag Raman (jag.raman@oracle.com) wrote:  
> > > > > > >> 
> > > > > > >>   
> > > > > > >>> On Jan 19, 2022, at 7:12 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >>> 
> > > > > > >>> On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:  
> > > > > > >>>> Allow PCI buses to be part of isolated CPU address spaces. This has a
> > > > > > >>>> niche usage.
> > > > > > >>>> 
> > > > > > >>>> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
> > > > > > >>>> the same machine/server. This would cause address space collision as
> > > > > > >>>> well as be a security vulnerability. Having separate address spaces for
> > > > > > >>>> each PCI bus would solve this problem.  
> > > > > > >>> 
> > > > > > >>> Fascinating, but I am not sure I understand. any examples?  
> > > > > > >> 
> > > > > > >> Hi Michael!
> > > > > > >> 
> > > > > > >> multiprocess QEMU and vfio-user implement a client-server model to allow
> > > > > > >> out-of-process emulation of devices. The client QEMU, which makes ioctls
> > > > > > >> to the kernel and runs VCPUs, could attach devices running in a server
> > > > > > >> QEMU. The server QEMU needs access to parts of the client’s RAM to
> > > > > > >> perform DMA.  
> > > > > > > 
> > > > > > > Do you ever have the opposite problem? i.e. when an emulated PCI device  
> > > > > > 
> > > > > > That’s an interesting question.
> > > > > >   
> > > > > > > exposes a chunk of RAM-like space (frame buffer, or maybe a mapped file)
> > > > > > > that the client can see.  What happens if two emulated devices need to
> > > > > > > access each others emulated address space?  
> > > > > > 
> > > > > > In this case, the kernel driver would map the destination’s chunk of internal RAM into
> > > > > > the DMA space of the source device. Then the source device could write to that
> > > > > > mapped address range, and the IOMMU should direct those writes to the
> > > > > > destination device.
> > > > > > 
> > > > > > I would like to take a closer look at the IOMMU implementation on how to achieve
> > > > > > this, and get back to you. I think the IOMMU would handle this. Could you please
> > > > > > point me to the IOMMU implementation you have in mind?  
> > > > > 
> > > > > I don't know if the current vfio-user client/server patches already
> > > > > implement device-to-device DMA, but the functionality is supported by
> > > > > the vfio-user protocol.
> > > > > 
> > > > > Basically: if the DMA regions lookup inside the vfio-user server fails,
> > > > > fall back to VFIO_USER_DMA_READ/WRITE messages instead.
> > > > > https://github.com/nutanix/libvfio-user/blob/master/docs/vfio-user.rst#vfio-user-dma-read
> > > > > 
> > > > > Here is the flow:
> > > > > 1. The vfio-user server with device A sends a DMA read to QEMU.
> > > > > 2. QEMU finds the MemoryRegion associated with the DMA address and sees
> > > > >    it's a device.
> > > > >    a. If it's emulated inside the QEMU process then the normal
> > > > >       device emulation code kicks in.
> > > > >    b. If it's another vfio-user PCI device then the vfio-user PCI proxy
> > > > >       device forwards the DMA to the second vfio-user server's device B.  
> > > > 
> > > > I'm starting to be curious if there's a way to persuade the guest kernel
> > > > to do it for us; in general is there a way to say to PCI devices that
> > > > they can only DMA to the host and not other PCI devices?  
> > > 
> > > 
> > > But of course - this is how e.g. VFIO protects host PCI devices from
> > > each other when one of them is passed through to a VM.  
> > 
> > Michael: Are you saying just turn on vIOMMU? :)
> > 
> > Devices in different VFIO groups have their own IOMMU context, so their
> > IOVA space is isolated. Just don't map other devices into the IOVA space
> > and those other devices will be inaccessible.
> 
> Devices in different VFIO *containers* have their own IOMMU context.
> Based on the group attachment to a container, groups can either have
> shared or isolated IOVA space.  That determination is made by looking
> at the address space of the bus, which is governed by the presence of a
> vIOMMU.

Oops, thank you for pointing that out!

Stefan
Stefan Hajnoczi Jan. 28, 2022, 9:18 a.m. UTC | #17
On Thu, Jan 27, 2022 at 02:22:53PM -0700, Alex Williamson wrote:
> If the goal here is to restrict DMA between devices, ie. peer-to-peer
> (p2p), why are we trying to re-invent what an IOMMU already does?

The issue Dave raised is that vfio-user servers run in separate
processses from QEMU with shared memory access to RAM but no direct
access to non-RAM MemoryRegions. The virtiofs DAX Window BAR is one
example of a non-RAM MemoryRegion that can be the source/target of DMA
requests.

I don't think IOMMUs solve this problem but luckily the vfio-user
protocol already has messages that vfio-user servers can use as a
fallback when DMA cannot be completed through the shared memory RAM
accesses.

> In
> fact, it seems like an IOMMU does this better in providing an IOVA
> address space per BDF.  Is the dynamic mapping overhead too much?  What
> physical hardware properties or specifications could we leverage to
> restrict p2p mappings to a device?  Should it be governed by machine
> type to provide consistency between devices?  Should each "isolated"
> bus be in a separate root complex?  Thanks,

There is a separate issue in this patch series regarding isolating the
address space where BAR accesses are made (i.e. the global
address_space_memory/io). When one process hosts multiple vfio-user
server instances (e.g. a software-defined network switch with multiple
ethernet devices) then each instance needs isolated memory and io address
spaces so that vfio-user clients don't cause collisions when they map
BARs to the same address.

I think the the separate root complex idea is a good solution. This
patch series takes a different approach by adding the concept of
isolated address spaces into hw/pci/.

Stefan
Alex Williamson Jan. 31, 2022, 4:16 p.m. UTC | #18
On Fri, 28 Jan 2022 09:18:08 +0000
Stefan Hajnoczi <stefanha@redhat.com> wrote:

> On Thu, Jan 27, 2022 at 02:22:53PM -0700, Alex Williamson wrote:
> > If the goal here is to restrict DMA between devices, ie. peer-to-peer
> > (p2p), why are we trying to re-invent what an IOMMU already does?  
> 
> The issue Dave raised is that vfio-user servers run in separate
> processses from QEMU with shared memory access to RAM but no direct
> access to non-RAM MemoryRegions. The virtiofs DAX Window BAR is one
> example of a non-RAM MemoryRegion that can be the source/target of DMA
> requests.
> 
> I don't think IOMMUs solve this problem but luckily the vfio-user
> protocol already has messages that vfio-user servers can use as a
> fallback when DMA cannot be completed through the shared memory RAM
> accesses.
> 
> > In
> > fact, it seems like an IOMMU does this better in providing an IOVA
> > address space per BDF.  Is the dynamic mapping overhead too much?  What
> > physical hardware properties or specifications could we leverage to
> > restrict p2p mappings to a device?  Should it be governed by machine
> > type to provide consistency between devices?  Should each "isolated"
> > bus be in a separate root complex?  Thanks,  
> 
> There is a separate issue in this patch series regarding isolating the
> address space where BAR accesses are made (i.e. the global
> address_space_memory/io). When one process hosts multiple vfio-user
> server instances (e.g. a software-defined network switch with multiple
> ethernet devices) then each instance needs isolated memory and io address
> spaces so that vfio-user clients don't cause collisions when they map
> BARs to the same address.
> 
> I think the the separate root complex idea is a good solution. This
> patch series takes a different approach by adding the concept of
> isolated address spaces into hw/pci/.

This all still seems pretty sketchy, BARs cannot overlap within the
same vCPU address space, perhaps with the exception of when they're
being sized, but DMA should be disabled during sizing.

Devices within the same VM context with identical BARs would need to
operate in different address spaces.  For example a translation offset
in the vCPU address space would allow unique addressing to the devices,
perhaps using the translation offset bits to address a root complex and
masking those bits for downstream transactions.

In general, the device simply operates in an address space, ie. an
IOVA.  When a mapping is made within that address space, we perform a
translation as necessary to generate a guest physical address.  The
IOVA itself is only meaningful within the context of the address space,
there is no requirement or expectation for it to be globally unique.

If the vfio-user server is making some sort of requirement that IOVAs
are unique across all devices, that seems very, very wrong.  Thanks,

Alex
Stefan Hajnoczi Feb. 1, 2022, 9:30 a.m. UTC | #19
On Mon, Jan 31, 2022 at 09:16:23AM -0700, Alex Williamson wrote:
> On Fri, 28 Jan 2022 09:18:08 +0000
> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> > On Thu, Jan 27, 2022 at 02:22:53PM -0700, Alex Williamson wrote:
> > > If the goal here is to restrict DMA between devices, ie. peer-to-peer
> > > (p2p), why are we trying to re-invent what an IOMMU already does?  
> > 
> > The issue Dave raised is that vfio-user servers run in separate
> > processses from QEMU with shared memory access to RAM but no direct
> > access to non-RAM MemoryRegions. The virtiofs DAX Window BAR is one
> > example of a non-RAM MemoryRegion that can be the source/target of DMA
> > requests.
> > 
> > I don't think IOMMUs solve this problem but luckily the vfio-user
> > protocol already has messages that vfio-user servers can use as a
> > fallback when DMA cannot be completed through the shared memory RAM
> > accesses.
> > 
> > > In
> > > fact, it seems like an IOMMU does this better in providing an IOVA
> > > address space per BDF.  Is the dynamic mapping overhead too much?  What
> > > physical hardware properties or specifications could we leverage to
> > > restrict p2p mappings to a device?  Should it be governed by machine
> > > type to provide consistency between devices?  Should each "isolated"
> > > bus be in a separate root complex?  Thanks,  
> > 
> > There is a separate issue in this patch series regarding isolating the
> > address space where BAR accesses are made (i.e. the global
> > address_space_memory/io). When one process hosts multiple vfio-user
> > server instances (e.g. a software-defined network switch with multiple
> > ethernet devices) then each instance needs isolated memory and io address
> > spaces so that vfio-user clients don't cause collisions when they map
> > BARs to the same address.
> > 
> > I think the the separate root complex idea is a good solution. This
> > patch series takes a different approach by adding the concept of
> > isolated address spaces into hw/pci/.
> 
> This all still seems pretty sketchy, BARs cannot overlap within the
> same vCPU address space, perhaps with the exception of when they're
> being sized, but DMA should be disabled during sizing.
> 
> Devices within the same VM context with identical BARs would need to
> operate in different address spaces.  For example a translation offset
> in the vCPU address space would allow unique addressing to the devices,
> perhaps using the translation offset bits to address a root complex and
> masking those bits for downstream transactions.
> 
> In general, the device simply operates in an address space, ie. an
> IOVA.  When a mapping is made within that address space, we perform a
> translation as necessary to generate a guest physical address.  The
> IOVA itself is only meaningful within the context of the address space,
> there is no requirement or expectation for it to be globally unique.
> 
> If the vfio-user server is making some sort of requirement that IOVAs
> are unique across all devices, that seems very, very wrong.  Thanks,

Yes, BARs and IOVAs don't need to be unique across all devices.

The issue is that there can be as many guest physical address spaces as
there are vfio-user clients connected, so per-client isolated address
spaces are required. This patch series has a solution to that problem
with the new pci_isol_as_mem/io() API.

What I find strange about this approach is that exported PCI devices are
on PCI root ports that are connected to the machine's main PCI bus. The
PCI devices don't interact with the main bus's IOVA space, guest
physical memory space, or interrupts. It seems hacky to graft isolated
devices onto a parent bus that provides nothing to its children. I
wonder if it would be cleaner for every vfio-user server to have its own
PCIHost. Then it may be possible to drop the new pci_isol_as_mem/io()
API.

Stefan
Dr. David Alan Gilbert Feb. 1, 2022, 10:42 a.m. UTC | #20
* Alex Williamson (alex.williamson@redhat.com) wrote:
> On Thu, 27 Jan 2022 08:30:13 +0000
> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> > On Wed, Jan 26, 2022 at 04:13:33PM -0500, Michael S. Tsirkin wrote:
> > > On Wed, Jan 26, 2022 at 08:07:36PM +0000, Dr. David Alan Gilbert wrote:  
> > > > * Stefan Hajnoczi (stefanha@redhat.com) wrote:  
> > > > > On Wed, Jan 26, 2022 at 05:27:32AM +0000, Jag Raman wrote:  
> > > > > > 
> > > > > >   
> > > > > > > On Jan 25, 2022, at 1:38 PM, Dr. David Alan Gilbert <dgilbert@redhat.com> wrote:
> > > > > > > 
> > > > > > > * Jag Raman (jag.raman@oracle.com) wrote:  
> > > > > > >> 
> > > > > > >>   
> > > > > > >>> On Jan 19, 2022, at 7:12 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >>> 
> > > > > > >>> On Wed, Jan 19, 2022 at 04:41:52PM -0500, Jagannathan Raman wrote:  
> > > > > > >>>> Allow PCI buses to be part of isolated CPU address spaces. This has a
> > > > > > >>>> niche usage.
> > > > > > >>>> 
> > > > > > >>>> TYPE_REMOTE_MACHINE allows multiple VMs to house their PCI devices in
> > > > > > >>>> the same machine/server. This would cause address space collision as
> > > > > > >>>> well as be a security vulnerability. Having separate address spaces for
> > > > > > >>>> each PCI bus would solve this problem.  
> > > > > > >>> 
> > > > > > >>> Fascinating, but I am not sure I understand. any examples?  
> > > > > > >> 
> > > > > > >> Hi Michael!
> > > > > > >> 
> > > > > > >> multiprocess QEMU and vfio-user implement a client-server model to allow
> > > > > > >> out-of-process emulation of devices. The client QEMU, which makes ioctls
> > > > > > >> to the kernel and runs VCPUs, could attach devices running in a server
> > > > > > >> QEMU. The server QEMU needs access to parts of the client’s RAM to
> > > > > > >> perform DMA.  
> > > > > > > 
> > > > > > > Do you ever have the opposite problem? i.e. when an emulated PCI device  
> > > > > > 
> > > > > > That’s an interesting question.
> > > > > >   
> > > > > > > exposes a chunk of RAM-like space (frame buffer, or maybe a mapped file)
> > > > > > > that the client can see.  What happens if two emulated devices need to
> > > > > > > access each others emulated address space?  
> > > > > > 
> > > > > > In this case, the kernel driver would map the destination’s chunk of internal RAM into
> > > > > > the DMA space of the source device. Then the source device could write to that
> > > > > > mapped address range, and the IOMMU should direct those writes to the
> > > > > > destination device.
> > > > > > 
> > > > > > I would like to take a closer look at the IOMMU implementation on how to achieve
> > > > > > this, and get back to you. I think the IOMMU would handle this. Could you please
> > > > > > point me to the IOMMU implementation you have in mind?  
> > > > > 
> > > > > I don't know if the current vfio-user client/server patches already
> > > > > implement device-to-device DMA, but the functionality is supported by
> > > > > the vfio-user protocol.
> > > > > 
> > > > > Basically: if the DMA regions lookup inside the vfio-user server fails,
> > > > > fall back to VFIO_USER_DMA_READ/WRITE messages instead.
> > > > > https://github.com/nutanix/libvfio-user/blob/master/docs/vfio-user.rst#vfio-user-dma-read
> > > > > 
> > > > > Here is the flow:
> > > > > 1. The vfio-user server with device A sends a DMA read to QEMU.
> > > > > 2. QEMU finds the MemoryRegion associated with the DMA address and sees
> > > > >    it's a device.
> > > > >    a. If it's emulated inside the QEMU process then the normal
> > > > >       device emulation code kicks in.
> > > > >    b. If it's another vfio-user PCI device then the vfio-user PCI proxy
> > > > >       device forwards the DMA to the second vfio-user server's device B.  
> > > > 
> > > > I'm starting to be curious if there's a way to persuade the guest kernel
> > > > to do it for us; in general is there a way to say to PCI devices that
> > > > they can only DMA to the host and not other PCI devices?  
> > > 
> > > 
> > > But of course - this is how e.g. VFIO protects host PCI devices from
> > > each other when one of them is passed through to a VM.  
> > 
> > Michael: Are you saying just turn on vIOMMU? :)
> > 
> > Devices in different VFIO groups have their own IOMMU context, so their
> > IOVA space is isolated. Just don't map other devices into the IOVA space
> > and those other devices will be inaccessible.
> 
> Devices in different VFIO *containers* have their own IOMMU context.
> Based on the group attachment to a container, groups can either have
> shared or isolated IOVA space.  That determination is made by looking
> at the address space of the bus, which is governed by the presence of a
> vIOMMU.
> 
> If the goal here is to restrict DMA between devices, ie. peer-to-peer
> (p2p), why are we trying to re-invent what an IOMMU already does?

That was what I was curious about - is it possible to get an IOMMU to do
that, and how? (Not knowing much about IOMMUs).
In my DAX/virtiofs case, I want the device to be able to DMA to guest
RAM but for other devices not to try to DMA to it and in particular for
it not to have to DMA to other devices.

>  In
> fact, it seems like an IOMMU does this better in providing an IOVA
> address space per BDF.  Is the dynamic mapping overhead too much?  What
> physical hardware properties or specifications could we leverage to
> restrict p2p mappings to a device?  Should it be governed by machine
> type to provide consistency between devices?  Should each "isolated"
> bus be in a separate root complex?  Thanks,

Dave

> Alex
>
Alex Williamson Feb. 1, 2022, 3:24 p.m. UTC | #21
On Tue, 1 Feb 2022 09:30:35 +0000
Stefan Hajnoczi <stefanha@redhat.com> wrote:

> On Mon, Jan 31, 2022 at 09:16:23AM -0700, Alex Williamson wrote:
> > On Fri, 28 Jan 2022 09:18:08 +0000
> > Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >   
> > > On Thu, Jan 27, 2022 at 02:22:53PM -0700, Alex Williamson wrote:  
> > > > If the goal here is to restrict DMA between devices, ie. peer-to-peer
> > > > (p2p), why are we trying to re-invent what an IOMMU already does?    
> > > 
> > > The issue Dave raised is that vfio-user servers run in separate
> > > processses from QEMU with shared memory access to RAM but no direct
> > > access to non-RAM MemoryRegions. The virtiofs DAX Window BAR is one
> > > example of a non-RAM MemoryRegion that can be the source/target of DMA
> > > requests.
> > > 
> > > I don't think IOMMUs solve this problem but luckily the vfio-user
> > > protocol already has messages that vfio-user servers can use as a
> > > fallback when DMA cannot be completed through the shared memory RAM
> > > accesses.
> > >   
> > > > In
> > > > fact, it seems like an IOMMU does this better in providing an IOVA
> > > > address space per BDF.  Is the dynamic mapping overhead too much?  What
> > > > physical hardware properties or specifications could we leverage to
> > > > restrict p2p mappings to a device?  Should it be governed by machine
> > > > type to provide consistency between devices?  Should each "isolated"
> > > > bus be in a separate root complex?  Thanks,    
> > > 
> > > There is a separate issue in this patch series regarding isolating the
> > > address space where BAR accesses are made (i.e. the global
> > > address_space_memory/io). When one process hosts multiple vfio-user
> > > server instances (e.g. a software-defined network switch with multiple
> > > ethernet devices) then each instance needs isolated memory and io address
> > > spaces so that vfio-user clients don't cause collisions when they map
> > > BARs to the same address.
> > > 
> > > I think the the separate root complex idea is a good solution. This
> > > patch series takes a different approach by adding the concept of
> > > isolated address spaces into hw/pci/.  
> > 
> > This all still seems pretty sketchy, BARs cannot overlap within the
> > same vCPU address space, perhaps with the exception of when they're
> > being sized, but DMA should be disabled during sizing.
> > 
> > Devices within the same VM context with identical BARs would need to
> > operate in different address spaces.  For example a translation offset
> > in the vCPU address space would allow unique addressing to the devices,
> > perhaps using the translation offset bits to address a root complex and
> > masking those bits for downstream transactions.
> > 
> > In general, the device simply operates in an address space, ie. an
> > IOVA.  When a mapping is made within that address space, we perform a
> > translation as necessary to generate a guest physical address.  The
> > IOVA itself is only meaningful within the context of the address space,
> > there is no requirement or expectation for it to be globally unique.
> > 
> > If the vfio-user server is making some sort of requirement that IOVAs
> > are unique across all devices, that seems very, very wrong.  Thanks,  
> 
> Yes, BARs and IOVAs don't need to be unique across all devices.
> 
> The issue is that there can be as many guest physical address spaces as
> there are vfio-user clients connected, so per-client isolated address
> spaces are required. This patch series has a solution to that problem
> with the new pci_isol_as_mem/io() API.

Sorry, this still doesn't follow for me.  A server that hosts multiple
devices across many VMs (I'm not sure if you're referring to the device
or the VM as a client) needs to deal with different address spaces per
device.  The server needs to be able to uniquely identify every DMA,
which must be part of the interface protocol.  But I don't see how that
imposes a requirement of an isolated address space.  If we want the
device isolated because we don't trust the server, that's where an IOMMU
provides per device isolation.  What is the restriction of the
per-client isolated address space and why do we need it?  The server
needing to support multiple clients is not a sufficient answer to
impose new PCI bus types with an implicit restriction on the VM.
 
> What I find strange about this approach is that exported PCI devices are
> on PCI root ports that are connected to the machine's main PCI bus. The
> PCI devices don't interact with the main bus's IOVA space, guest
> physical memory space, or interrupts. It seems hacky to graft isolated
> devices onto a parent bus that provides nothing to its children. I
> wonder if it would be cleaner for every vfio-user server to have its own
> PCIHost. Then it may be possible to drop the new pci_isol_as_mem/io()
> API.

This is getting a bit ridiculous, if vfio-user devices require this
degree of manipulation of the VM topology into things that don't exist
on bare metal, we've done something very wrong.  Thanks,

Alex
Jag Raman Feb. 1, 2022, 9:24 p.m. UTC | #22
> On Feb 1, 2022, at 10:24 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> On Tue, 1 Feb 2022 09:30:35 +0000
> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
>> On Mon, Jan 31, 2022 at 09:16:23AM -0700, Alex Williamson wrote:
>>> On Fri, 28 Jan 2022 09:18:08 +0000
>>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>> 
>>>> On Thu, Jan 27, 2022 at 02:22:53PM -0700, Alex Williamson wrote:  
>>>>> If the goal here is to restrict DMA between devices, ie. peer-to-peer
>>>>> (p2p), why are we trying to re-invent what an IOMMU already does?    
>>>> 
>>>> The issue Dave raised is that vfio-user servers run in separate
>>>> processses from QEMU with shared memory access to RAM but no direct
>>>> access to non-RAM MemoryRegions. The virtiofs DAX Window BAR is one
>>>> example of a non-RAM MemoryRegion that can be the source/target of DMA
>>>> requests.
>>>> 
>>>> I don't think IOMMUs solve this problem but luckily the vfio-user
>>>> protocol already has messages that vfio-user servers can use as a
>>>> fallback when DMA cannot be completed through the shared memory RAM
>>>> accesses.
>>>> 
>>>>> In
>>>>> fact, it seems like an IOMMU does this better in providing an IOVA
>>>>> address space per BDF.  Is the dynamic mapping overhead too much?  What
>>>>> physical hardware properties or specifications could we leverage to
>>>>> restrict p2p mappings to a device?  Should it be governed by machine
>>>>> type to provide consistency between devices?  Should each "isolated"
>>>>> bus be in a separate root complex?  Thanks,    
>>>> 
>>>> There is a separate issue in this patch series regarding isolating the
>>>> address space where BAR accesses are made (i.e. the global
>>>> address_space_memory/io). When one process hosts multiple vfio-user
>>>> server instances (e.g. a software-defined network switch with multiple
>>>> ethernet devices) then each instance needs isolated memory and io address
>>>> spaces so that vfio-user clients don't cause collisions when they map
>>>> BARs to the same address.
>>>> 
>>>> I think the the separate root complex idea is a good solution. This
>>>> patch series takes a different approach by adding the concept of
>>>> isolated address spaces into hw/pci/.  
>>> 
>>> This all still seems pretty sketchy, BARs cannot overlap within the
>>> same vCPU address space, perhaps with the exception of when they're
>>> being sized, but DMA should be disabled during sizing.
>>> 
>>> Devices within the same VM context with identical BARs would need to
>>> operate in different address spaces.  For example a translation offset
>>> in the vCPU address space would allow unique addressing to the devices,
>>> perhaps using the translation offset bits to address a root complex and
>>> masking those bits for downstream transactions.
>>> 
>>> In general, the device simply operates in an address space, ie. an
>>> IOVA.  When a mapping is made within that address space, we perform a
>>> translation as necessary to generate a guest physical address.  The
>>> IOVA itself is only meaningful within the context of the address space,
>>> there is no requirement or expectation for it to be globally unique.
>>> 
>>> If the vfio-user server is making some sort of requirement that IOVAs
>>> are unique across all devices, that seems very, very wrong.  Thanks,  
>> 
>> Yes, BARs and IOVAs don't need to be unique across all devices.
>> 
>> The issue is that there can be as many guest physical address spaces as
>> there are vfio-user clients connected, so per-client isolated address
>> spaces are required. This patch series has a solution to that problem
>> with the new pci_isol_as_mem/io() API.
> 
> Sorry, this still doesn't follow for me.  A server that hosts multiple
> devices across many VMs (I'm not sure if you're referring to the device
> or the VM as a client) needs to deal with different address spaces per
> device.  The server needs to be able to uniquely identify every DMA,
> which must be part of the interface protocol.  But I don't see how that
> imposes a requirement of an isolated address space.  If we want the
> device isolated because we don't trust the server, that's where an IOMMU
> provides per device isolation.  What is the restriction of the
> per-client isolated address space and why do we need it?  The server
> needing to support multiple clients is not a sufficient answer to
> impose new PCI bus types with an implicit restriction on the VM.

Hi Alex,

I believe there are two separate problems with running PCI devices in
the vfio-user server. The first one is concerning memory isolation and
second one is vectoring of BAR accesses (as explained below).

In our previous patches (v3), we used an IOMMU to isolate memory
spaces. But we still had trouble with the vectoring. So we implemented
separate address spaces for each PCIBus to tackle both problems
simultaneously, based on the feedback we got.

The following gives an overview of issues concerning vectoring of
BAR accesses.

The device’s BAR regions are mapped into the guest physical address
space. The guest writes the guest PA of each BAR into the device’s BAR
registers. To access the BAR regions of the device, QEMU uses
address_space_rw() which vectors the physical address access to the
device BAR region handlers.

The PCIBus data structure already has address_space_mem and
address_space_io to contain the BAR regions of devices attached
to it. I understand that these two PCIBus members form the
PCI address space.

Typically, the machines map the PCI address space into the system address
space. For example, pc_pci_as_mapping_init() does this for ‘pc' machine types.
As such, there is a 1:1 mapping between system address space and PCI address
space of the root bus. Since all the PCI devices in the machine are assigned to
the same VM, we could map the PCI address space of all PCI buses to the same
system address space.

Whereas in the case of vfio-user, the devices running in the server could
belong to different VMs. Therefore, along with the physical address, we would
need to know the address space that the device belongs for
address_space_rw() to successfully vector BAR accesses into the PCI device.

Thank you!
--
Jag

> 
>> What I find strange about this approach is that exported PCI devices are
>> on PCI root ports that are connected to the machine's main PCI bus. The
>> PCI devices don't interact with the main bus's IOVA space, guest
>> physical memory space, or interrupts. It seems hacky to graft isolated
>> devices onto a parent bus that provides nothing to its children. I
>> wonder if it would be cleaner for every vfio-user server to have its own
>> PCIHost. Then it may be possible to drop the new pci_isol_as_mem/io()
>> API.
> 
> This is getting a bit ridiculous, if vfio-user devices require this
> degree of manipulation of the VM topology into things that don't exist
> on bare metal, we've done something very wrong.  Thanks,
> 
> Alex
>
Alex Williamson Feb. 1, 2022, 10:47 p.m. UTC | #23
On Tue, 1 Feb 2022 21:24:08 +0000
Jag Raman <jag.raman@oracle.com> wrote:

> > On Feb 1, 2022, at 10:24 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
> > 
> > On Tue, 1 Feb 2022 09:30:35 +0000
> > Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >   
> >> On Mon, Jan 31, 2022 at 09:16:23AM -0700, Alex Williamson wrote:  
> >>> On Fri, 28 Jan 2022 09:18:08 +0000
> >>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >>>   
> >>>> On Thu, Jan 27, 2022 at 02:22:53PM -0700, Alex Williamson wrote:    
> >>>>> If the goal here is to restrict DMA between devices, ie. peer-to-peer
> >>>>> (p2p), why are we trying to re-invent what an IOMMU already does?      
> >>>> 
> >>>> The issue Dave raised is that vfio-user servers run in separate
> >>>> processses from QEMU with shared memory access to RAM but no direct
> >>>> access to non-RAM MemoryRegions. The virtiofs DAX Window BAR is one
> >>>> example of a non-RAM MemoryRegion that can be the source/target of DMA
> >>>> requests.
> >>>> 
> >>>> I don't think IOMMUs solve this problem but luckily the vfio-user
> >>>> protocol already has messages that vfio-user servers can use as a
> >>>> fallback when DMA cannot be completed through the shared memory RAM
> >>>> accesses.
> >>>>   
> >>>>> In
> >>>>> fact, it seems like an IOMMU does this better in providing an IOVA
> >>>>> address space per BDF.  Is the dynamic mapping overhead too much?  What
> >>>>> physical hardware properties or specifications could we leverage to
> >>>>> restrict p2p mappings to a device?  Should it be governed by machine
> >>>>> type to provide consistency between devices?  Should each "isolated"
> >>>>> bus be in a separate root complex?  Thanks,      
> >>>> 
> >>>> There is a separate issue in this patch series regarding isolating the
> >>>> address space where BAR accesses are made (i.e. the global
> >>>> address_space_memory/io). When one process hosts multiple vfio-user
> >>>> server instances (e.g. a software-defined network switch with multiple
> >>>> ethernet devices) then each instance needs isolated memory and io address
> >>>> spaces so that vfio-user clients don't cause collisions when they map
> >>>> BARs to the same address.
> >>>> 
> >>>> I think the the separate root complex idea is a good solution. This
> >>>> patch series takes a different approach by adding the concept of
> >>>> isolated address spaces into hw/pci/.    
> >>> 
> >>> This all still seems pretty sketchy, BARs cannot overlap within the
> >>> same vCPU address space, perhaps with the exception of when they're
> >>> being sized, but DMA should be disabled during sizing.
> >>> 
> >>> Devices within the same VM context with identical BARs would need to
> >>> operate in different address spaces.  For example a translation offset
> >>> in the vCPU address space would allow unique addressing to the devices,
> >>> perhaps using the translation offset bits to address a root complex and
> >>> masking those bits for downstream transactions.
> >>> 
> >>> In general, the device simply operates in an address space, ie. an
> >>> IOVA.  When a mapping is made within that address space, we perform a
> >>> translation as necessary to generate a guest physical address.  The
> >>> IOVA itself is only meaningful within the context of the address space,
> >>> there is no requirement or expectation for it to be globally unique.
> >>> 
> >>> If the vfio-user server is making some sort of requirement that IOVAs
> >>> are unique across all devices, that seems very, very wrong.  Thanks,    
> >> 
> >> Yes, BARs and IOVAs don't need to be unique across all devices.
> >> 
> >> The issue is that there can be as many guest physical address spaces as
> >> there are vfio-user clients connected, so per-client isolated address
> >> spaces are required. This patch series has a solution to that problem
> >> with the new pci_isol_as_mem/io() API.  
> > 
> > Sorry, this still doesn't follow for me.  A server that hosts multiple
> > devices across many VMs (I'm not sure if you're referring to the device
> > or the VM as a client) needs to deal with different address spaces per
> > device.  The server needs to be able to uniquely identify every DMA,
> > which must be part of the interface protocol.  But I don't see how that
> > imposes a requirement of an isolated address space.  If we want the
> > device isolated because we don't trust the server, that's where an IOMMU
> > provides per device isolation.  What is the restriction of the
> > per-client isolated address space and why do we need it?  The server
> > needing to support multiple clients is not a sufficient answer to
> > impose new PCI bus types with an implicit restriction on the VM.  
> 
> Hi Alex,
> 
> I believe there are two separate problems with running PCI devices in
> the vfio-user server. The first one is concerning memory isolation and
> second one is vectoring of BAR accesses (as explained below).
> 
> In our previous patches (v3), we used an IOMMU to isolate memory
> spaces. But we still had trouble with the vectoring. So we implemented
> separate address spaces for each PCIBus to tackle both problems
> simultaneously, based on the feedback we got.
> 
> The following gives an overview of issues concerning vectoring of
> BAR accesses.
> 
> The device’s BAR regions are mapped into the guest physical address
> space. The guest writes the guest PA of each BAR into the device’s BAR
> registers. To access the BAR regions of the device, QEMU uses
> address_space_rw() which vectors the physical address access to the
> device BAR region handlers.

The guest physical address written to the BAR is irrelevant from the
device perspective, this only serves to assign the BAR an offset within
the address_space_mem, which is used by the vCPU (and possibly other
devices depending on their address space).  There is no reason for the
device itself to care about this address.
 
> The PCIBus data structure already has address_space_mem and
> address_space_io to contain the BAR regions of devices attached
> to it. I understand that these two PCIBus members form the
> PCI address space.

These are the CPU address spaces.  When there's no IOMMU, the PCI bus is
identity mapped to the CPU address space.  When there is an IOMMU, the
device address space is determined by the granularity of the IOMMU and
may be entirely separate from address_space_mem.

I/O port space is always the identity mapped CPU address space unless
sparse translations are used to create multiple I/O port spaces (not
implemented).  I/O port space is only accessed by the CPU, there are no
device initiated I/O port transactions, so the address space relative
to the device is irrelevant.

> Typically, the machines map the PCI address space into the system address
> space. For example, pc_pci_as_mapping_init() does this for ‘pc' machine types.
> As such, there is a 1:1 mapping between system address space and PCI address
> space of the root bus. Since all the PCI devices in the machine are assigned to
> the same VM, we could map the PCI address space of all PCI buses to the same
> system address space.

"Typically" only if we're restricted to the "pc", ie. i440FX, machine
type since it doesn't support a vIOMMU.  There's no reason to focus on
the identity map case versus the vIOMMU case.

> Whereas in the case of vfio-user, the devices running in the server could
> belong to different VMs. Therefore, along with the physical address, we would
> need to know the address space that the device belongs for
> address_space_rw() to successfully vector BAR accesses into the PCI device.

But as far as device initiated transactions, there is only one address
space for a given device, it's either address_space_mem or one provided
by the vIOMMU and pci_device_iommu_address_space() tells us that
address space.  Furthermore, the device never operates on a "physical
address", it only ever operates on an IOVA, ie. an offset within the
address space assigned to the device.  The IOVA should be considered
arbitrary relative to mappings in any other address spaces.

Device initiated transactions operate on an IOVA within the (single)
address space to which the device is assigned.  Any attempt to do
otherwise violates the isolation put in place by things like vIOMMUs
and ought to be considered a security concern, especially for a device
serviced by an external process.  Thanks,

Alex
Jag Raman Feb. 2, 2022, 1:13 a.m. UTC | #24
> On Feb 1, 2022, at 5:47 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> On Tue, 1 Feb 2022 21:24:08 +0000
> Jag Raman <jag.raman@oracle.com> wrote:
> 
>>> On Feb 1, 2022, at 10:24 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
>>> 
>>> On Tue, 1 Feb 2022 09:30:35 +0000
>>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>> 
>>>> On Mon, Jan 31, 2022 at 09:16:23AM -0700, Alex Williamson wrote:  
>>>>> On Fri, 28 Jan 2022 09:18:08 +0000
>>>>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>>>> 
>>>>>> On Thu, Jan 27, 2022 at 02:22:53PM -0700, Alex Williamson wrote:    
>>>>>>> If the goal here is to restrict DMA between devices, ie. peer-to-peer
>>>>>>> (p2p), why are we trying to re-invent what an IOMMU already does?      
>>>>>> 
>>>>>> The issue Dave raised is that vfio-user servers run in separate
>>>>>> processses from QEMU with shared memory access to RAM but no direct
>>>>>> access to non-RAM MemoryRegions. The virtiofs DAX Window BAR is one
>>>>>> example of a non-RAM MemoryRegion that can be the source/target of DMA
>>>>>> requests.
>>>>>> 
>>>>>> I don't think IOMMUs solve this problem but luckily the vfio-user
>>>>>> protocol already has messages that vfio-user servers can use as a
>>>>>> fallback when DMA cannot be completed through the shared memory RAM
>>>>>> accesses.
>>>>>> 
>>>>>>> In
>>>>>>> fact, it seems like an IOMMU does this better in providing an IOVA
>>>>>>> address space per BDF.  Is the dynamic mapping overhead too much?  What
>>>>>>> physical hardware properties or specifications could we leverage to
>>>>>>> restrict p2p mappings to a device?  Should it be governed by machine
>>>>>>> type to provide consistency between devices?  Should each "isolated"
>>>>>>> bus be in a separate root complex?  Thanks,      
>>>>>> 
>>>>>> There is a separate issue in this patch series regarding isolating the
>>>>>> address space where BAR accesses are made (i.e. the global
>>>>>> address_space_memory/io). When one process hosts multiple vfio-user
>>>>>> server instances (e.g. a software-defined network switch with multiple
>>>>>> ethernet devices) then each instance needs isolated memory and io address
>>>>>> spaces so that vfio-user clients don't cause collisions when they map
>>>>>> BARs to the same address.
>>>>>> 
>>>>>> I think the the separate root complex idea is a good solution. This
>>>>>> patch series takes a different approach by adding the concept of
>>>>>> isolated address spaces into hw/pci/.    
>>>>> 
>>>>> This all still seems pretty sketchy, BARs cannot overlap within the
>>>>> same vCPU address space, perhaps with the exception of when they're
>>>>> being sized, but DMA should be disabled during sizing.
>>>>> 
>>>>> Devices within the same VM context with identical BARs would need to
>>>>> operate in different address spaces.  For example a translation offset
>>>>> in the vCPU address space would allow unique addressing to the devices,
>>>>> perhaps using the translation offset bits to address a root complex and
>>>>> masking those bits for downstream transactions.
>>>>> 
>>>>> In general, the device simply operates in an address space, ie. an
>>>>> IOVA.  When a mapping is made within that address space, we perform a
>>>>> translation as necessary to generate a guest physical address.  The
>>>>> IOVA itself is only meaningful within the context of the address space,
>>>>> there is no requirement or expectation for it to be globally unique.
>>>>> 
>>>>> If the vfio-user server is making some sort of requirement that IOVAs
>>>>> are unique across all devices, that seems very, very wrong.  Thanks,    
>>>> 
>>>> Yes, BARs and IOVAs don't need to be unique across all devices.
>>>> 
>>>> The issue is that there can be as many guest physical address spaces as
>>>> there are vfio-user clients connected, so per-client isolated address
>>>> spaces are required. This patch series has a solution to that problem
>>>> with the new pci_isol_as_mem/io() API.  
>>> 
>>> Sorry, this still doesn't follow for me.  A server that hosts multiple
>>> devices across many VMs (I'm not sure if you're referring to the device
>>> or the VM as a client) needs to deal with different address spaces per
>>> device.  The server needs to be able to uniquely identify every DMA,
>>> which must be part of the interface protocol.  But I don't see how that
>>> imposes a requirement of an isolated address space.  If we want the
>>> device isolated because we don't trust the server, that's where an IOMMU
>>> provides per device isolation.  What is the restriction of the
>>> per-client isolated address space and why do we need it?  The server
>>> needing to support multiple clients is not a sufficient answer to
>>> impose new PCI bus types with an implicit restriction on the VM.  
>> 
>> Hi Alex,
>> 
>> I believe there are two separate problems with running PCI devices in
>> the vfio-user server. The first one is concerning memory isolation and
>> second one is vectoring of BAR accesses (as explained below).
>> 
>> In our previous patches (v3), we used an IOMMU to isolate memory
>> spaces. But we still had trouble with the vectoring. So we implemented
>> separate address spaces for each PCIBus to tackle both problems
>> simultaneously, based on the feedback we got.
>> 
>> The following gives an overview of issues concerning vectoring of
>> BAR accesses.
>> 
>> The device’s BAR regions are mapped into the guest physical address
>> space. The guest writes the guest PA of each BAR into the device’s BAR
>> registers. To access the BAR regions of the device, QEMU uses
>> address_space_rw() which vectors the physical address access to the
>> device BAR region handlers.
> 
> The guest physical address written to the BAR is irrelevant from the
> device perspective, this only serves to assign the BAR an offset within
> the address_space_mem, which is used by the vCPU (and possibly other
> devices depending on their address space).  There is no reason for the
> device itself to care about this address.

Thank you for the explanation, Alex!

The confusion at my part is whether we are inside the device already when
the server receives a request to access BAR region of a device. Based on
your explanation, I get that your view is the BAR access request has
propagated into the device already, whereas I was under the impression
that the request is still on the CPU side of the PCI root complex.

Your view makes sense to me - once the BAR access request reaches the
client (on the other side), we could consider that the request has reached
the device.

On a separate note, if devices don’t care about the values in BAR
registers, why do the default PCI config handlers intercept and map
the BAR region into address_space_mem?
(pci_default_write_config() -> pci_update_mappings())

Thank you!
--
Jag

> 
>> The PCIBus data structure already has address_space_mem and
>> address_space_io to contain the BAR regions of devices attached
>> to it. I understand that these two PCIBus members form the
>> PCI address space.
> 
> These are the CPU address spaces.  When there's no IOMMU, the PCI bus is
> identity mapped to the CPU address space.  When there is an IOMMU, the
> device address space is determined by the granularity of the IOMMU and
> may be entirely separate from address_space_mem.
> 
> I/O port space is always the identity mapped CPU address space unless
> sparse translations are used to create multiple I/O port spaces (not
> implemented).  I/O port space is only accessed by the CPU, there are no
> device initiated I/O port transactions, so the address space relative
> to the device is irrelevant.
> 
>> Typically, the machines map the PCI address space into the system address
>> space. For example, pc_pci_as_mapping_init() does this for ‘pc' machine types.
>> As such, there is a 1:1 mapping between system address space and PCI address
>> space of the root bus. Since all the PCI devices in the machine are assigned to
>> the same VM, we could map the PCI address space of all PCI buses to the same
>> system address space.
> 
> "Typically" only if we're restricted to the "pc", ie. i440FX, machine
> type since it doesn't support a vIOMMU.  There's no reason to focus on
> the identity map case versus the vIOMMU case.
> 
>> Whereas in the case of vfio-user, the devices running in the server could
>> belong to different VMs. Therefore, along with the physical address, we would
>> need to know the address space that the device belongs for
>> address_space_rw() to successfully vector BAR accesses into the PCI device.
> 
> But as far as device initiated transactions, there is only one address
> space for a given device, it's either address_space_mem or one provided
> by the vIOMMU and pci_device_iommu_address_space() tells us that
> address space.  Furthermore, the device never operates on a "physical
> address", it only ever operates on an IOVA, ie. an offset within the
> address space assigned to the device.  The IOVA should be considered
> arbitrary relative to mappings in any other address spaces.
> 
> Device initiated transactions operate on an IOVA within the (single)
> address space to which the device is assigned.  Any attempt to do
> otherwise violates the isolation put in place by things like vIOMMUs
> and ought to be considered a security concern, especially for a device
> serviced by an external process.  Thanks,
> 
> Alex
>
Alex Williamson Feb. 2, 2022, 5:34 a.m. UTC | #25
On Wed, 2 Feb 2022 01:13:22 +0000
Jag Raman <jag.raman@oracle.com> wrote:

> > On Feb 1, 2022, at 5:47 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
> > 
> > On Tue, 1 Feb 2022 21:24:08 +0000
> > Jag Raman <jag.raman@oracle.com> wrote:
> >   
> >>> On Feb 1, 2022, at 10:24 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
> >>> 
> >>> On Tue, 1 Feb 2022 09:30:35 +0000
> >>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >>>   
> >>>> On Mon, Jan 31, 2022 at 09:16:23AM -0700, Alex Williamson wrote:    
> >>>>> On Fri, 28 Jan 2022 09:18:08 +0000
> >>>>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >>>>>   
> >>>>>> On Thu, Jan 27, 2022 at 02:22:53PM -0700, Alex Williamson wrote:      
> >>>>>>> If the goal here is to restrict DMA between devices, ie. peer-to-peer
> >>>>>>> (p2p), why are we trying to re-invent what an IOMMU already does?        
> >>>>>> 
> >>>>>> The issue Dave raised is that vfio-user servers run in separate
> >>>>>> processses from QEMU with shared memory access to RAM but no direct
> >>>>>> access to non-RAM MemoryRegions. The virtiofs DAX Window BAR is one
> >>>>>> example of a non-RAM MemoryRegion that can be the source/target of DMA
> >>>>>> requests.
> >>>>>> 
> >>>>>> I don't think IOMMUs solve this problem but luckily the vfio-user
> >>>>>> protocol already has messages that vfio-user servers can use as a
> >>>>>> fallback when DMA cannot be completed through the shared memory RAM
> >>>>>> accesses.
> >>>>>>   
> >>>>>>> In
> >>>>>>> fact, it seems like an IOMMU does this better in providing an IOVA
> >>>>>>> address space per BDF.  Is the dynamic mapping overhead too much?  What
> >>>>>>> physical hardware properties or specifications could we leverage to
> >>>>>>> restrict p2p mappings to a device?  Should it be governed by machine
> >>>>>>> type to provide consistency between devices?  Should each "isolated"
> >>>>>>> bus be in a separate root complex?  Thanks,        
> >>>>>> 
> >>>>>> There is a separate issue in this patch series regarding isolating the
> >>>>>> address space where BAR accesses are made (i.e. the global
> >>>>>> address_space_memory/io). When one process hosts multiple vfio-user
> >>>>>> server instances (e.g. a software-defined network switch with multiple
> >>>>>> ethernet devices) then each instance needs isolated memory and io address
> >>>>>> spaces so that vfio-user clients don't cause collisions when they map
> >>>>>> BARs to the same address.
> >>>>>> 
> >>>>>> I think the the separate root complex idea is a good solution. This
> >>>>>> patch series takes a different approach by adding the concept of
> >>>>>> isolated address spaces into hw/pci/.      
> >>>>> 
> >>>>> This all still seems pretty sketchy, BARs cannot overlap within the
> >>>>> same vCPU address space, perhaps with the exception of when they're
> >>>>> being sized, but DMA should be disabled during sizing.
> >>>>> 
> >>>>> Devices within the same VM context with identical BARs would need to
> >>>>> operate in different address spaces.  For example a translation offset
> >>>>> in the vCPU address space would allow unique addressing to the devices,
> >>>>> perhaps using the translation offset bits to address a root complex and
> >>>>> masking those bits for downstream transactions.
> >>>>> 
> >>>>> In general, the device simply operates in an address space, ie. an
> >>>>> IOVA.  When a mapping is made within that address space, we perform a
> >>>>> translation as necessary to generate a guest physical address.  The
> >>>>> IOVA itself is only meaningful within the context of the address space,
> >>>>> there is no requirement or expectation for it to be globally unique.
> >>>>> 
> >>>>> If the vfio-user server is making some sort of requirement that IOVAs
> >>>>> are unique across all devices, that seems very, very wrong.  Thanks,      
> >>>> 
> >>>> Yes, BARs and IOVAs don't need to be unique across all devices.
> >>>> 
> >>>> The issue is that there can be as many guest physical address spaces as
> >>>> there are vfio-user clients connected, so per-client isolated address
> >>>> spaces are required. This patch series has a solution to that problem
> >>>> with the new pci_isol_as_mem/io() API.    
> >>> 
> >>> Sorry, this still doesn't follow for me.  A server that hosts multiple
> >>> devices across many VMs (I'm not sure if you're referring to the device
> >>> or the VM as a client) needs to deal with different address spaces per
> >>> device.  The server needs to be able to uniquely identify every DMA,
> >>> which must be part of the interface protocol.  But I don't see how that
> >>> imposes a requirement of an isolated address space.  If we want the
> >>> device isolated because we don't trust the server, that's where an IOMMU
> >>> provides per device isolation.  What is the restriction of the
> >>> per-client isolated address space and why do we need it?  The server
> >>> needing to support multiple clients is not a sufficient answer to
> >>> impose new PCI bus types with an implicit restriction on the VM.    
> >> 
> >> Hi Alex,
> >> 
> >> I believe there are two separate problems with running PCI devices in
> >> the vfio-user server. The first one is concerning memory isolation and
> >> second one is vectoring of BAR accesses (as explained below).
> >> 
> >> In our previous patches (v3), we used an IOMMU to isolate memory
> >> spaces. But we still had trouble with the vectoring. So we implemented
> >> separate address spaces for each PCIBus to tackle both problems
> >> simultaneously, based on the feedback we got.
> >> 
> >> The following gives an overview of issues concerning vectoring of
> >> BAR accesses.
> >> 
> >> The device’s BAR regions are mapped into the guest physical address
> >> space. The guest writes the guest PA of each BAR into the device’s BAR
> >> registers. To access the BAR regions of the device, QEMU uses
> >> address_space_rw() which vectors the physical address access to the
> >> device BAR region handlers.  
> > 
> > The guest physical address written to the BAR is irrelevant from the
> > device perspective, this only serves to assign the BAR an offset within
> > the address_space_mem, which is used by the vCPU (and possibly other
> > devices depending on their address space).  There is no reason for the
> > device itself to care about this address.  
> 
> Thank you for the explanation, Alex!
> 
> The confusion at my part is whether we are inside the device already when
> the server receives a request to access BAR region of a device. Based on
> your explanation, I get that your view is the BAR access request has
> propagated into the device already, whereas I was under the impression
> that the request is still on the CPU side of the PCI root complex.

If you are getting an access through your MemoryRegionOps, all the
translations have been made, you simply need to use the hwaddr as the
offset into the MemoryRegion for the access.  Perform the read/write to
your device, no further translations required.
 
> Your view makes sense to me - once the BAR access request reaches the
> client (on the other side), we could consider that the request has reached
> the device.
> 
> On a separate note, if devices don’t care about the values in BAR
> registers, why do the default PCI config handlers intercept and map
> the BAR region into address_space_mem?
> (pci_default_write_config() -> pci_update_mappings())

This is the part that's actually placing the BAR MemoryRegion as a
sub-region into the vCPU address space.  I think if you track it,
you'll see PCIDevice.io_regions[i].address_space is actually
system_memory, which is used to initialize address_space_system.

The machine assembles PCI devices onto buses as instructed by the
command line or hot plug operations.  It's the responsibility of the
guest firmware and guest OS to probe those devices, size the BARs, and
place the BARs into the memory hierarchy of the PCI bus, ie. system
memory.  The BARs are necessarily in the "guest physical memory" for
vCPU access, but it's essentially only coincidental that PCI devices
might be in an address space that provides a mapping to their own BAR.
There's no reason to ever use it.

In the vIOMMU case, we can't know that the device address space
includes those BAR mappings or if they do, that they're identity mapped
to the physical address.  Devices really need to not infer anything
about an address.  Think about real hardware, a device is told by
driver programming to perform a DMA operation.  The device doesn't know
the target of that operation, it's the guest driver's responsibility to
make sure the IOVA within the device address space is valid and maps to
the desired target.  Thanks,

Alex
Stefan Hajnoczi Feb. 2, 2022, 9:22 a.m. UTC | #26
On Tue, Feb 01, 2022 at 10:34:32PM -0700, Alex Williamson wrote:
> On Wed, 2 Feb 2022 01:13:22 +0000
> Jag Raman <jag.raman@oracle.com> wrote:
> 
> > > On Feb 1, 2022, at 5:47 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
> > > 
> > > On Tue, 1 Feb 2022 21:24:08 +0000
> > > Jag Raman <jag.raman@oracle.com> wrote:
> > >   
> > >>> On Feb 1, 2022, at 10:24 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
> > >>> 
> > >>> On Tue, 1 Feb 2022 09:30:35 +0000
> > >>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > >>>   
> > >>>> On Mon, Jan 31, 2022 at 09:16:23AM -0700, Alex Williamson wrote:    
> > >>>>> On Fri, 28 Jan 2022 09:18:08 +0000
> > >>>>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > >>>>>   
> > >>>>>> On Thu, Jan 27, 2022 at 02:22:53PM -0700, Alex Williamson wrote:      
> > >>>>>>> If the goal here is to restrict DMA between devices, ie. peer-to-peer
> > >>>>>>> (p2p), why are we trying to re-invent what an IOMMU already does?        
> > >>>>>> 
> > >>>>>> The issue Dave raised is that vfio-user servers run in separate
> > >>>>>> processses from QEMU with shared memory access to RAM but no direct
> > >>>>>> access to non-RAM MemoryRegions. The virtiofs DAX Window BAR is one
> > >>>>>> example of a non-RAM MemoryRegion that can be the source/target of DMA
> > >>>>>> requests.
> > >>>>>> 
> > >>>>>> I don't think IOMMUs solve this problem but luckily the vfio-user
> > >>>>>> protocol already has messages that vfio-user servers can use as a
> > >>>>>> fallback when DMA cannot be completed through the shared memory RAM
> > >>>>>> accesses.
> > >>>>>>   
> > >>>>>>> In
> > >>>>>>> fact, it seems like an IOMMU does this better in providing an IOVA
> > >>>>>>> address space per BDF.  Is the dynamic mapping overhead too much?  What
> > >>>>>>> physical hardware properties or specifications could we leverage to
> > >>>>>>> restrict p2p mappings to a device?  Should it be governed by machine
> > >>>>>>> type to provide consistency between devices?  Should each "isolated"
> > >>>>>>> bus be in a separate root complex?  Thanks,        
> > >>>>>> 
> > >>>>>> There is a separate issue in this patch series regarding isolating the
> > >>>>>> address space where BAR accesses are made (i.e. the global
> > >>>>>> address_space_memory/io). When one process hosts multiple vfio-user
> > >>>>>> server instances (e.g. a software-defined network switch with multiple
> > >>>>>> ethernet devices) then each instance needs isolated memory and io address
> > >>>>>> spaces so that vfio-user clients don't cause collisions when they map
> > >>>>>> BARs to the same address.
> > >>>>>> 
> > >>>>>> I think the the separate root complex idea is a good solution. This
> > >>>>>> patch series takes a different approach by adding the concept of
> > >>>>>> isolated address spaces into hw/pci/.      
> > >>>>> 
> > >>>>> This all still seems pretty sketchy, BARs cannot overlap within the
> > >>>>> same vCPU address space, perhaps with the exception of when they're
> > >>>>> being sized, but DMA should be disabled during sizing.
> > >>>>> 
> > >>>>> Devices within the same VM context with identical BARs would need to
> > >>>>> operate in different address spaces.  For example a translation offset
> > >>>>> in the vCPU address space would allow unique addressing to the devices,
> > >>>>> perhaps using the translation offset bits to address a root complex and
> > >>>>> masking those bits for downstream transactions.
> > >>>>> 
> > >>>>> In general, the device simply operates in an address space, ie. an
> > >>>>> IOVA.  When a mapping is made within that address space, we perform a
> > >>>>> translation as necessary to generate a guest physical address.  The
> > >>>>> IOVA itself is only meaningful within the context of the address space,
> > >>>>> there is no requirement or expectation for it to be globally unique.
> > >>>>> 
> > >>>>> If the vfio-user server is making some sort of requirement that IOVAs
> > >>>>> are unique across all devices, that seems very, very wrong.  Thanks,      
> > >>>> 
> > >>>> Yes, BARs and IOVAs don't need to be unique across all devices.
> > >>>> 
> > >>>> The issue is that there can be as many guest physical address spaces as
> > >>>> there are vfio-user clients connected, so per-client isolated address
> > >>>> spaces are required. This patch series has a solution to that problem
> > >>>> with the new pci_isol_as_mem/io() API.    
> > >>> 
> > >>> Sorry, this still doesn't follow for me.  A server that hosts multiple
> > >>> devices across many VMs (I'm not sure if you're referring to the device
> > >>> or the VM as a client) needs to deal with different address spaces per
> > >>> device.  The server needs to be able to uniquely identify every DMA,
> > >>> which must be part of the interface protocol.  But I don't see how that
> > >>> imposes a requirement of an isolated address space.  If we want the
> > >>> device isolated because we don't trust the server, that's where an IOMMU
> > >>> provides per device isolation.  What is the restriction of the
> > >>> per-client isolated address space and why do we need it?  The server
> > >>> needing to support multiple clients is not a sufficient answer to
> > >>> impose new PCI bus types with an implicit restriction on the VM.    
> > >> 
> > >> Hi Alex,
> > >> 
> > >> I believe there are two separate problems with running PCI devices in
> > >> the vfio-user server. The first one is concerning memory isolation and
> > >> second one is vectoring of BAR accesses (as explained below).
> > >> 
> > >> In our previous patches (v3), we used an IOMMU to isolate memory
> > >> spaces. But we still had trouble with the vectoring. So we implemented
> > >> separate address spaces for each PCIBus to tackle both problems
> > >> simultaneously, based on the feedback we got.
> > >> 
> > >> The following gives an overview of issues concerning vectoring of
> > >> BAR accesses.
> > >> 
> > >> The device’s BAR regions are mapped into the guest physical address
> > >> space. The guest writes the guest PA of each BAR into the device’s BAR
> > >> registers. To access the BAR regions of the device, QEMU uses
> > >> address_space_rw() which vectors the physical address access to the
> > >> device BAR region handlers.  
> > > 
> > > The guest physical address written to the BAR is irrelevant from the
> > > device perspective, this only serves to assign the BAR an offset within
> > > the address_space_mem, which is used by the vCPU (and possibly other
> > > devices depending on their address space).  There is no reason for the
> > > device itself to care about this address.  
> > 
> > Thank you for the explanation, Alex!
> > 
> > The confusion at my part is whether we are inside the device already when
> > the server receives a request to access BAR region of a device. Based on
> > your explanation, I get that your view is the BAR access request has
> > propagated into the device already, whereas I was under the impression
> > that the request is still on the CPU side of the PCI root complex.
> 
> If you are getting an access through your MemoryRegionOps, all the
> translations have been made, you simply need to use the hwaddr as the
> offset into the MemoryRegion for the access.  Perform the read/write to
> your device, no further translations required.

The access comes via libvfio-user's vfu_region_access_cb_t callback, not
via MemoryRegionOps. The callback is currently implemented by calling
address_space_rw() on the pci_isol_as_mem/io() address space, depending
on the BAR type. The code in "[PATCH v5 15/18] vfio-user: handle PCI BAR
accesses".

It's possible to reimplement the patch to directly call
memory_region_dispatch_read/write() on r->memory instead of
address_space_rw() as you've described.

> > Your view makes sense to me - once the BAR access request reaches the
> > client (on the other side), we could consider that the request has reached
> > the device.
> > 
> > On a separate note, if devices don’t care about the values in BAR
> > registers, why do the default PCI config handlers intercept and map
> > the BAR region into address_space_mem?
> > (pci_default_write_config() -> pci_update_mappings())
> 
> This is the part that's actually placing the BAR MemoryRegion as a
> sub-region into the vCPU address space.  I think if you track it,
> you'll see PCIDevice.io_regions[i].address_space is actually
> system_memory, which is used to initialize address_space_system.
> 
> The machine assembles PCI devices onto buses as instructed by the
> command line or hot plug operations.  It's the responsibility of the
> guest firmware and guest OS to probe those devices, size the BARs, and
> place the BARs into the memory hierarchy of the PCI bus, ie. system
> memory.  The BARs are necessarily in the "guest physical memory" for
> vCPU access, but it's essentially only coincidental that PCI devices
> might be in an address space that provides a mapping to their own BAR.
> There's no reason to ever use it.

Good, I think nothing uses address_space_system/io when BAR dispatch is
implemented with memory_region_dispatch_read/write() as you suggested.

It would be nice if there was a way to poison address_space_system/io to
abort on dispatch - nothing should use them.

We now have the option of dropping pci_isol_as_mem/io() again and using
->iommu_fn() to return an isolated memory address space containing the
vfio-user client's VFIO_USER_DMA_MAP regions.

Stefan
Peter Maydell Feb. 2, 2022, 9:30 a.m. UTC | #27
On Tue, 1 Feb 2022 at 23:51, Alex Williamson <alex.williamson@redhat.com> wrote:
>
> On Tue, 1 Feb 2022 21:24:08 +0000
> Jag Raman <jag.raman@oracle.com> wrote:
> > The PCIBus data structure already has address_space_mem and
> > address_space_io to contain the BAR regions of devices attached
> > to it. I understand that these two PCIBus members form the
> > PCI address space.
>
> These are the CPU address spaces.  When there's no IOMMU, the PCI bus is
> identity mapped to the CPU address space.  When there is an IOMMU, the
> device address space is determined by the granularity of the IOMMU and
> may be entirely separate from address_space_mem.

Note that those fields in PCIBus are just whatever MemoryRegions
the pci controller model passed in to the call to pci_root_bus_init()
or equivalent. They may or may not be specifically the CPU's view
of anything. (For instance on the versatilepb board, the PCI controller
is visible to the CPU via several MMIO "windows" at known addresses,
which let the CPU access into the PCI address space at a programmable
offset. We model that by creating a couple of container MRs which
we pass to pci_root_bus_init() to be the PCI memory and IO spaces,
and then using alias MRs to provide the view into those at the
guest-programmed offset. The CPU sees those windows, and doesn't
have direct access to the whole PCIBus::address_space_mem.)
I guess you could say they're the PCI controller's view of the PCI
address space ?

We have a tendency to be a bit sloppy with use of AddressSpaces
within QEMU where it happens that the view of the world that a
DMA-capable device matches that of the CPU, but conceptually
they can definitely be different, especially in the non-x86 world.
(Linux also confuses matters here by preferring to program a 1:1
mapping even if the hardware is more flexible and can do other things.
The model of the h/w in QEMU should support the other cases too, not
just 1:1.)

> I/O port space is always the identity mapped CPU address space unless
> sparse translations are used to create multiple I/O port spaces (not
> implemented).  I/O port space is only accessed by the CPU, there are no
> device initiated I/O port transactions, so the address space relative
> to the device is irrelevant.

Does the PCI spec actually forbid any master except the CPU from
issuing I/O port transactions, or is it just that in practice nobody
makes a PCI device that does weird stuff like that ?

thanks
-- PMM
Michael S. Tsirkin Feb. 2, 2022, 10:06 a.m. UTC | #28
On Wed, Feb 02, 2022 at 09:30:42AM +0000, Peter Maydell wrote:
> > I/O port space is always the identity mapped CPU address space unless
> > sparse translations are used to create multiple I/O port spaces (not
> > implemented).  I/O port space is only accessed by the CPU, there are no
> > device initiated I/O port transactions, so the address space relative
> > to the device is irrelevant.
> 
> Does the PCI spec actually forbid any master except the CPU from
> issuing I/O port transactions, or is it just that in practice nobody
> makes a PCI device that does weird stuff like that ?
> 
> thanks
> -- PMM

Hmm, the only thing vaguely related in the spec that I know of is this:

	PCI Express supports I/O Space for compatibility with legacy devices which require their use.
	Future revisions of this specification may deprecate the use of I/O Space.

Alex, what did you refer to?
Alex Williamson Feb. 2, 2022, 3:49 p.m. UTC | #29
On Wed, 2 Feb 2022 05:06:49 -0500
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Wed, Feb 02, 2022 at 09:30:42AM +0000, Peter Maydell wrote:
> > > I/O port space is always the identity mapped CPU address space unless
> > > sparse translations are used to create multiple I/O port spaces (not
> > > implemented).  I/O port space is only accessed by the CPU, there are no
> > > device initiated I/O port transactions, so the address space relative
> > > to the device is irrelevant.  
> > 
> > Does the PCI spec actually forbid any master except the CPU from
> > issuing I/O port transactions, or is it just that in practice nobody
> > makes a PCI device that does weird stuff like that ?
> > 
> > thanks
> > -- PMM  
> 
> Hmm, the only thing vaguely related in the spec that I know of is this:
> 
> 	PCI Express supports I/O Space for compatibility with legacy devices which require their use.
> 	Future revisions of this specification may deprecate the use of I/O Space.
> 
> Alex, what did you refer to?

My evidence is largely by omission, but that might be that in practice
it's not used rather than explicitly forbidden.  I note that the bus
master enable bit specifies:

	Bus Master Enable - Controls the ability of a Function to issue
		Memory and I/O Read/Write Requests, and the ability of
		a Port to forward Memory and I/O Read/Write Requests in
		the Upstream direction.

That would suggest it's possible, but for PCI device assignment, I'm
not aware of any means through which we could support this.  There is
no support in the IOMMU core for mapping I/O port space, nor could we
trap such device initiated transactions to emulate them.  I can't spot
any mention of I/O port space in the VT-d spec, however the AMD-Vi spec
does include a field in the device table:

	controlIoCtl: port I/O control. Specifies whether
	device-initiated port I/O space transactions are blocked,
	forwarded, or translated.

	00b=Device-initiated port I/O is not allowed. The IOMMU target
	aborts the transaction if a port I/O space transaction is
	received. Translation requests are target aborted.
	
	01b=Device-initiated port I/O space transactions are allowed.
	The IOMMU must pass port I/O accesses untranslated. Translation
	requests are target aborted.
	
	10b=Transactions in the port I/O space address range are
	translated by the IOMMU page tables as memory transactions.

	11b=Reserved.

I don't see this field among the macros used by the Linux driver in
configuring these device entries, so I assume it's left to the default
value, ie. zero, blocking device initiated I/O port transactions.

So yes, I suppose device initiated I/O port transactions are possible,
but we have no support or reason to support them, so I'm going to go
ahead and continue believing any I/O port address space from the device
perspective is largely irrelevant ;)  Thanks,

Alex
Michael S. Tsirkin Feb. 2, 2022, 4:53 p.m. UTC | #30
On Wed, Feb 02, 2022 at 08:49:33AM -0700, Alex Williamson wrote:
> > Alex, what did you refer to?
> 
> My evidence is largely by omission, but that might be that in practice
> it's not used rather than explicitly forbidden.  I note that the bus
> master enable bit specifies:
> 
> 	Bus Master Enable - Controls the ability of a Function to issue
> 		Memory and I/O Read/Write Requests, and the ability of
> 		a Port to forward Memory and I/O Read/Write Requests in
> 		the Upstream direction.
> 
> That would suggest it's possible, but for PCI device assignment, I'm
> not aware of any means through which we could support this.  There is
> no support in the IOMMU core for mapping I/O port space, nor could we
> trap such device initiated transactions to emulate them.  I can't spot
> any mention of I/O port space in the VT-d spec, however the AMD-Vi spec
> does include a field in the device table:
> 
> 	controlIoCtl: port I/O control. Specifies whether
> 	device-initiated port I/O space transactions are blocked,
> 	forwarded, or translated.
> 
> 	00b=Device-initiated port I/O is not allowed. The IOMMU target
> 	aborts the transaction if a port I/O space transaction is
> 	received. Translation requests are target aborted.
> 	
> 	01b=Device-initiated port I/O space transactions are allowed.
> 	The IOMMU must pass port I/O accesses untranslated. Translation
> 	requests are target aborted.
> 	
> 	10b=Transactions in the port I/O space address range are
> 	translated by the IOMMU page tables as memory transactions.
> 
> 	11b=Reserved.
> 
> I don't see this field among the macros used by the Linux driver in
> configuring these device entries, so I assume it's left to the default
> value, ie. zero, blocking device initiated I/O port transactions.
> 
> So yes, I suppose device initiated I/O port transactions are possible,
> but we have no support or reason to support them, so I'm going to go
> ahead and continue believing any I/O port address space from the device
> perspective is largely irrelevant ;)  Thanks,
> 
> Alex

Right, it would seem devices can initiate I/O space transactions but IOMMUs
don't support virtualizing them and so neither does VFIO.
Alex Williamson Feb. 2, 2022, 5:12 p.m. UTC | #31
On Wed, 2 Feb 2022 09:30:42 +0000
Peter Maydell <peter.maydell@linaro.org> wrote:

> On Tue, 1 Feb 2022 at 23:51, Alex Williamson <alex.williamson@redhat.com> wrote:
> >
> > On Tue, 1 Feb 2022 21:24:08 +0000
> > Jag Raman <jag.raman@oracle.com> wrote:  
> > > The PCIBus data structure already has address_space_mem and
> > > address_space_io to contain the BAR regions of devices attached
> > > to it. I understand that these two PCIBus members form the
> > > PCI address space.  
> >
> > These are the CPU address spaces.  When there's no IOMMU, the PCI bus is
> > identity mapped to the CPU address space.  When there is an IOMMU, the
> > device address space is determined by the granularity of the IOMMU and
> > may be entirely separate from address_space_mem.  
> 
> Note that those fields in PCIBus are just whatever MemoryRegions
> the pci controller model passed in to the call to pci_root_bus_init()
> or equivalent. They may or may not be specifically the CPU's view
> of anything. (For instance on the versatilepb board, the PCI controller
> is visible to the CPU via several MMIO "windows" at known addresses,
> which let the CPU access into the PCI address space at a programmable
> offset. We model that by creating a couple of container MRs which
> we pass to pci_root_bus_init() to be the PCI memory and IO spaces,
> and then using alias MRs to provide the view into those at the
> guest-programmed offset. The CPU sees those windows, and doesn't
> have direct access to the whole PCIBus::address_space_mem.)
> I guess you could say they're the PCI controller's view of the PCI
> address space ?

Sure, that's fair.

> We have a tendency to be a bit sloppy with use of AddressSpaces
> within QEMU where it happens that the view of the world that a
> DMA-capable device matches that of the CPU, but conceptually
> they can definitely be different, especially in the non-x86 world.
> (Linux also confuses matters here by preferring to program a 1:1
> mapping even if the hardware is more flexible and can do other things.
> The model of the h/w in QEMU should support the other cases too, not
> just 1:1.)

Right, this is why I prefer to look at the device address space as
simply an IOVA.  The IOVA might be a direct physical address or
coincidental identity mapped physical address via an IOMMU, but none of
that should be the concern of the device.
 
> > I/O port space is always the identity mapped CPU address space unless
> > sparse translations are used to create multiple I/O port spaces (not
> > implemented).  I/O port space is only accessed by the CPU, there are no
> > device initiated I/O port transactions, so the address space relative
> > to the device is irrelevant.  
> 
> Does the PCI spec actually forbid any master except the CPU from
> issuing I/O port transactions, or is it just that in practice nobody
> makes a PCI device that does weird stuff like that ?

As realized in reply to MST, more the latter.  Not used, no point to
enabling, no means to enable depending on the physical IOMMU
implementation.  Thanks,

Alex
Jag Raman Feb. 10, 2022, 12:08 a.m. UTC | #32
> On Feb 2, 2022, at 12:34 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> On Wed, 2 Feb 2022 01:13:22 +0000
> Jag Raman <jag.raman@oracle.com> wrote:
> 
>>> On Feb 1, 2022, at 5:47 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
>>> 
>>> On Tue, 1 Feb 2022 21:24:08 +0000
>>> Jag Raman <jag.raman@oracle.com> wrote:
>>> 
>>>>> On Feb 1, 2022, at 10:24 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
>>>>> 
>>>>> On Tue, 1 Feb 2022 09:30:35 +0000
>>>>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>>>> 
>>>>>> On Mon, Jan 31, 2022 at 09:16:23AM -0700, Alex Williamson wrote:    
>>>>>>> On Fri, 28 Jan 2022 09:18:08 +0000
>>>>>>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>>>>>> 
>>>>>>>> On Thu, Jan 27, 2022 at 02:22:53PM -0700, Alex Williamson wrote:      
>>>>>>>>> If the goal here is to restrict DMA between devices, ie. peer-to-peer
>>>>>>>>> (p2p), why are we trying to re-invent what an IOMMU already does?        
>>>>>>>> 
>>>>>>>> The issue Dave raised is that vfio-user servers run in separate
>>>>>>>> processses from QEMU with shared memory access to RAM but no direct
>>>>>>>> access to non-RAM MemoryRegions. The virtiofs DAX Window BAR is one
>>>>>>>> example of a non-RAM MemoryRegion that can be the source/target of DMA
>>>>>>>> requests.
>>>>>>>> 
>>>>>>>> I don't think IOMMUs solve this problem but luckily the vfio-user
>>>>>>>> protocol already has messages that vfio-user servers can use as a
>>>>>>>> fallback when DMA cannot be completed through the shared memory RAM
>>>>>>>> accesses.
>>>>>>>> 
>>>>>>>>> In
>>>>>>>>> fact, it seems like an IOMMU does this better in providing an IOVA
>>>>>>>>> address space per BDF.  Is the dynamic mapping overhead too much?  What
>>>>>>>>> physical hardware properties or specifications could we leverage to
>>>>>>>>> restrict p2p mappings to a device?  Should it be governed by machine
>>>>>>>>> type to provide consistency between devices?  Should each "isolated"
>>>>>>>>> bus be in a separate root complex?  Thanks,        
>>>>>>>> 
>>>>>>>> There is a separate issue in this patch series regarding isolating the
>>>>>>>> address space where BAR accesses are made (i.e. the global
>>>>>>>> address_space_memory/io). When one process hosts multiple vfio-user
>>>>>>>> server instances (e.g. a software-defined network switch with multiple
>>>>>>>> ethernet devices) then each instance needs isolated memory and io address
>>>>>>>> spaces so that vfio-user clients don't cause collisions when they map
>>>>>>>> BARs to the same address.
>>>>>>>> 
>>>>>>>> I think the the separate root complex idea is a good solution. This
>>>>>>>> patch series takes a different approach by adding the concept of
>>>>>>>> isolated address spaces into hw/pci/.      
>>>>>>> 
>>>>>>> This all still seems pretty sketchy, BARs cannot overlap within the
>>>>>>> same vCPU address space, perhaps with the exception of when they're
>>>>>>> being sized, but DMA should be disabled during sizing.
>>>>>>> 
>>>>>>> Devices within the same VM context with identical BARs would need to
>>>>>>> operate in different address spaces.  For example a translation offset
>>>>>>> in the vCPU address space would allow unique addressing to the devices,
>>>>>>> perhaps using the translation offset bits to address a root complex and
>>>>>>> masking those bits for downstream transactions.
>>>>>>> 
>>>>>>> In general, the device simply operates in an address space, ie. an
>>>>>>> IOVA.  When a mapping is made within that address space, we perform a
>>>>>>> translation as necessary to generate a guest physical address.  The
>>>>>>> IOVA itself is only meaningful within the context of the address space,
>>>>>>> there is no requirement or expectation for it to be globally unique.
>>>>>>> 
>>>>>>> If the vfio-user server is making some sort of requirement that IOVAs
>>>>>>> are unique across all devices, that seems very, very wrong.  Thanks,      
>>>>>> 
>>>>>> Yes, BARs and IOVAs don't need to be unique across all devices.
>>>>>> 
>>>>>> The issue is that there can be as many guest physical address spaces as
>>>>>> there are vfio-user clients connected, so per-client isolated address
>>>>>> spaces are required. This patch series has a solution to that problem
>>>>>> with the new pci_isol_as_mem/io() API.    
>>>>> 
>>>>> Sorry, this still doesn't follow for me.  A server that hosts multiple
>>>>> devices across many VMs (I'm not sure if you're referring to the device
>>>>> or the VM as a client) needs to deal with different address spaces per
>>>>> device.  The server needs to be able to uniquely identify every DMA,
>>>>> which must be part of the interface protocol.  But I don't see how that
>>>>> imposes a requirement of an isolated address space.  If we want the
>>>>> device isolated because we don't trust the server, that's where an IOMMU
>>>>> provides per device isolation.  What is the restriction of the
>>>>> per-client isolated address space and why do we need it?  The server
>>>>> needing to support multiple clients is not a sufficient answer to
>>>>> impose new PCI bus types with an implicit restriction on the VM.    
>>>> 
>>>> Hi Alex,
>>>> 
>>>> I believe there are two separate problems with running PCI devices in
>>>> the vfio-user server. The first one is concerning memory isolation and
>>>> second one is vectoring of BAR accesses (as explained below).
>>>> 
>>>> In our previous patches (v3), we used an IOMMU to isolate memory
>>>> spaces. But we still had trouble with the vectoring. So we implemented
>>>> separate address spaces for each PCIBus to tackle both problems
>>>> simultaneously, based on the feedback we got.
>>>> 
>>>> The following gives an overview of issues concerning vectoring of
>>>> BAR accesses.
>>>> 
>>>> The device’s BAR regions are mapped into the guest physical address
>>>> space. The guest writes the guest PA of each BAR into the device’s BAR
>>>> registers. To access the BAR regions of the device, QEMU uses
>>>> address_space_rw() which vectors the physical address access to the
>>>> device BAR region handlers.  
>>> 
>>> The guest physical address written to the BAR is irrelevant from the
>>> device perspective, this only serves to assign the BAR an offset within
>>> the address_space_mem, which is used by the vCPU (and possibly other
>>> devices depending on their address space).  There is no reason for the
>>> device itself to care about this address.  
>> 
>> Thank you for the explanation, Alex!
>> 
>> The confusion at my part is whether we are inside the device already when
>> the server receives a request to access BAR region of a device. Based on
>> your explanation, I get that your view is the BAR access request has
>> propagated into the device already, whereas I was under the impression
>> that the request is still on the CPU side of the PCI root complex.
> 
> If you are getting an access through your MemoryRegionOps, all the
> translations have been made, you simply need to use the hwaddr as the
> offset into the MemoryRegion for the access.  Perform the read/write to
> your device, no further translations required.
> 
>> Your view makes sense to me - once the BAR access request reaches the
>> client (on the other side), we could consider that the request has reached
>> the device.
>> 
>> On a separate note, if devices don’t care about the values in BAR
>> registers, why do the default PCI config handlers intercept and map
>> the BAR region into address_space_mem?
>> (pci_default_write_config() -> pci_update_mappings())
> 
> This is the part that's actually placing the BAR MemoryRegion as a
> sub-region into the vCPU address space.  I think if you track it,
> you'll see PCIDevice.io_regions[i].address_space is actually
> system_memory, which is used to initialize address_space_system.
> 
> The machine assembles PCI devices onto buses as instructed by the
> command line or hot plug operations.  It's the responsibility of the
> guest firmware and guest OS to probe those devices, size the BARs, and
> place the BARs into the memory hierarchy of the PCI bus, ie. system
> memory.  The BARs are necessarily in the "guest physical memory" for
> vCPU access, but it's essentially only coincidental that PCI devices
> might be in an address space that provides a mapping to their own BAR.
> There's no reason to ever use it.
> 
> In the vIOMMU case, we can't know that the device address space
> includes those BAR mappings or if they do, that they're identity mapped
> to the physical address.  Devices really need to not infer anything
> about an address.  Think about real hardware, a device is told by
> driver programming to perform a DMA operation.  The device doesn't know
> the target of that operation, it's the guest driver's responsibility to
> make sure the IOVA within the device address space is valid and maps to
> the desired target.  Thanks,

Thanks for the explanation, Alex. Thanks to everyone else in the thread who
helped to clarify this problem.

We have implemented the memory isolation based on the discussion in the
thread. We will send the patches out shortly.

Devices such as “name" and “e1000” worked fine. But I’d like to note that
the LSI device (TYPE_LSI53C895A) had some problems - it doesn’t seem
to be IOMMU aware. In LSI’s case, the kernel driver is asking the device to
read instructions from the CPU VA (lsi_execute_script() -> read_dword()),
which is forbidden when IOMMU is enabled. Specifically, the driver is asking
the device to access other BAR regions by using the BAR address programmed
in the PCI config space. This happens even without vfio-user patches. For example,
we could enable IOMMU using “-device intel-iommu” QEMU option and also
adding the following to the kernel command-line: “intel_iommu=on iommu=nopt”.
In this case, we could see an IOMMU fault.

Unfortunately, we started off our project with the LSI device. So that lead to all the
confusion about what is expected at the server end in-terms of
vectoring/address-translation. It gave an impression as if the request was still on
the CPU side of the PCI root complex, but the actual problem was with the
device driver itself.

I’m wondering how to deal with this problem. Would it be OK if we mapped the
device’s BAR into the IOVA, at the same CPU VA programmed in the BAR registers?
This would help devices such as LSI to circumvent this problem. One problem
with this approach is that it has the potential to collide with another legitimate
IOVA address. Kindly share your thought on this.

Thank you!
--
Jag

> 
> Alex
>
Michael S. Tsirkin Feb. 10, 2022, 8:02 a.m. UTC | #33
On Thu, Feb 10, 2022 at 12:08:27AM +0000, Jag Raman wrote:
> 
> 
> > On Feb 2, 2022, at 12:34 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
> > 
> > On Wed, 2 Feb 2022 01:13:22 +0000
> > Jag Raman <jag.raman@oracle.com> wrote:
> > 
> >>> On Feb 1, 2022, at 5:47 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
> >>> 
> >>> On Tue, 1 Feb 2022 21:24:08 +0000
> >>> Jag Raman <jag.raman@oracle.com> wrote:
> >>> 
> >>>>> On Feb 1, 2022, at 10:24 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
> >>>>> 
> >>>>> On Tue, 1 Feb 2022 09:30:35 +0000
> >>>>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >>>>> 
> >>>>>> On Mon, Jan 31, 2022 at 09:16:23AM -0700, Alex Williamson wrote:    
> >>>>>>> On Fri, 28 Jan 2022 09:18:08 +0000
> >>>>>>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >>>>>>> 
> >>>>>>>> On Thu, Jan 27, 2022 at 02:22:53PM -0700, Alex Williamson wrote:      
> >>>>>>>>> If the goal here is to restrict DMA between devices, ie. peer-to-peer
> >>>>>>>>> (p2p), why are we trying to re-invent what an IOMMU already does?        
> >>>>>>>> 
> >>>>>>>> The issue Dave raised is that vfio-user servers run in separate
> >>>>>>>> processses from QEMU with shared memory access to RAM but no direct
> >>>>>>>> access to non-RAM MemoryRegions. The virtiofs DAX Window BAR is one
> >>>>>>>> example of a non-RAM MemoryRegion that can be the source/target of DMA
> >>>>>>>> requests.
> >>>>>>>> 
> >>>>>>>> I don't think IOMMUs solve this problem but luckily the vfio-user
> >>>>>>>> protocol already has messages that vfio-user servers can use as a
> >>>>>>>> fallback when DMA cannot be completed through the shared memory RAM
> >>>>>>>> accesses.
> >>>>>>>> 
> >>>>>>>>> In
> >>>>>>>>> fact, it seems like an IOMMU does this better in providing an IOVA
> >>>>>>>>> address space per BDF.  Is the dynamic mapping overhead too much?  What
> >>>>>>>>> physical hardware properties or specifications could we leverage to
> >>>>>>>>> restrict p2p mappings to a device?  Should it be governed by machine
> >>>>>>>>> type to provide consistency between devices?  Should each "isolated"
> >>>>>>>>> bus be in a separate root complex?  Thanks,        
> >>>>>>>> 
> >>>>>>>> There is a separate issue in this patch series regarding isolating the
> >>>>>>>> address space where BAR accesses are made (i.e. the global
> >>>>>>>> address_space_memory/io). When one process hosts multiple vfio-user
> >>>>>>>> server instances (e.g. a software-defined network switch with multiple
> >>>>>>>> ethernet devices) then each instance needs isolated memory and io address
> >>>>>>>> spaces so that vfio-user clients don't cause collisions when they map
> >>>>>>>> BARs to the same address.
> >>>>>>>> 
> >>>>>>>> I think the the separate root complex idea is a good solution. This
> >>>>>>>> patch series takes a different approach by adding the concept of
> >>>>>>>> isolated address spaces into hw/pci/.      
> >>>>>>> 
> >>>>>>> This all still seems pretty sketchy, BARs cannot overlap within the
> >>>>>>> same vCPU address space, perhaps with the exception of when they're
> >>>>>>> being sized, but DMA should be disabled during sizing.
> >>>>>>> 
> >>>>>>> Devices within the same VM context with identical BARs would need to
> >>>>>>> operate in different address spaces.  For example a translation offset
> >>>>>>> in the vCPU address space would allow unique addressing to the devices,
> >>>>>>> perhaps using the translation offset bits to address a root complex and
> >>>>>>> masking those bits for downstream transactions.
> >>>>>>> 
> >>>>>>> In general, the device simply operates in an address space, ie. an
> >>>>>>> IOVA.  When a mapping is made within that address space, we perform a
> >>>>>>> translation as necessary to generate a guest physical address.  The
> >>>>>>> IOVA itself is only meaningful within the context of the address space,
> >>>>>>> there is no requirement or expectation for it to be globally unique.
> >>>>>>> 
> >>>>>>> If the vfio-user server is making some sort of requirement that IOVAs
> >>>>>>> are unique across all devices, that seems very, very wrong.  Thanks,      
> >>>>>> 
> >>>>>> Yes, BARs and IOVAs don't need to be unique across all devices.
> >>>>>> 
> >>>>>> The issue is that there can be as many guest physical address spaces as
> >>>>>> there are vfio-user clients connected, so per-client isolated address
> >>>>>> spaces are required. This patch series has a solution to that problem
> >>>>>> with the new pci_isol_as_mem/io() API.    
> >>>>> 
> >>>>> Sorry, this still doesn't follow for me.  A server that hosts multiple
> >>>>> devices across many VMs (I'm not sure if you're referring to the device
> >>>>> or the VM as a client) needs to deal with different address spaces per
> >>>>> device.  The server needs to be able to uniquely identify every DMA,
> >>>>> which must be part of the interface protocol.  But I don't see how that
> >>>>> imposes a requirement of an isolated address space.  If we want the
> >>>>> device isolated because we don't trust the server, that's where an IOMMU
> >>>>> provides per device isolation.  What is the restriction of the
> >>>>> per-client isolated address space and why do we need it?  The server
> >>>>> needing to support multiple clients is not a sufficient answer to
> >>>>> impose new PCI bus types with an implicit restriction on the VM.    
> >>>> 
> >>>> Hi Alex,
> >>>> 
> >>>> I believe there are two separate problems with running PCI devices in
> >>>> the vfio-user server. The first one is concerning memory isolation and
> >>>> second one is vectoring of BAR accesses (as explained below).
> >>>> 
> >>>> In our previous patches (v3), we used an IOMMU to isolate memory
> >>>> spaces. But we still had trouble with the vectoring. So we implemented
> >>>> separate address spaces for each PCIBus to tackle both problems
> >>>> simultaneously, based on the feedback we got.
> >>>> 
> >>>> The following gives an overview of issues concerning vectoring of
> >>>> BAR accesses.
> >>>> 
> >>>> The device’s BAR regions are mapped into the guest physical address
> >>>> space. The guest writes the guest PA of each BAR into the device’s BAR
> >>>> registers. To access the BAR regions of the device, QEMU uses
> >>>> address_space_rw() which vectors the physical address access to the
> >>>> device BAR region handlers.  
> >>> 
> >>> The guest physical address written to the BAR is irrelevant from the
> >>> device perspective, this only serves to assign the BAR an offset within
> >>> the address_space_mem, which is used by the vCPU (and possibly other
> >>> devices depending on their address space).  There is no reason for the
> >>> device itself to care about this address.  
> >> 
> >> Thank you for the explanation, Alex!
> >> 
> >> The confusion at my part is whether we are inside the device already when
> >> the server receives a request to access BAR region of a device. Based on
> >> your explanation, I get that your view is the BAR access request has
> >> propagated into the device already, whereas I was under the impression
> >> that the request is still on the CPU side of the PCI root complex.
> > 
> > If you are getting an access through your MemoryRegionOps, all the
> > translations have been made, you simply need to use the hwaddr as the
> > offset into the MemoryRegion for the access.  Perform the read/write to
> > your device, no further translations required.
> > 
> >> Your view makes sense to me - once the BAR access request reaches the
> >> client (on the other side), we could consider that the request has reached
> >> the device.
> >> 
> >> On a separate note, if devices don’t care about the values in BAR
> >> registers, why do the default PCI config handlers intercept and map
> >> the BAR region into address_space_mem?
> >> (pci_default_write_config() -> pci_update_mappings())
> > 
> > This is the part that's actually placing the BAR MemoryRegion as a
> > sub-region into the vCPU address space.  I think if you track it,
> > you'll see PCIDevice.io_regions[i].address_space is actually
> > system_memory, which is used to initialize address_space_system.
> > 
> > The machine assembles PCI devices onto buses as instructed by the
> > command line or hot plug operations.  It's the responsibility of the
> > guest firmware and guest OS to probe those devices, size the BARs, and
> > place the BARs into the memory hierarchy of the PCI bus, ie. system
> > memory.  The BARs are necessarily in the "guest physical memory" for
> > vCPU access, but it's essentially only coincidental that PCI devices
> > might be in an address space that provides a mapping to their own BAR.
> > There's no reason to ever use it.
> > 
> > In the vIOMMU case, we can't know that the device address space
> > includes those BAR mappings or if they do, that they're identity mapped
> > to the physical address.  Devices really need to not infer anything
> > about an address.  Think about real hardware, a device is told by
> > driver programming to perform a DMA operation.  The device doesn't know
> > the target of that operation, it's the guest driver's responsibility to
> > make sure the IOVA within the device address space is valid and maps to
> > the desired target.  Thanks,
> 
> Thanks for the explanation, Alex. Thanks to everyone else in the thread who
> helped to clarify this problem.
> 
> We have implemented the memory isolation based on the discussion in the
> thread. We will send the patches out shortly.
> 
> Devices such as “name" and “e1000” worked fine. But I’d like to note that
> the LSI device (TYPE_LSI53C895A) had some problems - it doesn’t seem
> to be IOMMU aware. In LSI’s case, the kernel driver is asking the device to
> read instructions from the CPU VA (lsi_execute_script() -> read_dword()),
> which is forbidden when IOMMU is enabled. Specifically, the driver is asking
> the device to access other BAR regions by using the BAR address programmed
> in the PCI config space. This happens even without vfio-user patches. For example,
> we could enable IOMMU using “-device intel-iommu” QEMU option and also
> adding the following to the kernel command-line: “intel_iommu=on iommu=nopt”.
> In this case, we could see an IOMMU fault.

So, device accessing its own BAR is different. Basically, these
transactions never go on the bus at all, never mind get to the IOMMU.
I think it's just used as a handle to address internal device memory.
This kind of trick is not universal, but not terribly unusual.


> Unfortunately, we started off our project with the LSI device. So that lead to all the
> confusion about what is expected at the server end in-terms of
> vectoring/address-translation. It gave an impression as if the request was still on
> the CPU side of the PCI root complex, but the actual problem was with the
> device driver itself.
> 
> I’m wondering how to deal with this problem. Would it be OK if we mapped the
> device’s BAR into the IOVA, at the same CPU VA programmed in the BAR registers?
> This would help devices such as LSI to circumvent this problem. One problem
> with this approach is that it has the potential to collide with another legitimate
> IOVA address. Kindly share your thought on this.
> 
> Thank you!

I am not 100% sure what do you plan to do but it sounds fine since even
if it collides, with traditional PCI device must never initiate cycles
within their own BAR range, and PCIe is software-compatible with PCI. So
devices won't be able to access this IOVA even if it was programmed in
the IOMMU.

As was mentioned elsewhere on this thread, devices accessing each
other's BAR is a different matter.

I do not remember which rules apply to multiple functions of a
multi-function device though. I think in a traditional PCI
they will never go out on the bus, but with e.g. SRIOV they
would probably do go out? Alex, any idea?


> --
> Jag
> 
> > 
> > Alex
> > 
>
Jag Raman Feb. 10, 2022, 10:23 p.m. UTC | #34
> On Feb 10, 2022, at 3:02 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> 
> On Thu, Feb 10, 2022 at 12:08:27AM +0000, Jag Raman wrote:
>> 
>> 
>>> On Feb 2, 2022, at 12:34 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
>>> 
>>> On Wed, 2 Feb 2022 01:13:22 +0000
>>> Jag Raman <jag.raman@oracle.com> wrote:
>>> 
>>>>> On Feb 1, 2022, at 5:47 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
>>>>> 
>>>>> On Tue, 1 Feb 2022 21:24:08 +0000
>>>>> Jag Raman <jag.raman@oracle.com> wrote:
>>>>> 
>>>>>>> On Feb 1, 2022, at 10:24 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
>>>>>>> 
>>>>>>> On Tue, 1 Feb 2022 09:30:35 +0000
>>>>>>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>>>>>> 
>>>>>>>> On Mon, Jan 31, 2022 at 09:16:23AM -0700, Alex Williamson wrote:    
>>>>>>>>> On Fri, 28 Jan 2022 09:18:08 +0000
>>>>>>>>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>>>>>>>> 
>>>>>>>>>> On Thu, Jan 27, 2022 at 02:22:53PM -0700, Alex Williamson wrote:      
>>>>>>>>>>> If the goal here is to restrict DMA between devices, ie. peer-to-peer
>>>>>>>>>>> (p2p), why are we trying to re-invent what an IOMMU already does?        
>>>>>>>>>> 
>>>>>>>>>> The issue Dave raised is that vfio-user servers run in separate
>>>>>>>>>> processses from QEMU with shared memory access to RAM but no direct
>>>>>>>>>> access to non-RAM MemoryRegions. The virtiofs DAX Window BAR is one
>>>>>>>>>> example of a non-RAM MemoryRegion that can be the source/target of DMA
>>>>>>>>>> requests.
>>>>>>>>>> 
>>>>>>>>>> I don't think IOMMUs solve this problem but luckily the vfio-user
>>>>>>>>>> protocol already has messages that vfio-user servers can use as a
>>>>>>>>>> fallback when DMA cannot be completed through the shared memory RAM
>>>>>>>>>> accesses.
>>>>>>>>>> 
>>>>>>>>>>> In
>>>>>>>>>>> fact, it seems like an IOMMU does this better in providing an IOVA
>>>>>>>>>>> address space per BDF.  Is the dynamic mapping overhead too much?  What
>>>>>>>>>>> physical hardware properties or specifications could we leverage to
>>>>>>>>>>> restrict p2p mappings to a device?  Should it be governed by machine
>>>>>>>>>>> type to provide consistency between devices?  Should each "isolated"
>>>>>>>>>>> bus be in a separate root complex?  Thanks,        
>>>>>>>>>> 
>>>>>>>>>> There is a separate issue in this patch series regarding isolating the
>>>>>>>>>> address space where BAR accesses are made (i.e. the global
>>>>>>>>>> address_space_memory/io). When one process hosts multiple vfio-user
>>>>>>>>>> server instances (e.g. a software-defined network switch with multiple
>>>>>>>>>> ethernet devices) then each instance needs isolated memory and io address
>>>>>>>>>> spaces so that vfio-user clients don't cause collisions when they map
>>>>>>>>>> BARs to the same address.
>>>>>>>>>> 
>>>>>>>>>> I think the the separate root complex idea is a good solution. This
>>>>>>>>>> patch series takes a different approach by adding the concept of
>>>>>>>>>> isolated address spaces into hw/pci/.      
>>>>>>>>> 
>>>>>>>>> This all still seems pretty sketchy, BARs cannot overlap within the
>>>>>>>>> same vCPU address space, perhaps with the exception of when they're
>>>>>>>>> being sized, but DMA should be disabled during sizing.
>>>>>>>>> 
>>>>>>>>> Devices within the same VM context with identical BARs would need to
>>>>>>>>> operate in different address spaces.  For example a translation offset
>>>>>>>>> in the vCPU address space would allow unique addressing to the devices,
>>>>>>>>> perhaps using the translation offset bits to address a root complex and
>>>>>>>>> masking those bits for downstream transactions.
>>>>>>>>> 
>>>>>>>>> In general, the device simply operates in an address space, ie. an
>>>>>>>>> IOVA.  When a mapping is made within that address space, we perform a
>>>>>>>>> translation as necessary to generate a guest physical address.  The
>>>>>>>>> IOVA itself is only meaningful within the context of the address space,
>>>>>>>>> there is no requirement or expectation for it to be globally unique.
>>>>>>>>> 
>>>>>>>>> If the vfio-user server is making some sort of requirement that IOVAs
>>>>>>>>> are unique across all devices, that seems very, very wrong.  Thanks,      
>>>>>>>> 
>>>>>>>> Yes, BARs and IOVAs don't need to be unique across all devices.
>>>>>>>> 
>>>>>>>> The issue is that there can be as many guest physical address spaces as
>>>>>>>> there are vfio-user clients connected, so per-client isolated address
>>>>>>>> spaces are required. This patch series has a solution to that problem
>>>>>>>> with the new pci_isol_as_mem/io() API.    
>>>>>>> 
>>>>>>> Sorry, this still doesn't follow for me.  A server that hosts multiple
>>>>>>> devices across many VMs (I'm not sure if you're referring to the device
>>>>>>> or the VM as a client) needs to deal with different address spaces per
>>>>>>> device.  The server needs to be able to uniquely identify every DMA,
>>>>>>> which must be part of the interface protocol.  But I don't see how that
>>>>>>> imposes a requirement of an isolated address space.  If we want the
>>>>>>> device isolated because we don't trust the server, that's where an IOMMU
>>>>>>> provides per device isolation.  What is the restriction of the
>>>>>>> per-client isolated address space and why do we need it?  The server
>>>>>>> needing to support multiple clients is not a sufficient answer to
>>>>>>> impose new PCI bus types with an implicit restriction on the VM.    
>>>>>> 
>>>>>> Hi Alex,
>>>>>> 
>>>>>> I believe there are two separate problems with running PCI devices in
>>>>>> the vfio-user server. The first one is concerning memory isolation and
>>>>>> second one is vectoring of BAR accesses (as explained below).
>>>>>> 
>>>>>> In our previous patches (v3), we used an IOMMU to isolate memory
>>>>>> spaces. But we still had trouble with the vectoring. So we implemented
>>>>>> separate address spaces for each PCIBus to tackle both problems
>>>>>> simultaneously, based on the feedback we got.
>>>>>> 
>>>>>> The following gives an overview of issues concerning vectoring of
>>>>>> BAR accesses.
>>>>>> 
>>>>>> The device’s BAR regions are mapped into the guest physical address
>>>>>> space. The guest writes the guest PA of each BAR into the device’s BAR
>>>>>> registers. To access the BAR regions of the device, QEMU uses
>>>>>> address_space_rw() which vectors the physical address access to the
>>>>>> device BAR region handlers.  
>>>>> 
>>>>> The guest physical address written to the BAR is irrelevant from the
>>>>> device perspective, this only serves to assign the BAR an offset within
>>>>> the address_space_mem, which is used by the vCPU (and possibly other
>>>>> devices depending on their address space).  There is no reason for the
>>>>> device itself to care about this address.  
>>>> 
>>>> Thank you for the explanation, Alex!
>>>> 
>>>> The confusion at my part is whether we are inside the device already when
>>>> the server receives a request to access BAR region of a device. Based on
>>>> your explanation, I get that your view is the BAR access request has
>>>> propagated into the device already, whereas I was under the impression
>>>> that the request is still on the CPU side of the PCI root complex.
>>> 
>>> If you are getting an access through your MemoryRegionOps, all the
>>> translations have been made, you simply need to use the hwaddr as the
>>> offset into the MemoryRegion for the access.  Perform the read/write to
>>> your device, no further translations required.
>>> 
>>>> Your view makes sense to me - once the BAR access request reaches the
>>>> client (on the other side), we could consider that the request has reached
>>>> the device.
>>>> 
>>>> On a separate note, if devices don’t care about the values in BAR
>>>> registers, why do the default PCI config handlers intercept and map
>>>> the BAR region into address_space_mem?
>>>> (pci_default_write_config() -> pci_update_mappings())
>>> 
>>> This is the part that's actually placing the BAR MemoryRegion as a
>>> sub-region into the vCPU address space.  I think if you track it,
>>> you'll see PCIDevice.io_regions[i].address_space is actually
>>> system_memory, which is used to initialize address_space_system.
>>> 
>>> The machine assembles PCI devices onto buses as instructed by the
>>> command line or hot plug operations.  It's the responsibility of the
>>> guest firmware and guest OS to probe those devices, size the BARs, and
>>> place the BARs into the memory hierarchy of the PCI bus, ie. system
>>> memory.  The BARs are necessarily in the "guest physical memory" for
>>> vCPU access, but it's essentially only coincidental that PCI devices
>>> might be in an address space that provides a mapping to their own BAR.
>>> There's no reason to ever use it.
>>> 
>>> In the vIOMMU case, we can't know that the device address space
>>> includes those BAR mappings or if they do, that they're identity mapped
>>> to the physical address.  Devices really need to not infer anything
>>> about an address.  Think about real hardware, a device is told by
>>> driver programming to perform a DMA operation.  The device doesn't know
>>> the target of that operation, it's the guest driver's responsibility to
>>> make sure the IOVA within the device address space is valid and maps to
>>> the desired target.  Thanks,
>> 
>> Thanks for the explanation, Alex. Thanks to everyone else in the thread who
>> helped to clarify this problem.
>> 
>> We have implemented the memory isolation based on the discussion in the
>> thread. We will send the patches out shortly.
>> 
>> Devices such as “name" and “e1000” worked fine. But I’d like to note that
>> the LSI device (TYPE_LSI53C895A) had some problems - it doesn’t seem
>> to be IOMMU aware. In LSI’s case, the kernel driver is asking the device to
>> read instructions from the CPU VA (lsi_execute_script() -> read_dword()),
>> which is forbidden when IOMMU is enabled. Specifically, the driver is asking
>> the device to access other BAR regions by using the BAR address programmed
>> in the PCI config space. This happens even without vfio-user patches. For example,
>> we could enable IOMMU using “-device intel-iommu” QEMU option and also
>> adding the following to the kernel command-line: “intel_iommu=on iommu=nopt”.
>> In this case, we could see an IOMMU fault.
> 
> So, device accessing its own BAR is different. Basically, these
> transactions never go on the bus at all, never mind get to the IOMMU.

Hi Michael,

In LSI case, I did notice that it went to the IOMMU. The device is reading the BAR
address as if it was a DMA address.

> I think it's just used as a handle to address internal device memory.
> This kind of trick is not universal, but not terribly unusual.
> 
> 
>> Unfortunately, we started off our project with the LSI device. So that lead to all the
>> confusion about what is expected at the server end in-terms of
>> vectoring/address-translation. It gave an impression as if the request was still on
>> the CPU side of the PCI root complex, but the actual problem was with the
>> device driver itself.
>> 
>> I’m wondering how to deal with this problem. Would it be OK if we mapped the
>> device’s BAR into the IOVA, at the same CPU VA programmed in the BAR registers?
>> This would help devices such as LSI to circumvent this problem. One problem
>> with this approach is that it has the potential to collide with another legitimate
>> IOVA address. Kindly share your thought on this.
>> 
>> Thank you!
> 
> I am not 100% sure what do you plan to do but it sounds fine since even
> if it collides, with traditional PCI device must never initiate cycles

OK sounds good, I’ll create a mapping of the device BARs in the IOVA.

Thank you!
--
Jag

> within their own BAR range, and PCIe is software-compatible with PCI. So
> devices won't be able to access this IOVA even if it was programmed in
> the IOMMU.
> 
> As was mentioned elsewhere on this thread, devices accessing each
> other's BAR is a different matter.
> 
> I do not remember which rules apply to multiple functions of a
> multi-function device though. I think in a traditional PCI
> they will never go out on the bus, but with e.g. SRIOV they
> would probably do go out? Alex, any idea?
> 
> 
>> --
>> Jag
>> 
>>> 
>>> Alex
>>> 
>> 
>
Michael S. Tsirkin Feb. 10, 2022, 10:53 p.m. UTC | #35
On Thu, Feb 10, 2022 at 10:23:01PM +0000, Jag Raman wrote:
> 
> 
> > On Feb 10, 2022, at 3:02 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > 
> > On Thu, Feb 10, 2022 at 12:08:27AM +0000, Jag Raman wrote:
> >> 
> >> 
> >>> On Feb 2, 2022, at 12:34 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
> >>> 
> >>> On Wed, 2 Feb 2022 01:13:22 +0000
> >>> Jag Raman <jag.raman@oracle.com> wrote:
> >>> 
> >>>>> On Feb 1, 2022, at 5:47 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
> >>>>> 
> >>>>> On Tue, 1 Feb 2022 21:24:08 +0000
> >>>>> Jag Raman <jag.raman@oracle.com> wrote:
> >>>>> 
> >>>>>>> On Feb 1, 2022, at 10:24 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
> >>>>>>> 
> >>>>>>> On Tue, 1 Feb 2022 09:30:35 +0000
> >>>>>>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >>>>>>> 
> >>>>>>>> On Mon, Jan 31, 2022 at 09:16:23AM -0700, Alex Williamson wrote:    
> >>>>>>>>> On Fri, 28 Jan 2022 09:18:08 +0000
> >>>>>>>>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >>>>>>>>> 
> >>>>>>>>>> On Thu, Jan 27, 2022 at 02:22:53PM -0700, Alex Williamson wrote:      
> >>>>>>>>>>> If the goal here is to restrict DMA between devices, ie. peer-to-peer
> >>>>>>>>>>> (p2p), why are we trying to re-invent what an IOMMU already does?        
> >>>>>>>>>> 
> >>>>>>>>>> The issue Dave raised is that vfio-user servers run in separate
> >>>>>>>>>> processses from QEMU with shared memory access to RAM but no direct
> >>>>>>>>>> access to non-RAM MemoryRegions. The virtiofs DAX Window BAR is one
> >>>>>>>>>> example of a non-RAM MemoryRegion that can be the source/target of DMA
> >>>>>>>>>> requests.
> >>>>>>>>>> 
> >>>>>>>>>> I don't think IOMMUs solve this problem but luckily the vfio-user
> >>>>>>>>>> protocol already has messages that vfio-user servers can use as a
> >>>>>>>>>> fallback when DMA cannot be completed through the shared memory RAM
> >>>>>>>>>> accesses.
> >>>>>>>>>> 
> >>>>>>>>>>> In
> >>>>>>>>>>> fact, it seems like an IOMMU does this better in providing an IOVA
> >>>>>>>>>>> address space per BDF.  Is the dynamic mapping overhead too much?  What
> >>>>>>>>>>> physical hardware properties or specifications could we leverage to
> >>>>>>>>>>> restrict p2p mappings to a device?  Should it be governed by machine
> >>>>>>>>>>> type to provide consistency between devices?  Should each "isolated"
> >>>>>>>>>>> bus be in a separate root complex?  Thanks,        
> >>>>>>>>>> 
> >>>>>>>>>> There is a separate issue in this patch series regarding isolating the
> >>>>>>>>>> address space where BAR accesses are made (i.e. the global
> >>>>>>>>>> address_space_memory/io). When one process hosts multiple vfio-user
> >>>>>>>>>> server instances (e.g. a software-defined network switch with multiple
> >>>>>>>>>> ethernet devices) then each instance needs isolated memory and io address
> >>>>>>>>>> spaces so that vfio-user clients don't cause collisions when they map
> >>>>>>>>>> BARs to the same address.
> >>>>>>>>>> 
> >>>>>>>>>> I think the the separate root complex idea is a good solution. This
> >>>>>>>>>> patch series takes a different approach by adding the concept of
> >>>>>>>>>> isolated address spaces into hw/pci/.      
> >>>>>>>>> 
> >>>>>>>>> This all still seems pretty sketchy, BARs cannot overlap within the
> >>>>>>>>> same vCPU address space, perhaps with the exception of when they're
> >>>>>>>>> being sized, but DMA should be disabled during sizing.
> >>>>>>>>> 
> >>>>>>>>> Devices within the same VM context with identical BARs would need to
> >>>>>>>>> operate in different address spaces.  For example a translation offset
> >>>>>>>>> in the vCPU address space would allow unique addressing to the devices,
> >>>>>>>>> perhaps using the translation offset bits to address a root complex and
> >>>>>>>>> masking those bits for downstream transactions.
> >>>>>>>>> 
> >>>>>>>>> In general, the device simply operates in an address space, ie. an
> >>>>>>>>> IOVA.  When a mapping is made within that address space, we perform a
> >>>>>>>>> translation as necessary to generate a guest physical address.  The
> >>>>>>>>> IOVA itself is only meaningful within the context of the address space,
> >>>>>>>>> there is no requirement or expectation for it to be globally unique.
> >>>>>>>>> 
> >>>>>>>>> If the vfio-user server is making some sort of requirement that IOVAs
> >>>>>>>>> are unique across all devices, that seems very, very wrong.  Thanks,      
> >>>>>>>> 
> >>>>>>>> Yes, BARs and IOVAs don't need to be unique across all devices.
> >>>>>>>> 
> >>>>>>>> The issue is that there can be as many guest physical address spaces as
> >>>>>>>> there are vfio-user clients connected, so per-client isolated address
> >>>>>>>> spaces are required. This patch series has a solution to that problem
> >>>>>>>> with the new pci_isol_as_mem/io() API.    
> >>>>>>> 
> >>>>>>> Sorry, this still doesn't follow for me.  A server that hosts multiple
> >>>>>>> devices across many VMs (I'm not sure if you're referring to the device
> >>>>>>> or the VM as a client) needs to deal with different address spaces per
> >>>>>>> device.  The server needs to be able to uniquely identify every DMA,
> >>>>>>> which must be part of the interface protocol.  But I don't see how that
> >>>>>>> imposes a requirement of an isolated address space.  If we want the
> >>>>>>> device isolated because we don't trust the server, that's where an IOMMU
> >>>>>>> provides per device isolation.  What is the restriction of the
> >>>>>>> per-client isolated address space and why do we need it?  The server
> >>>>>>> needing to support multiple clients is not a sufficient answer to
> >>>>>>> impose new PCI bus types with an implicit restriction on the VM.    
> >>>>>> 
> >>>>>> Hi Alex,
> >>>>>> 
> >>>>>> I believe there are two separate problems with running PCI devices in
> >>>>>> the vfio-user server. The first one is concerning memory isolation and
> >>>>>> second one is vectoring of BAR accesses (as explained below).
> >>>>>> 
> >>>>>> In our previous patches (v3), we used an IOMMU to isolate memory
> >>>>>> spaces. But we still had trouble with the vectoring. So we implemented
> >>>>>> separate address spaces for each PCIBus to tackle both problems
> >>>>>> simultaneously, based on the feedback we got.
> >>>>>> 
> >>>>>> The following gives an overview of issues concerning vectoring of
> >>>>>> BAR accesses.
> >>>>>> 
> >>>>>> The device’s BAR regions are mapped into the guest physical address
> >>>>>> space. The guest writes the guest PA of each BAR into the device’s BAR
> >>>>>> registers. To access the BAR regions of the device, QEMU uses
> >>>>>> address_space_rw() which vectors the physical address access to the
> >>>>>> device BAR region handlers.  
> >>>>> 
> >>>>> The guest physical address written to the BAR is irrelevant from the
> >>>>> device perspective, this only serves to assign the BAR an offset within
> >>>>> the address_space_mem, which is used by the vCPU (and possibly other
> >>>>> devices depending on their address space).  There is no reason for the
> >>>>> device itself to care about this address.  
> >>>> 
> >>>> Thank you for the explanation, Alex!
> >>>> 
> >>>> The confusion at my part is whether we are inside the device already when
> >>>> the server receives a request to access BAR region of a device. Based on
> >>>> your explanation, I get that your view is the BAR access request has
> >>>> propagated into the device already, whereas I was under the impression
> >>>> that the request is still on the CPU side of the PCI root complex.
> >>> 
> >>> If you are getting an access through your MemoryRegionOps, all the
> >>> translations have been made, you simply need to use the hwaddr as the
> >>> offset into the MemoryRegion for the access.  Perform the read/write to
> >>> your device, no further translations required.
> >>> 
> >>>> Your view makes sense to me - once the BAR access request reaches the
> >>>> client (on the other side), we could consider that the request has reached
> >>>> the device.
> >>>> 
> >>>> On a separate note, if devices don’t care about the values in BAR
> >>>> registers, why do the default PCI config handlers intercept and map
> >>>> the BAR region into address_space_mem?
> >>>> (pci_default_write_config() -> pci_update_mappings())
> >>> 
> >>> This is the part that's actually placing the BAR MemoryRegion as a
> >>> sub-region into the vCPU address space.  I think if you track it,
> >>> you'll see PCIDevice.io_regions[i].address_space is actually
> >>> system_memory, which is used to initialize address_space_system.
> >>> 
> >>> The machine assembles PCI devices onto buses as instructed by the
> >>> command line or hot plug operations.  It's the responsibility of the
> >>> guest firmware and guest OS to probe those devices, size the BARs, and
> >>> place the BARs into the memory hierarchy of the PCI bus, ie. system
> >>> memory.  The BARs are necessarily in the "guest physical memory" for
> >>> vCPU access, but it's essentially only coincidental that PCI devices
> >>> might be in an address space that provides a mapping to their own BAR.
> >>> There's no reason to ever use it.
> >>> 
> >>> In the vIOMMU case, we can't know that the device address space
> >>> includes those BAR mappings or if they do, that they're identity mapped
> >>> to the physical address.  Devices really need to not infer anything
> >>> about an address.  Think about real hardware, a device is told by
> >>> driver programming to perform a DMA operation.  The device doesn't know
> >>> the target of that operation, it's the guest driver's responsibility to
> >>> make sure the IOVA within the device address space is valid and maps to
> >>> the desired target.  Thanks,
> >> 
> >> Thanks for the explanation, Alex. Thanks to everyone else in the thread who
> >> helped to clarify this problem.
> >> 
> >> We have implemented the memory isolation based on the discussion in the
> >> thread. We will send the patches out shortly.
> >> 
> >> Devices such as “name" and “e1000” worked fine. But I’d like to note that
> >> the LSI device (TYPE_LSI53C895A) had some problems - it doesn’t seem
> >> to be IOMMU aware. In LSI’s case, the kernel driver is asking the device to
> >> read instructions from the CPU VA (lsi_execute_script() -> read_dword()),
> >> which is forbidden when IOMMU is enabled. Specifically, the driver is asking
> >> the device to access other BAR regions by using the BAR address programmed
> >> in the PCI config space. This happens even without vfio-user patches. For example,
> >> we could enable IOMMU using “-device intel-iommu” QEMU option and also
> >> adding the following to the kernel command-line: “intel_iommu=on iommu=nopt”.
> >> In this case, we could see an IOMMU fault.
> > 
> > So, device accessing its own BAR is different. Basically, these
> > transactions never go on the bus at all, never mind get to the IOMMU.
> 
> Hi Michael,
> 
> In LSI case, I did notice that it went to the IOMMU.

Hmm do you mean you analyzed how a physical device works?
Or do you mean in QEMU?

> The device is reading the BAR
> address as if it was a DMA address.

I got that, my understanding of PCI was that a device can
not be both a master and a target of a transaction at
the same time though. Could not find this in the spec though,
maybe I remember incorrectly.

> > I think it's just used as a handle to address internal device memory.
> > This kind of trick is not universal, but not terribly unusual.
> > 
> > 
> >> Unfortunately, we started off our project with the LSI device. So that lead to all the
> >> confusion about what is expected at the server end in-terms of
> >> vectoring/address-translation. It gave an impression as if the request was still on
> >> the CPU side of the PCI root complex, but the actual problem was with the
> >> device driver itself.
> >> 
> >> I’m wondering how to deal with this problem. Would it be OK if we mapped the
> >> device’s BAR into the IOVA, at the same CPU VA programmed in the BAR registers?
> >> This would help devices such as LSI to circumvent this problem. One problem
> >> with this approach is that it has the potential to collide with another legitimate
> >> IOVA address. Kindly share your thought on this.
> >> 
> >> Thank you!
> > 
> > I am not 100% sure what do you plan to do but it sounds fine since even
> > if it collides, with traditional PCI device must never initiate cycles
> 
> OK sounds good, I’ll create a mapping of the device BARs in the IOVA.
> 
> Thank you!
> --
> Jag
> 
> > within their own BAR range, and PCIe is software-compatible with PCI. So
> > devices won't be able to access this IOVA even if it was programmed in
> > the IOMMU.
> > 
> > As was mentioned elsewhere on this thread, devices accessing each
> > other's BAR is a different matter.
> > 
> > I do not remember which rules apply to multiple functions of a
> > multi-function device though. I think in a traditional PCI
> > they will never go out on the bus, but with e.g. SRIOV they
> > would probably do go out? Alex, any idea?
> > 
> > 
> >> --
> >> Jag
> >> 
> >>> 
> >>> Alex
> >>> 
> >> 
> > 
>
Alex Williamson Feb. 10, 2022, 11:17 p.m. UTC | #36
On Thu, 10 Feb 2022 22:23:01 +0000
Jag Raman <jag.raman@oracle.com> wrote:

> > On Feb 10, 2022, at 3:02 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > 
> > On Thu, Feb 10, 2022 at 12:08:27AM +0000, Jag Raman wrote:  
> >> 
> >> Thanks for the explanation, Alex. Thanks to everyone else in the thread who
> >> helped to clarify this problem.
> >> 
> >> We have implemented the memory isolation based on the discussion in the
> >> thread. We will send the patches out shortly.
> >> 
> >> Devices such as “name" and “e1000” worked fine. But I’d like to note that
> >> the LSI device (TYPE_LSI53C895A) had some problems - it doesn’t seem
> >> to be IOMMU aware. In LSI’s case, the kernel driver is asking the device to
> >> read instructions from the CPU VA (lsi_execute_script() -> read_dword()),
> >> which is forbidden when IOMMU is enabled. Specifically, the driver is asking
> >> the device to access other BAR regions by using the BAR address programmed
> >> in the PCI config space. This happens even without vfio-user patches. For example,
> >> we could enable IOMMU using “-device intel-iommu” QEMU option and also
> >> adding the following to the kernel command-line: “intel_iommu=on iommu=nopt”.
> >> In this case, we could see an IOMMU fault.  
> > 
> > So, device accessing its own BAR is different. Basically, these
> > transactions never go on the bus at all, never mind get to the IOMMU.  
> 
> Hi Michael,
> 
> In LSI case, I did notice that it went to the IOMMU. The device is reading the BAR
> address as if it was a DMA address.
> 
> > I think it's just used as a handle to address internal device memory.
> > This kind of trick is not universal, but not terribly unusual.
> > 
> >   
> >> Unfortunately, we started off our project with the LSI device. So that lead to all the
> >> confusion about what is expected at the server end in-terms of
> >> vectoring/address-translation. It gave an impression as if the request was still on
> >> the CPU side of the PCI root complex, but the actual problem was with the
> >> device driver itself.
> >> 
> >> I’m wondering how to deal with this problem. Would it be OK if we mapped the
> >> device’s BAR into the IOVA, at the same CPU VA programmed in the BAR registers?
> >> This would help devices such as LSI to circumvent this problem. One problem
> >> with this approach is that it has the potential to collide with another legitimate
> >> IOVA address. Kindly share your thought on this.
> >> 
> >> Thank you!  
> > 
> > I am not 100% sure what do you plan to do but it sounds fine since even
> > if it collides, with traditional PCI device must never initiate cycles  
> 
> OK sounds good, I’ll create a mapping of the device BARs in the IOVA.

I don't think this is correct.  Look for instance at ACPI _TRA support
where a system can specify a translation offset such that, for example,
a CPU access to a device is required to add the provided offset to the
bus address of the device.  A system using this could have multiple
root bridges, where each is given the same, overlapping MMIO aperture.
From the processor perspective, each MMIO range is unique and possibly
none of those devices have a zero _TRA, there could be system memory at
the equivalent flat memory address.

So if the transaction actually hits this bus, which I think is what
making use of the device AddressSpace implies, I don't think it can
assume that it's simply reflected back at itself.  Conventional PCI and
PCI Express may be software compatible, but there's a reason we don't
see IOMMUs that provide both translation and isolation in conventional
topologies.

Is this more a bug in the LSI device emulation model?  For instance in
vfio-pci, if I want to access an offset into a BAR from within QEMU, I
don't care what address is programmed into that BAR, I perform an
access relative to the vfio file descriptor region representing that
BAR space.  I'd expect that any viable device emulation model does the
same, an access to device memory uses an offset from an internal
resource, irrespective of the BAR address.

It would seem strange if the driver is actually programming the device
to DMA to itself and if that's actually happening, I'd wonder if this
driver is actually compatible with an IOMMU on bare metal.

> > within their own BAR range, and PCIe is software-compatible with PCI. So
> > devices won't be able to access this IOVA even if it was programmed in
> > the IOMMU.
> > 
> > As was mentioned elsewhere on this thread, devices accessing each
> > other's BAR is a different matter.
> > 
> > I do not remember which rules apply to multiple functions of a
> > multi-function device though. I think in a traditional PCI
> > they will never go out on the bus, but with e.g. SRIOV they
> > would probably do go out? Alex, any idea?

This falls under implementation specific behavior in the spec, IIRC.
This is actually why IOMMU grouping requires ACS support on
multi-function devices to clarify the behavior of p2p between functions
in the same slot.  Thanks,

Alex
Michael S. Tsirkin Feb. 10, 2022, 11:28 p.m. UTC | #37
On Thu, Feb 10, 2022 at 04:17:34PM -0700, Alex Williamson wrote:
> On Thu, 10 Feb 2022 22:23:01 +0000
> Jag Raman <jag.raman@oracle.com> wrote:
> 
> > > On Feb 10, 2022, at 3:02 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > > 
> > > On Thu, Feb 10, 2022 at 12:08:27AM +0000, Jag Raman wrote:  
> > >> 
> > >> Thanks for the explanation, Alex. Thanks to everyone else in the thread who
> > >> helped to clarify this problem.
> > >> 
> > >> We have implemented the memory isolation based on the discussion in the
> > >> thread. We will send the patches out shortly.
> > >> 
> > >> Devices such as “name" and “e1000” worked fine. But I’d like to note that
> > >> the LSI device (TYPE_LSI53C895A) had some problems - it doesn’t seem
> > >> to be IOMMU aware. In LSI’s case, the kernel driver is asking the device to
> > >> read instructions from the CPU VA (lsi_execute_script() -> read_dword()),
> > >> which is forbidden when IOMMU is enabled. Specifically, the driver is asking
> > >> the device to access other BAR regions by using the BAR address programmed
> > >> in the PCI config space. This happens even without vfio-user patches. For example,
> > >> we could enable IOMMU using “-device intel-iommu” QEMU option and also
> > >> adding the following to the kernel command-line: “intel_iommu=on iommu=nopt”.
> > >> In this case, we could see an IOMMU fault.  
> > > 
> > > So, device accessing its own BAR is different. Basically, these
> > > transactions never go on the bus at all, never mind get to the IOMMU.  
> > 
> > Hi Michael,
> > 
> > In LSI case, I did notice that it went to the IOMMU. The device is reading the BAR
> > address as if it was a DMA address.
> > 
> > > I think it's just used as a handle to address internal device memory.
> > > This kind of trick is not universal, but not terribly unusual.
> > > 
> > >   
> > >> Unfortunately, we started off our project with the LSI device. So that lead to all the
> > >> confusion about what is expected at the server end in-terms of
> > >> vectoring/address-translation. It gave an impression as if the request was still on
> > >> the CPU side of the PCI root complex, but the actual problem was with the
> > >> device driver itself.
> > >> 
> > >> I’m wondering how to deal with this problem. Would it be OK if we mapped the
> > >> device’s BAR into the IOVA, at the same CPU VA programmed in the BAR registers?
> > >> This would help devices such as LSI to circumvent this problem. One problem
> > >> with this approach is that it has the potential to collide with another legitimate
> > >> IOVA address. Kindly share your thought on this.
> > >> 
> > >> Thank you!  
> > > 
> > > I am not 100% sure what do you plan to do but it sounds fine since even
> > > if it collides, with traditional PCI device must never initiate cycles  
> > 
> > OK sounds good, I’ll create a mapping of the device BARs in the IOVA.
> 
> I don't think this is correct.  Look for instance at ACPI _TRA support
> where a system can specify a translation offset such that, for example,
> a CPU access to a device is required to add the provided offset to the
> bus address of the device.  A system using this could have multiple
> root bridges, where each is given the same, overlapping MMIO aperture.
> >From the processor perspective, each MMIO range is unique and possibly
> none of those devices have a zero _TRA, there could be system memory at
> the equivalent flat memory address.

I am guessing there are reasons to have these in acpi besides firmware
vendors wanting to find corner cases in device implementations though
:). E.g. it's possible something else is tweaking DMA in similar ways. I
can't say for sure and I wonder why do we care as long as QEMU does not
have _TRA.


> So if the transaction actually hits this bus, which I think is what
> making use of the device AddressSpace implies, I don't think it can
> assume that it's simply reflected back at itself.  Conventional PCI and
> PCI Express may be software compatible, but there's a reason we don't
> see IOMMUs that provide both translation and isolation in conventional
> topologies.
> 
> Is this more a bug in the LSI device emulation model?  For instance in
> vfio-pci, if I want to access an offset into a BAR from within QEMU, I
> don't care what address is programmed into that BAR, I perform an
> access relative to the vfio file descriptor region representing that
> BAR space.  I'd expect that any viable device emulation model does the
> same, an access to device memory uses an offset from an internal
> resource, irrespective of the BAR address.

However, using BAR seems like a reasonable shortcut allowing
device to use the same 64 bit address to refer to system
and device RAM interchangeably.

> It would seem strange if the driver is actually programming the device
> to DMA to itself and if that's actually happening, I'd wonder if this
> driver is actually compatible with an IOMMU on bare metal.

You know, it's hardware after all. As long as things work vendors
happily ship the device. They don't really worry about theoretical issues ;).

> > > within their own BAR range, and PCIe is software-compatible with PCI. So
> > > devices won't be able to access this IOVA even if it was programmed in
> > > the IOMMU.
> > > 
> > > As was mentioned elsewhere on this thread, devices accessing each
> > > other's BAR is a different matter.
> > > 
> > > I do not remember which rules apply to multiple functions of a
> > > multi-function device though. I think in a traditional PCI
> > > they will never go out on the bus, but with e.g. SRIOV they
> > > would probably do go out? Alex, any idea?
> 
> This falls under implementation specific behavior in the spec, IIRC.
> This is actually why IOMMU grouping requires ACS support on
> multi-function devices to clarify the behavior of p2p between functions
> in the same slot.  Thanks,
> 
> Alex

thanks!
Jag Raman Feb. 10, 2022, 11:46 p.m. UTC | #38
> On Feb 10, 2022, at 5:53 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> 
> On Thu, Feb 10, 2022 at 10:23:01PM +0000, Jag Raman wrote:
>> 
>> 
>>> On Feb 10, 2022, at 3:02 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>> 
>>> On Thu, Feb 10, 2022 at 12:08:27AM +0000, Jag Raman wrote:
>>>> 
>>>> 
>>>>> On Feb 2, 2022, at 12:34 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
>>>>> 
>>>>> On Wed, 2 Feb 2022 01:13:22 +0000
>>>>> Jag Raman <jag.raman@oracle.com> wrote:
>>>>> 
>>>>>>> On Feb 1, 2022, at 5:47 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
>>>>>>> 
>>>>>>> On Tue, 1 Feb 2022 21:24:08 +0000
>>>>>>> Jag Raman <jag.raman@oracle.com> wrote:
>>>>>>> 
>>>>>>>>> On Feb 1, 2022, at 10:24 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
>>>>>>>>> 
>>>>>>>>> On Tue, 1 Feb 2022 09:30:35 +0000
>>>>>>>>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>>>>>>>> 
>>>>>>>>>> On Mon, Jan 31, 2022 at 09:16:23AM -0700, Alex Williamson wrote:    
>>>>>>>>>>> On Fri, 28 Jan 2022 09:18:08 +0000
>>>>>>>>>>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>>>>>>>>>> 
>>>>>>>>>>>> On Thu, Jan 27, 2022 at 02:22:53PM -0700, Alex Williamson wrote:      
>>>>>>>>>>>>> If the goal here is to restrict DMA between devices, ie. peer-to-peer
>>>>>>>>>>>>> (p2p), why are we trying to re-invent what an IOMMU already does?        
>>>>>>>>>>>> 
>>>>>>>>>>>> The issue Dave raised is that vfio-user servers run in separate
>>>>>>>>>>>> processses from QEMU with shared memory access to RAM but no direct
>>>>>>>>>>>> access to non-RAM MemoryRegions. The virtiofs DAX Window BAR is one
>>>>>>>>>>>> example of a non-RAM MemoryRegion that can be the source/target of DMA
>>>>>>>>>>>> requests.
>>>>>>>>>>>> 
>>>>>>>>>>>> I don't think IOMMUs solve this problem but luckily the vfio-user
>>>>>>>>>>>> protocol already has messages that vfio-user servers can use as a
>>>>>>>>>>>> fallback when DMA cannot be completed through the shared memory RAM
>>>>>>>>>>>> accesses.
>>>>>>>>>>>> 
>>>>>>>>>>>>> In
>>>>>>>>>>>>> fact, it seems like an IOMMU does this better in providing an IOVA
>>>>>>>>>>>>> address space per BDF.  Is the dynamic mapping overhead too much?  What
>>>>>>>>>>>>> physical hardware properties or specifications could we leverage to
>>>>>>>>>>>>> restrict p2p mappings to a device?  Should it be governed by machine
>>>>>>>>>>>>> type to provide consistency between devices?  Should each "isolated"
>>>>>>>>>>>>> bus be in a separate root complex?  Thanks,        
>>>>>>>>>>>> 
>>>>>>>>>>>> There is a separate issue in this patch series regarding isolating the
>>>>>>>>>>>> address space where BAR accesses are made (i.e. the global
>>>>>>>>>>>> address_space_memory/io). When one process hosts multiple vfio-user
>>>>>>>>>>>> server instances (e.g. a software-defined network switch with multiple
>>>>>>>>>>>> ethernet devices) then each instance needs isolated memory and io address
>>>>>>>>>>>> spaces so that vfio-user clients don't cause collisions when they map
>>>>>>>>>>>> BARs to the same address.
>>>>>>>>>>>> 
>>>>>>>>>>>> I think the the separate root complex idea is a good solution. This
>>>>>>>>>>>> patch series takes a different approach by adding the concept of
>>>>>>>>>>>> isolated address spaces into hw/pci/.      
>>>>>>>>>>> 
>>>>>>>>>>> This all still seems pretty sketchy, BARs cannot overlap within the
>>>>>>>>>>> same vCPU address space, perhaps with the exception of when they're
>>>>>>>>>>> being sized, but DMA should be disabled during sizing.
>>>>>>>>>>> 
>>>>>>>>>>> Devices within the same VM context with identical BARs would need to
>>>>>>>>>>> operate in different address spaces.  For example a translation offset
>>>>>>>>>>> in the vCPU address space would allow unique addressing to the devices,
>>>>>>>>>>> perhaps using the translation offset bits to address a root complex and
>>>>>>>>>>> masking those bits for downstream transactions.
>>>>>>>>>>> 
>>>>>>>>>>> In general, the device simply operates in an address space, ie. an
>>>>>>>>>>> IOVA.  When a mapping is made within that address space, we perform a
>>>>>>>>>>> translation as necessary to generate a guest physical address.  The
>>>>>>>>>>> IOVA itself is only meaningful within the context of the address space,
>>>>>>>>>>> there is no requirement or expectation for it to be globally unique.
>>>>>>>>>>> 
>>>>>>>>>>> If the vfio-user server is making some sort of requirement that IOVAs
>>>>>>>>>>> are unique across all devices, that seems very, very wrong.  Thanks,      
>>>>>>>>>> 
>>>>>>>>>> Yes, BARs and IOVAs don't need to be unique across all devices.
>>>>>>>>>> 
>>>>>>>>>> The issue is that there can be as many guest physical address spaces as
>>>>>>>>>> there are vfio-user clients connected, so per-client isolated address
>>>>>>>>>> spaces are required. This patch series has a solution to that problem
>>>>>>>>>> with the new pci_isol_as_mem/io() API.    
>>>>>>>>> 
>>>>>>>>> Sorry, this still doesn't follow for me.  A server that hosts multiple
>>>>>>>>> devices across many VMs (I'm not sure if you're referring to the device
>>>>>>>>> or the VM as a client) needs to deal with different address spaces per
>>>>>>>>> device.  The server needs to be able to uniquely identify every DMA,
>>>>>>>>> which must be part of the interface protocol.  But I don't see how that
>>>>>>>>> imposes a requirement of an isolated address space.  If we want the
>>>>>>>>> device isolated because we don't trust the server, that's where an IOMMU
>>>>>>>>> provides per device isolation.  What is the restriction of the
>>>>>>>>> per-client isolated address space and why do we need it?  The server
>>>>>>>>> needing to support multiple clients is not a sufficient answer to
>>>>>>>>> impose new PCI bus types with an implicit restriction on the VM.    
>>>>>>>> 
>>>>>>>> Hi Alex,
>>>>>>>> 
>>>>>>>> I believe there are two separate problems with running PCI devices in
>>>>>>>> the vfio-user server. The first one is concerning memory isolation and
>>>>>>>> second one is vectoring of BAR accesses (as explained below).
>>>>>>>> 
>>>>>>>> In our previous patches (v3), we used an IOMMU to isolate memory
>>>>>>>> spaces. But we still had trouble with the vectoring. So we implemented
>>>>>>>> separate address spaces for each PCIBus to tackle both problems
>>>>>>>> simultaneously, based on the feedback we got.
>>>>>>>> 
>>>>>>>> The following gives an overview of issues concerning vectoring of
>>>>>>>> BAR accesses.
>>>>>>>> 
>>>>>>>> The device’s BAR regions are mapped into the guest physical address
>>>>>>>> space. The guest writes the guest PA of each BAR into the device’s BAR
>>>>>>>> registers. To access the BAR regions of the device, QEMU uses
>>>>>>>> address_space_rw() which vectors the physical address access to the
>>>>>>>> device BAR region handlers.  
>>>>>>> 
>>>>>>> The guest physical address written to the BAR is irrelevant from the
>>>>>>> device perspective, this only serves to assign the BAR an offset within
>>>>>>> the address_space_mem, which is used by the vCPU (and possibly other
>>>>>>> devices depending on their address space).  There is no reason for the
>>>>>>> device itself to care about this address.  
>>>>>> 
>>>>>> Thank you for the explanation, Alex!
>>>>>> 
>>>>>> The confusion at my part is whether we are inside the device already when
>>>>>> the server receives a request to access BAR region of a device. Based on
>>>>>> your explanation, I get that your view is the BAR access request has
>>>>>> propagated into the device already, whereas I was under the impression
>>>>>> that the request is still on the CPU side of the PCI root complex.
>>>>> 
>>>>> If you are getting an access through your MemoryRegionOps, all the
>>>>> translations have been made, you simply need to use the hwaddr as the
>>>>> offset into the MemoryRegion for the access.  Perform the read/write to
>>>>> your device, no further translations required.
>>>>> 
>>>>>> Your view makes sense to me - once the BAR access request reaches the
>>>>>> client (on the other side), we could consider that the request has reached
>>>>>> the device.
>>>>>> 
>>>>>> On a separate note, if devices don’t care about the values in BAR
>>>>>> registers, why do the default PCI config handlers intercept and map
>>>>>> the BAR region into address_space_mem?
>>>>>> (pci_default_write_config() -> pci_update_mappings())
>>>>> 
>>>>> This is the part that's actually placing the BAR MemoryRegion as a
>>>>> sub-region into the vCPU address space.  I think if you track it,
>>>>> you'll see PCIDevice.io_regions[i].address_space is actually
>>>>> system_memory, which is used to initialize address_space_system.
>>>>> 
>>>>> The machine assembles PCI devices onto buses as instructed by the
>>>>> command line or hot plug operations.  It's the responsibility of the
>>>>> guest firmware and guest OS to probe those devices, size the BARs, and
>>>>> place the BARs into the memory hierarchy of the PCI bus, ie. system
>>>>> memory.  The BARs are necessarily in the "guest physical memory" for
>>>>> vCPU access, but it's essentially only coincidental that PCI devices
>>>>> might be in an address space that provides a mapping to their own BAR.
>>>>> There's no reason to ever use it.
>>>>> 
>>>>> In the vIOMMU case, we can't know that the device address space
>>>>> includes those BAR mappings or if they do, that they're identity mapped
>>>>> to the physical address.  Devices really need to not infer anything
>>>>> about an address.  Think about real hardware, a device is told by
>>>>> driver programming to perform a DMA operation.  The device doesn't know
>>>>> the target of that operation, it's the guest driver's responsibility to
>>>>> make sure the IOVA within the device address space is valid and maps to
>>>>> the desired target.  Thanks,
>>>> 
>>>> Thanks for the explanation, Alex. Thanks to everyone else in the thread who
>>>> helped to clarify this problem.
>>>> 
>>>> We have implemented the memory isolation based on the discussion in the
>>>> thread. We will send the patches out shortly.
>>>> 
>>>> Devices such as “name" and “e1000” worked fine. But I’d like to note that
>>>> the LSI device (TYPE_LSI53C895A) had some problems - it doesn’t seem
>>>> to be IOMMU aware. In LSI’s case, the kernel driver is asking the device to
>>>> read instructions from the CPU VA (lsi_execute_script() -> read_dword()),
>>>> which is forbidden when IOMMU is enabled. Specifically, the driver is asking
>>>> the device to access other BAR regions by using the BAR address programmed
>>>> in the PCI config space. This happens even without vfio-user patches. For example,
>>>> we could enable IOMMU using “-device intel-iommu” QEMU option and also
>>>> adding the following to the kernel command-line: “intel_iommu=on iommu=nopt”.
>>>> In this case, we could see an IOMMU fault.
>>> 
>>> So, device accessing its own BAR is different. Basically, these
>>> transactions never go on the bus at all, never mind get to the IOMMU.
>> 
>> Hi Michael,
>> 
>> In LSI case, I did notice that it went to the IOMMU.
> 
> Hmm do you mean you analyzed how a physical device works?
> Or do you mean in QEMU?

I mean in QEMU, I did not analyze a physical device.
> 
>> The device is reading the BAR
>> address as if it was a DMA address.
> 
> I got that, my understanding of PCI was that a device can
> not be both a master and a target of a transaction at
> the same time though. Could not find this in the spec though,
> maybe I remember incorrectly.

I see, OK. If this were to happen in a real device, PCI would raise
an error because the master and target of a transaction can’t be
the same. So you believe that this access is handled inside the
device, and doesn’t go out.

Thanks!
--
Jag

> 
>>> I think it's just used as a handle to address internal device memory.
>>> This kind of trick is not universal, but not terribly unusual.
>>> 
>>> 
>>>> Unfortunately, we started off our project with the LSI device. So that lead to all the
>>>> confusion about what is expected at the server end in-terms of
>>>> vectoring/address-translation. It gave an impression as if the request was still on
>>>> the CPU side of the PCI root complex, but the actual problem was with the
>>>> device driver itself.
>>>> 
>>>> I’m wondering how to deal with this problem. Would it be OK if we mapped the
>>>> device’s BAR into the IOVA, at the same CPU VA programmed in the BAR registers?
>>>> This would help devices such as LSI to circumvent this problem. One problem
>>>> with this approach is that it has the potential to collide with another legitimate
>>>> IOVA address. Kindly share your thought on this.
>>>> 
>>>> Thank you!
>>> 
>>> I am not 100% sure what do you plan to do but it sounds fine since even
>>> if it collides, with traditional PCI device must never initiate cycles
>> 
>> OK sounds good, I’ll create a mapping of the device BARs in the IOVA.
>> 
>> Thank you!
>> --
>> Jag
>> 
>>> within their own BAR range, and PCIe is software-compatible with PCI. So
>>> devices won't be able to access this IOVA even if it was programmed in
>>> the IOMMU.
>>> 
>>> As was mentioned elsewhere on this thread, devices accessing each
>>> other's BAR is a different matter.
>>> 
>>> I do not remember which rules apply to multiple functions of a
>>> multi-function device though. I think in a traditional PCI
>>> they will never go out on the bus, but with e.g. SRIOV they
>>> would probably do go out? Alex, any idea?
>>> 
>>> 
>>>> --
>>>> Jag
>>>> 
>>>>> 
>>>>> Alex
Alex Williamson Feb. 10, 2022, 11:49 p.m. UTC | #39
On Thu, 10 Feb 2022 18:28:56 -0500
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Thu, Feb 10, 2022 at 04:17:34PM -0700, Alex Williamson wrote:
> > On Thu, 10 Feb 2022 22:23:01 +0000
> > Jag Raman <jag.raman@oracle.com> wrote:
> >   
> > > > On Feb 10, 2022, at 3:02 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > 
> > > > On Thu, Feb 10, 2022 at 12:08:27AM +0000, Jag Raman wrote:    
> > > >> 
> > > >> Thanks for the explanation, Alex. Thanks to everyone else in the thread who
> > > >> helped to clarify this problem.
> > > >> 
> > > >> We have implemented the memory isolation based on the discussion in the
> > > >> thread. We will send the patches out shortly.
> > > >> 
> > > >> Devices such as “name" and “e1000” worked fine. But I’d like to note that
> > > >> the LSI device (TYPE_LSI53C895A) had some problems - it doesn’t seem
> > > >> to be IOMMU aware. In LSI’s case, the kernel driver is asking the device to
> > > >> read instructions from the CPU VA (lsi_execute_script() -> read_dword()),
> > > >> which is forbidden when IOMMU is enabled. Specifically, the driver is asking
> > > >> the device to access other BAR regions by using the BAR address programmed
> > > >> in the PCI config space. This happens even without vfio-user patches. For example,
> > > >> we could enable IOMMU using “-device intel-iommu” QEMU option and also
> > > >> adding the following to the kernel command-line: “intel_iommu=on iommu=nopt”.
> > > >> In this case, we could see an IOMMU fault.    
> > > > 
> > > > So, device accessing its own BAR is different. Basically, these
> > > > transactions never go on the bus at all, never mind get to the IOMMU.    
> > > 
> > > Hi Michael,
> > > 
> > > In LSI case, I did notice that it went to the IOMMU. The device is reading the BAR
> > > address as if it was a DMA address.
> > >   
> > > > I think it's just used as a handle to address internal device memory.
> > > > This kind of trick is not universal, but not terribly unusual.
> > > > 
> > > >     
> > > >> Unfortunately, we started off our project with the LSI device. So that lead to all the
> > > >> confusion about what is expected at the server end in-terms of
> > > >> vectoring/address-translation. It gave an impression as if the request was still on
> > > >> the CPU side of the PCI root complex, but the actual problem was with the
> > > >> device driver itself.
> > > >> 
> > > >> I’m wondering how to deal with this problem. Would it be OK if we mapped the
> > > >> device’s BAR into the IOVA, at the same CPU VA programmed in the BAR registers?
> > > >> This would help devices such as LSI to circumvent this problem. One problem
> > > >> with this approach is that it has the potential to collide with another legitimate
> > > >> IOVA address. Kindly share your thought on this.
> > > >> 
> > > >> Thank you!    
> > > > 
> > > > I am not 100% sure what do you plan to do but it sounds fine since even
> > > > if it collides, with traditional PCI device must never initiate cycles    
> > > 
> > > OK sounds good, I’ll create a mapping of the device BARs in the IOVA.  
> > 
> > I don't think this is correct.  Look for instance at ACPI _TRA support
> > where a system can specify a translation offset such that, for example,
> > a CPU access to a device is required to add the provided offset to the
> > bus address of the device.  A system using this could have multiple
> > root bridges, where each is given the same, overlapping MMIO aperture.  
> > >From the processor perspective, each MMIO range is unique and possibly  
> > none of those devices have a zero _TRA, there could be system memory at
> > the equivalent flat memory address.  
> 
> I am guessing there are reasons to have these in acpi besides firmware
> vendors wanting to find corner cases in device implementations though
> :). E.g. it's possible something else is tweaking DMA in similar ways. I
> can't say for sure and I wonder why do we care as long as QEMU does not
> have _TRA.

How many complaints do we get about running out of I/O port space on
q35 because we allow an arbitrary number of root ports?  What if we
used _TRA to provide the full I/O port range per root port?  32-bit
MMIO could be duplicated as well.

> > So if the transaction actually hits this bus, which I think is what
> > making use of the device AddressSpace implies, I don't think it can
> > assume that it's simply reflected back at itself.  Conventional PCI and
> > PCI Express may be software compatible, but there's a reason we don't
> > see IOMMUs that provide both translation and isolation in conventional
> > topologies.
> > 
> > Is this more a bug in the LSI device emulation model?  For instance in
> > vfio-pci, if I want to access an offset into a BAR from within QEMU, I
> > don't care what address is programmed into that BAR, I perform an
> > access relative to the vfio file descriptor region representing that
> > BAR space.  I'd expect that any viable device emulation model does the
> > same, an access to device memory uses an offset from an internal
> > resource, irrespective of the BAR address.  
> 
> However, using BAR seems like a reasonable shortcut allowing
> device to use the same 64 bit address to refer to system
> and device RAM interchangeably.

A shortcut that breaks when an IOMMU is involved.

> > It would seem strange if the driver is actually programming the device
> > to DMA to itself and if that's actually happening, I'd wonder if this
> > driver is actually compatible with an IOMMU on bare metal.  
> 
> You know, it's hardware after all. As long as things work vendors
> happily ship the device. They don't really worry about theoretical issues ;).

We're talking about a 32-bit conventional PCI device from the previous
century.  IOMMUs are no longer theoretical, but not all drivers have
kept up.  It's maybe not the best choice as the de facto standard
device, imo.  Thanks,

Alex
Jag Raman Feb. 11, 2022, 12:10 a.m. UTC | #40
> On Feb 10, 2022, at 6:17 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> On Thu, 10 Feb 2022 22:23:01 +0000
> Jag Raman <jag.raman@oracle.com> wrote:
> 
>>> On Feb 10, 2022, at 3:02 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>> 
>>> On Thu, Feb 10, 2022 at 12:08:27AM +0000, Jag Raman wrote:  
>>>> 
>>>> Thanks for the explanation, Alex. Thanks to everyone else in the thread who
>>>> helped to clarify this problem.
>>>> 
>>>> We have implemented the memory isolation based on the discussion in the
>>>> thread. We will send the patches out shortly.
>>>> 
>>>> Devices such as “name" and “e1000” worked fine. But I’d like to note that
>>>> the LSI device (TYPE_LSI53C895A) had some problems - it doesn’t seem
>>>> to be IOMMU aware. In LSI’s case, the kernel driver is asking the device to
>>>> read instructions from the CPU VA (lsi_execute_script() -> read_dword()),
>>>> which is forbidden when IOMMU is enabled. Specifically, the driver is asking
>>>> the device to access other BAR regions by using the BAR address programmed
>>>> in the PCI config space. This happens even without vfio-user patches. For example,
>>>> we could enable IOMMU using “-device intel-iommu” QEMU option and also
>>>> adding the following to the kernel command-line: “intel_iommu=on iommu=nopt”.
>>>> In this case, we could see an IOMMU fault.  
>>> 
>>> So, device accessing its own BAR is different. Basically, these
>>> transactions never go on the bus at all, never mind get to the IOMMU.  
>> 
>> Hi Michael,
>> 
>> In LSI case, I did notice that it went to the IOMMU. The device is reading the BAR
>> address as if it was a DMA address.
>> 
>>> I think it's just used as a handle to address internal device memory.
>>> This kind of trick is not universal, but not terribly unusual.
>>> 
>>> 
>>>> Unfortunately, we started off our project with the LSI device. So that lead to all the
>>>> confusion about what is expected at the server end in-terms of
>>>> vectoring/address-translation. It gave an impression as if the request was still on
>>>> the CPU side of the PCI root complex, but the actual problem was with the
>>>> device driver itself.
>>>> 
>>>> I’m wondering how to deal with this problem. Would it be OK if we mapped the
>>>> device’s BAR into the IOVA, at the same CPU VA programmed in the BAR registers?
>>>> This would help devices such as LSI to circumvent this problem. One problem
>>>> with this approach is that it has the potential to collide with another legitimate
>>>> IOVA address. Kindly share your thought on this.
>>>> 
>>>> Thank you!  
>>> 
>>> I am not 100% sure what do you plan to do but it sounds fine since even
>>> if it collides, with traditional PCI device must never initiate cycles  
>> 
>> OK sounds good, I’ll create a mapping of the device BARs in the IOVA.
> 
> I don't think this is correct.  Look for instance at ACPI _TRA support
> where a system can specify a translation offset such that, for example,
> a CPU access to a device is required to add the provided offset to the
> bus address of the device.  A system using this could have multiple
> root bridges, where each is given the same, overlapping MMIO aperture.
> From the processor perspective, each MMIO range is unique and possibly
> none of those devices have a zero _TRA, there could be system memory at
> the equivalent flat memory address.
> 
> So if the transaction actually hits this bus, which I think is what
> making use of the device AddressSpace implies, I don't think it can
> assume that it's simply reflected back at itself.  Conventional PCI and
> PCI Express may be software compatible, but there's a reason we don't
> see IOMMUs that provide both translation and isolation in conventional
> topologies.
> 
> Is this more a bug in the LSI device emulation model?  For instance in
> vfio-pci, if I want to access an offset into a BAR from within QEMU, I
> don't care what address is programmed into that BAR, I perform an
> access relative to the vfio file descriptor region representing that
> BAR space.  I'd expect that any viable device emulation model does the
> same, an access to device memory uses an offset from an internal
> resource, irrespective of the BAR address.
> 
> It would seem strange if the driver is actually programming the device
> to DMA to itself and if that's actually happening, I'd wonder if this

It does look like the driver is actually programming the device to DMA to itself.

The driver first programs the DSP (DMA Scripts Pointer) register with the BAR
address. It does so by performing a series of MMIO writes (lsi_mmio_write())
to offsets 0x2C - 0x2F. Immediately after programming this register, the device
fetches some instructions located at the programmed address.

Thank you!
--
Jag

> driver is actually compatible with an IOMMU on bare metal.
> 
>>> within their own BAR range, and PCIe is software-compatible with PCI. So
>>> devices won't be able to access this IOVA even if it was programmed in
>>> the IOMMU.
>>> 
>>> As was mentioned elsewhere on this thread, devices accessing each
>>> other's BAR is a different matter.
>>> 
>>> I do not remember which rules apply to multiple functions of a
>>> multi-function device though. I think in a traditional PCI
>>> they will never go out on the bus, but with e.g. SRIOV they
>>> would probably do go out? Alex, any idea?
> 
> This falls under implementation specific behavior in the spec, IIRC.
> This is actually why IOMMU grouping requires ACS support on
> multi-function devices to clarify the behavior of p2p between functions
> in the same slot.  Thanks,
> 
> Alex
>
Michael S. Tsirkin Feb. 11, 2022, 12:26 a.m. UTC | #41
On Thu, Feb 10, 2022 at 04:49:33PM -0700, Alex Williamson wrote:
> On Thu, 10 Feb 2022 18:28:56 -0500
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Thu, Feb 10, 2022 at 04:17:34PM -0700, Alex Williamson wrote:
> > > On Thu, 10 Feb 2022 22:23:01 +0000
> > > Jag Raman <jag.raman@oracle.com> wrote:
> > >   
> > > > > On Feb 10, 2022, at 3:02 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > 
> > > > > On Thu, Feb 10, 2022 at 12:08:27AM +0000, Jag Raman wrote:    
> > > > >> 
> > > > >> Thanks for the explanation, Alex. Thanks to everyone else in the thread who
> > > > >> helped to clarify this problem.
> > > > >> 
> > > > >> We have implemented the memory isolation based on the discussion in the
> > > > >> thread. We will send the patches out shortly.
> > > > >> 
> > > > >> Devices such as “name" and “e1000” worked fine. But I’d like to note that
> > > > >> the LSI device (TYPE_LSI53C895A) had some problems - it doesn’t seem
> > > > >> to be IOMMU aware. In LSI’s case, the kernel driver is asking the device to
> > > > >> read instructions from the CPU VA (lsi_execute_script() -> read_dword()),
> > > > >> which is forbidden when IOMMU is enabled. Specifically, the driver is asking
> > > > >> the device to access other BAR regions by using the BAR address programmed
> > > > >> in the PCI config space. This happens even without vfio-user patches. For example,
> > > > >> we could enable IOMMU using “-device intel-iommu” QEMU option and also
> > > > >> adding the following to the kernel command-line: “intel_iommu=on iommu=nopt”.
> > > > >> In this case, we could see an IOMMU fault.    
> > > > > 
> > > > > So, device accessing its own BAR is different. Basically, these
> > > > > transactions never go on the bus at all, never mind get to the IOMMU.    
> > > > 
> > > > Hi Michael,
> > > > 
> > > > In LSI case, I did notice that it went to the IOMMU. The device is reading the BAR
> > > > address as if it was a DMA address.
> > > >   
> > > > > I think it's just used as a handle to address internal device memory.
> > > > > This kind of trick is not universal, but not terribly unusual.
> > > > > 
> > > > >     
> > > > >> Unfortunately, we started off our project with the LSI device. So that lead to all the
> > > > >> confusion about what is expected at the server end in-terms of
> > > > >> vectoring/address-translation. It gave an impression as if the request was still on
> > > > >> the CPU side of the PCI root complex, but the actual problem was with the
> > > > >> device driver itself.
> > > > >> 
> > > > >> I’m wondering how to deal with this problem. Would it be OK if we mapped the
> > > > >> device’s BAR into the IOVA, at the same CPU VA programmed in the BAR registers?
> > > > >> This would help devices such as LSI to circumvent this problem. One problem
> > > > >> with this approach is that it has the potential to collide with another legitimate
> > > > >> IOVA address. Kindly share your thought on this.
> > > > >> 
> > > > >> Thank you!    
> > > > > 
> > > > > I am not 100% sure what do you plan to do but it sounds fine since even
> > > > > if it collides, with traditional PCI device must never initiate cycles    
> > > > 
> > > > OK sounds good, I’ll create a mapping of the device BARs in the IOVA.  
> > > 
> > > I don't think this is correct.  Look for instance at ACPI _TRA support
> > > where a system can specify a translation offset such that, for example,
> > > a CPU access to a device is required to add the provided offset to the
> > > bus address of the device.  A system using this could have multiple
> > > root bridges, where each is given the same, overlapping MMIO aperture.  
> > > >From the processor perspective, each MMIO range is unique and possibly  
> > > none of those devices have a zero _TRA, there could be system memory at
> > > the equivalent flat memory address.  
> > 
> > I am guessing there are reasons to have these in acpi besides firmware
> > vendors wanting to find corner cases in device implementations though
> > :). E.g. it's possible something else is tweaking DMA in similar ways. I
> > can't say for sure and I wonder why do we care as long as QEMU does not
> > have _TRA.
> 
> How many complaints do we get about running out of I/O port space on
> q35 because we allow an arbitrary number of root ports?  What if we
> used _TRA to provide the full I/O port range per root port?  32-bit
> MMIO could be duplicated as well.

It's an interesting idea. To clarify what I said, I suspect some devices
are broken in presence of translating bridges unless DMA
is also translated to match.

I agree it's a mess though, in that some devices when given their own
BAR to DMA to will probably just satisfy the access from internal
memory, while others will ignore that and send it up as DMA
and both types are probably out there in the field.


> > > So if the transaction actually hits this bus, which I think is what
> > > making use of the device AddressSpace implies, I don't think it can
> > > assume that it's simply reflected back at itself.  Conventional PCI and
> > > PCI Express may be software compatible, but there's a reason we don't
> > > see IOMMUs that provide both translation and isolation in conventional
> > > topologies.
> > > 
> > > Is this more a bug in the LSI device emulation model?  For instance in
> > > vfio-pci, if I want to access an offset into a BAR from within QEMU, I
> > > don't care what address is programmed into that BAR, I perform an
> > > access relative to the vfio file descriptor region representing that
> > > BAR space.  I'd expect that any viable device emulation model does the
> > > same, an access to device memory uses an offset from an internal
> > > resource, irrespective of the BAR address.  
> > 
> > However, using BAR seems like a reasonable shortcut allowing
> > device to use the same 64 bit address to refer to system
> > and device RAM interchangeably.
> 
> A shortcut that breaks when an IOMMU is involved.

Maybe. But if that's how hardware behaves, we have little choice but
emulate it.

> > > It would seem strange if the driver is actually programming the device
> > > to DMA to itself and if that's actually happening, I'd wonder if this
> > > driver is actually compatible with an IOMMU on bare metal.  
> > 
> > You know, it's hardware after all. As long as things work vendors
> > happily ship the device. They don't really worry about theoretical issues ;).
> 
> We're talking about a 32-bit conventional PCI device from the previous
> century.  IOMMUs are no longer theoretical, but not all drivers have
> kept up.  It's maybe not the best choice as the de facto standard
> device, imo.  Thanks,
> 
> Alex

More importantly lots devices most likely don't support arbitrary
configurations and break if you try to create something that matches the
spec but never or even rarely occurs on bare metal.
Jag Raman Feb. 11, 2022, 12:54 a.m. UTC | #42
> On Feb 10, 2022, at 7:26 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> 
> On Thu, Feb 10, 2022 at 04:49:33PM -0700, Alex Williamson wrote:
>> On Thu, 10 Feb 2022 18:28:56 -0500
>> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>> 
>>> On Thu, Feb 10, 2022 at 04:17:34PM -0700, Alex Williamson wrote:
>>>> On Thu, 10 Feb 2022 22:23:01 +0000
>>>> Jag Raman <jag.raman@oracle.com> wrote:
>>>> 
>>>>>> On Feb 10, 2022, at 3:02 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>>>>> 
>>>>>> On Thu, Feb 10, 2022 at 12:08:27AM +0000, Jag Raman wrote:    
>>>>>>> 
>>>>>>> Thanks for the explanation, Alex. Thanks to everyone else in the thread who
>>>>>>> helped to clarify this problem.
>>>>>>> 
>>>>>>> We have implemented the memory isolation based on the discussion in the
>>>>>>> thread. We will send the patches out shortly.
>>>>>>> 
>>>>>>> Devices such as “name" and “e1000” worked fine. But I’d like to note that
>>>>>>> the LSI device (TYPE_LSI53C895A) had some problems - it doesn’t seem
>>>>>>> to be IOMMU aware. In LSI’s case, the kernel driver is asking the device to
>>>>>>> read instructions from the CPU VA (lsi_execute_script() -> read_dword()),
>>>>>>> which is forbidden when IOMMU is enabled. Specifically, the driver is asking
>>>>>>> the device to access other BAR regions by using the BAR address programmed
>>>>>>> in the PCI config space. This happens even without vfio-user patches. For example,
>>>>>>> we could enable IOMMU using “-device intel-iommu” QEMU option and also
>>>>>>> adding the following to the kernel command-line: “intel_iommu=on iommu=nopt”.
>>>>>>> In this case, we could see an IOMMU fault.    
>>>>>> 
>>>>>> So, device accessing its own BAR is different. Basically, these
>>>>>> transactions never go on the bus at all, never mind get to the IOMMU.    
>>>>> 
>>>>> Hi Michael,
>>>>> 
>>>>> In LSI case, I did notice that it went to the IOMMU. The device is reading the BAR
>>>>> address as if it was a DMA address.
>>>>> 
>>>>>> I think it's just used as a handle to address internal device memory.
>>>>>> This kind of trick is not universal, but not terribly unusual.
>>>>>> 
>>>>>> 
>>>>>>> Unfortunately, we started off our project with the LSI device. So that lead to all the
>>>>>>> confusion about what is expected at the server end in-terms of
>>>>>>> vectoring/address-translation. It gave an impression as if the request was still on
>>>>>>> the CPU side of the PCI root complex, but the actual problem was with the
>>>>>>> device driver itself.
>>>>>>> 
>>>>>>> I’m wondering how to deal with this problem. Would it be OK if we mapped the
>>>>>>> device’s BAR into the IOVA, at the same CPU VA programmed in the BAR registers?
>>>>>>> This would help devices such as LSI to circumvent this problem. One problem
>>>>>>> with this approach is that it has the potential to collide with another legitimate
>>>>>>> IOVA address. Kindly share your thought on this.
>>>>>>> 
>>>>>>> Thank you!    
>>>>>> 
>>>>>> I am not 100% sure what do you plan to do but it sounds fine since even
>>>>>> if it collides, with traditional PCI device must never initiate cycles    
>>>>> 
>>>>> OK sounds good, I’ll create a mapping of the device BARs in the IOVA.  
>>>> 
>>>> I don't think this is correct.  Look for instance at ACPI _TRA support
>>>> where a system can specify a translation offset such that, for example,
>>>> a CPU access to a device is required to add the provided offset to the
>>>> bus address of the device.  A system using this could have multiple
>>>> root bridges, where each is given the same, overlapping MMIO aperture.  
>>>>> From the processor perspective, each MMIO range is unique and possibly  
>>>> none of those devices have a zero _TRA, there could be system memory at
>>>> the equivalent flat memory address.  
>>> 
>>> I am guessing there are reasons to have these in acpi besides firmware
>>> vendors wanting to find corner cases in device implementations though
>>> :). E.g. it's possible something else is tweaking DMA in similar ways. I
>>> can't say for sure and I wonder why do we care as long as QEMU does not
>>> have _TRA.
>> 
>> How many complaints do we get about running out of I/O port space on
>> q35 because we allow an arbitrary number of root ports?  What if we
>> used _TRA to provide the full I/O port range per root port?  32-bit
>> MMIO could be duplicated as well.
> 
> It's an interesting idea. To clarify what I said, I suspect some devices
> are broken in presence of translating bridges unless DMA
> is also translated to match.
> 
> I agree it's a mess though, in that some devices when given their own
> BAR to DMA to will probably just satisfy the access from internal
> memory, while others will ignore that and send it up as DMA
> and both types are probably out there in the field.
> 
> 
>>>> So if the transaction actually hits this bus, which I think is what
>>>> making use of the device AddressSpace implies, I don't think it can
>>>> assume that it's simply reflected back at itself.  Conventional PCI and
>>>> PCI Express may be software compatible, but there's a reason we don't
>>>> see IOMMUs that provide both translation and isolation in conventional
>>>> topologies.
>>>> 
>>>> Is this more a bug in the LSI device emulation model?  For instance in
>>>> vfio-pci, if I want to access an offset into a BAR from within QEMU, I
>>>> don't care what address is programmed into that BAR, I perform an
>>>> access relative to the vfio file descriptor region representing that
>>>> BAR space.  I'd expect that any viable device emulation model does the
>>>> same, an access to device memory uses an offset from an internal
>>>> resource, irrespective of the BAR address.  
>>> 
>>> However, using BAR seems like a reasonable shortcut allowing
>>> device to use the same 64 bit address to refer to system
>>> and device RAM interchangeably.
>> 
>> A shortcut that breaks when an IOMMU is involved.
> 
> Maybe. But if that's how hardware behaves, we have little choice but
> emulate it.

I was wondering if we could map the BARs into the IOVA for a limited set of
devices - the ones which are designed before IOMMU such as lsi53c895a.
This would ensure that we follow the spec to the best without breaking
existing devices?

--
Jag

> 
>>>> It would seem strange if the driver is actually programming the device
>>>> to DMA to itself and if that's actually happening, I'd wonder if this
>>>> driver is actually compatible with an IOMMU on bare metal.  
>>> 
>>> You know, it's hardware after all. As long as things work vendors
>>> happily ship the device. They don't really worry about theoretical issues ;).
>> 
>> We're talking about a 32-bit conventional PCI device from the previous
>> century.  IOMMUs are no longer theoretical, but not all drivers have
>> kept up.  It's maybe not the best choice as the de facto standard
>> device, imo.  Thanks,
>> 
>> Alex
> 
> More importantly lots devices most likely don't support arbitrary
> configurations and break if you try to create something that matches the
> spec but never or even rarely occurs on bare metal.
> 
> -- 
> MST
>
diff mbox series

Patch

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 023abc0f79..9bb4472abc 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -387,6 +387,8 @@  void pci_device_save(PCIDevice *s, QEMUFile *f);
 int pci_device_load(PCIDevice *s, QEMUFile *f);
 MemoryRegion *pci_address_space(PCIDevice *dev);
 MemoryRegion *pci_address_space_io(PCIDevice *dev);
+AddressSpace *pci_isol_as_mem(PCIDevice *dev);
+AddressSpace *pci_isol_as_io(PCIDevice *dev);
 
 /*
  * Should not normally be used by devices. For use by sPAPR target
diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
index 347440d42c..d78258e79e 100644
--- a/include/hw/pci/pci_bus.h
+++ b/include/hw/pci/pci_bus.h
@@ -39,9 +39,26 @@  struct PCIBus {
     void *irq_opaque;
     PCIDevice *devices[PCI_SLOT_MAX * PCI_FUNC_MAX];
     PCIDevice *parent_dev;
+
     MemoryRegion *address_space_mem;
     MemoryRegion *address_space_io;
 
+    /**
+     * Isolated address spaces - these allow the PCI bus to be part
+     * of an isolated address space as opposed to the global
+     * address_space_memory & address_space_io. This allows the
+     * bus to be attached to CPUs from different machines. The
+     * following is not used used commonly.
+     *
+     * TYPE_REMOTE_MACHINE allows emulating devices from multiple
+     * VM clients, as such it needs the PCI buses in the same machine
+     * to be part of different CPU address spaces. The following is
+     * useful in that scenario.
+     *
+     */
+    AddressSpace *isol_as_mem;
+    AddressSpace *isol_as_io;
+
     QLIST_HEAD(, PCIBus) child; /* this will be replaced by qdev later */
     QLIST_ENTRY(PCIBus) sibling;/* this will be replaced by qdev later */
 
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 5d30f9ca60..d5f1c6c421 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -442,6 +442,8 @@  static void pci_root_bus_internal_init(PCIBus *bus, DeviceState *parent,
     bus->slot_reserved_mask = 0x0;
     bus->address_space_mem = address_space_mem;
     bus->address_space_io = address_space_io;
+    bus->isol_as_mem = NULL;
+    bus->isol_as_io = NULL;
     bus->flags |= PCI_BUS_IS_ROOT;
 
     /* host bridge */
@@ -2676,6 +2678,16 @@  MemoryRegion *pci_address_space_io(PCIDevice *dev)
     return pci_get_bus(dev)->address_space_io;
 }
 
+AddressSpace *pci_isol_as_mem(PCIDevice *dev)
+{
+    return pci_get_bus(dev)->isol_as_mem;
+}
+
+AddressSpace *pci_isol_as_io(PCIDevice *dev)
+{
+    return pci_get_bus(dev)->isol_as_io;
+}
+
 static void pci_device_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *k = DEVICE_CLASS(klass);
@@ -2699,6 +2711,7 @@  static void pci_device_class_base_init(ObjectClass *klass, void *data)
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
 {
+    AddressSpace *iommu_as = NULL;
     PCIBus *bus = pci_get_bus(dev);
     PCIBus *iommu_bus = bus;
     uint8_t devfn = dev->devfn;
@@ -2745,6 +2758,10 @@  AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
     if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) {
         return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn);
     }
+    iommu_as = pci_isol_as_mem(dev);
+    if (iommu_as) {
+        return iommu_as;
+    }
     return &address_space_memory;
 }
 
diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
index da34c8ebcd..98366768d2 100644
--- a/hw/pci/pci_bridge.c
+++ b/hw/pci/pci_bridge.c
@@ -383,6 +383,11 @@  void pci_bridge_initfn(PCIDevice *dev, const char *typename)
     sec_bus->address_space_io = &br->address_space_io;
     memory_region_init(&br->address_space_io, OBJECT(br), "pci_bridge_io",
                        4 * GiB);
+
+    /* This PCI bridge puts the sec_bus in its parent's address space */
+    sec_bus->isol_as_mem = pci_isol_as_mem(dev);
+    sec_bus->isol_as_io = pci_isol_as_io(dev);
+
     br->windows = pci_bridge_region_init(br);
     QLIST_INIT(&sec_bus->child);
     QLIST_INSERT_HEAD(&parent->child, sec_bus, sibling);