diff mbox series

[v4,11/14] vfio-user: IOMMU support for remote device

Message ID acae079dec4261d762311b86a0e699ba9ad79737.1639549843.git.jag.raman@oracle.com (mailing list archive)
State New, archived
Headers show
Series vfio-user server in QEMU | expand

Commit Message

Jag Raman Dec. 15, 2021, 3:35 p.m. UTC
Assign separate address space for each device in the remote processes.

Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
 include/hw/pci/pci.h      |   2 +
 include/hw/remote/iommu.h |  24 ++++++++
 hw/pci/pci.c              |   2 +-
 hw/remote/iommu.c         | 117 ++++++++++++++++++++++++++++++++++++++
 hw/remote/machine.c       |   5 ++
 hw/remote/vfio-user-obj.c |  20 ++++++-
 MAINTAINERS               |   2 +
 hw/remote/meson.build     |   1 +
 8 files changed, 169 insertions(+), 4 deletions(-)
 create mode 100644 include/hw/remote/iommu.h
 create mode 100644 hw/remote/iommu.c

Comments

Stefan Hajnoczi Dec. 16, 2021, 2:40 p.m. UTC | #1
On Wed, Dec 15, 2021 at 10:35:35AM -0500, Jagannathan Raman wrote:
> Assign separate address space for each device in the remote processes.

If I understand correctly this isn't really an IOMMU. It's abusing the
IOMMU APIs to create isolated address spaces for each device. This way
memory regions added by the vfio-user client do not conflict when there
are multiple vfio-user servers.

Calling pci_root_bus_new() and keeping one PCI bus per VfuObject might
be a cleaner approach:
- Lets you isolate both PCI Memory Space and IO Space.
- Isolates the PCIDevices and their addresses on the bus.
- Isolates irqs.
- No more need to abuse the IOMMU API.

I might be missing something because I haven't investigated how to do
this myself.

> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> ---
>  include/hw/pci/pci.h      |   2 +
>  include/hw/remote/iommu.h |  24 ++++++++
>  hw/pci/pci.c              |   2 +-
>  hw/remote/iommu.c         | 117 ++++++++++++++++++++++++++++++++++++++
>  hw/remote/machine.c       |   5 ++
>  hw/remote/vfio-user-obj.c |  20 ++++++-
>  MAINTAINERS               |   2 +
>  hw/remote/meson.build     |   1 +
>  8 files changed, 169 insertions(+), 4 deletions(-)
>  create mode 100644 include/hw/remote/iommu.h
>  create mode 100644 hw/remote/iommu.c
> 
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index 5c4016b995..f2fc2d5375 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -734,6 +734,8 @@ void lsi53c8xx_handle_legacy_cmdline(DeviceState *lsi_dev);
>  qemu_irq pci_allocate_irq(PCIDevice *pci_dev);
>  void pci_set_irq(PCIDevice *pci_dev, int level);
>  
> +void pci_init_bus_master(PCIDevice *pci_dev);

This function isn't used in this patch. Why make it public?

> +
>  static inline void pci_irq_assert(PCIDevice *pci_dev)
>  {
>      pci_set_irq(pci_dev, 1);
> diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
> new file mode 100644
> index 0000000000..42ce0ca383
> --- /dev/null
> +++ b/include/hw/remote/iommu.h
> @@ -0,0 +1,24 @@
> +/*
> + * IOMMU for remote device
> + *
> + * Copyright © 2021 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#ifndef REMOTE_IOMMU_H
> +#define REMOTE_IOMMU_H
> +
> +#include "hw/pci/pci_bus.h"
> +
> +void remote_iommu_free(PCIDevice *pci_dev);
> +
> +void remote_iommu_init(void);
> +
> +void remote_iommu_set(PCIBus *bus);
> +
> +MemoryRegion *remote_iommu_get_ram(PCIDevice *pci_dev);
> +
> +#endif
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 4a84e478ce..57d561cc03 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -95,7 +95,7 @@ static const VMStateDescription vmstate_pcibus = {
>      }
>  };
>  
> -static void pci_init_bus_master(PCIDevice *pci_dev)
> +void pci_init_bus_master(PCIDevice *pci_dev)
>  {
>      AddressSpace *dma_as = pci_device_iommu_address_space(pci_dev);
>  
> diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
> new file mode 100644
> index 0000000000..30c866badb
> --- /dev/null
> +++ b/hw/remote/iommu.c
> @@ -0,0 +1,117 @@
> +/*
> + * Remote IOMMU
> + *
> + * Copyright © 2021 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +
> +#include "hw/remote/iommu.h"
> +#include "hw/pci/pci_bus.h"
> +#include "exec/memory.h"
> +#include "exec/address-spaces.h"
> +#include "trace.h"
> +
> +struct VFUIOMMU {
> +    AddressSpace  as;
> +    MemoryRegion  mr;

I guess this is the root MemoryRegion container? Calling it "root" or
"root_mr" instead of "mr" would make that clearer.

> +};
> +
> +typedef struct VFUPciBus {

There is no uppercase/lowercase consistency between VfuObject vs
VFUIOMMU vs VFUPciBus. Although the coding standard doesn't dictate ABC
vs Abc, please be consistent. I suggest following the VfuObject
convention started in the previous patches. The names would be VfuIommu
and VfuPciBus.

> +    PCIBus           *bus;
> +    struct VFUIOMMU  *iommu[];
> +} VFUPciBus;
> +
> +GHashTable *remote_as_table;
> +
> +static AddressSpace *remote_iommu_get_as(PCIBus *bus, void *opaque, int devfn)
> +{
> +    VFUPciBus *vfu_pci_bus = NULL;
> +    struct VFUIOMMU *iommu = NULL;
> +
> +    if (!remote_as_table) {
> +        return &address_space_memory;
> +    }
> +
> +    vfu_pci_bus = g_hash_table_lookup(remote_as_table, bus);
> +
> +    if (!vfu_pci_bus) {
> +        vfu_pci_bus = g_malloc0(sizeof(VFUPciBus));
> +        vfu_pci_bus->bus = bus;
> +        g_hash_table_insert(remote_as_table, bus, vfu_pci_bus);
> +    }
> +
> +    iommu = vfu_pci_bus->iommu[devfn];
> +
> +    if (!iommu) {
> +        g_autofree char *mr_name = g_strdup_printf("vfu-ram-%d", devfn);
> +        g_autofree char *as_name = g_strdup_printf("vfu-as-%d", devfn);
> +
> +        iommu = g_malloc0(sizeof(struct VFUIOMMU));
> +
> +        memory_region_init(&iommu->mr, NULL, mr_name, UINT64_MAX);
> +        address_space_init(&iommu->as, &iommu->mr, as_name);
> +
> +        vfu_pci_bus->iommu[devfn] = iommu;
> +    }
> +
> +    return &iommu->as;
> +}
> +
> +void remote_iommu_free(PCIDevice *pci_dev)
> +{
> +    VFUPciBus *vfu_pci_bus = NULL;
> +    struct VFUIOMMU *iommu = NULL;
> +
> +    if (!remote_as_table) {
> +        return;
> +    }
> +
> +    vfu_pci_bus = g_hash_table_lookup(remote_as_table, pci_get_bus(pci_dev));
> +
> +    if (!vfu_pci_bus) {
> +        return;
> +    }
> +
> +    iommu = vfu_pci_bus->iommu[pci_dev->devfn];
> +
> +    vfu_pci_bus->iommu[pci_dev->devfn] = NULL;
> +
> +    if (iommu) {
> +        memory_region_unref(&iommu->mr);
> +        address_space_destroy(&iommu->as);
> +        g_free(iommu);
> +    }
> +}
> +
> +void remote_iommu_init(void)
> +{
> +    remote_as_table = g_hash_table_new_full(NULL, NULL, NULL, NULL);
> +}
> +
> +void remote_iommu_set(PCIBus *bus)
> +{
> +    pci_setup_iommu(bus, remote_iommu_get_as, NULL);
> +}
> +
> +MemoryRegion *remote_iommu_get_ram(PCIDevice *pci_dev)
> +{
> +    PCIBus *bus = pci_get_bus(pci_dev);
> +    VFUPciBus *vfu_pci_bus;
> +
> +    if (!remote_as_table) {
> +        return get_system_memory();
> +    }
> +
> +    vfu_pci_bus = g_hash_table_lookup(remote_as_table, bus);
> +    if (!vfu_pci_bus) {
> +        return get_system_memory();
> +    }
> +
> +    return &vfu_pci_bus->iommu[pci_dev->devfn]->mr;
> +}
> diff --git a/hw/remote/machine.c b/hw/remote/machine.c
> index 952105eab5..023be0491e 100644
> --- a/hw/remote/machine.c
> +++ b/hw/remote/machine.c
> @@ -21,6 +21,7 @@
>  #include "qapi/error.h"
>  #include "hw/pci/pci_host.h"
>  #include "hw/remote/iohub.h"
> +#include "hw/remote/iommu.h"
>  
>  static void remote_machine_init(MachineState *machine)
>  {
> @@ -52,6 +53,10 @@ static void remote_machine_init(MachineState *machine)
>  
>      remote_iohub_init(&s->iohub);
>  
> +    remote_iommu_init();
> +
> +    remote_iommu_set(pci_host->bus);
> +
>      pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
>                   &s->iohub, REMOTE_IOHUB_NB_PIRQS);
>  }
> diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
> index 9399e87cbe..ae375e69b9 100644
> --- a/hw/remote/vfio-user-obj.c
> +++ b/hw/remote/vfio-user-obj.c
> @@ -49,6 +49,7 @@
>  #include "hw/qdev-core.h"
>  #include "hw/pci/pci.h"
>  #include "qemu/timer.h"
> +#include "hw/remote/iommu.h"
>  
>  #define TYPE_VFU_OBJECT "x-vfio-user-server"
>  OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
> @@ -210,6 +211,7 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
>  
>  static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
>  {
> +    VfuObject *o = vfu_get_private(vfu_ctx);
>      MemoryRegion *subregion = NULL;
>      g_autofree char *name = NULL;
>      static unsigned int suffix;
> @@ -226,14 +228,15 @@ static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
>      memory_region_init_ram_ptr(subregion, NULL, name,
>                                 iov->iov_len, info->vaddr);
>  
> -    memory_region_add_subregion(get_system_memory(), (hwaddr)iov->iov_base,
> -                                subregion);
> +    memory_region_add_subregion(remote_iommu_get_ram(o->pci_dev),
> +                                (hwaddr)iov->iov_base, subregion);
>  
>      trace_vfu_dma_register((uint64_t)iov->iov_base, iov->iov_len);
>  }
>  
>  static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
>  {
> +    VfuObject *o = vfu_get_private(vfu_ctx);
>      MemoryRegion *mr = NULL;
>      ram_addr_t offset;
>  
> @@ -242,7 +245,7 @@ static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
>          return;
>      }
>  
> -    memory_region_del_subregion(get_system_memory(), mr);
> +    memory_region_del_subregion(remote_iommu_get_ram(o->pci_dev), mr);
>  
>      object_unparent((OBJECT(mr)));
>  
> @@ -320,6 +323,7 @@ static vfu_region_access_cb_t *vfu_object_bar_handlers[PCI_NUM_REGIONS] = {
>   */
>  static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
>  {
> +    VfuObject *o = vfu_get_private(vfu_ctx);
>      int i;
>  
>      for (i = 0; i < PCI_NUM_REGIONS; i++) {
> @@ -332,6 +336,12 @@ static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
>                           vfu_object_bar_handlers[i],
>                           VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
>  
> +        if ((o->pci_dev->io_regions[i].type & PCI_BASE_ADDRESS_SPACE) == 0) {
> +            memory_region_unref(o->pci_dev->io_regions[i].address_space);
> +            o->pci_dev->io_regions[i].address_space =
> +                remote_iommu_get_ram(o->pci_dev);
> +        }

This looks hacky. If you create a separate PCIHost for each device
instead then the BARs will be created in the MemoryRegion (confusingly
named "address_space" in the PCI code) of your choosing.

Also, why is PCI Memory Space isolated via VFUIOMMU but PCI IO Space is
not?
Jag Raman Dec. 17, 2021, 8 p.m. UTC | #2
> On Dec 16, 2021, at 9:40 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Wed, Dec 15, 2021 at 10:35:35AM -0500, Jagannathan Raman wrote:
>> Assign separate address space for each device in the remote processes.
> 
> If I understand correctly this isn't really an IOMMU. It's abusing the
> IOMMU APIs to create isolated address spaces for each device. This way
> memory regions added by the vfio-user client do not conflict when there
> are multiple vfio-user servers.

Like you already figured out, having isolated DMA address space alone is not
sufficient for this application, we also needed to isolate the sysmem/RAM address
space. As such, the available IOMMU APIs alone were not sufficient, so we had
to improvise.

> 
> Calling pci_root_bus_new() and keeping one PCI bus per VfuObject might
> be a cleaner approach:
> - Lets you isolate both PCI Memory Space and IO Space.
> - Isolates the PCIDevices and their addresses on the bus.
> - Isolates irqs.
> - No more need to abuse the IOMMU API.

I believe we would still need to have an IOMMU. It’s because, devices use the
pci_dma_read()/_write() functions. These functions look up the address in DMA
address space (via pci_get_address_space() -> PCIDevice->bus_master_as ->
PCIDevice->bus_master_enable_region -> PCIDevice->bus_master_container_region).
 bus_master_enable_region and bus_master_container_region are effectively aliases
to the DMA address space - without an IOMMU, the dma_as would be the shared
global sysmem/RAM space (address_space_mem, please see pci_init_bus_master())

> 
> I might be missing something because I haven't investigated how to do
> this myself.
> 
>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>> ---
>> include/hw/pci/pci.h      |   2 +
>> include/hw/remote/iommu.h |  24 ++++++++
>> hw/pci/pci.c              |   2 +-
>> hw/remote/iommu.c         | 117 ++++++++++++++++++++++++++++++++++++++
>> hw/remote/machine.c       |   5 ++
>> hw/remote/vfio-user-obj.c |  20 ++++++-
>> MAINTAINERS               |   2 +
>> hw/remote/meson.build     |   1 +
>> 8 files changed, 169 insertions(+), 4 deletions(-)
>> create mode 100644 include/hw/remote/iommu.h
>> create mode 100644 hw/remote/iommu.c
>> 
>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>> index 5c4016b995..f2fc2d5375 100644
>> --- a/include/hw/pci/pci.h
>> +++ b/include/hw/pci/pci.h
>> @@ -734,6 +734,8 @@ void lsi53c8xx_handle_legacy_cmdline(DeviceState *lsi_dev);
>> qemu_irq pci_allocate_irq(PCIDevice *pci_dev);
>> void pci_set_irq(PCIDevice *pci_dev, int level);
>> 
>> +void pci_init_bus_master(PCIDevice *pci_dev);
> 
> This function isn't used in this patch. Why make it public?

We were investigating updating the bus’s address space before the PCI device
initialized, but we dropped it as that would be an invasive change. This is cruft
from that effort, sorry - will remove.

> 
>> +
>> static inline void pci_irq_assert(PCIDevice *pci_dev)
>> {
>>     pci_set_irq(pci_dev, 1);
>> diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
>> new file mode 100644
>> index 0000000000..42ce0ca383
>> --- /dev/null
>> +++ b/include/hw/remote/iommu.h
>> @@ -0,0 +1,24 @@
>> +/*
>> + * IOMMU for remote device
>> + *
>> + * Copyright © 2021 Oracle and/or its affiliates.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#ifndef REMOTE_IOMMU_H
>> +#define REMOTE_IOMMU_H
>> +
>> +#include "hw/pci/pci_bus.h"
>> +
>> +void remote_iommu_free(PCIDevice *pci_dev);
>> +
>> +void remote_iommu_init(void);
>> +
>> +void remote_iommu_set(PCIBus *bus);
>> +
>> +MemoryRegion *remote_iommu_get_ram(PCIDevice *pci_dev);
>> +
>> +#endif
>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>> index 4a84e478ce..57d561cc03 100644
>> --- a/hw/pci/pci.c
>> +++ b/hw/pci/pci.c
>> @@ -95,7 +95,7 @@ static const VMStateDescription vmstate_pcibus = {
>>     }
>> };
>> 
>> -static void pci_init_bus_master(PCIDevice *pci_dev)
>> +void pci_init_bus_master(PCIDevice *pci_dev)
>> {
>>     AddressSpace *dma_as = pci_device_iommu_address_space(pci_dev);
>> 
>> diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
>> new file mode 100644
>> index 0000000000..30c866badb
>> --- /dev/null
>> +++ b/hw/remote/iommu.c
>> @@ -0,0 +1,117 @@
>> +/*
>> + * Remote IOMMU
>> + *
>> + * Copyright © 2021 Oracle and/or its affiliates.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu-common.h"
>> +
>> +#include "hw/remote/iommu.h"
>> +#include "hw/pci/pci_bus.h"
>> +#include "exec/memory.h"
>> +#include "exec/address-spaces.h"
>> +#include "trace.h"
>> +
>> +struct VFUIOMMU {
>> +    AddressSpace  as;
>> +    MemoryRegion  mr;
> 
> I guess this is the root MemoryRegion container? Calling it "root" or
> "root_mr" instead of "mr" would make that clearer.

OK

> 
>> +};
>> +
>> +typedef struct VFUPciBus {
> 
> There is no uppercase/lowercase consistency between VfuObject vs
> VFUIOMMU vs VFUPciBus. Although the coding standard doesn't dictate ABC
> vs Abc, please be consistent. I suggest following the VfuObject
> convention started in the previous patches. The names would be VfuIommu
> and VfuPciBus.

Sounds good, thank you!

> 
>> +    PCIBus           *bus;
>> +    struct VFUIOMMU  *iommu[];
>> +} VFUPciBus;
>> +
>> +GHashTable *remote_as_table;
>> +
>> +static AddressSpace *remote_iommu_get_as(PCIBus *bus, void *opaque, int devfn)
>> +{
>> +    VFUPciBus *vfu_pci_bus = NULL;
>> +    struct VFUIOMMU *iommu = NULL;
>> +
>> +    if (!remote_as_table) {
>> +        return &address_space_memory;
>> +    }
>> +
>> +    vfu_pci_bus = g_hash_table_lookup(remote_as_table, bus);
>> +
>> +    if (!vfu_pci_bus) {
>> +        vfu_pci_bus = g_malloc0(sizeof(VFUPciBus));
>> +        vfu_pci_bus->bus = bus;
>> +        g_hash_table_insert(remote_as_table, bus, vfu_pci_bus);
>> +    }
>> +
>> +    iommu = vfu_pci_bus->iommu[devfn];
>> +
>> +    if (!iommu) {
>> +        g_autofree char *mr_name = g_strdup_printf("vfu-ram-%d", devfn);
>> +        g_autofree char *as_name = g_strdup_printf("vfu-as-%d", devfn);
>> +
>> +        iommu = g_malloc0(sizeof(struct VFUIOMMU));
>> +
>> +        memory_region_init(&iommu->mr, NULL, mr_name, UINT64_MAX);
>> +        address_space_init(&iommu->as, &iommu->mr, as_name);
>> +
>> +        vfu_pci_bus->iommu[devfn] = iommu;
>> +    }
>> +
>> +    return &iommu->as;
>> +}
>> +
>> +void remote_iommu_free(PCIDevice *pci_dev)
>> +{
>> +    VFUPciBus *vfu_pci_bus = NULL;
>> +    struct VFUIOMMU *iommu = NULL;
>> +
>> +    if (!remote_as_table) {
>> +        return;
>> +    }
>> +
>> +    vfu_pci_bus = g_hash_table_lookup(remote_as_table, pci_get_bus(pci_dev));
>> +
>> +    if (!vfu_pci_bus) {
>> +        return;
>> +    }
>> +
>> +    iommu = vfu_pci_bus->iommu[pci_dev->devfn];
>> +
>> +    vfu_pci_bus->iommu[pci_dev->devfn] = NULL;
>> +
>> +    if (iommu) {
>> +        memory_region_unref(&iommu->mr);
>> +        address_space_destroy(&iommu->as);
>> +        g_free(iommu);
>> +    }
>> +}
>> +
>> +void remote_iommu_init(void)
>> +{
>> +    remote_as_table = g_hash_table_new_full(NULL, NULL, NULL, NULL);
>> +}
>> +
>> +void remote_iommu_set(PCIBus *bus)
>> +{
>> +    pci_setup_iommu(bus, remote_iommu_get_as, NULL);
>> +}
>> +
>> +MemoryRegion *remote_iommu_get_ram(PCIDevice *pci_dev)
>> +{
>> +    PCIBus *bus = pci_get_bus(pci_dev);
>> +    VFUPciBus *vfu_pci_bus;
>> +
>> +    if (!remote_as_table) {
>> +        return get_system_memory();
>> +    }
>> +
>> +    vfu_pci_bus = g_hash_table_lookup(remote_as_table, bus);
>> +    if (!vfu_pci_bus) {
>> +        return get_system_memory();
>> +    }
>> +
>> +    return &vfu_pci_bus->iommu[pci_dev->devfn]->mr;
>> +}
>> diff --git a/hw/remote/machine.c b/hw/remote/machine.c
>> index 952105eab5..023be0491e 100644
>> --- a/hw/remote/machine.c
>> +++ b/hw/remote/machine.c
>> @@ -21,6 +21,7 @@
>> #include "qapi/error.h"
>> #include "hw/pci/pci_host.h"
>> #include "hw/remote/iohub.h"
>> +#include "hw/remote/iommu.h"
>> 
>> static void remote_machine_init(MachineState *machine)
>> {
>> @@ -52,6 +53,10 @@ static void remote_machine_init(MachineState *machine)
>> 
>>     remote_iohub_init(&s->iohub);
>> 
>> +    remote_iommu_init();
>> +
>> +    remote_iommu_set(pci_host->bus);
>> +
>>     pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
>>                  &s->iohub, REMOTE_IOHUB_NB_PIRQS);
>> }
>> diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
>> index 9399e87cbe..ae375e69b9 100644
>> --- a/hw/remote/vfio-user-obj.c
>> +++ b/hw/remote/vfio-user-obj.c
>> @@ -49,6 +49,7 @@
>> #include "hw/qdev-core.h"
>> #include "hw/pci/pci.h"
>> #include "qemu/timer.h"
>> +#include "hw/remote/iommu.h"
>> 
>> #define TYPE_VFU_OBJECT "x-vfio-user-server"
>> OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
>> @@ -210,6 +211,7 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
>> 
>> static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
>> {
>> +    VfuObject *o = vfu_get_private(vfu_ctx);
>>     MemoryRegion *subregion = NULL;
>>     g_autofree char *name = NULL;
>>     static unsigned int suffix;
>> @@ -226,14 +228,15 @@ static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
>>     memory_region_init_ram_ptr(subregion, NULL, name,
>>                                iov->iov_len, info->vaddr);
>> 
>> -    memory_region_add_subregion(get_system_memory(), (hwaddr)iov->iov_base,
>> -                                subregion);
>> +    memory_region_add_subregion(remote_iommu_get_ram(o->pci_dev),
>> +                                (hwaddr)iov->iov_base, subregion);
>> 
>>     trace_vfu_dma_register((uint64_t)iov->iov_base, iov->iov_len);
>> }
>> 
>> static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
>> {
>> +    VfuObject *o = vfu_get_private(vfu_ctx);
>>     MemoryRegion *mr = NULL;
>>     ram_addr_t offset;
>> 
>> @@ -242,7 +245,7 @@ static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
>>         return;
>>     }
>> 
>> -    memory_region_del_subregion(get_system_memory(), mr);
>> +    memory_region_del_subregion(remote_iommu_get_ram(o->pci_dev), mr);
>> 
>>     object_unparent((OBJECT(mr)));
>> 
>> @@ -320,6 +323,7 @@ static vfu_region_access_cb_t *vfu_object_bar_handlers[PCI_NUM_REGIONS] = {
>>  */
>> static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
>> {
>> +    VfuObject *o = vfu_get_private(vfu_ctx);
>>     int i;
>> 
>>     for (i = 0; i < PCI_NUM_REGIONS; i++) {
>> @@ -332,6 +336,12 @@ static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
>>                          vfu_object_bar_handlers[i],
>>                          VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
>> 
>> +        if ((o->pci_dev->io_regions[i].type & PCI_BASE_ADDRESS_SPACE) == 0) {
>> +            memory_region_unref(o->pci_dev->io_regions[i].address_space);
>> +            o->pci_dev->io_regions[i].address_space =
>> +                remote_iommu_get_ram(o->pci_dev);
>> +        }
> 
> This looks hacky. If you create a separate PCIHost for each device
> instead then the BARs will be created in the MemoryRegion (confusingly
> named "address_space" in the PCI code) of your choosing.

I was also not very comfortable with this - added it very grudgingly out of
necessity. Thank god this can go away with separate bus for each device.

> 
> Also, why is PCI Memory Space isolated via VFUIOMMU but PCI IO Space is
> not?

If I understand correctly, the IO address space translates sysmem address to
direct device access (such as I2C). Once we are inside a device, we already
have access to all parts of the device (unlike RAM which sits outside the device).
So didn’t think device would go via IOMMU to access IO. Also didn’t see any
other IOMMU translating IO address space accesses.

Thank you!
--
Jag
Stefan Hajnoczi Dec. 20, 2021, 2:36 p.m. UTC | #3
On Fri, Dec 17, 2021 at 08:00:35PM +0000, Jag Raman wrote:
> > On Dec 16, 2021, at 9:40 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> > On Wed, Dec 15, 2021 at 10:35:35AM -0500, Jagannathan Raman wrote:
> >> Assign separate address space for each device in the remote processes.
> > 
> > If I understand correctly this isn't really an IOMMU. It's abusing the
> > IOMMU APIs to create isolated address spaces for each device. This way
> > memory regions added by the vfio-user client do not conflict when there
> > are multiple vfio-user servers.
> 
> Like you already figured out, having isolated DMA address space alone is not
> sufficient for this application, we also needed to isolate the sysmem/RAM address
> space. As such, the available IOMMU APIs alone were not sufficient, so we had
> to improvise.
> 
> > 
> > Calling pci_root_bus_new() and keeping one PCI bus per VfuObject might
> > be a cleaner approach:
> > - Lets you isolate both PCI Memory Space and IO Space.
> > - Isolates the PCIDevices and their addresses on the bus.
> > - Isolates irqs.
> > - No more need to abuse the IOMMU API.
> 
> I believe we would still need to have an IOMMU. It’s because, devices use the
> pci_dma_read()/_write() functions. These functions look up the address in DMA
> address space (via pci_get_address_space() -> PCIDevice->bus_master_as ->
> PCIDevice->bus_master_enable_region -> PCIDevice->bus_master_container_region).
>  bus_master_enable_region and bus_master_container_region are effectively aliases
> to the DMA address space - without an IOMMU, the dma_as would be the shared
> global sysmem/RAM space (address_space_mem, please see pci_init_bus_master())

Good point, that code assumes there is a global address space. Creating
a fake IOMMU works around that assumption but it seems cleaner to
eliminate it:

  AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
  {
      ...
      if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) {
          return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn);
      }
      return &address_space_memory;
              ^^^^^^^^^^^^^^^^^^^^

When creating a PCI root bus an AddressSpace argument could be provided,
just like it already does for the address_space_memory and
address_space_io MemoryRegions. Then the hardcoded return can be
changed to something like:

  return bus->dma_address_space;

> >> @@ -332,6 +336,12 @@ static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
> >>                          vfu_object_bar_handlers[i],
> >>                          VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
> >> 
> >> +        if ((o->pci_dev->io_regions[i].type & PCI_BASE_ADDRESS_SPACE) == 0) {
> >> +            memory_region_unref(o->pci_dev->io_regions[i].address_space);
> >> +            o->pci_dev->io_regions[i].address_space =
> >> +                remote_iommu_get_ram(o->pci_dev);
> >> +        }
> > 
> > This looks hacky. If you create a separate PCIHost for each device
> > instead then the BARs will be created in the MemoryRegion (confusingly
> > named "address_space" in the PCI code) of your choosing.
> 
> I was also not very comfortable with this - added it very grudgingly out of
> necessity. Thank god this can go away with separate bus for each device.

I talked to Kevin Wolf about having separate busses. qdev currently
requires each DeviceState to have a parent bus and each bus must have a
parent DeviceState. There is only one exception: a special check that
allows the global system bus (sysbus_get_default()) to be created
without a parent DeviceState.

This restriction probably needs to be loosened in order to support an
isolated PCIHost for each vfio-user server. The challenge is that
qdev_find_recursive() and monitor commands like device_add currently
only search the global system bus. Maybe new syntax is needed for the
multiple root bus case or the behavior of existing monitor commands
needs to be understood and extended without breaking anything.

> > 
> > Also, why is PCI Memory Space isolated via VFUIOMMU but PCI IO Space is
> > not?
> 
> If I understand correctly, the IO address space translates sysmem address to
> direct device access (such as I2C). Once we are inside a device, we already
> have access to all parts of the device (unlike RAM which sits outside the device).
> So didn’t think device would go via IOMMU to access IO. Also didn’t see any
> other IOMMU translating IO address space accesses.

I reviewed how BARs are configured with VFIO:

1. When the guest writes to the vfio-pci PCIDevice's Configuration Space
   the write is forwarded to the VFIO device (i.e. vfio-user or VFIO
   kernel ioctl).

2. The vfio-user server receives the Configuration Space write and
   forwards it to pci_dev (the PCIDevice we're serving up). BAR mappings
   are updated in the vfio-user server so the BAR MemoryRegions are
   mapped/unmapped at the locations given by the guest.

This applies for both Memory and IO Space accesses.

Because this patch series does not isolate IO Space between VfuObject
instances the MemoryRegions will collide when two guests map IO Space
BARs of different devices at the same IO Space address. In other words,
vfu_object_bar_rw() uses the global address_space_io and that means
collisions can occur.

Stefan
Jag Raman Dec. 21, 2021, 4:32 a.m. UTC | #4
> On Dec 20, 2021, at 9:36 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Fri, Dec 17, 2021 at 08:00:35PM +0000, Jag Raman wrote:
>>> On Dec 16, 2021, at 9:40 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>> 
>>> On Wed, Dec 15, 2021 at 10:35:35AM -0500, Jagannathan Raman wrote:
>>>> Assign separate address space for each device in the remote processes.
>>> 
>>> If I understand correctly this isn't really an IOMMU. It's abusing the
>>> IOMMU APIs to create isolated address spaces for each device. This way
>>> memory regions added by the vfio-user client do not conflict when there
>>> are multiple vfio-user servers.
>> 
>> Like you already figured out, having isolated DMA address space alone is not
>> sufficient for this application, we also needed to isolate the sysmem/RAM address
>> space. As such, the available IOMMU APIs alone were not sufficient, so we had
>> to improvise.
>> 
>>> 
>>> Calling pci_root_bus_new() and keeping one PCI bus per VfuObject might
>>> be a cleaner approach:
>>> - Lets you isolate both PCI Memory Space and IO Space.
>>> - Isolates the PCIDevices and their addresses on the bus.
>>> - Isolates irqs.
>>> - No more need to abuse the IOMMU API.
>> 
>> I believe we would still need to have an IOMMU. It’s because, devices use the
>> pci_dma_read()/_write() functions. These functions look up the address in DMA
>> address space (via pci_get_address_space() -> PCIDevice->bus_master_as ->
>> PCIDevice->bus_master_enable_region -> PCIDevice->bus_master_container_region).
>> bus_master_enable_region and bus_master_container_region are effectively aliases
>> to the DMA address space - without an IOMMU, the dma_as would be the shared
>> global sysmem/RAM space (address_space_mem, please see pci_init_bus_master())
> 
> Good point, that code assumes there is a global address space. Creating
> a fake IOMMU works around that assumption but it seems cleaner to
> eliminate it:
> 
>  AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>  {
>      ...
>      if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) {
>          return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn);
>      }
>      return &address_space_memory;
>              ^^^^^^^^^^^^^^^^^^^^
> 
> When creating a PCI root bus an AddressSpace argument could be provided,
> just like it already does for the address_space_memory and
> address_space_io MemoryRegions. Then the hardcoded return can be
> changed to something like:
> 
>  return bus->dma_address_space;

This approach should work when we are using separate PCIBus for each PCIDevice.

> 
>>>> @@ -332,6 +336,12 @@ static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
>>>>                         vfu_object_bar_handlers[i],
>>>>                         VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
>>>> 
>>>> +        if ((o->pci_dev->io_regions[i].type & PCI_BASE_ADDRESS_SPACE) == 0) {
>>>> +            memory_region_unref(o->pci_dev->io_regions[i].address_space);
>>>> +            o->pci_dev->io_regions[i].address_space =
>>>> +                remote_iommu_get_ram(o->pci_dev);
>>>> +        }
>>> 
>>> This looks hacky. If you create a separate PCIHost for each device
>>> instead then the BARs will be created in the MemoryRegion (confusingly
>>> named "address_space" in the PCI code) of your choosing.
>> 
>> I was also not very comfortable with this - added it very grudgingly out of
>> necessity. Thank god this can go away with separate bus for each device.
> 
> I talked to Kevin Wolf about having separate busses. qdev currently
> requires each DeviceState to have a parent bus and each bus must have a
> parent DeviceState. There is only one exception: a special check that
> allows the global system bus (sysbus_get_default()) to be created
> without a parent DeviceState.
> 
> This restriction probably needs to be loosened in order to support an
> isolated PCIHost for each vfio-user server. The challenge is that
> qdev_find_recursive() and monitor commands like device_add currently
> only search the global system bus. Maybe new syntax is needed for the
> multiple root bus case or the behavior of existing monitor commands
> needs to be understood and extended without breaking anything.

Lemme check if it’s possible to create multiple PCIBuses within the global
system bus, something similar to what PCI expansion cards are doing. That
would help avoid the complexities you just mentioned.
> 
>>> 
>>> Also, why is PCI Memory Space isolated via VFUIOMMU but PCI IO Space is
>>> not?
>> 
>> If I understand correctly, the IO address space translates sysmem address to
>> direct device access (such as I2C). Once we are inside a device, we already
>> have access to all parts of the device (unlike RAM which sits outside the device).
>> So didn’t think device would go via IOMMU to access IO. Also didn’t see any
>> other IOMMU translating IO address space accesses.
> 
> I reviewed how BARs are configured with VFIO:
> 
> 1. When the guest writes to the vfio-pci PCIDevice's Configuration Space
>   the write is forwarded to the VFIO device (i.e. vfio-user or VFIO
>   kernel ioctl).
> 
> 2. The vfio-user server receives the Configuration Space write and
>   forwards it to pci_dev (the PCIDevice we're serving up). BAR mappings
>   are updated in the vfio-user server so the BAR MemoryRegions are
>   mapped/unmapped at the locations given by the guest.
> 
> This applies for both Memory and IO Space accesses.
> 
> Because this patch series does not isolate IO Space between VfuObject
> instances the MemoryRegions will collide when two guests map IO Space
> BARs of different devices at the same IO Space address. In other words,
> vfu_object_bar_rw() uses the global address_space_io and that means
> collisions can occur.

I agree that collision could occur from the CPU end. But I'm not if IOMMU
needs to translate IO space.

Thank you!
--
Jag

> 
> Stefan
Stefan Hajnoczi Jan. 6, 2022, 1:10 p.m. UTC | #5
On Tue, Dec 21, 2021 at 04:32:05AM +0000, Jag Raman wrote:
> > On Dec 20, 2021, at 9:36 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > On Fri, Dec 17, 2021 at 08:00:35PM +0000, Jag Raman wrote:
> >>> On Dec 16, 2021, at 9:40 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >>> On Wed, Dec 15, 2021 at 10:35:35AM -0500, Jagannathan Raman wrote:
> >>> Also, why is PCI Memory Space isolated via VFUIOMMU but PCI IO Space is
> >>> not?
> >> 
> >> If I understand correctly, the IO address space translates sysmem address to
> >> direct device access (such as I2C). Once we are inside a device, we already
> >> have access to all parts of the device (unlike RAM which sits outside the device).
> >> So didn’t think device would go via IOMMU to access IO. Also didn’t see any
> >> other IOMMU translating IO address space accesses.
> > 
> > I reviewed how BARs are configured with VFIO:
> > 
> > 1. When the guest writes to the vfio-pci PCIDevice's Configuration Space
> >   the write is forwarded to the VFIO device (i.e. vfio-user or VFIO
> >   kernel ioctl).
> > 
> > 2. The vfio-user server receives the Configuration Space write and
> >   forwards it to pci_dev (the PCIDevice we're serving up). BAR mappings
> >   are updated in the vfio-user server so the BAR MemoryRegions are
> >   mapped/unmapped at the locations given by the guest.
> > 
> > This applies for both Memory and IO Space accesses.
> > 
> > Because this patch series does not isolate IO Space between VfuObject
> > instances the MemoryRegions will collide when two guests map IO Space
> > BARs of different devices at the same IO Space address. In other words,
> > vfu_object_bar_rw() uses the global address_space_io and that means
> > collisions can occur.
> 
> I agree that collision could occur from the CPU end. But I'm not if IOMMU
> needs to translate IO space.

QEMU's IOMMUs do not translate IO Space addresses AFAIK.

IO Space just needs to be isolated between vfio-user server instances so
there is no collision when one client maps an IO Space BAR to the same
address as another client.

I think the cleanest way of achieving that is by creating a
per-vfio-user server PCI bus with an address_space_io MemoryRegion.

Stefan
diff mbox series

Patch

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 5c4016b995..f2fc2d5375 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -734,6 +734,8 @@  void lsi53c8xx_handle_legacy_cmdline(DeviceState *lsi_dev);
 qemu_irq pci_allocate_irq(PCIDevice *pci_dev);
 void pci_set_irq(PCIDevice *pci_dev, int level);
 
+void pci_init_bus_master(PCIDevice *pci_dev);
+
 static inline void pci_irq_assert(PCIDevice *pci_dev)
 {
     pci_set_irq(pci_dev, 1);
diff --git a/include/hw/remote/iommu.h b/include/hw/remote/iommu.h
new file mode 100644
index 0000000000..42ce0ca383
--- /dev/null
+++ b/include/hw/remote/iommu.h
@@ -0,0 +1,24 @@ 
+/*
+ * IOMMU for remote device
+ *
+ * Copyright © 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef REMOTE_IOMMU_H
+#define REMOTE_IOMMU_H
+
+#include "hw/pci/pci_bus.h"
+
+void remote_iommu_free(PCIDevice *pci_dev);
+
+void remote_iommu_init(void);
+
+void remote_iommu_set(PCIBus *bus);
+
+MemoryRegion *remote_iommu_get_ram(PCIDevice *pci_dev);
+
+#endif
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 4a84e478ce..57d561cc03 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -95,7 +95,7 @@  static const VMStateDescription vmstate_pcibus = {
     }
 };
 
-static void pci_init_bus_master(PCIDevice *pci_dev)
+void pci_init_bus_master(PCIDevice *pci_dev)
 {
     AddressSpace *dma_as = pci_device_iommu_address_space(pci_dev);
 
diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
new file mode 100644
index 0000000000..30c866badb
--- /dev/null
+++ b/hw/remote/iommu.c
@@ -0,0 +1,117 @@ 
+/*
+ * Remote IOMMU
+ *
+ * Copyright © 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/remote/iommu.h"
+#include "hw/pci/pci_bus.h"
+#include "exec/memory.h"
+#include "exec/address-spaces.h"
+#include "trace.h"
+
+struct VFUIOMMU {
+    AddressSpace  as;
+    MemoryRegion  mr;
+};
+
+typedef struct VFUPciBus {
+    PCIBus           *bus;
+    struct VFUIOMMU  *iommu[];
+} VFUPciBus;
+
+GHashTable *remote_as_table;
+
+static AddressSpace *remote_iommu_get_as(PCIBus *bus, void *opaque, int devfn)
+{
+    VFUPciBus *vfu_pci_bus = NULL;
+    struct VFUIOMMU *iommu = NULL;
+
+    if (!remote_as_table) {
+        return &address_space_memory;
+    }
+
+    vfu_pci_bus = g_hash_table_lookup(remote_as_table, bus);
+
+    if (!vfu_pci_bus) {
+        vfu_pci_bus = g_malloc0(sizeof(VFUPciBus));
+        vfu_pci_bus->bus = bus;
+        g_hash_table_insert(remote_as_table, bus, vfu_pci_bus);
+    }
+
+    iommu = vfu_pci_bus->iommu[devfn];
+
+    if (!iommu) {
+        g_autofree char *mr_name = g_strdup_printf("vfu-ram-%d", devfn);
+        g_autofree char *as_name = g_strdup_printf("vfu-as-%d", devfn);
+
+        iommu = g_malloc0(sizeof(struct VFUIOMMU));
+
+        memory_region_init(&iommu->mr, NULL, mr_name, UINT64_MAX);
+        address_space_init(&iommu->as, &iommu->mr, as_name);
+
+        vfu_pci_bus->iommu[devfn] = iommu;
+    }
+
+    return &iommu->as;
+}
+
+void remote_iommu_free(PCIDevice *pci_dev)
+{
+    VFUPciBus *vfu_pci_bus = NULL;
+    struct VFUIOMMU *iommu = NULL;
+
+    if (!remote_as_table) {
+        return;
+    }
+
+    vfu_pci_bus = g_hash_table_lookup(remote_as_table, pci_get_bus(pci_dev));
+
+    if (!vfu_pci_bus) {
+        return;
+    }
+
+    iommu = vfu_pci_bus->iommu[pci_dev->devfn];
+
+    vfu_pci_bus->iommu[pci_dev->devfn] = NULL;
+
+    if (iommu) {
+        memory_region_unref(&iommu->mr);
+        address_space_destroy(&iommu->as);
+        g_free(iommu);
+    }
+}
+
+void remote_iommu_init(void)
+{
+    remote_as_table = g_hash_table_new_full(NULL, NULL, NULL, NULL);
+}
+
+void remote_iommu_set(PCIBus *bus)
+{
+    pci_setup_iommu(bus, remote_iommu_get_as, NULL);
+}
+
+MemoryRegion *remote_iommu_get_ram(PCIDevice *pci_dev)
+{
+    PCIBus *bus = pci_get_bus(pci_dev);
+    VFUPciBus *vfu_pci_bus;
+
+    if (!remote_as_table) {
+        return get_system_memory();
+    }
+
+    vfu_pci_bus = g_hash_table_lookup(remote_as_table, bus);
+    if (!vfu_pci_bus) {
+        return get_system_memory();
+    }
+
+    return &vfu_pci_bus->iommu[pci_dev->devfn]->mr;
+}
diff --git a/hw/remote/machine.c b/hw/remote/machine.c
index 952105eab5..023be0491e 100644
--- a/hw/remote/machine.c
+++ b/hw/remote/machine.c
@@ -21,6 +21,7 @@ 
 #include "qapi/error.h"
 #include "hw/pci/pci_host.h"
 #include "hw/remote/iohub.h"
+#include "hw/remote/iommu.h"
 
 static void remote_machine_init(MachineState *machine)
 {
@@ -52,6 +53,10 @@  static void remote_machine_init(MachineState *machine)
 
     remote_iohub_init(&s->iohub);
 
+    remote_iommu_init();
+
+    remote_iommu_set(pci_host->bus);
+
     pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
                  &s->iohub, REMOTE_IOHUB_NB_PIRQS);
 }
diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index 9399e87cbe..ae375e69b9 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -49,6 +49,7 @@ 
 #include "hw/qdev-core.h"
 #include "hw/pci/pci.h"
 #include "qemu/timer.h"
+#include "hw/remote/iommu.h"
 
 #define TYPE_VFU_OBJECT "x-vfio-user-server"
 OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
@@ -210,6 +211,7 @@  static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
 
 static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
 {
+    VfuObject *o = vfu_get_private(vfu_ctx);
     MemoryRegion *subregion = NULL;
     g_autofree char *name = NULL;
     static unsigned int suffix;
@@ -226,14 +228,15 @@  static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
     memory_region_init_ram_ptr(subregion, NULL, name,
                                iov->iov_len, info->vaddr);
 
-    memory_region_add_subregion(get_system_memory(), (hwaddr)iov->iov_base,
-                                subregion);
+    memory_region_add_subregion(remote_iommu_get_ram(o->pci_dev),
+                                (hwaddr)iov->iov_base, subregion);
 
     trace_vfu_dma_register((uint64_t)iov->iov_base, iov->iov_len);
 }
 
 static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
 {
+    VfuObject *o = vfu_get_private(vfu_ctx);
     MemoryRegion *mr = NULL;
     ram_addr_t offset;
 
@@ -242,7 +245,7 @@  static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
         return;
     }
 
-    memory_region_del_subregion(get_system_memory(), mr);
+    memory_region_del_subregion(remote_iommu_get_ram(o->pci_dev), mr);
 
     object_unparent((OBJECT(mr)));
 
@@ -320,6 +323,7 @@  static vfu_region_access_cb_t *vfu_object_bar_handlers[PCI_NUM_REGIONS] = {
  */
 static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
 {
+    VfuObject *o = vfu_get_private(vfu_ctx);
     int i;
 
     for (i = 0; i < PCI_NUM_REGIONS; i++) {
@@ -332,6 +336,12 @@  static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
                          vfu_object_bar_handlers[i],
                          VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
 
+        if ((o->pci_dev->io_regions[i].type & PCI_BASE_ADDRESS_SPACE) == 0) {
+            memory_region_unref(o->pci_dev->io_regions[i].address_space);
+            o->pci_dev->io_regions[i].address_space =
+                remote_iommu_get_ram(o->pci_dev);
+        }
+
         trace_vfu_bar_register(i, pdev->io_regions[i].addr,
                                pdev->io_regions[i].size);
     }
@@ -490,6 +500,10 @@  static void vfu_object_finalize(Object *obj)
 
     o->device = NULL;
 
+    if (o->pci_dev) {
+        remote_iommu_free(o->pci_dev);
+    }
+
     o->pci_dev = NULL;
 
     if (!k->nr_devs && !k->daemon) {
diff --git a/MAINTAINERS b/MAINTAINERS
index b5eb306662..5dc67d79a1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3466,6 +3466,8 @@  F: hw/remote/iohub.c
 F: include/hw/remote/iohub.h
 F: subprojects/libvfio-user
 F: hw/remote/vfio-user-obj.c
+F: include/hw/remote/iommu.h
+F: hw/remote/iommu.c
 
 EBPF:
 M: Jason Wang <jasowang@redhat.com>
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
index 534ac5df79..bcef83c8cc 100644
--- a/hw/remote/meson.build
+++ b/hw/remote/meson.build
@@ -6,6 +6,7 @@  remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
 remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
 remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
 remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iommu.c'))
 remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: files('vfio-user-obj.c'))
 
 remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: vfiouser)