diff mbox series

[RFC,15/18] vfio/iommufd: Implement iommufd backend

Message ID 20220414104710.28534-16-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series vfio: Adopt iommufd | expand

Commit Message

Yi Liu April 14, 2022, 10:47 a.m. UTC
Add the iommufd backend. The IOMMUFD container class is implemented
based on the new /dev/iommu user API. This backend obviously depends
on CONFIG_IOMMUFD.

So far, the iommufd backend doesn't support live migration and
cache coherency yet due to missing support in the host kernel meaning
that only a subset of the container class callbacks is implemented.

Co-authored-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 hw/vfio/as.c                         |   2 +-
 hw/vfio/iommufd.c                    | 545 +++++++++++++++++++++++++++
 hw/vfio/meson.build                  |   3 +
 hw/vfio/pci.c                        |  10 +
 hw/vfio/trace-events                 |  11 +
 include/hw/vfio/vfio-common.h        |  18 +
 include/hw/vfio/vfio-container-obj.h |   1 +
 7 files changed, 589 insertions(+), 1 deletion(-)
 create mode 100644 hw/vfio/iommufd.c

Comments

Jason Gunthorpe April 22, 2022, 2:58 p.m. UTC | #1
On Thu, Apr 14, 2022 at 03:47:07AM -0700, Yi Liu wrote:

> +static int vfio_get_devicefd(const char *sysfs_path, Error **errp)
> +{
> +    long int vfio_id = -1, ret = -ENOTTY;
> +    char *path, *tmp = NULL;
> +    DIR *dir;
> +    struct dirent *dent;
> +    struct stat st;
> +    gchar *contents;
> +    gsize length;
> +    int major, minor;
> +    dev_t vfio_devt;
> +
> +    path = g_strdup_printf("%s/vfio-device", sysfs_path);
> +    if (stat(path, &st) < 0) {
> +        error_setg_errno(errp, errno, "no such host device");
> +        goto out;
> +    }
> +
> +    dir = opendir(path);
> +    if (!dir) {
> +        error_setg_errno(errp, errno, "couldn't open dirrectory %s", path);
> +        goto out;
> +    }
> +
> +    while ((dent = readdir(dir))) {
> +        const char *end_name;
> +
> +        if (!strncmp(dent->d_name, "vfio", 4)) {
> +            ret = qemu_strtol(dent->d_name + 4, &end_name, 10, &vfio_id);
> +            if (ret) {
> +                error_setg(errp, "suspicious vfio* file in %s", path);
> +                goto out;
> +            }

Userspace shouldn't explode if there are different files here down the
road. Just search for the first match of vfio\d+ and there is no need
to parse out the vfio_id from the string. Only fail if no match is
found.

> +    tmp = g_strdup_printf("/dev/vfio/devices/vfio%ld", vfio_id);
> +    if (stat(tmp, &st) < 0) {
> +        error_setg_errno(errp, errno, "no such vfio device");
> +        goto out;
> +    }

And simply pass the string directly here, no need to parse out
vfio_id.

I also suggest falling back to using "/dev/char/%u:%u" if the above
does not exist which prevents "vfio/devices/vfio" from turning into
ABI.

It would be a good idea to make a general open_cdev function that does
all this work once the sysfs is found and cdev read out of it, all the
other vfio places can use it too.

> +static int iommufd_attach_device(VFIODevice *vbasedev, AddressSpace *as,
> +                                 Error **errp)
> +{
> +    VFIOContainer *bcontainer;
> +    VFIOIOMMUFDContainer *container;
> +    VFIOAddressSpace *space;
> +    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
> +    int ret, devfd, iommufd;
> +    uint32_t ioas_id;
> +    Error *err = NULL;
> +
> +    devfd = vfio_get_devicefd(vbasedev->sysfsdev, errp);
> +    if (devfd < 0) {
> +        return devfd;
> +    }
> +    vbasedev->fd = devfd;
> +
> +    space = vfio_get_address_space(as);
> +
> +    /* try to attach to an existing container in this space */
> +    QLIST_FOREACH(bcontainer, &space->containers, next) {
> +        if (!object_dynamic_cast(OBJECT(bcontainer),
> +                                 TYPE_VFIO_IOMMUFD_CONTAINER)) {
> +            continue;
> +        }
> +        container = container_of(bcontainer, VFIOIOMMUFDContainer, obj);
> +        if (vfio_device_attach_container(vbasedev, container, &err)) {
> +            const char *msg = error_get_pretty(err);
> +
> +            trace_vfio_iommufd_fail_attach_existing_container(msg);
> +            error_free(err);
> +            err = NULL;
> +        } else {
> +            ret = vfio_ram_block_discard_disable(true);
> +            if (ret) {
> +                vfio_device_detach_container(vbasedev, container, &err);
> +                error_propagate(errp, err);
> +                vfio_put_address_space(space);
> +                close(vbasedev->fd);
> +                error_prepend(errp,
> +                              "Cannot set discarding of RAM broken (%d)", ret);
> +                return ret;
> +            }
> +            goto out;
> +        }
> +    }

?? this logic shouldn't be necessary, a single ioas always supports
all devices, userspace should never need to juggle multiple ioas's
unless it wants to have different address maps.

Something I would like to see confirmed here in qemu is that qemu can
track the hw pagetable id for each device it binds because we will
need that later to do dirty tracking and other things.

> +    /*
> +     * TODO: for now iommufd BE is on par with vfio iommu type1, so it's
> +     * fine to add the whole range as window. For SPAPR, below code
> +     * should be updated.
> +     */
> +    vfio_host_win_add(bcontainer, 0, (hwaddr)-1, 4096);

? Not sure what this is, but I don't expect any changes for SPAPR
someday IOMMU_IOAS_IOVA_RANGES should be able to accurately report its
configuration.

I don't see IOMMU_IOAS_IOVA_RANGES called at all, that seems like a
problem..

(and note that IOVA_RANGES changes with every device attached to the IOAS)

Jason
Alex Williamson April 22, 2022, 9:33 p.m. UTC | #2
On Fri, 22 Apr 2022 11:58:15 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> I don't see IOMMU_IOAS_IOVA_RANGES called at all, that seems like a
> problem..

Not as much as you might think.  Note that you also won't find QEMU
testing VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE in the QEMU vfio-pci
driver either.  The vfio-nvme driver does because it has control of the
address space it chooses to use, but for vfio-pci the address space is
dictated by the VM and there's not a lot of difference between knowing
in advance that a mapping conflicts with a reserved range or just
trying add the mapping and taking appropriate action if it fails.
Thanks,

Alex
Yi Liu April 26, 2022, 9:55 a.m. UTC | #3
Hi Jason,

On 2022/4/22 22:58, Jason Gunthorpe wrote:
> On Thu, Apr 14, 2022 at 03:47:07AM -0700, Yi Liu wrote:
> 
>> +static int vfio_get_devicefd(const char *sysfs_path, Error **errp)
>> +{
>> +    long int vfio_id = -1, ret = -ENOTTY;
>> +    char *path, *tmp = NULL;
>> +    DIR *dir;
>> +    struct dirent *dent;
>> +    struct stat st;
>> +    gchar *contents;
>> +    gsize length;
>> +    int major, minor;
>> +    dev_t vfio_devt;
>> +
>> +    path = g_strdup_printf("%s/vfio-device", sysfs_path);
>> +    if (stat(path, &st) < 0) {
>> +        error_setg_errno(errp, errno, "no such host device");
>> +        goto out;
>> +    }
>> +
>> +    dir = opendir(path);
>> +    if (!dir) {
>> +        error_setg_errno(errp, errno, "couldn't open dirrectory %s", path);
>> +        goto out;
>> +    }
>> +
>> +    while ((dent = readdir(dir))) {
>> +        const char *end_name;
>> +
>> +        if (!strncmp(dent->d_name, "vfio", 4)) {
>> +            ret = qemu_strtol(dent->d_name + 4, &end_name, 10, &vfio_id);
>> +            if (ret) {
>> +                error_setg(errp, "suspicious vfio* file in %s", path);
>> +                goto out;
>> +            }
> 
> Userspace shouldn't explode if there are different files here down the
> road. Just search for the first match of vfio\d+ and there is no need
> to parse out the vfio_id from the string. Only fail if no match is
> found.
> 
>> +    tmp = g_strdup_printf("/dev/vfio/devices/vfio%ld", vfio_id);
>> +    if (stat(tmp, &st) < 0) {
>> +        error_setg_errno(errp, errno, "no such vfio device");
>> +        goto out;
>> +    }
> 
> And simply pass the string directly here, no need to parse out
> vfio_id.

got above suggestion.

> I also suggest falling back to using "/dev/char/%u:%u" if the above
> does not exist which prevents "vfio/devices/vfio" from turning into
> ABI.

do you mean there is no matched file under /dev/vfio/devices/? Is this
possible?

> 
> It would be a good idea to make a general open_cdev function that does
> all this work once the sysfs is found and cdev read out of it, all the
> other vfio places can use it too.

hmmm, it's good to have a general open_cdev() function. But I guess this
is the only place in VFIO to open the device cdev. Do you mean the vdpa
stuffes?

>> +static int iommufd_attach_device(VFIODevice *vbasedev, AddressSpace *as,
>> +                                 Error **errp)
>> +{
>> +    VFIOContainer *bcontainer;
>> +    VFIOIOMMUFDContainer *container;
>> +    VFIOAddressSpace *space;
>> +    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
>> +    int ret, devfd, iommufd;
>> +    uint32_t ioas_id;
>> +    Error *err = NULL;
>> +
>> +    devfd = vfio_get_devicefd(vbasedev->sysfsdev, errp);
>> +    if (devfd < 0) {
>> +        return devfd;
>> +    }
>> +    vbasedev->fd = devfd;
>> +
>> +    space = vfio_get_address_space(as);
>> +
>> +    /* try to attach to an existing container in this space */
>> +    QLIST_FOREACH(bcontainer, &space->containers, next) {
>> +        if (!object_dynamic_cast(OBJECT(bcontainer),
>> +                                 TYPE_VFIO_IOMMUFD_CONTAINER)) {
>> +            continue;
>> +        }
>> +        container = container_of(bcontainer, VFIOIOMMUFDContainer, obj);
>> +        if (vfio_device_attach_container(vbasedev, container, &err)) {
>> +            const char *msg = error_get_pretty(err);
>> +
>> +            trace_vfio_iommufd_fail_attach_existing_container(msg);
>> +            error_free(err);
>> +            err = NULL;
>> +        } else {
>> +            ret = vfio_ram_block_discard_disable(true);
>> +            if (ret) {
>> +                vfio_device_detach_container(vbasedev, container, &err);
>> +                error_propagate(errp, err);
>> +                vfio_put_address_space(space);
>> +                close(vbasedev->fd);
>> +                error_prepend(errp,
>> +                              "Cannot set discarding of RAM broken (%d)", ret);
>> +                return ret;
>> +            }
>> +            goto out;
>> +        }
>> +    }
> 
> ?? this logic shouldn't be necessary, a single ioas always supports
> all devices, userspace should never need to juggle multiple ioas's
> unless it wants to have different address maps.

legacy vfio container needs to allocate multiple containers in some cases.
Say a device is attached to a container and some iova were mapped on this
container. When trying to attach another device to this container, it will
be failed in case of conflicts between the mapped DMA mappings and the
reserved iovas of the another device. For such case, legacy vfio chooses to
create a new container and attach the group to this new container. Hotlplug
is a typical case of such scenario.

I think current iommufd also needs such choice. The reserved_iova and 
mapped iova area are tracked in io_pagetable, and this structure is 
per-IOAS. So if there is conflict between mapped iova areas of an IOAS and
the reserved_iova of a device that is going to be attached to IOAS, the
attachment would be failed. To be working, QEMU needs to create another
IOAS and attach the device to new IOAS as well.

struct io_pagetable {
          struct rw_semaphore domains_rwsem;
          struct xarray domains;
          unsigned int next_domain_id;

          struct rw_semaphore iova_rwsem;
          struct rb_root_cached area_itree;
          struct rb_root_cached reserved_iova_itree;
          unsigned long iova_alignment;
};

struct iommufd_ioas {
          struct iommufd_object obj;
          struct io_pagetable iopt;
          struct mutex mutex;
          struct list_head auto_domains;
};

> Something I would like to see confirmed here in qemu is that qemu can
> track the hw pagetable id for each device it binds because we will
> need that later to do dirty tracking and other things.

we have tracked the hwpt_id. :-)

>> +    /*
>> +     * TODO: for now iommufd BE is on par with vfio iommu type1, so it's
>> +     * fine to add the whole range as window. For SPAPR, below code
>> +     * should be updated.
>> +     */
>> +    vfio_host_win_add(bcontainer, 0, (hwaddr)-1, 4096);
> 
> ? Not sure what this is, but I don't expect any changes for SPAPR
> someday IOMMU_IOAS_IOVA_RANGES should be able to accurately report its
> configuration.
> 
> I don't see IOMMU_IOAS_IOVA_RANGES called at all, that seems like a
> problem..
> 
> (and note that IOVA_RANGES changes with every device attached to the IOAS)
> 
> Jason
Tian, Kevin April 26, 2022, 10:41 a.m. UTC | #4
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Tuesday, April 26, 2022 5:55 PM
> On 2022/4/22 22:58, Jason Gunthorpe wrote:
> > On Thu, Apr 14, 2022 at 03:47:07AM -0700, Yi Liu wrote:
> >
> >> +
> >> +    /* try to attach to an existing container in this space */
> >> +    QLIST_FOREACH(bcontainer, &space->containers, next) {
> >> +        if (!object_dynamic_cast(OBJECT(bcontainer),
> >> +                                 TYPE_VFIO_IOMMUFD_CONTAINER)) {
> >> +            continue;
> >> +        }
> >> +        container = container_of(bcontainer, VFIOIOMMUFDContainer, obj);
> >> +        if (vfio_device_attach_container(vbasedev, container, &err)) {
> >> +            const char *msg = error_get_pretty(err);
> >> +
> >> +            trace_vfio_iommufd_fail_attach_existing_container(msg);
> >> +            error_free(err);
> >> +            err = NULL;
> >> +        } else {
> >> +            ret = vfio_ram_block_discard_disable(true);
> >> +            if (ret) {
> >> +                vfio_device_detach_container(vbasedev, container, &err);
> >> +                error_propagate(errp, err);
> >> +                vfio_put_address_space(space);
> >> +                close(vbasedev->fd);
> >> +                error_prepend(errp,
> >> +                              "Cannot set discarding of RAM broken (%d)", ret);
> >> +                return ret;
> >> +            }
> >> +            goto out;
> >> +        }
> >> +    }
> >
> > ?? this logic shouldn't be necessary, a single ioas always supports
> > all devices, userspace should never need to juggle multiple ioas's
> > unless it wants to have different address maps.
> 
> legacy vfio container needs to allocate multiple containers in some cases.
> Say a device is attached to a container and some iova were mapped on this
> container. When trying to attach another device to this container, it will
> be failed in case of conflicts between the mapped DMA mappings and the
> reserved iovas of the another device. For such case, legacy vfio chooses to
> create a new container and attach the group to this new container. Hotlplug
> is a typical case of such scenario.
> 

Alex provided a clear rationale when we chatted with him on the
same topic. I simply copied it here instead of trying to further
translate: (Alex, please chime in if you want to add more words. 
Jason Gunthorpe April 26, 2022, 1:41 p.m. UTC | #5
On Tue, Apr 26, 2022 at 10:41:01AM +0000, Tian, Kevin wrote:

> That's one case of incompatibility, but the IOMMU attach group callback
> can fail in a variety of ways.  One that we've seen that is not
> uncommon is that we might have an mdev container with various  mappings  
> to other devices.  None of those mappings are validated until the mdev
> driver tries to pin something, where it's generally unlikely that
> they'd pin those particular mappings.  Then QEMU hot-adds a regular
> IOMMU backed device, we allocate a domain for the device and replay the
> mappings from the container, but now they get validated and potentially
> fail.  The kernel returns a failure for the SET_IOMMU ioctl, QEMU
> creates a new container and fills it from the same AddressSpace, where
> now QEMU can determine which mappings can be safely skipped.  

I think it is strange that the allowed DMA a guest can do depends on
the order how devices are plugged into the guest, and varys from
device to device?

IMHO it would be nicer if qemu would be able to read the new reserved
regions and unmap the conflicts before hot plugging the new device. We
don't have a kernel API to do this, maybe we should have one?

> A: 
> QEMU sets up a MemoryListener for the device AddressSpace and attempts
> to map anything that triggers that listener, which includes not only VM
> RAM which is our primary mapping goal, but also miscellaneous devices,
> unaligned regions, and other device regions, ex. BARs.  Some of these
> we filter out in QEMU with broad generalizations that unaligned ranges
> aren't anything we can deal with, but other device regions covers
> anything that's mmap'd in QEMU, ie. it has an associated KVM memory
> slot.  IIRC, in the case I'm thinking of, the mapping that triggered
> the replay failure was the BAR for an mdev device.  No attempt was made
> to use gup or PFNMAP to resolve the mapping when only the mdev device
> was present and the mdev host driver didn't attempt to pin pages within
> its own BAR, but neither of these methods worked for the replay (I
> don't recall further specifics). 

This feels sort of like a bug in iommufd, or perhaps qemu..

With iommufd only normal GUP'able memory should be passed to
map. Special memory will have to go through some other API. This is
different from vfio containers.

We could possibly check the VMAs in iommufd during map to enforce
normal memory.. However I'm also a bit surprised that qemu can't ID
the underlying memory source and avoid this?

eg currently I see the log messages that it is passing P2P BAR memory
into iommufd map, this should be prevented inside qemu because it is
not reliable right now if iommufd will correctly reject it.

IMHO multi-container should be avoided because it does force creating
multiple iommu_domains which does have a memory/performance cost.

Though, it is not so important that it is urgent (and copy makes it
work better anyhow), qemu can stay as it is.

Jason
Jason Gunthorpe April 26, 2022, 1:53 p.m. UTC | #6
On Tue, Apr 26, 2022 at 05:55:29PM +0800, Yi Liu wrote:
> > I also suggest falling back to using "/dev/char/%u:%u" if the above
> > does not exist which prevents "vfio/devices/vfio" from turning into
> > ABI.
> 
> do you mean there is no matched file under /dev/vfio/devices/? Is this
> possible?

The layout of /dev/ depens on udev rules, so it is possible. I only
suggested it to avoid creating ABI here.

> > It would be a good idea to make a general open_cdev function that does
> > all this work once the sysfs is found and cdev read out of it, all the
> > other vfio places can use it too.
> 
> hmmm, it's good to have a general open_cdev() function. But I guess this
> is the only place in VFIO to open the device cdev. Do you mean the vdpa
> stuffes?

Any place that starts from a sysfs name would be interested - I don't
know what vdpa does

Jason
Yi Liu April 26, 2022, 2:08 p.m. UTC | #7
On 2022/4/26 21:41, Jason Gunthorpe wrote:
> On Tue, Apr 26, 2022 at 10:41:01AM +0000, Tian, Kevin wrote:
> 
>> That's one case of incompatibility, but the IOMMU attach group callback
>> can fail in a variety of ways.  One that we've seen that is not
>> uncommon is that we might have an mdev container with various  mappings
>> to other devices.  None of those mappings are validated until the mdev
>> driver tries to pin something, where it's generally unlikely that
>> they'd pin those particular mappings.  Then QEMU hot-adds a regular
>> IOMMU backed device, we allocate a domain for the device and replay the
>> mappings from the container, but now they get validated and potentially
>> fail.  The kernel returns a failure for the SET_IOMMU ioctl, QEMU
>> creates a new container and fills it from the same AddressSpace, where
>> now QEMU can determine which mappings can be safely skipped.
> 
> I think it is strange that the allowed DMA a guest can do depends on
> the order how devices are plugged into the guest, and varys from
> device to device?
> 
> IMHO it would be nicer if qemu would be able to read the new reserved
> regions and unmap the conflicts before hot plugging the new device. We
> don't have a kernel API to do this, maybe we should have one?

For userspace drivers, it is fine to do it. For QEMU, it's not quite easy 
since the IOVA is GPA which is determined per the e820 table.

>> A:
>> QEMU sets up a MemoryListener for the device AddressSpace and attempts
>> to map anything that triggers that listener, which includes not only VM
>> RAM which is our primary mapping goal, but also miscellaneous devices,
>> unaligned regions, and other device regions, ex. BARs.  Some of these
>> we filter out in QEMU with broad generalizations that unaligned ranges
>> aren't anything we can deal with, but other device regions covers
>> anything that's mmap'd in QEMU, ie. it has an associated KVM memory
>> slot.  IIRC, in the case I'm thinking of, the mapping that triggered
>> the replay failure was the BAR for an mdev device.  No attempt was made
>> to use gup or PFNMAP to resolve the mapping when only the mdev device
>> was present and the mdev host driver didn't attempt to pin pages within
>> its own BAR, but neither of these methods worked for the replay (I
>> don't recall further specifics).
> 
> This feels sort of like a bug in iommufd, or perhaps qemu..
> 
> With iommufd only normal GUP'able memory should be passed to
> map. Special memory will have to go through some other API. This is
> different from vfio containers.
> 
> We could possibly check the VMAs in iommufd during map to enforce
> normal memory.. However I'm also a bit surprised that qemu can't ID
> the underlying memory source and avoid this?
> 
> eg currently I see the log messages that it is passing P2P BAR memory
> into iommufd map, this should be prevented inside qemu because it is
> not reliable right now if iommufd will correctly reject it.

yeah. qemu can filter the P2P BAR mapping and just stop it in qemu. We
haven't added it as it is something you will add in future. so didn't
add it in this RFC. :-) Please let me know if it feels better to filter
it from today.

> IMHO multi-container should be avoided because it does force creating
> multiple iommu_domains which does have a memory/performance cost.

yes. for multi-hw_pgtable, there is no choice as it is mostly due to
compatibility. But for multi-container, seems to be solvable if kernel
and qemu has some extra support like you mentioned. But I'd like to
echo below. It seems there may be other possible reasons to fail in
the attach.

 >> That's one case of incompatibility, but the IOMMU attach group callback
 >> can fail in a variety of ways."

> Though, it is not so important that it is urgent (and copy makes it
> work better anyhow), qemu can stay as it is.

yes. as a start, keep it would be simpler.

> Jason
Jason Gunthorpe April 26, 2022, 2:11 p.m. UTC | #8
On Tue, Apr 26, 2022 at 10:08:30PM +0800, Yi Liu wrote:

> > I think it is strange that the allowed DMA a guest can do depends on
> > the order how devices are plugged into the guest, and varys from
> > device to device?
> > 
> > IMHO it would be nicer if qemu would be able to read the new reserved
> > regions and unmap the conflicts before hot plugging the new device. We
> > don't have a kernel API to do this, maybe we should have one?
> 
> For userspace drivers, it is fine to do it. For QEMU, it's not quite easy
> since the IOVA is GPA which is determined per the e820 table.

Sure, that is why I said we may need a new API to get this data back
so userspace can fix the address map before attempting to attach the
new device. Currently that is not possible at all, the device attach
fails and userspace has no way to learn what addresses are causing
problems.

> > eg currently I see the log messages that it is passing P2P BAR memory
> > into iommufd map, this should be prevented inside qemu because it is
> > not reliable right now if iommufd will correctly reject it.
> 
> yeah. qemu can filter the P2P BAR mapping and just stop it in qemu. We
> haven't added it as it is something you will add in future. so didn't
> add it in this RFC. :-) Please let me know if it feels better to filter
> it from today.

I currently hope it will use a different map API entirely and not rely
on discovering the P2P via the VMA. eg using a DMABUF FD or something.

So blocking it in qemu feels like the right thing to do.

Jason
Alex Williamson April 26, 2022, 6:45 p.m. UTC | #9
On Tue, 26 Apr 2022 11:11:56 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Apr 26, 2022 at 10:08:30PM +0800, Yi Liu wrote:
> 
> > > I think it is strange that the allowed DMA a guest can do depends on
> > > the order how devices are plugged into the guest, and varys from
> > > device to device?
> > > 
> > > IMHO it would be nicer if qemu would be able to read the new reserved
> > > regions and unmap the conflicts before hot plugging the new device. We
> > > don't have a kernel API to do this, maybe we should have one?  
> > 
> > For userspace drivers, it is fine to do it. For QEMU, it's not quite easy
> > since the IOVA is GPA which is determined per the e820 table.  
> 
> Sure, that is why I said we may need a new API to get this data back
> so userspace can fix the address map before attempting to attach the
> new device. Currently that is not possible at all, the device attach
> fails and userspace has no way to learn what addresses are causing
> problems.

We have APIs to get the IOVA ranges, both with legacy vfio and the
iommufd RFC, QEMU could compare these, but deciding to remove an
existing mapping is not something to be done lightly.  We must be
absolutely certain that there is no DMA to that range before doing so.
 
> > > eg currently I see the log messages that it is passing P2P BAR memory
> > > into iommufd map, this should be prevented inside qemu because it is
> > > not reliable right now if iommufd will correctly reject it.  
> > 
> > yeah. qemu can filter the P2P BAR mapping and just stop it in qemu. We
> > haven't added it as it is something you will add in future. so didn't
> > add it in this RFC. :-) Please let me know if it feels better to filter
> > it from today.  
> 
> I currently hope it will use a different map API entirely and not rely
> on discovering the P2P via the VMA. eg using a DMABUF FD or something.
> 
> So blocking it in qemu feels like the right thing to do.

Wait a sec, so legacy vfio supports p2p between devices, which has a
least a couple known use cases, primarily involving GPUs for at least
one of the peers, and we're not going to make equivalent support a
feature requirement for iommufd?  This would entirely fracture the
notion that iommufd is a direct replacement and upgrade from legacy
vfio and make a transparent transition for libvirt managed VMs
impossible.  Let's reconsider.  Thanks,

Alex
Jason Gunthorpe April 26, 2022, 7:27 p.m. UTC | #10
On Tue, Apr 26, 2022 at 12:45:41PM -0600, Alex Williamson wrote:
> On Tue, 26 Apr 2022 11:11:56 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Tue, Apr 26, 2022 at 10:08:30PM +0800, Yi Liu wrote:
> > 
> > > > I think it is strange that the allowed DMA a guest can do depends on
> > > > the order how devices are plugged into the guest, and varys from
> > > > device to device?
> > > > 
> > > > IMHO it would be nicer if qemu would be able to read the new reserved
> > > > regions and unmap the conflicts before hot plugging the new device. We
> > > > don't have a kernel API to do this, maybe we should have one?  
> > > 
> > > For userspace drivers, it is fine to do it. For QEMU, it's not quite easy
> > > since the IOVA is GPA which is determined per the e820 table.  
> > 
> > Sure, that is why I said we may need a new API to get this data back
> > so userspace can fix the address map before attempting to attach the
> > new device. Currently that is not possible at all, the device attach
> > fails and userspace has no way to learn what addresses are causing
> > problems.
> 
> We have APIs to get the IOVA ranges, both with legacy vfio and the
> iommufd RFC, QEMU could compare these, but deciding to remove an
> existing mapping is not something to be done lightly. 

Not quite, you can get the IOVA ranges after you attach the device,
but device attach will fail if the new range restrictions intersect
with the existing mappings. So we don't have an easy way to learn the
new range restriction in a way that lets userspace ensure an attach
will not fail due to reserved ranged overlapping with mappings.

The best you could do is make a dummy IOAS then attach the device,
read the mappings, detatch, and then do your unmaps.

I'm imagining something like IOMMUFD_DEVICE_GET_RANGES that can be
called prior to attaching on the device ID.

> We must be absolutely certain that there is no DMA to that range
> before doing so.

Yes, but at the same time if the VM thinks it can DMA to that memory
then it is quite likely to DMA to it with the new device that doesn't
have it mapped in the first place.

It is also a bit odd that the behavior depends on the order the
devices are installed as if you plug the narrower device first then
the next device will happily use the narrower ranges, but viceversa
will get a different result.

This is why I find it bit strange that qemu doesn't check the
ranges. eg I would expect that anything declared as memory in the E820
map has to be mappable to the iommu_domain or the device should not
attach at all.

The P2P is a bit trickier, and I know we don't have a good story
because we lack ACPI description, but I would have expected the same
kind of thing. Anything P2Pable should be in the iommu_domain or the
device should not attach. As with system memory there are only certain
parts of the E820 map that an OS would use for P2P.

(ideally ACPI would indicate exactly what combinations of devices are
P2Pable and then qemu would use that drive the mandatory address
ranges in the IOAS)

> > > yeah. qemu can filter the P2P BAR mapping and just stop it in qemu. We
> > > haven't added it as it is something you will add in future. so didn't
> > > add it in this RFC. :-) Please let me know if it feels better to filter
> > > it from today.  
> > 
> > I currently hope it will use a different map API entirely and not rely
> > on discovering the P2P via the VMA. eg using a DMABUF FD or something.
> > 
> > So blocking it in qemu feels like the right thing to do.
> 
> Wait a sec, so legacy vfio supports p2p between devices, which has a
> least a couple known use cases, primarily involving GPUs for at least
> one of the peers, and we're not going to make equivalent support a
> feature requirement for iommufd?  

I said "different map API" - something like IOMMU_FD_MAP_DMABUF
perhaps.

The trouble with taking in a user pointer to MMIO memory is that it
becomes quite annoying to go from a VMA back to the actual owner
object so we can establish proper refcounting and lifetime of struct-page-less
memory. Requiring userspace to make that connection via a FD
simplifies and generalizes this.

So, qemu would say 'oh this memory is exported by VFIO, I will do
VFIO_EXPORT_DMA_BUF, then do IOMMU_FD_MAP_DMABUF, then close the FD'

For vfio_compat we'd have to build some hacky compat approach to
discover the dmabuf for vfio-pci from the VMA.

But if qemu is going this way with a new implementation I would prefer
the new implementation use the new way, when we decide what it should
be.

As I mentioned before I would like to use DMABUF since I already have
a use-case to expose DMABUF from vfio-pci to connect to RDMA. I will
post the vfio DMABUF patch I have already.

Jason
Alex Williamson April 26, 2022, 8:59 p.m. UTC | #11
On Tue, 26 Apr 2022 16:27:03 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Tue, Apr 26, 2022 at 12:45:41PM -0600, Alex Williamson wrote:
> > On Tue, 26 Apr 2022 11:11:56 -0300
> > Jason Gunthorpe <jgg@nvidia.com> wrote:
> >   
> > > On Tue, Apr 26, 2022 at 10:08:30PM +0800, Yi Liu wrote:
> > >   
> > > > > I think it is strange that the allowed DMA a guest can do depends on
> > > > > the order how devices are plugged into the guest, and varys from
> > > > > device to device?
> > > > > 
> > > > > IMHO it would be nicer if qemu would be able to read the new reserved
> > > > > regions and unmap the conflicts before hot plugging the new device. We
> > > > > don't have a kernel API to do this, maybe we should have one?    
> > > > 
> > > > For userspace drivers, it is fine to do it. For QEMU, it's not quite easy
> > > > since the IOVA is GPA which is determined per the e820 table.    
> > > 
> > > Sure, that is why I said we may need a new API to get this data back
> > > so userspace can fix the address map before attempting to attach the
> > > new device. Currently that is not possible at all, the device attach
> > > fails and userspace has no way to learn what addresses are causing
> > > problems.  
> > 
> > We have APIs to get the IOVA ranges, both with legacy vfio and the
> > iommufd RFC, QEMU could compare these, but deciding to remove an
> > existing mapping is not something to be done lightly.   
> 
> Not quite, you can get the IOVA ranges after you attach the device,
> but device attach will fail if the new range restrictions intersect
> with the existing mappings. So we don't have an easy way to learn the
> new range restriction in a way that lets userspace ensure an attach
> will not fail due to reserved ranged overlapping with mappings.
> 
> The best you could do is make a dummy IOAS then attach the device,
> read the mappings, detatch, and then do your unmaps.

Right, the same thing the kernel does currently.

> I'm imagining something like IOMMUFD_DEVICE_GET_RANGES that can be
> called prior to attaching on the device ID.

Something like /sys/kernel/iommu_groups/$GROUP/reserved_regions?

> > We must be absolutely certain that there is no DMA to that range
> > before doing so.  
> 
> Yes, but at the same time if the VM thinks it can DMA to that memory
> then it is quite likely to DMA to it with the new device that doesn't
> have it mapped in the first place.

Sorry, this assertion doesn't make sense to me.  We can't assume a
vIOMMU on x86, so QEMU typically maps the entire VM address space (ie.
device address space == system memory).  Some of those mappings are
likely DMA targets (RAM), but only a tiny fraction of the address space
may actually be used for DMA.  Some of those mappings are exceedingly
unlikely P2P DMA targets (device memory), so we don't consider mapping
failures to be fatal to attaching the device.

If we have a case where a range failed for one device but worked for a
previous, we're in the latter scenario, because we should have failed
the device attach otherwise.  Your assertion would require that there
are existing devices (plural) making use of this mapping and that the
new device is also likely to make use of this mapping.  I have a hard
time believing that evidence exists to support that statement.
 
> It is also a bit odd that the behavior depends on the order the
> devices are installed as if you plug the narrower device first then
> the next device will happily use the narrower ranges, but viceversa
> will get a different result.

P2P use cases are sufficiently rare that this hasn't been an issue.  I
think there's also still a sufficient healthy dose of FUD whether a
system supports P2P that drivers do some validation before relying on
it.
 
> This is why I find it bit strange that qemu doesn't check the
> ranges. eg I would expect that anything declared as memory in the E820
> map has to be mappable to the iommu_domain or the device should not
> attach at all.

You have some interesting assumptions around associating
MemoryRegionSegments from the device AddressSpace to something like an
x86 specific E820 table.  The currently used rule of thumb is that if
we think it's memory, mapping failure is fatal to the device, otherwise
it's not.  If we want each device to have the most complete mapping
possible, then we'd use a container per device, but that implies a lot
of extra overhead.  Instead we try to attach the device to an existing
container within the address space and assume if it was good enough
there, it's good enough here.

> The P2P is a bit trickier, and I know we don't have a good story
> because we lack ACPI description, but I would have expected the same
> kind of thing. Anything P2Pable should be in the iommu_domain or the
> device should not attach. As with system memory there are only certain
> parts of the E820 map that an OS would use for P2P.
> 
> (ideally ACPI would indicate exactly what combinations of devices are
> P2Pable and then qemu would use that drive the mandatory address
> ranges in the IOAS)

How exactly does ACPI indicate that devices can do P2P?  How can we
rely on ACPI for a problem that's not unique to platforms that
implement ACPI?

> > > > yeah. qemu can filter the P2P BAR mapping and just stop it in qemu. We
> > > > haven't added it as it is something you will add in future. so didn't
> > > > add it in this RFC. :-) Please let me know if it feels better to filter
> > > > it from today.    
> > > 
> > > I currently hope it will use a different map API entirely and not rely
> > > on discovering the P2P via the VMA. eg using a DMABUF FD or something.
> > > 
> > > So blocking it in qemu feels like the right thing to do.  
> > 
> > Wait a sec, so legacy vfio supports p2p between devices, which has a
> > least a couple known use cases, primarily involving GPUs for at least
> > one of the peers, and we're not going to make equivalent support a
> > feature requirement for iommufd?    
> 
> I said "different map API" - something like IOMMU_FD_MAP_DMABUF
> perhaps.

For future support, yes, but your last sentence above states to
outright block it for now, which would be a visible feature regression
vs legacy vfio.

> The trouble with taking in a user pointer to MMIO memory is that it
> becomes quite annoying to go from a VMA back to the actual owner
> object so we can establish proper refcounting and lifetime of struct-page-less
> memory. Requiring userspace to make that connection via a FD
> simplifies and generalizes this.
> 
> So, qemu would say 'oh this memory is exported by VFIO, I will do
> VFIO_EXPORT_DMA_BUF, then do IOMMU_FD_MAP_DMABUF, then close the FD'
> 
> For vfio_compat we'd have to build some hacky compat approach to
> discover the dmabuf for vfio-pci from the VMA.
> 
> But if qemu is going this way with a new implementation I would prefer
> the new implementation use the new way, when we decide what it should
> be.
> 
> As I mentioned before I would like to use DMABUF since I already have
> a use-case to expose DMABUF from vfio-pci to connect to RDMA. I will
> post the vfio DMABUF patch I have already.

I'm not suggesting there aren't issues with P2P mappings, we all know
that legacy vfio has various issues currently.  I'm only stating that
there are use cases for it and if we cannot support those use cases
then we can't do a transparent switch to iommufd when it's available.
Switching would depend not only on kernel/QEMU support, but the
necessary features for the VM, where we have no means to
programmatically determine the latter.  Thanks,

Alex
Jason Gunthorpe April 26, 2022, 11:08 p.m. UTC | #12
On Tue, Apr 26, 2022 at 02:59:31PM -0600, Alex Williamson wrote:

> > The best you could do is make a dummy IOAS then attach the device,
> > read the mappings, detatch, and then do your unmaps.
> 
> Right, the same thing the kernel does currently.
> 
> > I'm imagining something like IOMMUFD_DEVICE_GET_RANGES that can be
> > called prior to attaching on the device ID.
> 
> Something like /sys/kernel/iommu_groups/$GROUP/reserved_regions?

If we do the above ioctl with iommufd I would want to include the domain
aperture too, but yes.

> > > We must be absolutely certain that there is no DMA to that range
> > > before doing so.  
> > 
> > Yes, but at the same time if the VM thinks it can DMA to that memory
> > then it is quite likely to DMA to it with the new device that doesn't
> > have it mapped in the first place.
> 
> Sorry, this assertion doesn't make sense to me.  We can't assume a
> vIOMMU on x86, so QEMU typically maps the entire VM address space (ie.
> device address space == system memory).  Some of those mappings are
> likely DMA targets (RAM), but only a tiny fraction of the address space
> may actually be used for DMA.  Some of those mappings are exceedingly
> unlikely P2P DMA targets (device memory), so we don't consider mapping
> failures to be fatal to attaching the device.

> If we have a case where a range failed for one device but worked for a
> previous, we're in the latter scenario, because we should have failed
> the device attach otherwise.  Your assertion would require that there
> are existing devices (plural) making use of this mapping and that the
> new device is also likely to make use of this mapping.  I have a hard
> time believing that evidence exists to support that statement.

This is quite normal, we often have multiple NICs and GPUs in the same
system/VM and the expectation is that P2P between the MMIO regions of
all the NICs and all the GPUs will work. Hotplugging in a NIC or GPU
and having it be excluded from P2P maps would be fatal to the VM.

So, while I think it is vanishingly unlikely that a reserved region
conflict would cause a problem, my preference is that this stuff is
deterministic. Either hotplugs fails or hotplug configures it to the
same state it would be if the VM was started with this configuration.

Perhaps this just suggests that qemu should be told by the operator
what kind of P2P to export from a device 'never/auto/always' with auto
being today's behavior.

> P2P use cases are sufficiently rare that this hasn't been an issue.  I
> think there's also still a sufficient healthy dose of FUD whether a
> system supports P2P that drivers do some validation before relying on
> it.

I'm not sure what you mean here, the P2P capability discovery is a
complete mess and never did get standardized. Linux has the
expectation that drivers will use pci_p2pdma_distance() before doing
P2P which weeds out only some of the worst non-working cases.

> > This is why I find it bit strange that qemu doesn't check the
> > ranges. eg I would expect that anything declared as memory in the E820
> > map has to be mappable to the iommu_domain or the device should not
> > attach at all.
> 
> You have some interesting assumptions around associating
> MemoryRegionSegments from the device AddressSpace to something like an
> x86 specific E820 table.  

I'm thinking about it from an OS perspective in the VM, not from qemu
internals. OS's do not randomly DMA everwhere, the firmware tables/etc
do make it predictable where DMA will happen.

> > The P2P is a bit trickier, and I know we don't have a good story
> > because we lack ACPI description, but I would have expected the same
> > kind of thing. Anything P2Pable should be in the iommu_domain or the
> > device should not attach. As with system memory there are only certain
> > parts of the E820 map that an OS would use for P2P.
> > 
> > (ideally ACPI would indicate exactly what combinations of devices are
> > P2Pable and then qemu would use that drive the mandatory address
> > ranges in the IOAS)
> 
> How exactly does ACPI indicate that devices can do P2P?  How can we
> rely on ACPI for a problem that's not unique to platforms that
> implement ACPI?

I am trying to say this never did get standardized. It was talked about
when the pci_p2pdma_distance() was merged and I thought some folks
were going to go off and take care of an ACPI query for it to use. It
would be useful here at least.
 
> > > > > yeah. qemu can filter the P2P BAR mapping and just stop it in qemu. We
> > > > > haven't added it as it is something you will add in future. so didn't
> > > > > add it in this RFC. :-) Please let me know if it feels better to filter
> > > > > it from today.    
> > > > 
> > > > I currently hope it will use a different map API entirely and not rely
> > > > on discovering the P2P via the VMA. eg using a DMABUF FD or something.
> > > > 
> > > > So blocking it in qemu feels like the right thing to do.  
> > > 
> > > Wait a sec, so legacy vfio supports p2p between devices, which has a
> > > least a couple known use cases, primarily involving GPUs for at least
> > > one of the peers, and we're not going to make equivalent support a
> > > feature requirement for iommufd?    
> > 
> > I said "different map API" - something like IOMMU_FD_MAP_DMABUF
> > perhaps.
> 
> For future support, yes, but your last sentence above states to
> outright block it for now, which would be a visible feature regression
> vs legacy vfio.

I'm not sure I understand. Today iommufd does not support MMIO vmas in
IOMMUFD_MAP, and if we do the DMABUF stuff, it never will. So the
correct thing is to block it in qemu and when we decide exactly the
correct interface we will update qemu to use it. Surely this would be
completed before we declare iommufd "ready". Hopefully this happens
not long after we merge the basic iommufd kernel stuff.

> that legacy vfio has various issues currently.  I'm only stating that
> there are use cases for it and if we cannot support those use cases
> then we can't do a transparent switch to iommufd when it's
> available.

P2P is very important to me, I will get it supported, but I can't
tackle every problem at once.

If we can't agree on a secure implementation after a lot of trying
then we can implement follow_pfn like VFIO did.

> Switching would depend not only on kernel/QEMU support, but the
> necessary features for the VM, where we have no means to
> programmatically determine the latter.  Thanks,

I'm not sure what "features for the VM" means?

Jason
diff mbox series

Patch

diff --git a/hw/vfio/as.c b/hw/vfio/as.c
index 4abaa4068f..94618efd1f 100644
--- a/hw/vfio/as.c
+++ b/hw/vfio/as.c
@@ -41,7 +41,7 @@ 
 #include "qapi/error.h"
 #include "migration/migration.h"
 
-static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
+VFIOAddressSpaceList vfio_address_spaces =
     QLIST_HEAD_INITIALIZER(vfio_address_spaces);
 
 void vfio_host_win_add(VFIOContainer *container,
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
new file mode 100644
index 0000000000..f8375f1672
--- /dev/null
+++ b/hw/vfio/iommufd.c
@@ -0,0 +1,545 @@ 
+/*
+ * iommufd container backend
+ *
+ * Copyright (C) 2022 Intel Corporation.
+ * Copyright Red Hat, Inc. 2022
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ *          Eric Auger <eric.auger@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include <sys/ioctl.h>
+#include <linux/vfio.h>
+
+#include "hw/vfio/vfio-common.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+#include "qapi/error.h"
+#include "hw/iommufd/iommufd.h"
+#include "hw/qdev-core.h"
+#include "sysemu/reset.h"
+#include "qemu/cutils.h"
+
+static bool iommufd_check_extension(VFIOContainer *bcontainer,
+                                    VFIOContainerFeature feat)
+{
+    switch (feat) {
+    default:
+        return false;
+    };
+}
+
+static int iommufd_map(VFIOContainer *bcontainer, hwaddr iova,
+                       ram_addr_t size, void *vaddr, bool readonly)
+{
+    VFIOIOMMUFDContainer *container = container_of(bcontainer,
+                                                   VFIOIOMMUFDContainer, obj);
+
+    return iommufd_map_dma(container->iommufd, container->ioas_id,
+                           iova, size, vaddr, readonly);
+}
+
+static int iommufd_unmap(VFIOContainer *bcontainer,
+                         hwaddr iova, ram_addr_t size,
+                         IOMMUTLBEntry *iotlb)
+{
+    VFIOIOMMUFDContainer *container = container_of(bcontainer,
+                                                   VFIOIOMMUFDContainer, obj);
+
+    /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */
+    return iommufd_unmap_dma(container->iommufd,
+                             container->ioas_id, iova, size);
+}
+
+static int vfio_get_devicefd(const char *sysfs_path, Error **errp)
+{
+    long int vfio_id = -1, ret = -ENOTTY;
+    char *path, *tmp = NULL;
+    DIR *dir;
+    struct dirent *dent;
+    struct stat st;
+    gchar *contents;
+    gsize length;
+    int major, minor;
+    dev_t vfio_devt;
+
+    path = g_strdup_printf("%s/vfio-device", sysfs_path);
+    if (stat(path, &st) < 0) {
+        error_setg_errno(errp, errno, "no such host device");
+        goto out;
+    }
+
+    dir = opendir(path);
+    if (!dir) {
+        error_setg_errno(errp, errno, "couldn't open dirrectory %s", path);
+        goto out;
+    }
+
+    while ((dent = readdir(dir))) {
+        const char *end_name;
+
+        if (!strncmp(dent->d_name, "vfio", 4)) {
+            ret = qemu_strtol(dent->d_name + 4, &end_name, 10, &vfio_id);
+            if (ret) {
+                error_setg(errp, "suspicious vfio* file in %s", path);
+                goto out;
+            }
+            break;
+        }
+    }
+
+    /* check if the major:minor matches */
+    tmp = g_strdup_printf("%s/%s/dev", path, dent->d_name);
+    if (!g_file_get_contents(tmp, &contents, &length, NULL)) {
+        error_setg(errp, "failed to load \"%s\"", tmp);
+        goto out;
+    }
+
+    if (sscanf(contents, "%d:%d", &major, &minor) != 2) {
+        error_setg(errp, "failed to load \"%s\"", tmp);
+        goto out;
+    }
+    g_free(contents);
+    g_free(tmp);
+
+    tmp = g_strdup_printf("/dev/vfio/devices/vfio%ld", vfio_id);
+    if (stat(tmp, &st) < 0) {
+        error_setg_errno(errp, errno, "no such vfio device");
+        goto out;
+    }
+    vfio_devt = makedev(major, minor);
+    if (st.st_rdev != vfio_devt) {
+        error_setg(errp, "minor do not match: %lu, %lu", vfio_devt, st.st_rdev);
+        goto out;
+    }
+
+    ret = qemu_open_old(tmp, O_RDWR);
+    if (ret < 0) {
+        error_setg(errp, "Failed to open %s", tmp);
+    }
+    trace_vfio_iommufd_get_devicefd(tmp, ret);
+out:
+    g_free(tmp);
+    g_free(path);
+
+    if (*errp) {
+        error_prepend(errp, VFIO_MSG_PREFIX, path);
+    }
+    return ret;
+}
+
+static VFIOIOASHwpt *vfio_container_get_hwpt(VFIOIOMMUFDContainer *container,
+                                             uint32_t hwpt_id)
+{
+    VFIOIOASHwpt *hwpt;
+
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        if (hwpt->hwpt_id == hwpt_id) {
+            return hwpt;
+        }
+    }
+
+    hwpt = g_malloc0(sizeof(*hwpt));
+
+    hwpt->hwpt_id = hwpt_id;
+    QLIST_INIT(&hwpt->device_list);
+    QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next);
+
+    return hwpt;
+}
+
+static void vfio_container_put_hwpt(VFIOIOASHwpt *hwpt)
+{
+    if (!QLIST_EMPTY(&hwpt->device_list)) {
+        g_assert_not_reached();
+    }
+    QLIST_REMOVE(hwpt, next);
+    g_free(hwpt);
+}
+
+static VFIOIOASHwpt *vfio_find_hwpt_for_dev(VFIOIOMMUFDContainer *container,
+                                            VFIODevice *vbasedev)
+{
+    VFIOIOASHwpt *hwpt;
+    VFIODevice *vbasedev_iter;
+
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        QLIST_FOREACH(vbasedev_iter, &hwpt->device_list, hwpt_next) {
+            if (vbasedev_iter == vbasedev) {
+                return hwpt;
+            }
+        }
+    }
+    return NULL;
+}
+
+static void
+__vfio_device_detach_container(VFIODevice *vbasedev,
+                               VFIOIOMMUFDContainer *container, Error **errp)
+{
+    struct vfio_device_detach_ioas detach_data = {
+        .argsz = sizeof(detach_data),
+        .flags = 0,
+        .iommufd = container->iommufd,
+        .ioas_id = container->ioas_id,
+    };
+
+    if (ioctl(vbasedev->fd, VFIO_DEVICE_DETACH_IOAS, &detach_data)) {
+        error_setg_errno(errp, errno, "detach %s from ioas id=%d failed",
+                         vbasedev->name, container->ioas_id);
+    }
+    trace_vfio_iommufd_detach_device(container->iommufd, vbasedev->name,
+                                     container->ioas_id);
+
+    /* iommufd unbind is done per device fd close */
+}
+
+static void vfio_device_detach_container(VFIODevice *vbasedev,
+                                         VFIOIOMMUFDContainer *container,
+                                         Error **errp)
+{
+    VFIOIOASHwpt *hwpt;
+
+    hwpt = vfio_find_hwpt_for_dev(container, vbasedev);
+    if (hwpt) {
+        QLIST_REMOVE(vbasedev, hwpt_next);
+        if (QLIST_EMPTY(&hwpt->device_list)) {
+            vfio_container_put_hwpt(hwpt);
+        }
+    }
+
+    __vfio_device_detach_container(vbasedev, container, errp);
+}
+
+static int vfio_device_attach_container(VFIODevice *vbasedev,
+                                        VFIOIOMMUFDContainer *container,
+                                        Error **errp)
+{
+    struct vfio_device_bind_iommufd bind = {
+        .argsz = sizeof(bind),
+        .flags = 0,
+        .iommufd = container->iommufd,
+        .dev_cookie = (uint64_t)vbasedev,
+    };
+    struct vfio_device_attach_ioas attach_data = {
+        .argsz = sizeof(attach_data),
+        .flags = 0,
+        .iommufd = container->iommufd,
+        .ioas_id = container->ioas_id,
+    };
+    VFIOIOASHwpt *hwpt;
+    int ret;
+
+    /* Bind device to iommufd */
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind);
+    if (ret) {
+        error_setg_errno(errp, errno, "error bind device fd=%d to iommufd=%d",
+                         vbasedev->fd, bind.iommufd);
+        return ret;
+    }
+
+    vbasedev->devid = bind.out_devid;
+    trace_vfio_iommufd_bind_device(bind.iommufd, vbasedev->name,
+                                   vbasedev->fd, vbasedev->devid);
+
+    /* Attach device to an ioas within iommufd */
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_ATTACH_IOAS, &attach_data);
+    if (ret) {
+        error_setg_errno(errp, errno,
+                         "[iommufd=%d] error attach %s (%d) to ioasid=%d",
+                         container->iommufd, vbasedev->name, vbasedev->fd,
+                         attach_data.ioas_id);
+        return ret;
+
+    }
+    trace_vfio_iommufd_attach_device(bind.iommufd, vbasedev->name,
+                                     vbasedev->fd, container->ioas_id,
+                                     attach_data.out_hwpt_id);
+
+    hwpt = vfio_container_get_hwpt(container, attach_data.out_hwpt_id);
+
+    QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
+    return 0;
+}
+
+static int vfio_device_reset(VFIODevice *vbasedev)
+{
+    if (vbasedev->dev->realized) {
+        vbasedev->ops->vfio_compute_needs_reset(vbasedev);
+        if (vbasedev->needs_reset) {
+            return vbasedev->ops->vfio_hot_reset_multi(vbasedev);
+        }
+    }
+    return 0;
+}
+
+static int vfio_iommufd_container_reset(VFIOContainer *bcontainer)
+{
+    VFIOIOMMUFDContainer *container;
+    int ret, final_ret = 0;
+    VFIODevice *vbasedev;
+    VFIOIOASHwpt *hwpt;
+
+    container = container_of(bcontainer, VFIOIOMMUFDContainer, obj);
+
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        QLIST_FOREACH(vbasedev, &hwpt->device_list, hwpt_next) {
+            ret = vfio_device_reset(vbasedev);
+            if (ret) {
+                error_report("failed to reset %s (%d)", vbasedev->name, ret);
+                final_ret = ret;
+            } else {
+                trace_vfio_iommufd_container_reset(vbasedev->name);
+            }
+        }
+    }
+    return final_ret;
+}
+
+static void vfio_iommufd_container_destroy(VFIOIOMMUFDContainer *container)
+{
+    vfio_container_destroy(&container->obj);
+    g_free(container);
+}
+
+static int vfio_ram_block_discard_disable(bool state)
+{
+    /*
+     * We support coordinated discarding of RAM via the RamDiscardManager.
+     */
+    return ram_block_uncoordinated_discard_disable(state);
+}
+
+static void iommufd_detach_device(VFIODevice *vbasedev);
+
+static int iommufd_attach_device(VFIODevice *vbasedev, AddressSpace *as,
+                                 Error **errp)
+{
+    VFIOContainer *bcontainer;
+    VFIOIOMMUFDContainer *container;
+    VFIOAddressSpace *space;
+    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
+    int ret, devfd, iommufd;
+    uint32_t ioas_id;
+    Error *err = NULL;
+
+    devfd = vfio_get_devicefd(vbasedev->sysfsdev, errp);
+    if (devfd < 0) {
+        return devfd;
+    }
+    vbasedev->fd = devfd;
+
+    space = vfio_get_address_space(as);
+
+    /* try to attach to an existing container in this space */
+    QLIST_FOREACH(bcontainer, &space->containers, next) {
+        if (!object_dynamic_cast(OBJECT(bcontainer),
+                                 TYPE_VFIO_IOMMUFD_CONTAINER)) {
+            continue;
+        }
+        container = container_of(bcontainer, VFIOIOMMUFDContainer, obj);
+        if (vfio_device_attach_container(vbasedev, container, &err)) {
+            const char *msg = error_get_pretty(err);
+
+            trace_vfio_iommufd_fail_attach_existing_container(msg);
+            error_free(err);
+            err = NULL;
+        } else {
+            ret = vfio_ram_block_discard_disable(true);
+            if (ret) {
+                vfio_device_detach_container(vbasedev, container, &err);
+                error_propagate(errp, err);
+                vfio_put_address_space(space);
+                close(vbasedev->fd);
+                error_prepend(errp,
+                              "Cannot set discarding of RAM broken (%d)", ret);
+                return ret;
+            }
+            goto out;
+        }
+    }
+
+    /* Need to allocate a new dedicated container */
+    ret = iommufd_get_ioas(&iommufd, &ioas_id);
+    if (ret < 0) {
+        vfio_put_address_space(space);
+        close(vbasedev->fd);
+        error_report("Failed to alloc ioas (%s)", strerror(errno));
+        return ret;
+    }
+
+    trace_vfio_iommufd_alloc_ioas(iommufd, ioas_id);
+
+    container = g_malloc0(sizeof(*container));
+    container->iommufd = iommufd;
+    container->ioas_id = ioas_id;
+    QLIST_INIT(&container->hwpt_list);
+
+    bcontainer = &container->obj;
+    vfio_container_init(bcontainer, sizeof(*bcontainer),
+                        TYPE_VFIO_IOMMUFD_CONTAINER, space);
+
+    ret = vfio_device_attach_container(vbasedev, container, &err);
+    if (ret) {
+        /* todo check if any other thing to do */
+        error_propagate(errp, err);
+        vfio_iommufd_container_destroy(container);
+        iommufd_put_ioas(iommufd, ioas_id);
+        vfio_put_address_space(space);
+        close(vbasedev->fd);
+        return ret;
+    }
+
+    ret = vfio_ram_block_discard_disable(true);
+    if (ret) {
+        vfio_device_detach_container(vbasedev, container, &err);
+        error_propagate(errp, err);
+        error_prepend(errp, "Cannot set discarding of RAM broken (%d)", -ret);
+        vfio_iommufd_container_destroy(container);
+        iommufd_put_ioas(iommufd, ioas_id);
+        vfio_put_address_space(space);
+        close(vbasedev->fd);
+        return ret;
+    }
+
+    /*
+     * TODO: for now iommufd BE is on par with vfio iommu type1, so it's
+     * fine to add the whole range as window. For SPAPR, below code
+     * should be updated.
+     */
+    vfio_host_win_add(bcontainer, 0, (hwaddr)-1, 4096);
+
+    /*
+     * TODO: kvmgroup, unable to do it before the protocol done
+     * between iommufd and kvm.
+     */
+
+    QLIST_INSERT_HEAD(&space->containers, bcontainer, next);
+
+    bcontainer->listener = vfio_memory_listener;
+
+    memory_listener_register(&bcontainer->listener, bcontainer->space->as);
+
+    bcontainer->initialized = true;
+
+out:
+    vbasedev->container = bcontainer;
+
+    /*
+     * TODO: examine RAM_BLOCK_DISCARD stuff, should we do group level
+     * for discarding incompatibility check as well?
+     */
+    if (vbasedev->ram_block_discard_allowed) {
+        vfio_ram_block_discard_disable(false);
+    }
+
+    ret = ioctl(devfd, VFIO_DEVICE_GET_INFO, &dev_info);
+    if (ret) {
+        error_setg_errno(errp, errno, "error getting device info");
+        /*
+         * Needs to use iommufd_detach_device() as this may be failed after
+         * attaching a new deivce to an existing container.
+         */
+        iommufd_detach_device(vbasedev);
+        close(vbasedev->fd);
+        return ret;
+    }
+
+    vbasedev->group = 0;
+    vbasedev->num_irqs = dev_info.num_irqs;
+    vbasedev->num_regions = dev_info.num_regions;
+    vbasedev->flags = dev_info.flags;
+    vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
+
+    trace_vfio_iommufd_device_info(vbasedev->name, devfd, vbasedev->num_irqs,
+                                   vbasedev->num_regions, vbasedev->flags);
+    return 0;
+}
+
+static void iommufd_detach_device(VFIODevice *vbasedev)
+{
+    VFIOContainer *bcontainer = vbasedev->container;
+    VFIOIOMMUFDContainer *container;
+    VFIODevice *vbasedev_iter;
+    VFIOIOASHwpt *hwpt;
+    Error *err;
+
+    if (!bcontainer) {
+        goto out;
+    }
+
+    if (!vbasedev->ram_block_discard_allowed) {
+        vfio_ram_block_discard_disable(false);
+    }
+
+    container = container_of(bcontainer, VFIOIOMMUFDContainer, obj);
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        QLIST_FOREACH(vbasedev_iter, &hwpt->device_list, hwpt_next) {
+            if (vbasedev_iter == vbasedev) {
+                goto found;
+            }
+        }
+    }
+    g_assert_not_reached();
+found:
+    QLIST_REMOVE(vbasedev, hwpt_next);
+    if (QLIST_EMPTY(&hwpt->device_list)) {
+        vfio_container_put_hwpt(hwpt);
+    }
+
+    __vfio_device_detach_container(vbasedev, container, &err);
+    if (err) {
+        error_report_err(err);
+    }
+    if (QLIST_EMPTY(&container->hwpt_list)) {
+        VFIOAddressSpace *space = bcontainer->space;
+
+        iommufd_put_ioas(container->iommufd, container->ioas_id);
+        vfio_iommufd_container_destroy(container);
+        vfio_put_address_space(space);
+    }
+    vbasedev->container = NULL;
+out:
+    close(vbasedev->fd);
+    g_free(vbasedev->name);
+}
+
+static void vfio_iommufd_class_init(ObjectClass *klass,
+                                    void *data)
+{
+    VFIOContainerClass *vccs = VFIO_CONTAINER_OBJ_CLASS(klass);
+
+    vccs->check_extension = iommufd_check_extension;
+    vccs->dma_map = iommufd_map;
+    vccs->dma_unmap = iommufd_unmap;
+    vccs->attach_device = iommufd_attach_device;
+    vccs->detach_device = iommufd_detach_device;
+    vccs->reset = vfio_iommufd_container_reset;
+}
+
+static const TypeInfo vfio_iommufd_info = {
+    .parent = TYPE_VFIO_CONTAINER_OBJ,
+    .name = TYPE_VFIO_IOMMUFD_CONTAINER,
+    .class_init = vfio_iommufd_class_init,
+};
+
+static void vfio_iommufd_register_types(void)
+{
+    type_register_static(&vfio_iommufd_info);
+}
+
+type_init(vfio_iommufd_register_types)
diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
index df4fa2b695..3c53c87200 100644
--- a/hw/vfio/meson.build
+++ b/hw/vfio/meson.build
@@ -7,6 +7,9 @@  vfio_ss.add(files(
   'spapr.c',
   'migration.c',
 ))
+vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files(
+  'iommufd.c',
+))
 vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
   'display.c',
   'pci-quirks.c',
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index e1ab6d339d..cf5703f94b 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3148,6 +3148,16 @@  static void vfio_pci_reset(DeviceState *dev)
         goto post_reset;
     }
 
+    /*
+     * This is a temporary check, long term iommufd should
+     * support hot reset as well
+     */
+    if (vdev->vbasedev.be == VFIO_IOMMU_BACKEND_TYPE_IOMMUFD) {
+        error_report("Dangerous: iommufd BE doesn't support hot "
+                     "reset, please stop the VM");
+        goto post_reset;
+    }
+
     /* See if we can do our own bus reset */
     if (!vfio_pci_hot_reset_one(vdev)) {
         goto post_reset;
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 0ef1b5f4a6..51f04b0b80 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -165,3 +165,14 @@  vfio_load_state_device_data(const char *name, uint64_t data_offset, uint64_t dat
 vfio_load_cleanup(const char *name) " (%s)"
 vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
+
+#iommufd.c
+
+vfio_iommufd_get_devicefd(const char *dev, int devfd) " %s (fd=%d)"
+vfio_iommufd_bind_device(int iommufd, const char *name, int devfd, int devid) " [iommufd=%d] Succesfully bound device %s (fd=%d): output devid=%d"
+vfio_iommufd_attach_device(int iommufd, const char *name, int devfd, int ioasid, int hwptid) " [iommufd=%d] Succesfully attached device %s (%d) to ioasid=%d: output hwptd=%d"
+vfio_iommufd_detach_device(int iommufd, const char *name, int ioasid) " [iommufd=%d] Detached %s from ioasid=%d"
+vfio_iommufd_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d"
+vfio_iommufd_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d"
+vfio_iommufd_fail_attach_existing_container(const char *msg) " %s"
+vfio_iommufd_container_reset(char *name) " Successfully reset %s"
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 2040c27cda..19731ea685 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -81,6 +81,22 @@  typedef struct VFIOLegacyContainer {
     QLIST_HEAD(, VFIOGroup) group_list;
 } VFIOLegacyContainer;
 
+typedef struct VFIOIOASHwpt {
+    uint32_t hwpt_id;
+    QLIST_HEAD(, VFIODevice) device_list;
+    QLIST_ENTRY(VFIOIOASHwpt) next;
+} VFIOIOASHwpt;
+
+typedef struct VFIOIOMMUFDContainer {
+    VFIOContainer obj;
+    int iommufd; /* /dev/vfio/vfio, empowered by the attached device */
+    uint32_t ioas_id;
+    QLIST_HEAD(, VFIOIOASHwpt) hwpt_list;
+} VFIOIOMMUFDContainer;
+
+typedef QLIST_HEAD(VFIOAddressSpaceList, VFIOAddressSpace) VFIOAddressSpaceList;
+extern VFIOAddressSpaceList vfio_address_spaces;
+
 typedef struct VFIODeviceOps VFIODeviceOps;
 
 typedef enum VFIOIOMMUBackendType {
@@ -90,6 +106,7 @@  typedef enum VFIOIOMMUBackendType {
 
 typedef struct VFIODevice {
     QLIST_ENTRY(VFIODevice) next;
+    QLIST_ENTRY(VFIODevice) hwpt_next;
     struct VFIOGroup *group;
     VFIOContainer *container;
     char *sysfsdev;
@@ -97,6 +114,7 @@  typedef struct VFIODevice {
     DeviceState *dev;
     int fd;
     int type;
+    int devid;
     bool reset_works;
     bool needs_reset;
     bool no_mmap;
diff --git a/include/hw/vfio/vfio-container-obj.h b/include/hw/vfio/vfio-container-obj.h
index ffd8590ff8..b5ef2160d8 100644
--- a/include/hw/vfio/vfio-container-obj.h
+++ b/include/hw/vfio/vfio-container-obj.h
@@ -43,6 +43,7 @@ 
                          TYPE_VFIO_CONTAINER_OBJ)
 
 #define TYPE_VFIO_LEGACY_CONTAINER "qemu:vfio-legacy-container"
+#define TYPE_VFIO_IOMMUFD_CONTAINER "qemu:vfio-iommufd-container"
 
 typedef enum VFIOContainerFeature {
     VFIO_FEAT_LIVE_MIGRATION,