diff mbox series

[RFC,v3,04/19] Add device IO ops vector

Message ID c8f636ad86d8b89f4610e95089ad49816c70a7a6.1636057885.git.john.g.johnson@oracle.com (mailing list archive)
State New, archived
Headers show
Series vfio-user client | expand

Commit Message

John Johnson Nov. 9, 2021, 12:46 a.m. UTC
Used for communication with VFIO driver
(prep work for vfio-user, which will communicate over a socket)

Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
---
 include/hw/vfio/vfio-common.h |  28 ++++++++
 hw/vfio/common.c              | 159 ++++++++++++++++++++++++++++++++++++++----
 hw/vfio/pci.c                 | 146 ++++++++++++++++++++++++--------------
 3 files changed, 265 insertions(+), 68 deletions(-)

Comments

Alex Williamson Nov. 19, 2021, 10:42 p.m. UTC | #1
On Mon,  8 Nov 2021 16:46:32 -0800
John Johnson <john.g.johnson@oracle.com> wrote:

> Used for communication with VFIO driver
> (prep work for vfio-user, which will communicate over a socket)
> 
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> ---
>  include/hw/vfio/vfio-common.h |  28 ++++++++
>  hw/vfio/common.c              | 159 ++++++++++++++++++++++++++++++++++++++----
>  hw/vfio/pci.c                 | 146 ++++++++++++++++++++++++--------------
>  3 files changed, 265 insertions(+), 68 deletions(-)
> 
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 9b3c5e5..43fa948 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -124,6 +124,7 @@ typedef struct VFIOHostDMAWindow {
>  } VFIOHostDMAWindow;
>  
>  typedef struct VFIODeviceOps VFIODeviceOps;
> +typedef struct VFIODevIO VFIODevIO;
>  
>  typedef struct VFIODevice {
>      QLIST_ENTRY(VFIODevice) next;
> @@ -139,12 +140,14 @@ typedef struct VFIODevice {
>      bool ram_block_discard_allowed;
>      bool enable_migration;
>      VFIODeviceOps *ops;
> +    VFIODevIO *io_ops;
>      unsigned int num_irqs;
>      unsigned int num_regions;
>      unsigned int flags;
>      VFIOMigration *migration;
>      Error *migration_blocker;
>      OnOffAuto pre_copy_dirty_page_tracking;
> +    struct vfio_region_info **regions;
>  } VFIODevice;
>  
>  struct VFIODeviceOps {
> @@ -164,6 +167,30 @@ struct VFIODeviceOps {
>   * through ioctl() to the kernel VFIO driver, but vfio-user
>   * can use a socket to a remote process.
>   */
> +struct VFIODevIO {
> +    int (*get_info)(VFIODevice *vdev, struct vfio_device_info *info);
> +    int (*get_region_info)(VFIODevice *vdev,
> +                           struct vfio_region_info *info, int *fd);
> +    int (*get_irq_info)(VFIODevice *vdev, struct vfio_irq_info *irq);
> +    int (*set_irqs)(VFIODevice *vdev, struct vfio_irq_set *irqs);
> +    int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
> +                       void *data);
> +    int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
> +                        void *data, bool post);

The @post arg is not really developed in this patch, it would make
review easier to add the arg when it becomes implemented and used.

> +};
> +
> +#define VDEV_GET_INFO(vdev, info) \
> +    ((vdev)->io_ops->get_info((vdev), (info)))
> +#define VDEV_GET_REGION_INFO(vdev, info, fd) \
> +    ((vdev)->io_ops->get_region_info((vdev), (info), (fd)))
> +#define VDEV_GET_IRQ_INFO(vdev, irq) \
> +    ((vdev)->io_ops->get_irq_info((vdev), (irq)))
> +#define VDEV_SET_IRQS(vdev, irqs) \
> +    ((vdev)->io_ops->set_irqs((vdev), (irqs)))
> +#define VDEV_REGION_READ(vdev, nr, off, size, data) \
> +    ((vdev)->io_ops->region_read((vdev), (nr), (off), (size), (data)))
> +#define VDEV_REGION_WRITE(vdev, nr, off, size, data, post) \
> +    ((vdev)->io_ops->region_write((vdev), (nr), (off), (size), (data), (post)))
>  
>  struct VFIOContIO {
>      int (*dma_map)(VFIOContainer *container,
> @@ -184,6 +211,7 @@ struct VFIOContIO {
>  #define CONT_DIRTY_BITMAP(cont, bitmap, range) \
>      ((cont)->io_ops->dirty_bitmap((cont), (bitmap), (range)))
>  
> +extern VFIODevIO vfio_dev_io_ioctl;
>  extern VFIOContIO vfio_cont_io_ioctl;
>  
>  #endif /* CONFIG_LINUX */
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 50748be..41fdd78 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -70,7 +70,7 @@ void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
>          .count = 0,
>      };
>  
> -    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> +    VDEV_SET_IRQS(vbasedev, &irq_set);
>  }
>  
>  void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
> @@ -83,7 +83,7 @@ void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
>          .count = 1,
>      };
>  
> -    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> +    VDEV_SET_IRQS(vbasedev, &irq_set);
>  }
>  
>  void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
> @@ -96,7 +96,7 @@ void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
>          .count = 1,
>      };
>  
> -    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> +    VDEV_SET_IRQS(vbasedev, &irq_set);
>  }
>  
>  static inline const char *action_to_str(int action)
> @@ -177,9 +177,7 @@ int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
>      pfd = (int32_t *)&irq_set->data;
>      *pfd = fd;
>  
> -    if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
> -        ret = -errno;
> -    }
> +    ret = VDEV_SET_IRQS(vbasedev, irq_set);
>      g_free(irq_set);
>  
>      if (!ret) {
> @@ -214,6 +212,7 @@ void vfio_region_write(void *opaque, hwaddr addr,
>          uint32_t dword;
>          uint64_t qword;
>      } buf;
> +    int ret;
>  
>      switch (size) {
>      case 1:
> @@ -233,13 +232,15 @@ void vfio_region_write(void *opaque, hwaddr addr,
>          break;
>      }
>  
> -    if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
> +    ret = VDEV_REGION_WRITE(vbasedev, region->nr, addr, size, &buf, false);
> +    if (ret != size) {
> +        const char *err = ret < 0 ? strerror(-ret) : "short write";
> +
>          error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
> -                     ",%d) failed: %m",
> +                     ",%d) failed: %s",
>                       __func__, vbasedev->name, region->nr,
> -                     addr, data, size);
> +                     addr, data, size, err);
>      }
> -
>      trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
>  
>      /*
> @@ -265,13 +266,18 @@ uint64_t vfio_region_read(void *opaque,
>          uint64_t qword;
>      } buf;
>      uint64_t data = 0;
> +    int ret;
>  
> -    if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
> -        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
> +    ret = VDEV_REGION_READ(vbasedev, region->nr, addr, size, &buf);
> +    if (ret != size) {
> +        const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> +        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s",
>                       __func__, vbasedev->name, region->nr,
> -                     addr, size);
> +                     addr, size, err);
>          return (uint64_t)-1;
>      }
> +
>      switch (size) {
>      case 1:
>          data = buf.byte;
> @@ -2369,6 +2375,16 @@ int vfio_get_device(VFIOGroup *group, const char *name,
>  
>  void vfio_put_base_device(VFIODevice *vbasedev)
>  {
> +    if (vbasedev->regions != NULL) {
> +        int i;
> +
> +        for (i = 0; i < vbasedev->num_regions; i++) {
> +            g_free(vbasedev->regions[i]);
> +        }
> +        g_free(vbasedev->regions);
> +        vbasedev->regions = NULL;
> +    }
> +
>      if (!vbasedev->group) {
>          return;
>      }
> @@ -2382,6 +2398,21 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index,
>                           struct vfio_region_info **info)
>  {
>      size_t argsz = sizeof(struct vfio_region_info);
> +    int fd = -1;
> +    int ret;
> +
> +    /* create region cache */
> +    if (vbasedev->regions == NULL) {
> +        vbasedev->regions = g_new0(struct vfio_region_info *,
> +                                   vbasedev->num_regions);
> +    }

Seems like this should have been part of vfio_get_device() since that's
where we set num_regions.

> +    /* check cache */
> +    if (vbasedev->regions[index] != NULL) {
> +        *info = g_malloc0(vbasedev->regions[index]->argsz);
> +        memcpy(*info, vbasedev->regions[index],
> +               vbasedev->regions[index]->argsz);
> +        return 0;
> +    }

Why are we calling vfio_get_region_info() enough that we need a cache?
This looks like an optimization for something that doesn't exist yet.
And if we have a cache, why aren't callers using it directly rather
than creating a copy for each caller?

>  
>      *info = g_malloc0(argsz);
>  
> @@ -2389,19 +2420,28 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index,
>  retry:
>      (*info)->argsz = argsz;
>  
> -    if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
> +    ret = VDEV_GET_REGION_INFO(vbasedev, *info, &fd);
> +    if (ret != 0) {
>          g_free(*info);
>          *info = NULL;
> -        return -errno;
> +        return ret;
>      }
>  
>      if ((*info)->argsz > argsz) {
>          argsz = (*info)->argsz;
>          *info = g_realloc(*info, argsz);
> +        if (fd != -1) {
> +            close(fd);
> +            fd = -1;
> +        }

Use of fd here is hard to review, it's clearly for some future use case.

>  
>          goto retry;
>      }
>  
> +    /* fill cache */
> +    vbasedev->regions[index] = g_malloc0(argsz);
> +    memcpy(vbasedev->regions[index], *info, argsz);
> +
>      return 0;
>  }
>  
> @@ -2554,6 +2594,95 @@ int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
>   * Traditional ioctl() based io_ops
>   */
>  
> +static int vfio_io_get_info(VFIODevice *vbasedev, struct vfio_device_info *info)
> +{
> +    int ret;
> +
> +    ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_INFO, info);
> +    if (ret < 0) {
> +        ret = -errno;
> +    }
> +
> +    return ret;
> +}
> +
> +static int vfio_io_get_region_info(VFIODevice *vbasedev,
> +                                   struct vfio_region_info *info,
> +                                   int *fd)
> +{
> +    int ret;
> +
> +    *fd = -1;
> +    ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
> +    if (ret < 0) {
> +        ret = -errno;
> +    }
> +
> +    return ret;
> +}
> +
> +static int vfio_io_get_irq_info(VFIODevice *vbasedev,
> +                                struct vfio_irq_info *info)
> +{
> +    int ret;
> +
> +    ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info);
> +    if (ret < 0) {
> +        ret = -errno;
> +    }
> +
> +    return ret;
> +}
> +
> +static int vfio_io_set_irqs(VFIODevice *vbasedev, struct vfio_irq_set *irqs)
> +{
> +    int ret;
> +
> +    ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irqs);
> +    if (ret < 0) {
> +        ret = -errno;
> +    }
> +
> +    return ret;
> +}
> +
> +static int vfio_io_region_read(VFIODevice *vbasedev, uint8_t index, off_t off,
> +                               uint32_t size, void *data)
> +{
> +    struct vfio_region_info *info = vbasedev->regions[index];
> +    int ret;
> +
> +    ret = pread(vbasedev->fd, data, size, info->offset + off);
> +    if (ret < 0) {
> +        ret = -errno;
> +    }
> +
> +    return ret;
> +}
> +
> +static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off,
> +                                uint32_t size, void *data, bool post)
> +{
> +    struct vfio_region_info *info = vbasedev->regions[index];
> +    int ret;
> +
> +    ret = pwrite(vbasedev->fd, data, size, info->offset + off);
> +    if (ret < 0) {
> +        ret = -errno;
> +    }
> +
> +    return ret;
> +}

Simplify all these with ternaries?

    return ret < 0 ? -errno : ret;

> +
> +VFIODevIO vfio_dev_io_ioctl = {
> +    .get_info = vfio_io_get_info,
> +    .get_region_info = vfio_io_get_region_info,
> +    .get_irq_info = vfio_io_get_irq_info,
> +    .set_irqs = vfio_io_set_irqs,
> +    .region_read = vfio_io_region_read,
> +    .region_write = vfio_io_region_write,
> +};
> +
>  static int vfio_io_dma_map(VFIOContainer *container,
>                             struct vfio_iommu_type1_dma_map *map,
>                             int fd, bool will_commit)
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 122edf8..28f21f8 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -402,7 +402,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
>          fds[i] = fd;
>      }
>  
> -    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    ret = VDEV_SET_IRQS(&vdev->vbasedev, irq_set);
>  
>      g_free(irq_set);
>  
> @@ -772,14 +772,16 @@ static void vfio_update_msi(VFIOPCIDevice *vdev)
>  
>  static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
>  {
> +    VFIODevice *vbasedev = &vdev->vbasedev;
>      struct vfio_region_info *reg_info;
>      uint64_t size;
>      off_t off = 0;
>      ssize_t bytes;
> +    int ret;
>  
> -    if (vfio_get_region_info(&vdev->vbasedev,
> -                             VFIO_PCI_ROM_REGION_INDEX, &reg_info)) {
> -        error_report("vfio: Error getting ROM info: %m");
> +    ret = vfio_get_region_info(vbasedev, VFIO_PCI_ROM_REGION_INDEX, &reg_info);
> +    if (ret < 0) {
> +        error_report("vfio: Error getting ROM info: %s", strerror(-ret));
>          return;
>      }
>  
> @@ -806,18 +808,19 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
>      memset(vdev->rom, 0xff, size);
>  
>      while (size) {
> -        bytes = pread(vdev->vbasedev.fd, vdev->rom + off,
> -                      size, vdev->rom_offset + off);
> +        bytes = VDEV_REGION_READ(vbasedev, VFIO_PCI_ROM_REGION_INDEX, off,
> +                                 size, vdev->rom + off);
>          if (bytes == 0) {
>              break;
>          } else if (bytes > 0) {
>              off += bytes;
>              size -= bytes;
>          } else {
> -            if (errno == EINTR || errno == EAGAIN) {
> +            if (bytes == -EINTR || bytes == -EAGAIN) {
>                  continue;
>              }
> -            error_report("vfio: Error reading device ROM: %m");
> +            error_report("vfio: Error reading device ROM: %s",
> +                         strerror(-bytes));
>              break;
>          }
>      }
> @@ -905,11 +908,10 @@ static const MemoryRegionOps vfio_rom_ops = {
>  
>  static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
>  {
> +    VFIODevice *vbasedev = &vdev->vbasedev;
>      uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
> -    off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
>      DeviceState *dev = DEVICE(vdev);
>      char *name;
> -    int fd = vdev->vbasedev.fd;
>  
>      if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
>          /* Since pci handles romfile, just print a message and return */
> @@ -926,13 +928,23 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
>       * Use the same size ROM BAR as the physical device.  The contents
>       * will get filled in later when the guest tries to read it.
>       */
> -    if (pread(fd, &orig, 4, offset) != 4 ||
> -        pwrite(fd, &size, 4, offset) != 4 ||
> -        pread(fd, &size, 4, offset) != 4 ||
> -        pwrite(fd, &orig, 4, offset) != 4) {
> -        error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name);
> +#define rom_read(var) VDEV_REGION_READ(vbasedev,  \
> +                                       VFIO_PCI_CONFIG_REGION_INDEX,  \
> +                                       PCI_ROM_ADDRESS, 4, (var))
> +#define rom_write(var) VDEV_REGION_WRITE(vbasedev,  \
> +                                         VFIO_PCI_CONFIG_REGION_INDEX,  \
> +                                         PCI_ROM_ADDRESS, 4, (var), false)
> +
> +    if (rom_read(&orig) != 4 ||
> +        rom_write(&size) != 4 ||
> +        rom_read(&size) != 4 ||
> +        rom_write(&orig) != 4) {
> +
> +        error_report("%s(%s) ROM access failed", __func__, vbasedev->name);
>          return;
>      }
> +#undef rom_read
> +#undef rom_write

Hmm, I think I'd rather see a generic function broken out for this,
maybe:

int vfio_pci_size_bar32(VFIODevice *vbasedev, int offset,
                        uint32_t mask, uint32_t *size)
{
    uint32_t orig, val;
    int index = VFIO_PCI_CONFIG_REGION_INDEX;

    if (VDEV_REGION_READ(vbasedev, index, offset, 4, &orig) != 4 ||
        VDEV_REGION_WRITE(vbasedev, index, offset, 4, &mask, false) != 4 ||
        VDEV_REGION_READ(vbasedev, index, offset, 4, &val) != 4 ||
        VDEV_REGION_WRITE(vbasedev, index, offset, 4, &orig, false) != 4) {

        error_report(...

        return -1;
    }

    *size = ~(le32_to_cpu(val) & PCI_ROM_ADDRESS_MASK) + 1;

    return 0;
}

Really I think it's just trying to hard code
VFIO_PCI_CONFIG_REGION_INDEX and PCI_ROM_ADDRESS into the macro calls
that demand some simplification, if we use variables it looks fine
inline.

We could also re-wrap VDEV_REGION_READ/WRITE in vfio/pci.h to simplify
config space access, presumably we'd never have posted writes to config
space.  Thanks,

Alex

>  
>      size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
>  
> @@ -1110,6 +1122,7 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
>  uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
>  {
>      VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
> +    VFIODevice *vbasedev = &vdev->vbasedev;
>      uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
>  
>      memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
> @@ -1122,12 +1135,14 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
>      if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
>          ssize_t ret;
>  
> -        ret = pread(vdev->vbasedev.fd, &phys_val, len,
> -                    vdev->config_offset + addr);
> +        ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, addr,
> +                               len, &phys_val);
>          if (ret != len) {
> -            error_report("%s(%s, 0x%x, 0x%x) failed: %m",
> -                         __func__, vdev->vbasedev.name, addr, len);
> -            return -errno;
> +            const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> +            error_report("%s(%s, 0x%x, 0x%x) failed: %s",
> +                         __func__, vbasedev->name, addr, len, err);
> +            return -1;
>          }
>          phys_val = le32_to_cpu(phys_val);
>      }
> @@ -1143,15 +1158,20 @@ void vfio_pci_write_config(PCIDevice *pdev,
>                             uint32_t addr, uint32_t val, int len)
>  {
>      VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
> +    VFIODevice *vbasedev = &vdev->vbasedev;
>      uint32_t val_le = cpu_to_le32(val);
> +    int ret;
>  
>      trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
>  
>      /* Write everything to VFIO, let it filter out what we can't write */
> -    if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr)
> -                != len) {
> -        error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m",
> -                     __func__, vdev->vbasedev.name, addr, val, len);
> +    ret = VDEV_REGION_WRITE(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, addr,
> +                            len, &val_le, false);
> +    if (ret != len) {
> +        const char *err = ret < 0 ? strerror(-ret) : "short write";
> +
> +        error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s",
> +                     __func__, vbasedev->name, addr, val, len, err);
>      }
>  
>      /* MSI/MSI-X Enabling/Disabling */
> @@ -1239,10 +1259,13 @@ static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>      int ret, entries;
>      Error *err = NULL;
>  
> -    if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
> -              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
> -        error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
> -        return -errno;
> +    ret = VDEV_REGION_READ(&vdev->vbasedev, VFIO_PCI_CONFIG_REGION_INDEX,
> +                           pos + PCI_CAP_FLAGS, sizeof(ctrl), &ctrl);
> +    if (ret != sizeof(ctrl)) {
> +        const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> +        error_setg(errp, "failed reading MSI PCI_CAP_FLAGS %s", err);
> +        return ret;
>      }
>      ctrl = le16_to_cpu(ctrl);
>  
> @@ -1444,33 +1467,40 @@ static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
>   */
>  static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
>  {
> +    VFIODevice *vbasedev = &vdev->vbasedev;
>      uint8_t pos;
>      uint16_t ctrl;
>      uint32_t table, pba;
> -    int fd = vdev->vbasedev.fd;
>      VFIOMSIXInfo *msix;
> +    int ret;
>  
>      pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
>      if (!pos) {
>          return;
>      }
>  
> -    if (pread(fd, &ctrl, sizeof(ctrl),
> -              vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) {
> -        error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS");
> -        return;
> +    ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX,
> +                           pos + PCI_MSIX_FLAGS, sizeof(ctrl), &ctrl);
> +    if (ret != sizeof(ctrl)) {
> +        const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> +        error_setg(errp, "failed to read PCI MSIX FLAGS %s", err);
>      }
>  
> -    if (pread(fd, &table, sizeof(table),
> -              vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
> -        error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE");
> -        return;
> +    ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX,
> +                           pos + PCI_MSIX_TABLE, sizeof(table), &table);
> +    if (ret != sizeof(table)) {
> +        const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> +        error_setg(errp, "failed to read PCI MSIX TABLE %s", err);
>      }
>  
> -    if (pread(fd, &pba, sizeof(pba),
> -              vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
> -        error_setg_errno(errp, errno, "failed to read PCI MSIX PBA");
> -        return;
> +    ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX,
> +                           pos + PCI_MSIX_PBA, sizeof(pba), &pba);
> +    if (ret != sizeof(pba)) {
> +        const char *err = ret < 0 ? strerror(-ret) : "short read";
> +
> +        error_setg(errp, "failed to read PCI MSIX PBA %s", err);
>      }
>  
>      ctrl = le16_to_cpu(ctrl);
> @@ -1608,7 +1638,6 @@ static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
>  static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
>  {
>      VFIOBAR *bar = &vdev->bars[nr];
> -
>      uint32_t pci_bar;
>      int ret;
>  
> @@ -1618,10 +1647,13 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
>      }
>  
>      /* Determine what type of BAR this is for registration */
> -    ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar),
> -                vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
> +    ret = VDEV_REGION_READ(&vdev->vbasedev, VFIO_PCI_CONFIG_REGION_INDEX,
> +                           PCI_BASE_ADDRESS_0 + (4 * nr),
> +                           sizeof(pci_bar), &pci_bar);
>      if (ret != sizeof(pci_bar)) {
> -        error_report("vfio: Failed to read BAR %d (%m)", nr);
> +        const char *err =  ret < 0 ? strerror(-ret) : "short read";
> +
> +        error_report("vfio: Failed to read BAR %d (%s)", nr, err);
>          return;
>      }
>  
> @@ -2169,8 +2201,9 @@ static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
>  
>  static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
>  {
> +    VFIODevice *vbasedev = &vdev->vbasedev;
>      Error *err = NULL;
> -    int nr;
> +    int ret, nr;
>  
>      vfio_intx_enable(vdev, &err);
>      if (err) {
> @@ -2178,13 +2211,17 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
>      }
>  
>      for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
> -        off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr);
> +        off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr);
>          uint32_t val = 0;
>          uint32_t len = sizeof(val);
>  
> -        if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) {
> -            error_report("%s(%s) reset bar %d failed: %m", __func__,
> -                         vdev->vbasedev.name, nr);
> +        ret = VDEV_REGION_WRITE(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, addr,
> +                                len, &val, false);
> +        if (ret != len) {
> +            const char *err = ret < 0 ? strerror(-ret) : "short write";
> +
> +            error_report("%s(%s) reset bar %d failed: %s", __func__,
> +                         vbasedev->name, nr, err);
>          }
>      }
>  
> @@ -2619,7 +2656,7 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
>  
>      irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
>  
> -    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
> +    ret = VDEV_GET_IRQ_INFO(vbasedev, &irq_info);
>      if (ret) {
>          /* This can fail for an old kernel or legacy PCI dev */
>          trace_vfio_populate_device_get_irq_info_failure(strerror(errno));
> @@ -2738,8 +2775,10 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
>          return;
>      }
>  
> -    if (ioctl(vdev->vbasedev.fd,
> -              VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) {
> +    if (VDEV_GET_IRQ_INFO(&vdev->vbasedev, &irq_info) < 0) {
> +        return;
> +    }
> +    if (irq_info.count < 1) {
>          return;
>      }
>  
> @@ -2817,6 +2856,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>      vdev->vbasedev.ops = &vfio_pci_ops;
>      vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI;
>      vdev->vbasedev.dev = DEVICE(vdev);
> +    vdev->vbasedev.io_ops = &vfio_dev_io_ioctl;
>  
>      tmp = g_strdup_printf("%s/iommu_group", vdev->vbasedev.sysfsdev);
>      len = readlink(tmp, group_path, sizeof(group_path));
John Johnson Dec. 7, 2021, 7:48 a.m. UTC | #2
> On Nov 19, 2021, at 2:42 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> On Mon,  8 Nov 2021 16:46:32 -0800
> John Johnson <john.g.johnson@oracle.com> wrote:
> 
>> 
>> +    int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
>> +                        void *data, bool post);
> 
> The @post arg is not really developed in this patch, it would make
> review easier to add the arg when it becomes implemented and used.
> 

	ok


>> 
>> @@ -2382,6 +2398,21 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index,
>>                          struct vfio_region_info **info)
>> {
>>     size_t argsz = sizeof(struct vfio_region_info);
>> +    int fd = -1;
>> +    int ret;
>> +
>> +    /* create region cache */
>> +    if (vbasedev->regions == NULL) {
>> +        vbasedev->regions = g_new0(struct vfio_region_info *,
>> +                                   vbasedev->num_regions);
>> +    }
> 
> Seems like this should have been part of vfio_get_device() since that's
> where we set num_regions.
> 
>> +    /* check cache */
>> +    if (vbasedev->regions[index] != NULL) {
>> +        *info = g_malloc0(vbasedev->regions[index]->argsz);
>> +        memcpy(*info, vbasedev->regions[index],
>> +               vbasedev->regions[index]->argsz);
>> +        return 0;
>> +    }
> 
> Why are we calling vfio_get_region_info() enough that we need a cache?
> This looks like an optimization for something that doesn't exist yet.
> And if we have a cache, why aren't callers using it directly rather
> than creating a copy for each caller?
> 

	The are a couple types of vfio_get_region_info() callers.
One type wants a specific region, which it then uses to fill in
it’s own data structures (e.g., vfio_region_setup()) and the other
type uses vfio_get_dev_region_info() to loop through all regions
looking for a specific one (e.g., for the migration region or for
a specifc device to setup its quirks)  On top of that, each
vfio_get_region_info() will need to call the server twice for regions
with capabilities: once to get the proper ‘argsz’ and once to actually
get the info.

	I thought just having a cache that gets filled once, then
satisfies all later callers was a cleaner idea.  It also solved the
issue I had that the FD returned isn't used until the container is
setup, which would’ve needed another vfio_get_region_info() call.

	I can certainly move the cache fill to vfio_get_device()
and remove the copies if you like that idea better.



>> 
>>     *info = g_malloc0(argsz);
>> 
>> @@ -2389,19 +2420,28 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index,
>> retry:
>>     (*info)->argsz = argsz;
>> 
>> -    if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
>> +    ret = VDEV_GET_REGION_INFO(vbasedev, *info, &fd);
>> +    if (ret != 0) {
>>         g_free(*info);
>>         *info = NULL;
>> -        return -errno;
>> +        return ret;
>>     }
>> 
>>     if ((*info)->argsz > argsz) {
>>         argsz = (*info)->argsz;
>>         *info = g_realloc(*info, argsz);
>> +        if (fd != -1) {
>> +            close(fd);
>> +            fd = -1;
>> +        }
> 
> Use of fd here is hard to review, it's clearly for some future use case.
> 

	The FD code can be moved to a patch where it’s used.


>> 
>> +
>> +static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off,
>> +                                uint32_t size, void *data, bool post)
>> +{
>> +    struct vfio_region_info *info = vbasedev->regions[index];
>> +    int ret;
>> +
>> +    ret = pwrite(vbasedev->fd, data, size, info->offset + off);
>> +    if (ret < 0) {
>> +        ret = -errno;
>> +    }
>> +
>> +    return ret;
>> +}
> 
> Simplify all these with ternaries?
> 
>    return ret < 0 ? -errno : ret;
> 

	ok



>> 
>> static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
>> {
>> +    VFIODevice *vbasedev = &vdev->vbasedev;
>>     uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
>> -    off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
>>     DeviceState *dev = DEVICE(vdev);
>>     char *name;
>> -    int fd = vdev->vbasedev.fd;
>> 
>>     if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
>>         /* Since pci handles romfile, just print a message and return */
>> @@ -926,13 +928,23 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
>>      * Use the same size ROM BAR as the physical device.  The contents
>>      * will get filled in later when the guest tries to read it.
>>      */
>> -    if (pread(fd, &orig, 4, offset) != 4 ||
>> -        pwrite(fd, &size, 4, offset) != 4 ||
>> -        pread(fd, &size, 4, offset) != 4 ||
>> -        pwrite(fd, &orig, 4, offset) != 4) {
>> -        error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name);
>> +#define rom_read(var) VDEV_REGION_READ(vbasedev,  \
>> +                                       VFIO_PCI_CONFIG_REGION_INDEX,  \
>> +                                       PCI_ROM_ADDRESS, 4, (var))
>> +#define rom_write(var) VDEV_REGION_WRITE(vbasedev,  \
>> +                                         VFIO_PCI_CONFIG_REGION_INDEX,  \
>> +                                         PCI_ROM_ADDRESS, 4, (var), false)
>> +
>> +    if (rom_read(&orig) != 4 ||
>> +        rom_write(&size) != 4 ||
>> +        rom_read(&size) != 4 ||
>> +        rom_write(&orig) != 4) {
>> +
>> +        error_report("%s(%s) ROM access failed", __func__, vbasedev->name);
>>         return;
>>     }
>> +#undef rom_read
>> +#undef rom_write
> 
> Hmm, I think I'd rather see a generic function broken out for this,
> maybe:
> 
> int vfio_pci_size_bar32(VFIODevice *vbasedev, int offset,
>                        uint32_t mask, uint32_t *size)
> {
>    uint32_t orig, val;
>    int index = VFIO_PCI_CONFIG_REGION_INDEX;
> 
>    if (VDEV_REGION_READ(vbasedev, index, offset, 4, &orig) != 4 ||
>        VDEV_REGION_WRITE(vbasedev, index, offset, 4, &mask, false) != 4 ||
>        VDEV_REGION_READ(vbasedev, index, offset, 4, &val) != 4 ||
>        VDEV_REGION_WRITE(vbasedev, index, offset, 4, &orig, false) != 4) {
> 
>        error_report(...
> 
>        return -1;
>    }
> 
>    *size = ~(le32_to_cpu(val) & PCI_ROM_ADDRESS_MASK) + 1;
> 
>    return 0;
> }
> 

	ok


> Really I think it's just trying to hard code
> VFIO_PCI_CONFIG_REGION_INDEX and PCI_ROM_ADDRESS into the macro calls
> that demand some simplification, if we use variables it looks fine
> inline.
> 
> We could also re-wrap VDEV_REGION_READ/WRITE in vfio/pci.h to simplify
> config space access, presumably we'd never have posted writes to config
> space.  Thanks,
> 


	There are multiple places where config space is read or written
in pci.c, so adding an inline or #define for it is a good idea.

								JJ
diff mbox series

Patch

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 9b3c5e5..43fa948 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -124,6 +124,7 @@  typedef struct VFIOHostDMAWindow {
 } VFIOHostDMAWindow;
 
 typedef struct VFIODeviceOps VFIODeviceOps;
+typedef struct VFIODevIO VFIODevIO;
 
 typedef struct VFIODevice {
     QLIST_ENTRY(VFIODevice) next;
@@ -139,12 +140,14 @@  typedef struct VFIODevice {
     bool ram_block_discard_allowed;
     bool enable_migration;
     VFIODeviceOps *ops;
+    VFIODevIO *io_ops;
     unsigned int num_irqs;
     unsigned int num_regions;
     unsigned int flags;
     VFIOMigration *migration;
     Error *migration_blocker;
     OnOffAuto pre_copy_dirty_page_tracking;
+    struct vfio_region_info **regions;
 } VFIODevice;
 
 struct VFIODeviceOps {
@@ -164,6 +167,30 @@  struct VFIODeviceOps {
  * through ioctl() to the kernel VFIO driver, but vfio-user
  * can use a socket to a remote process.
  */
+struct VFIODevIO {
+    int (*get_info)(VFIODevice *vdev, struct vfio_device_info *info);
+    int (*get_region_info)(VFIODevice *vdev,
+                           struct vfio_region_info *info, int *fd);
+    int (*get_irq_info)(VFIODevice *vdev, struct vfio_irq_info *irq);
+    int (*set_irqs)(VFIODevice *vdev, struct vfio_irq_set *irqs);
+    int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
+                       void *data);
+    int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
+                        void *data, bool post);
+};
+
+#define VDEV_GET_INFO(vdev, info) \
+    ((vdev)->io_ops->get_info((vdev), (info)))
+#define VDEV_GET_REGION_INFO(vdev, info, fd) \
+    ((vdev)->io_ops->get_region_info((vdev), (info), (fd)))
+#define VDEV_GET_IRQ_INFO(vdev, irq) \
+    ((vdev)->io_ops->get_irq_info((vdev), (irq)))
+#define VDEV_SET_IRQS(vdev, irqs) \
+    ((vdev)->io_ops->set_irqs((vdev), (irqs)))
+#define VDEV_REGION_READ(vdev, nr, off, size, data) \
+    ((vdev)->io_ops->region_read((vdev), (nr), (off), (size), (data)))
+#define VDEV_REGION_WRITE(vdev, nr, off, size, data, post) \
+    ((vdev)->io_ops->region_write((vdev), (nr), (off), (size), (data), (post)))
 
 struct VFIOContIO {
     int (*dma_map)(VFIOContainer *container,
@@ -184,6 +211,7 @@  struct VFIOContIO {
 #define CONT_DIRTY_BITMAP(cont, bitmap, range) \
     ((cont)->io_ops->dirty_bitmap((cont), (bitmap), (range)))
 
+extern VFIODevIO vfio_dev_io_ioctl;
 extern VFIOContIO vfio_cont_io_ioctl;
 
 #endif /* CONFIG_LINUX */
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 50748be..41fdd78 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -70,7 +70,7 @@  void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
         .count = 0,
     };
 
-    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+    VDEV_SET_IRQS(vbasedev, &irq_set);
 }
 
 void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
@@ -83,7 +83,7 @@  void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
         .count = 1,
     };
 
-    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+    VDEV_SET_IRQS(vbasedev, &irq_set);
 }
 
 void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
@@ -96,7 +96,7 @@  void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
         .count = 1,
     };
 
-    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+    VDEV_SET_IRQS(vbasedev, &irq_set);
 }
 
 static inline const char *action_to_str(int action)
@@ -177,9 +177,7 @@  int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
     pfd = (int32_t *)&irq_set->data;
     *pfd = fd;
 
-    if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
-        ret = -errno;
-    }
+    ret = VDEV_SET_IRQS(vbasedev, irq_set);
     g_free(irq_set);
 
     if (!ret) {
@@ -214,6 +212,7 @@  void vfio_region_write(void *opaque, hwaddr addr,
         uint32_t dword;
         uint64_t qword;
     } buf;
+    int ret;
 
     switch (size) {
     case 1:
@@ -233,13 +232,15 @@  void vfio_region_write(void *opaque, hwaddr addr,
         break;
     }
 
-    if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
+    ret = VDEV_REGION_WRITE(vbasedev, region->nr, addr, size, &buf, false);
+    if (ret != size) {
+        const char *err = ret < 0 ? strerror(-ret) : "short write";
+
         error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
-                     ",%d) failed: %m",
+                     ",%d) failed: %s",
                      __func__, vbasedev->name, region->nr,
-                     addr, data, size);
+                     addr, data, size, err);
     }
-
     trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
 
     /*
@@ -265,13 +266,18 @@  uint64_t vfio_region_read(void *opaque,
         uint64_t qword;
     } buf;
     uint64_t data = 0;
+    int ret;
 
-    if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
-        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
+    ret = VDEV_REGION_READ(vbasedev, region->nr, addr, size, &buf);
+    if (ret != size) {
+        const char *err = ret < 0 ? strerror(-ret) : "short read";
+
+        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s",
                      __func__, vbasedev->name, region->nr,
-                     addr, size);
+                     addr, size, err);
         return (uint64_t)-1;
     }
+
     switch (size) {
     case 1:
         data = buf.byte;
@@ -2369,6 +2375,16 @@  int vfio_get_device(VFIOGroup *group, const char *name,
 
 void vfio_put_base_device(VFIODevice *vbasedev)
 {
+    if (vbasedev->regions != NULL) {
+        int i;
+
+        for (i = 0; i < vbasedev->num_regions; i++) {
+            g_free(vbasedev->regions[i]);
+        }
+        g_free(vbasedev->regions);
+        vbasedev->regions = NULL;
+    }
+
     if (!vbasedev->group) {
         return;
     }
@@ -2382,6 +2398,21 @@  int vfio_get_region_info(VFIODevice *vbasedev, int index,
                          struct vfio_region_info **info)
 {
     size_t argsz = sizeof(struct vfio_region_info);
+    int fd = -1;
+    int ret;
+
+    /* create region cache */
+    if (vbasedev->regions == NULL) {
+        vbasedev->regions = g_new0(struct vfio_region_info *,
+                                   vbasedev->num_regions);
+    }
+    /* check cache */
+    if (vbasedev->regions[index] != NULL) {
+        *info = g_malloc0(vbasedev->regions[index]->argsz);
+        memcpy(*info, vbasedev->regions[index],
+               vbasedev->regions[index]->argsz);
+        return 0;
+    }
 
     *info = g_malloc0(argsz);
 
@@ -2389,19 +2420,28 @@  int vfio_get_region_info(VFIODevice *vbasedev, int index,
 retry:
     (*info)->argsz = argsz;
 
-    if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
+    ret = VDEV_GET_REGION_INFO(vbasedev, *info, &fd);
+    if (ret != 0) {
         g_free(*info);
         *info = NULL;
-        return -errno;
+        return ret;
     }
 
     if ((*info)->argsz > argsz) {
         argsz = (*info)->argsz;
         *info = g_realloc(*info, argsz);
+        if (fd != -1) {
+            close(fd);
+            fd = -1;
+        }
 
         goto retry;
     }
 
+    /* fill cache */
+    vbasedev->regions[index] = g_malloc0(argsz);
+    memcpy(vbasedev->regions[index], *info, argsz);
+
     return 0;
 }
 
@@ -2554,6 +2594,95 @@  int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
  * Traditional ioctl() based io_ops
  */
 
+static int vfio_io_get_info(VFIODevice *vbasedev, struct vfio_device_info *info)
+{
+    int ret;
+
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_INFO, info);
+    if (ret < 0) {
+        ret = -errno;
+    }
+
+    return ret;
+}
+
+static int vfio_io_get_region_info(VFIODevice *vbasedev,
+                                   struct vfio_region_info *info,
+                                   int *fd)
+{
+    int ret;
+
+    *fd = -1;
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
+    if (ret < 0) {
+        ret = -errno;
+    }
+
+    return ret;
+}
+
+static int vfio_io_get_irq_info(VFIODevice *vbasedev,
+                                struct vfio_irq_info *info)
+{
+    int ret;
+
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info);
+    if (ret < 0) {
+        ret = -errno;
+    }
+
+    return ret;
+}
+
+static int vfio_io_set_irqs(VFIODevice *vbasedev, struct vfio_irq_set *irqs)
+{
+    int ret;
+
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irqs);
+    if (ret < 0) {
+        ret = -errno;
+    }
+
+    return ret;
+}
+
+static int vfio_io_region_read(VFIODevice *vbasedev, uint8_t index, off_t off,
+                               uint32_t size, void *data)
+{
+    struct vfio_region_info *info = vbasedev->regions[index];
+    int ret;
+
+    ret = pread(vbasedev->fd, data, size, info->offset + off);
+    if (ret < 0) {
+        ret = -errno;
+    }
+
+    return ret;
+}
+
+static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off,
+                                uint32_t size, void *data, bool post)
+{
+    struct vfio_region_info *info = vbasedev->regions[index];
+    int ret;
+
+    ret = pwrite(vbasedev->fd, data, size, info->offset + off);
+    if (ret < 0) {
+        ret = -errno;
+    }
+
+    return ret;
+}
+
+VFIODevIO vfio_dev_io_ioctl = {
+    .get_info = vfio_io_get_info,
+    .get_region_info = vfio_io_get_region_info,
+    .get_irq_info = vfio_io_get_irq_info,
+    .set_irqs = vfio_io_set_irqs,
+    .region_read = vfio_io_region_read,
+    .region_write = vfio_io_region_write,
+};
+
 static int vfio_io_dma_map(VFIOContainer *container,
                            struct vfio_iommu_type1_dma_map *map,
                            int fd, bool will_commit)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 122edf8..28f21f8 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -402,7 +402,7 @@  static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
         fds[i] = fd;
     }
 
-    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    ret = VDEV_SET_IRQS(&vdev->vbasedev, irq_set);
 
     g_free(irq_set);
 
@@ -772,14 +772,16 @@  static void vfio_update_msi(VFIOPCIDevice *vdev)
 
 static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
 {
+    VFIODevice *vbasedev = &vdev->vbasedev;
     struct vfio_region_info *reg_info;
     uint64_t size;
     off_t off = 0;
     ssize_t bytes;
+    int ret;
 
-    if (vfio_get_region_info(&vdev->vbasedev,
-                             VFIO_PCI_ROM_REGION_INDEX, &reg_info)) {
-        error_report("vfio: Error getting ROM info: %m");
+    ret = vfio_get_region_info(vbasedev, VFIO_PCI_ROM_REGION_INDEX, &reg_info);
+    if (ret < 0) {
+        error_report("vfio: Error getting ROM info: %s", strerror(-ret));
         return;
     }
 
@@ -806,18 +808,19 @@  static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
     memset(vdev->rom, 0xff, size);
 
     while (size) {
-        bytes = pread(vdev->vbasedev.fd, vdev->rom + off,
-                      size, vdev->rom_offset + off);
+        bytes = VDEV_REGION_READ(vbasedev, VFIO_PCI_ROM_REGION_INDEX, off,
+                                 size, vdev->rom + off);
         if (bytes == 0) {
             break;
         } else if (bytes > 0) {
             off += bytes;
             size -= bytes;
         } else {
-            if (errno == EINTR || errno == EAGAIN) {
+            if (bytes == -EINTR || bytes == -EAGAIN) {
                 continue;
             }
-            error_report("vfio: Error reading device ROM: %m");
+            error_report("vfio: Error reading device ROM: %s",
+                         strerror(-bytes));
             break;
         }
     }
@@ -905,11 +908,10 @@  static const MemoryRegionOps vfio_rom_ops = {
 
 static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
 {
+    VFIODevice *vbasedev = &vdev->vbasedev;
     uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
-    off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
     DeviceState *dev = DEVICE(vdev);
     char *name;
-    int fd = vdev->vbasedev.fd;
 
     if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
         /* Since pci handles romfile, just print a message and return */
@@ -926,13 +928,23 @@  static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
      * Use the same size ROM BAR as the physical device.  The contents
      * will get filled in later when the guest tries to read it.
      */
-    if (pread(fd, &orig, 4, offset) != 4 ||
-        pwrite(fd, &size, 4, offset) != 4 ||
-        pread(fd, &size, 4, offset) != 4 ||
-        pwrite(fd, &orig, 4, offset) != 4) {
-        error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name);
+#define rom_read(var) VDEV_REGION_READ(vbasedev,  \
+                                       VFIO_PCI_CONFIG_REGION_INDEX,  \
+                                       PCI_ROM_ADDRESS, 4, (var))
+#define rom_write(var) VDEV_REGION_WRITE(vbasedev,  \
+                                         VFIO_PCI_CONFIG_REGION_INDEX,  \
+                                         PCI_ROM_ADDRESS, 4, (var), false)
+
+    if (rom_read(&orig) != 4 ||
+        rom_write(&size) != 4 ||
+        rom_read(&size) != 4 ||
+        rom_write(&orig) != 4) {
+
+        error_report("%s(%s) ROM access failed", __func__, vbasedev->name);
         return;
     }
+#undef rom_read
+#undef rom_write
 
     size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
 
@@ -1110,6 +1122,7 @@  static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
 {
     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
+    VFIODevice *vbasedev = &vdev->vbasedev;
     uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
 
     memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
@@ -1122,12 +1135,14 @@  uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
     if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
         ssize_t ret;
 
-        ret = pread(vdev->vbasedev.fd, &phys_val, len,
-                    vdev->config_offset + addr);
+        ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, addr,
+                               len, &phys_val);
         if (ret != len) {
-            error_report("%s(%s, 0x%x, 0x%x) failed: %m",
-                         __func__, vdev->vbasedev.name, addr, len);
-            return -errno;
+            const char *err = ret < 0 ? strerror(-ret) : "short read";
+
+            error_report("%s(%s, 0x%x, 0x%x) failed: %s",
+                         __func__, vbasedev->name, addr, len, err);
+            return -1;
         }
         phys_val = le32_to_cpu(phys_val);
     }
@@ -1143,15 +1158,20 @@  void vfio_pci_write_config(PCIDevice *pdev,
                            uint32_t addr, uint32_t val, int len)
 {
     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
+    VFIODevice *vbasedev = &vdev->vbasedev;
     uint32_t val_le = cpu_to_le32(val);
+    int ret;
 
     trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
 
     /* Write everything to VFIO, let it filter out what we can't write */
-    if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr)
-                != len) {
-        error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m",
-                     __func__, vdev->vbasedev.name, addr, val, len);
+    ret = VDEV_REGION_WRITE(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, addr,
+                            len, &val_le, false);
+    if (ret != len) {
+        const char *err = ret < 0 ? strerror(-ret) : "short write";
+
+        error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s",
+                     __func__, vbasedev->name, addr, val, len, err);
     }
 
     /* MSI/MSI-X Enabling/Disabling */
@@ -1239,10 +1259,13 @@  static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
     int ret, entries;
     Error *err = NULL;
 
-    if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
-              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
-        error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
-        return -errno;
+    ret = VDEV_REGION_READ(&vdev->vbasedev, VFIO_PCI_CONFIG_REGION_INDEX,
+                           pos + PCI_CAP_FLAGS, sizeof(ctrl), &ctrl);
+    if (ret != sizeof(ctrl)) {
+        const char *err = ret < 0 ? strerror(-ret) : "short read";
+
+        error_setg(errp, "failed reading MSI PCI_CAP_FLAGS %s", err);
+        return ret;
     }
     ctrl = le16_to_cpu(ctrl);
 
@@ -1444,33 +1467,40 @@  static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
  */
 static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
 {
+    VFIODevice *vbasedev = &vdev->vbasedev;
     uint8_t pos;
     uint16_t ctrl;
     uint32_t table, pba;
-    int fd = vdev->vbasedev.fd;
     VFIOMSIXInfo *msix;
+    int ret;
 
     pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
     if (!pos) {
         return;
     }
 
-    if (pread(fd, &ctrl, sizeof(ctrl),
-              vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) {
-        error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS");
-        return;
+    ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX,
+                           pos + PCI_MSIX_FLAGS, sizeof(ctrl), &ctrl);
+    if (ret != sizeof(ctrl)) {
+        const char *err = ret < 0 ? strerror(-ret) : "short read";
+
+        error_setg(errp, "failed to read PCI MSIX FLAGS %s", err);
     }
 
-    if (pread(fd, &table, sizeof(table),
-              vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
-        error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE");
-        return;
+    ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX,
+                           pos + PCI_MSIX_TABLE, sizeof(table), &table);
+    if (ret != sizeof(table)) {
+        const char *err = ret < 0 ? strerror(-ret) : "short read";
+
+        error_setg(errp, "failed to read PCI MSIX TABLE %s", err);
     }
 
-    if (pread(fd, &pba, sizeof(pba),
-              vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
-        error_setg_errno(errp, errno, "failed to read PCI MSIX PBA");
-        return;
+    ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX,
+                           pos + PCI_MSIX_PBA, sizeof(pba), &pba);
+    if (ret != sizeof(pba)) {
+        const char *err = ret < 0 ? strerror(-ret) : "short read";
+
+        error_setg(errp, "failed to read PCI MSIX PBA %s", err);
     }
 
     ctrl = le16_to_cpu(ctrl);
@@ -1608,7 +1638,6 @@  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
 static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
 {
     VFIOBAR *bar = &vdev->bars[nr];
-
     uint32_t pci_bar;
     int ret;
 
@@ -1618,10 +1647,13 @@  static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
     }
 
     /* Determine what type of BAR this is for registration */
-    ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar),
-                vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
+    ret = VDEV_REGION_READ(&vdev->vbasedev, VFIO_PCI_CONFIG_REGION_INDEX,
+                           PCI_BASE_ADDRESS_0 + (4 * nr),
+                           sizeof(pci_bar), &pci_bar);
     if (ret != sizeof(pci_bar)) {
-        error_report("vfio: Failed to read BAR %d (%m)", nr);
+        const char *err =  ret < 0 ? strerror(-ret) : "short read";
+
+        error_report("vfio: Failed to read BAR %d (%s)", nr, err);
         return;
     }
 
@@ -2169,8 +2201,9 @@  static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
 
 static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
 {
+    VFIODevice *vbasedev = &vdev->vbasedev;
     Error *err = NULL;
-    int nr;
+    int ret, nr;
 
     vfio_intx_enable(vdev, &err);
     if (err) {
@@ -2178,13 +2211,17 @@  static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
     }
 
     for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
-        off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr);
+        off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr);
         uint32_t val = 0;
         uint32_t len = sizeof(val);
 
-        if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) {
-            error_report("%s(%s) reset bar %d failed: %m", __func__,
-                         vdev->vbasedev.name, nr);
+        ret = VDEV_REGION_WRITE(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, addr,
+                                len, &val, false);
+        if (ret != len) {
+            const char *err = ret < 0 ? strerror(-ret) : "short write";
+
+            error_report("%s(%s) reset bar %d failed: %s", __func__,
+                         vbasedev->name, nr, err);
         }
     }
 
@@ -2619,7 +2656,7 @@  static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
 
     irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
 
-    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+    ret = VDEV_GET_IRQ_INFO(vbasedev, &irq_info);
     if (ret) {
         /* This can fail for an old kernel or legacy PCI dev */
         trace_vfio_populate_device_get_irq_info_failure(strerror(errno));
@@ -2738,8 +2775,10 @@  static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
         return;
     }
 
-    if (ioctl(vdev->vbasedev.fd,
-              VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) {
+    if (VDEV_GET_IRQ_INFO(&vdev->vbasedev, &irq_info) < 0) {
+        return;
+    }
+    if (irq_info.count < 1) {
         return;
     }
 
@@ -2817,6 +2856,7 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
     vdev->vbasedev.ops = &vfio_pci_ops;
     vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI;
     vdev->vbasedev.dev = DEVICE(vdev);
+    vdev->vbasedev.io_ops = &vfio_dev_io_ioctl;
 
     tmp = g_strdup_printf("%s/iommu_group", vdev->vbasedev.sysfsdev);
     len = readlink(tmp, group_path, sizeof(group_path));