Message ID | c8f636ad86d8b89f4610e95089ad49816c70a7a6.1636057885.git.john.g.johnson@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vfio-user client | expand |
On Mon, 8 Nov 2021 16:46:32 -0800 John Johnson <john.g.johnson@oracle.com> wrote: > Used for communication with VFIO driver > (prep work for vfio-user, which will communicate over a socket) > > Signed-off-by: John G Johnson <john.g.johnson@oracle.com> > --- > include/hw/vfio/vfio-common.h | 28 ++++++++ > hw/vfio/common.c | 159 ++++++++++++++++++++++++++++++++++++++---- > hw/vfio/pci.c | 146 ++++++++++++++++++++++++-------------- > 3 files changed, 265 insertions(+), 68 deletions(-) > > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index 9b3c5e5..43fa948 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -124,6 +124,7 @@ typedef struct VFIOHostDMAWindow { > } VFIOHostDMAWindow; > > typedef struct VFIODeviceOps VFIODeviceOps; > +typedef struct VFIODevIO VFIODevIO; > > typedef struct VFIODevice { > QLIST_ENTRY(VFIODevice) next; > @@ -139,12 +140,14 @@ typedef struct VFIODevice { > bool ram_block_discard_allowed; > bool enable_migration; > VFIODeviceOps *ops; > + VFIODevIO *io_ops; > unsigned int num_irqs; > unsigned int num_regions; > unsigned int flags; > VFIOMigration *migration; > Error *migration_blocker; > OnOffAuto pre_copy_dirty_page_tracking; > + struct vfio_region_info **regions; > } VFIODevice; > > struct VFIODeviceOps { > @@ -164,6 +167,30 @@ struct VFIODeviceOps { > * through ioctl() to the kernel VFIO driver, but vfio-user > * can use a socket to a remote process. > */ > +struct VFIODevIO { > + int (*get_info)(VFIODevice *vdev, struct vfio_device_info *info); > + int (*get_region_info)(VFIODevice *vdev, > + struct vfio_region_info *info, int *fd); > + int (*get_irq_info)(VFIODevice *vdev, struct vfio_irq_info *irq); > + int (*set_irqs)(VFIODevice *vdev, struct vfio_irq_set *irqs); > + int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, > + void *data); > + int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, > + void *data, bool post); The @post arg is not really developed in this patch, it would make review easier to add the arg when it becomes implemented and used. > +}; > + > +#define VDEV_GET_INFO(vdev, info) \ > + ((vdev)->io_ops->get_info((vdev), (info))) > +#define VDEV_GET_REGION_INFO(vdev, info, fd) \ > + ((vdev)->io_ops->get_region_info((vdev), (info), (fd))) > +#define VDEV_GET_IRQ_INFO(vdev, irq) \ > + ((vdev)->io_ops->get_irq_info((vdev), (irq))) > +#define VDEV_SET_IRQS(vdev, irqs) \ > + ((vdev)->io_ops->set_irqs((vdev), (irqs))) > +#define VDEV_REGION_READ(vdev, nr, off, size, data) \ > + ((vdev)->io_ops->region_read((vdev), (nr), (off), (size), (data))) > +#define VDEV_REGION_WRITE(vdev, nr, off, size, data, post) \ > + ((vdev)->io_ops->region_write((vdev), (nr), (off), (size), (data), (post))) > > struct VFIOContIO { > int (*dma_map)(VFIOContainer *container, > @@ -184,6 +211,7 @@ struct VFIOContIO { > #define CONT_DIRTY_BITMAP(cont, bitmap, range) \ > ((cont)->io_ops->dirty_bitmap((cont), (bitmap), (range))) > > +extern VFIODevIO vfio_dev_io_ioctl; > extern VFIOContIO vfio_cont_io_ioctl; > > #endif /* CONFIG_LINUX */ > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index 50748be..41fdd78 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -70,7 +70,7 @@ void vfio_disable_irqindex(VFIODevice *vbasedev, int index) > .count = 0, > }; > > - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); > + VDEV_SET_IRQS(vbasedev, &irq_set); > } > > void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index) > @@ -83,7 +83,7 @@ void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index) > .count = 1, > }; > > - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); > + VDEV_SET_IRQS(vbasedev, &irq_set); > } > > void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) > @@ -96,7 +96,7 @@ void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) > .count = 1, > }; > > - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); > + VDEV_SET_IRQS(vbasedev, &irq_set); > } > > static inline const char *action_to_str(int action) > @@ -177,9 +177,7 @@ int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, > pfd = (int32_t *)&irq_set->data; > *pfd = fd; > > - if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) { > - ret = -errno; > - } > + ret = VDEV_SET_IRQS(vbasedev, irq_set); > g_free(irq_set); > > if (!ret) { > @@ -214,6 +212,7 @@ void vfio_region_write(void *opaque, hwaddr addr, > uint32_t dword; > uint64_t qword; > } buf; > + int ret; > > switch (size) { > case 1: > @@ -233,13 +232,15 @@ void vfio_region_write(void *opaque, hwaddr addr, > break; > } > > - if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { > + ret = VDEV_REGION_WRITE(vbasedev, region->nr, addr, size, &buf, false); > + if (ret != size) { > + const char *err = ret < 0 ? strerror(-ret) : "short write"; > + > error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 > - ",%d) failed: %m", > + ",%d) failed: %s", > __func__, vbasedev->name, region->nr, > - addr, data, size); > + addr, data, size, err); > } > - > trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); > > /* > @@ -265,13 +266,18 @@ uint64_t vfio_region_read(void *opaque, > uint64_t qword; > } buf; > uint64_t data = 0; > + int ret; > > - if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { > - error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", > + ret = VDEV_REGION_READ(vbasedev, region->nr, addr, size, &buf); > + if (ret != size) { > + const char *err = ret < 0 ? strerror(-ret) : "short read"; > + > + error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s", > __func__, vbasedev->name, region->nr, > - addr, size); > + addr, size, err); > return (uint64_t)-1; > } > + > switch (size) { > case 1: > data = buf.byte; > @@ -2369,6 +2375,16 @@ int vfio_get_device(VFIOGroup *group, const char *name, > > void vfio_put_base_device(VFIODevice *vbasedev) > { > + if (vbasedev->regions != NULL) { > + int i; > + > + for (i = 0; i < vbasedev->num_regions; i++) { > + g_free(vbasedev->regions[i]); > + } > + g_free(vbasedev->regions); > + vbasedev->regions = NULL; > + } > + > if (!vbasedev->group) { > return; > } > @@ -2382,6 +2398,21 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, > struct vfio_region_info **info) > { > size_t argsz = sizeof(struct vfio_region_info); > + int fd = -1; > + int ret; > + > + /* create region cache */ > + if (vbasedev->regions == NULL) { > + vbasedev->regions = g_new0(struct vfio_region_info *, > + vbasedev->num_regions); > + } Seems like this should have been part of vfio_get_device() since that's where we set num_regions. > + /* check cache */ > + if (vbasedev->regions[index] != NULL) { > + *info = g_malloc0(vbasedev->regions[index]->argsz); > + memcpy(*info, vbasedev->regions[index], > + vbasedev->regions[index]->argsz); > + return 0; > + } Why are we calling vfio_get_region_info() enough that we need a cache? This looks like an optimization for something that doesn't exist yet. And if we have a cache, why aren't callers using it directly rather than creating a copy for each caller? > > *info = g_malloc0(argsz); > > @@ -2389,19 +2420,28 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, > retry: > (*info)->argsz = argsz; > > - if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { > + ret = VDEV_GET_REGION_INFO(vbasedev, *info, &fd); > + if (ret != 0) { > g_free(*info); > *info = NULL; > - return -errno; > + return ret; > } > > if ((*info)->argsz > argsz) { > argsz = (*info)->argsz; > *info = g_realloc(*info, argsz); > + if (fd != -1) { > + close(fd); > + fd = -1; > + } Use of fd here is hard to review, it's clearly for some future use case. > > goto retry; > } > > + /* fill cache */ > + vbasedev->regions[index] = g_malloc0(argsz); > + memcpy(vbasedev->regions[index], *info, argsz); > + > return 0; > } > > @@ -2554,6 +2594,95 @@ int vfio_eeh_as_op(AddressSpace *as, uint32_t op) > * Traditional ioctl() based io_ops > */ > > +static int vfio_io_get_info(VFIODevice *vbasedev, struct vfio_device_info *info) > +{ > + int ret; > + > + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_INFO, info); > + if (ret < 0) { > + ret = -errno; > + } > + > + return ret; > +} > + > +static int vfio_io_get_region_info(VFIODevice *vbasedev, > + struct vfio_region_info *info, > + int *fd) > +{ > + int ret; > + > + *fd = -1; > + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info); > + if (ret < 0) { > + ret = -errno; > + } > + > + return ret; > +} > + > +static int vfio_io_get_irq_info(VFIODevice *vbasedev, > + struct vfio_irq_info *info) > +{ > + int ret; > + > + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info); > + if (ret < 0) { > + ret = -errno; > + } > + > + return ret; > +} > + > +static int vfio_io_set_irqs(VFIODevice *vbasedev, struct vfio_irq_set *irqs) > +{ > + int ret; > + > + ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irqs); > + if (ret < 0) { > + ret = -errno; > + } > + > + return ret; > +} > + > +static int vfio_io_region_read(VFIODevice *vbasedev, uint8_t index, off_t off, > + uint32_t size, void *data) > +{ > + struct vfio_region_info *info = vbasedev->regions[index]; > + int ret; > + > + ret = pread(vbasedev->fd, data, size, info->offset + off); > + if (ret < 0) { > + ret = -errno; > + } > + > + return ret; > +} > + > +static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off, > + uint32_t size, void *data, bool post) > +{ > + struct vfio_region_info *info = vbasedev->regions[index]; > + int ret; > + > + ret = pwrite(vbasedev->fd, data, size, info->offset + off); > + if (ret < 0) { > + ret = -errno; > + } > + > + return ret; > +} Simplify all these with ternaries? return ret < 0 ? -errno : ret; > + > +VFIODevIO vfio_dev_io_ioctl = { > + .get_info = vfio_io_get_info, > + .get_region_info = vfio_io_get_region_info, > + .get_irq_info = vfio_io_get_irq_info, > + .set_irqs = vfio_io_set_irqs, > + .region_read = vfio_io_region_read, > + .region_write = vfio_io_region_write, > +}; > + > static int vfio_io_dma_map(VFIOContainer *container, > struct vfio_iommu_type1_dma_map *map, > int fd, bool will_commit) > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index 122edf8..28f21f8 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -402,7 +402,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) > fds[i] = fd; > } > > - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); > + ret = VDEV_SET_IRQS(&vdev->vbasedev, irq_set); > > g_free(irq_set); > > @@ -772,14 +772,16 @@ static void vfio_update_msi(VFIOPCIDevice *vdev) > > static void vfio_pci_load_rom(VFIOPCIDevice *vdev) > { > + VFIODevice *vbasedev = &vdev->vbasedev; > struct vfio_region_info *reg_info; > uint64_t size; > off_t off = 0; > ssize_t bytes; > + int ret; > > - if (vfio_get_region_info(&vdev->vbasedev, > - VFIO_PCI_ROM_REGION_INDEX, ®_info)) { > - error_report("vfio: Error getting ROM info: %m"); > + ret = vfio_get_region_info(vbasedev, VFIO_PCI_ROM_REGION_INDEX, ®_info); > + if (ret < 0) { > + error_report("vfio: Error getting ROM info: %s", strerror(-ret)); > return; > } > > @@ -806,18 +808,19 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) > memset(vdev->rom, 0xff, size); > > while (size) { > - bytes = pread(vdev->vbasedev.fd, vdev->rom + off, > - size, vdev->rom_offset + off); > + bytes = VDEV_REGION_READ(vbasedev, VFIO_PCI_ROM_REGION_INDEX, off, > + size, vdev->rom + off); > if (bytes == 0) { > break; > } else if (bytes > 0) { > off += bytes; > size -= bytes; > } else { > - if (errno == EINTR || errno == EAGAIN) { > + if (bytes == -EINTR || bytes == -EAGAIN) { > continue; > } > - error_report("vfio: Error reading device ROM: %m"); > + error_report("vfio: Error reading device ROM: %s", > + strerror(-bytes)); > break; > } > } > @@ -905,11 +908,10 @@ static const MemoryRegionOps vfio_rom_ops = { > > static void vfio_pci_size_rom(VFIOPCIDevice *vdev) > { > + VFIODevice *vbasedev = &vdev->vbasedev; > uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); > - off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; > DeviceState *dev = DEVICE(vdev); > char *name; > - int fd = vdev->vbasedev.fd; > > if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { > /* Since pci handles romfile, just print a message and return */ > @@ -926,13 +928,23 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) > * Use the same size ROM BAR as the physical device. The contents > * will get filled in later when the guest tries to read it. > */ > - if (pread(fd, &orig, 4, offset) != 4 || > - pwrite(fd, &size, 4, offset) != 4 || > - pread(fd, &size, 4, offset) != 4 || > - pwrite(fd, &orig, 4, offset) != 4) { > - error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name); > +#define rom_read(var) VDEV_REGION_READ(vbasedev, \ > + VFIO_PCI_CONFIG_REGION_INDEX, \ > + PCI_ROM_ADDRESS, 4, (var)) > +#define rom_write(var) VDEV_REGION_WRITE(vbasedev, \ > + VFIO_PCI_CONFIG_REGION_INDEX, \ > + PCI_ROM_ADDRESS, 4, (var), false) > + > + if (rom_read(&orig) != 4 || > + rom_write(&size) != 4 || > + rom_read(&size) != 4 || > + rom_write(&orig) != 4) { > + > + error_report("%s(%s) ROM access failed", __func__, vbasedev->name); > return; > } > +#undef rom_read > +#undef rom_write Hmm, I think I'd rather see a generic function broken out for this, maybe: int vfio_pci_size_bar32(VFIODevice *vbasedev, int offset, uint32_t mask, uint32_t *size) { uint32_t orig, val; int index = VFIO_PCI_CONFIG_REGION_INDEX; if (VDEV_REGION_READ(vbasedev, index, offset, 4, &orig) != 4 || VDEV_REGION_WRITE(vbasedev, index, offset, 4, &mask, false) != 4 || VDEV_REGION_READ(vbasedev, index, offset, 4, &val) != 4 || VDEV_REGION_WRITE(vbasedev, index, offset, 4, &orig, false) != 4) { error_report(... return -1; } *size = ~(le32_to_cpu(val) & PCI_ROM_ADDRESS_MASK) + 1; return 0; } Really I think it's just trying to hard code VFIO_PCI_CONFIG_REGION_INDEX and PCI_ROM_ADDRESS into the macro calls that demand some simplification, if we use variables it looks fine inline. We could also re-wrap VDEV_REGION_READ/WRITE in vfio/pci.h to simplify config space access, presumably we'd never have posted writes to config space. Thanks, Alex > > size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1; > > @@ -1110,6 +1122,7 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) > uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) > { > VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); > + VFIODevice *vbasedev = &vdev->vbasedev; > uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; > > memcpy(&emu_bits, vdev->emulated_config_bits + addr, len); > @@ -1122,12 +1135,14 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) > if (~emu_bits & (0xffffffffU >> (32 - len * 8))) { > ssize_t ret; > > - ret = pread(vdev->vbasedev.fd, &phys_val, len, > - vdev->config_offset + addr); > + ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, addr, > + len, &phys_val); > if (ret != len) { > - error_report("%s(%s, 0x%x, 0x%x) failed: %m", > - __func__, vdev->vbasedev.name, addr, len); > - return -errno; > + const char *err = ret < 0 ? strerror(-ret) : "short read"; > + > + error_report("%s(%s, 0x%x, 0x%x) failed: %s", > + __func__, vbasedev->name, addr, len, err); > + return -1; > } > phys_val = le32_to_cpu(phys_val); > } > @@ -1143,15 +1158,20 @@ void vfio_pci_write_config(PCIDevice *pdev, > uint32_t addr, uint32_t val, int len) > { > VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); > + VFIODevice *vbasedev = &vdev->vbasedev; > uint32_t val_le = cpu_to_le32(val); > + int ret; > > trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len); > > /* Write everything to VFIO, let it filter out what we can't write */ > - if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr) > - != len) { > - error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m", > - __func__, vdev->vbasedev.name, addr, val, len); > + ret = VDEV_REGION_WRITE(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, addr, > + len, &val_le, false); > + if (ret != len) { > + const char *err = ret < 0 ? strerror(-ret) : "short write"; > + > + error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s", > + __func__, vbasedev->name, addr, val, len, err); > } > > /* MSI/MSI-X Enabling/Disabling */ > @@ -1239,10 +1259,13 @@ static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp) > int ret, entries; > Error *err = NULL; > > - if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl), > - vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { > - error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS"); > - return -errno; > + ret = VDEV_REGION_READ(&vdev->vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, > + pos + PCI_CAP_FLAGS, sizeof(ctrl), &ctrl); > + if (ret != sizeof(ctrl)) { > + const char *err = ret < 0 ? strerror(-ret) : "short read"; > + > + error_setg(errp, "failed reading MSI PCI_CAP_FLAGS %s", err); > + return ret; > } > ctrl = le16_to_cpu(ctrl); > > @@ -1444,33 +1467,40 @@ static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp) > */ > static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) > { > + VFIODevice *vbasedev = &vdev->vbasedev; > uint8_t pos; > uint16_t ctrl; > uint32_t table, pba; > - int fd = vdev->vbasedev.fd; > VFIOMSIXInfo *msix; > + int ret; > > pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); > if (!pos) { > return; > } > > - if (pread(fd, &ctrl, sizeof(ctrl), > - vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) { > - error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS"); > - return; > + ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, > + pos + PCI_MSIX_FLAGS, sizeof(ctrl), &ctrl); > + if (ret != sizeof(ctrl)) { > + const char *err = ret < 0 ? strerror(-ret) : "short read"; > + > + error_setg(errp, "failed to read PCI MSIX FLAGS %s", err); > } > > - if (pread(fd, &table, sizeof(table), > - vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) { > - error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE"); > - return; > + ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, > + pos + PCI_MSIX_TABLE, sizeof(table), &table); > + if (ret != sizeof(table)) { > + const char *err = ret < 0 ? strerror(-ret) : "short read"; > + > + error_setg(errp, "failed to read PCI MSIX TABLE %s", err); > } > > - if (pread(fd, &pba, sizeof(pba), > - vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) { > - error_setg_errno(errp, errno, "failed to read PCI MSIX PBA"); > - return; > + ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, > + pos + PCI_MSIX_PBA, sizeof(pba), &pba); > + if (ret != sizeof(pba)) { > + const char *err = ret < 0 ? strerror(-ret) : "short read"; > + > + error_setg(errp, "failed to read PCI MSIX PBA %s", err); > } > > ctrl = le16_to_cpu(ctrl); > @@ -1608,7 +1638,6 @@ static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled) > static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) > { > VFIOBAR *bar = &vdev->bars[nr]; > - > uint32_t pci_bar; > int ret; > > @@ -1618,10 +1647,13 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) > } > > /* Determine what type of BAR this is for registration */ > - ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar), > - vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); > + ret = VDEV_REGION_READ(&vdev->vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, > + PCI_BASE_ADDRESS_0 + (4 * nr), > + sizeof(pci_bar), &pci_bar); > if (ret != sizeof(pci_bar)) { > - error_report("vfio: Failed to read BAR %d (%m)", nr); > + const char *err = ret < 0 ? strerror(-ret) : "short read"; > + > + error_report("vfio: Failed to read BAR %d (%s)", nr, err); > return; > } > > @@ -2169,8 +2201,9 @@ static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) > > static void vfio_pci_post_reset(VFIOPCIDevice *vdev) > { > + VFIODevice *vbasedev = &vdev->vbasedev; > Error *err = NULL; > - int nr; > + int ret, nr; > > vfio_intx_enable(vdev, &err); > if (err) { > @@ -2178,13 +2211,17 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) > } > > for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) { > - off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr); > + off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr); > uint32_t val = 0; > uint32_t len = sizeof(val); > > - if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) { > - error_report("%s(%s) reset bar %d failed: %m", __func__, > - vdev->vbasedev.name, nr); > + ret = VDEV_REGION_WRITE(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, addr, > + len, &val, false); > + if (ret != len) { > + const char *err = ret < 0 ? strerror(-ret) : "short write"; > + > + error_report("%s(%s) reset bar %d failed: %s", __func__, > + vbasedev->name, nr, err); > } > } > > @@ -2619,7 +2656,7 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) > > irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; > > - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); > + ret = VDEV_GET_IRQ_INFO(vbasedev, &irq_info); > if (ret) { > /* This can fail for an old kernel or legacy PCI dev */ > trace_vfio_populate_device_get_irq_info_failure(strerror(errno)); > @@ -2738,8 +2775,10 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev) > return; > } > > - if (ioctl(vdev->vbasedev.fd, > - VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) { > + if (VDEV_GET_IRQ_INFO(&vdev->vbasedev, &irq_info) < 0) { > + return; > + } > + if (irq_info.count < 1) { > return; > } > > @@ -2817,6 +2856,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) > vdev->vbasedev.ops = &vfio_pci_ops; > vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI; > vdev->vbasedev.dev = DEVICE(vdev); > + vdev->vbasedev.io_ops = &vfio_dev_io_ioctl; > > tmp = g_strdup_printf("%s/iommu_group", vdev->vbasedev.sysfsdev); > len = readlink(tmp, group_path, sizeof(group_path));
> On Nov 19, 2021, at 2:42 PM, Alex Williamson <alex.williamson@redhat.com> wrote: > > On Mon, 8 Nov 2021 16:46:32 -0800 > John Johnson <john.g.johnson@oracle.com> wrote: > >> >> + int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, >> + void *data, bool post); > > The @post arg is not really developed in this patch, it would make > review easier to add the arg when it becomes implemented and used. > ok >> >> @@ -2382,6 +2398,21 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, >> struct vfio_region_info **info) >> { >> size_t argsz = sizeof(struct vfio_region_info); >> + int fd = -1; >> + int ret; >> + >> + /* create region cache */ >> + if (vbasedev->regions == NULL) { >> + vbasedev->regions = g_new0(struct vfio_region_info *, >> + vbasedev->num_regions); >> + } > > Seems like this should have been part of vfio_get_device() since that's > where we set num_regions. > >> + /* check cache */ >> + if (vbasedev->regions[index] != NULL) { >> + *info = g_malloc0(vbasedev->regions[index]->argsz); >> + memcpy(*info, vbasedev->regions[index], >> + vbasedev->regions[index]->argsz); >> + return 0; >> + } > > Why are we calling vfio_get_region_info() enough that we need a cache? > This looks like an optimization for something that doesn't exist yet. > And if we have a cache, why aren't callers using it directly rather > than creating a copy for each caller? > The are a couple types of vfio_get_region_info() callers. One type wants a specific region, which it then uses to fill in it’s own data structures (e.g., vfio_region_setup()) and the other type uses vfio_get_dev_region_info() to loop through all regions looking for a specific one (e.g., for the migration region or for a specifc device to setup its quirks) On top of that, each vfio_get_region_info() will need to call the server twice for regions with capabilities: once to get the proper ‘argsz’ and once to actually get the info. I thought just having a cache that gets filled once, then satisfies all later callers was a cleaner idea. It also solved the issue I had that the FD returned isn't used until the container is setup, which would’ve needed another vfio_get_region_info() call. I can certainly move the cache fill to vfio_get_device() and remove the copies if you like that idea better. >> >> *info = g_malloc0(argsz); >> >> @@ -2389,19 +2420,28 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, >> retry: >> (*info)->argsz = argsz; >> >> - if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { >> + ret = VDEV_GET_REGION_INFO(vbasedev, *info, &fd); >> + if (ret != 0) { >> g_free(*info); >> *info = NULL; >> - return -errno; >> + return ret; >> } >> >> if ((*info)->argsz > argsz) { >> argsz = (*info)->argsz; >> *info = g_realloc(*info, argsz); >> + if (fd != -1) { >> + close(fd); >> + fd = -1; >> + } > > Use of fd here is hard to review, it's clearly for some future use case. > The FD code can be moved to a patch where it’s used. >> >> + >> +static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off, >> + uint32_t size, void *data, bool post) >> +{ >> + struct vfio_region_info *info = vbasedev->regions[index]; >> + int ret; >> + >> + ret = pwrite(vbasedev->fd, data, size, info->offset + off); >> + if (ret < 0) { >> + ret = -errno; >> + } >> + >> + return ret; >> +} > > Simplify all these with ternaries? > > return ret < 0 ? -errno : ret; > ok >> >> static void vfio_pci_size_rom(VFIOPCIDevice *vdev) >> { >> + VFIODevice *vbasedev = &vdev->vbasedev; >> uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); >> - off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; >> DeviceState *dev = DEVICE(vdev); >> char *name; >> - int fd = vdev->vbasedev.fd; >> >> if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { >> /* Since pci handles romfile, just print a message and return */ >> @@ -926,13 +928,23 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) >> * Use the same size ROM BAR as the physical device. The contents >> * will get filled in later when the guest tries to read it. >> */ >> - if (pread(fd, &orig, 4, offset) != 4 || >> - pwrite(fd, &size, 4, offset) != 4 || >> - pread(fd, &size, 4, offset) != 4 || >> - pwrite(fd, &orig, 4, offset) != 4) { >> - error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name); >> +#define rom_read(var) VDEV_REGION_READ(vbasedev, \ >> + VFIO_PCI_CONFIG_REGION_INDEX, \ >> + PCI_ROM_ADDRESS, 4, (var)) >> +#define rom_write(var) VDEV_REGION_WRITE(vbasedev, \ >> + VFIO_PCI_CONFIG_REGION_INDEX, \ >> + PCI_ROM_ADDRESS, 4, (var), false) >> + >> + if (rom_read(&orig) != 4 || >> + rom_write(&size) != 4 || >> + rom_read(&size) != 4 || >> + rom_write(&orig) != 4) { >> + >> + error_report("%s(%s) ROM access failed", __func__, vbasedev->name); >> return; >> } >> +#undef rom_read >> +#undef rom_write > > Hmm, I think I'd rather see a generic function broken out for this, > maybe: > > int vfio_pci_size_bar32(VFIODevice *vbasedev, int offset, > uint32_t mask, uint32_t *size) > { > uint32_t orig, val; > int index = VFIO_PCI_CONFIG_REGION_INDEX; > > if (VDEV_REGION_READ(vbasedev, index, offset, 4, &orig) != 4 || > VDEV_REGION_WRITE(vbasedev, index, offset, 4, &mask, false) != 4 || > VDEV_REGION_READ(vbasedev, index, offset, 4, &val) != 4 || > VDEV_REGION_WRITE(vbasedev, index, offset, 4, &orig, false) != 4) { > > error_report(... > > return -1; > } > > *size = ~(le32_to_cpu(val) & PCI_ROM_ADDRESS_MASK) + 1; > > return 0; > } > ok > Really I think it's just trying to hard code > VFIO_PCI_CONFIG_REGION_INDEX and PCI_ROM_ADDRESS into the macro calls > that demand some simplification, if we use variables it looks fine > inline. > > We could also re-wrap VDEV_REGION_READ/WRITE in vfio/pci.h to simplify > config space access, presumably we'd never have posted writes to config > space. Thanks, > There are multiple places where config space is read or written in pci.c, so adding an inline or #define for it is a good idea. JJ
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 9b3c5e5..43fa948 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -124,6 +124,7 @@ typedef struct VFIOHostDMAWindow { } VFIOHostDMAWindow; typedef struct VFIODeviceOps VFIODeviceOps; +typedef struct VFIODevIO VFIODevIO; typedef struct VFIODevice { QLIST_ENTRY(VFIODevice) next; @@ -139,12 +140,14 @@ typedef struct VFIODevice { bool ram_block_discard_allowed; bool enable_migration; VFIODeviceOps *ops; + VFIODevIO *io_ops; unsigned int num_irqs; unsigned int num_regions; unsigned int flags; VFIOMigration *migration; Error *migration_blocker; OnOffAuto pre_copy_dirty_page_tracking; + struct vfio_region_info **regions; } VFIODevice; struct VFIODeviceOps { @@ -164,6 +167,30 @@ struct VFIODeviceOps { * through ioctl() to the kernel VFIO driver, but vfio-user * can use a socket to a remote process. */ +struct VFIODevIO { + int (*get_info)(VFIODevice *vdev, struct vfio_device_info *info); + int (*get_region_info)(VFIODevice *vdev, + struct vfio_region_info *info, int *fd); + int (*get_irq_info)(VFIODevice *vdev, struct vfio_irq_info *irq); + int (*set_irqs)(VFIODevice *vdev, struct vfio_irq_set *irqs); + int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, + void *data); + int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, + void *data, bool post); +}; + +#define VDEV_GET_INFO(vdev, info) \ + ((vdev)->io_ops->get_info((vdev), (info))) +#define VDEV_GET_REGION_INFO(vdev, info, fd) \ + ((vdev)->io_ops->get_region_info((vdev), (info), (fd))) +#define VDEV_GET_IRQ_INFO(vdev, irq) \ + ((vdev)->io_ops->get_irq_info((vdev), (irq))) +#define VDEV_SET_IRQS(vdev, irqs) \ + ((vdev)->io_ops->set_irqs((vdev), (irqs))) +#define VDEV_REGION_READ(vdev, nr, off, size, data) \ + ((vdev)->io_ops->region_read((vdev), (nr), (off), (size), (data))) +#define VDEV_REGION_WRITE(vdev, nr, off, size, data, post) \ + ((vdev)->io_ops->region_write((vdev), (nr), (off), (size), (data), (post))) struct VFIOContIO { int (*dma_map)(VFIOContainer *container, @@ -184,6 +211,7 @@ struct VFIOContIO { #define CONT_DIRTY_BITMAP(cont, bitmap, range) \ ((cont)->io_ops->dirty_bitmap((cont), (bitmap), (range))) +extern VFIODevIO vfio_dev_io_ioctl; extern VFIOContIO vfio_cont_io_ioctl; #endif /* CONFIG_LINUX */ diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 50748be..41fdd78 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -70,7 +70,7 @@ void vfio_disable_irqindex(VFIODevice *vbasedev, int index) .count = 0, }; - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); + VDEV_SET_IRQS(vbasedev, &irq_set); } void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index) @@ -83,7 +83,7 @@ void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index) .count = 1, }; - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); + VDEV_SET_IRQS(vbasedev, &irq_set); } void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) @@ -96,7 +96,7 @@ void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) .count = 1, }; - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); + VDEV_SET_IRQS(vbasedev, &irq_set); } static inline const char *action_to_str(int action) @@ -177,9 +177,7 @@ int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, pfd = (int32_t *)&irq_set->data; *pfd = fd; - if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) { - ret = -errno; - } + ret = VDEV_SET_IRQS(vbasedev, irq_set); g_free(irq_set); if (!ret) { @@ -214,6 +212,7 @@ void vfio_region_write(void *opaque, hwaddr addr, uint32_t dword; uint64_t qword; } buf; + int ret; switch (size) { case 1: @@ -233,13 +232,15 @@ void vfio_region_write(void *opaque, hwaddr addr, break; } - if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { + ret = VDEV_REGION_WRITE(vbasedev, region->nr, addr, size, &buf, false); + if (ret != size) { + const char *err = ret < 0 ? strerror(-ret) : "short write"; + error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 - ",%d) failed: %m", + ",%d) failed: %s", __func__, vbasedev->name, region->nr, - addr, data, size); + addr, data, size, err); } - trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); /* @@ -265,13 +266,18 @@ uint64_t vfio_region_read(void *opaque, uint64_t qword; } buf; uint64_t data = 0; + int ret; - if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { - error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", + ret = VDEV_REGION_READ(vbasedev, region->nr, addr, size, &buf); + if (ret != size) { + const char *err = ret < 0 ? strerror(-ret) : "short read"; + + error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s", __func__, vbasedev->name, region->nr, - addr, size); + addr, size, err); return (uint64_t)-1; } + switch (size) { case 1: data = buf.byte; @@ -2369,6 +2375,16 @@ int vfio_get_device(VFIOGroup *group, const char *name, void vfio_put_base_device(VFIODevice *vbasedev) { + if (vbasedev->regions != NULL) { + int i; + + for (i = 0; i < vbasedev->num_regions; i++) { + g_free(vbasedev->regions[i]); + } + g_free(vbasedev->regions); + vbasedev->regions = NULL; + } + if (!vbasedev->group) { return; } @@ -2382,6 +2398,21 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info) { size_t argsz = sizeof(struct vfio_region_info); + int fd = -1; + int ret; + + /* create region cache */ + if (vbasedev->regions == NULL) { + vbasedev->regions = g_new0(struct vfio_region_info *, + vbasedev->num_regions); + } + /* check cache */ + if (vbasedev->regions[index] != NULL) { + *info = g_malloc0(vbasedev->regions[index]->argsz); + memcpy(*info, vbasedev->regions[index], + vbasedev->regions[index]->argsz); + return 0; + } *info = g_malloc0(argsz); @@ -2389,19 +2420,28 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, retry: (*info)->argsz = argsz; - if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { + ret = VDEV_GET_REGION_INFO(vbasedev, *info, &fd); + if (ret != 0) { g_free(*info); *info = NULL; - return -errno; + return ret; } if ((*info)->argsz > argsz) { argsz = (*info)->argsz; *info = g_realloc(*info, argsz); + if (fd != -1) { + close(fd); + fd = -1; + } goto retry; } + /* fill cache */ + vbasedev->regions[index] = g_malloc0(argsz); + memcpy(vbasedev->regions[index], *info, argsz); + return 0; } @@ -2554,6 +2594,95 @@ int vfio_eeh_as_op(AddressSpace *as, uint32_t op) * Traditional ioctl() based io_ops */ +static int vfio_io_get_info(VFIODevice *vbasedev, struct vfio_device_info *info) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_INFO, info); + if (ret < 0) { + ret = -errno; + } + + return ret; +} + +static int vfio_io_get_region_info(VFIODevice *vbasedev, + struct vfio_region_info *info, + int *fd) +{ + int ret; + + *fd = -1; + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info); + if (ret < 0) { + ret = -errno; + } + + return ret; +} + +static int vfio_io_get_irq_info(VFIODevice *vbasedev, + struct vfio_irq_info *info) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info); + if (ret < 0) { + ret = -errno; + } + + return ret; +} + +static int vfio_io_set_irqs(VFIODevice *vbasedev, struct vfio_irq_set *irqs) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irqs); + if (ret < 0) { + ret = -errno; + } + + return ret; +} + +static int vfio_io_region_read(VFIODevice *vbasedev, uint8_t index, off_t off, + uint32_t size, void *data) +{ + struct vfio_region_info *info = vbasedev->regions[index]; + int ret; + + ret = pread(vbasedev->fd, data, size, info->offset + off); + if (ret < 0) { + ret = -errno; + } + + return ret; +} + +static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t off, + uint32_t size, void *data, bool post) +{ + struct vfio_region_info *info = vbasedev->regions[index]; + int ret; + + ret = pwrite(vbasedev->fd, data, size, info->offset + off); + if (ret < 0) { + ret = -errno; + } + + return ret; +} + +VFIODevIO vfio_dev_io_ioctl = { + .get_info = vfio_io_get_info, + .get_region_info = vfio_io_get_region_info, + .get_irq_info = vfio_io_get_irq_info, + .set_irqs = vfio_io_set_irqs, + .region_read = vfio_io_region_read, + .region_write = vfio_io_region_write, +}; + static int vfio_io_dma_map(VFIOContainer *container, struct vfio_iommu_type1_dma_map *map, int fd, bool will_commit) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 122edf8..28f21f8 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -402,7 +402,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) fds[i] = fd; } - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + ret = VDEV_SET_IRQS(&vdev->vbasedev, irq_set); g_free(irq_set); @@ -772,14 +772,16 @@ static void vfio_update_msi(VFIOPCIDevice *vdev) static void vfio_pci_load_rom(VFIOPCIDevice *vdev) { + VFIODevice *vbasedev = &vdev->vbasedev; struct vfio_region_info *reg_info; uint64_t size; off_t off = 0; ssize_t bytes; + int ret; - if (vfio_get_region_info(&vdev->vbasedev, - VFIO_PCI_ROM_REGION_INDEX, ®_info)) { - error_report("vfio: Error getting ROM info: %m"); + ret = vfio_get_region_info(vbasedev, VFIO_PCI_ROM_REGION_INDEX, ®_info); + if (ret < 0) { + error_report("vfio: Error getting ROM info: %s", strerror(-ret)); return; } @@ -806,18 +808,19 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) memset(vdev->rom, 0xff, size); while (size) { - bytes = pread(vdev->vbasedev.fd, vdev->rom + off, - size, vdev->rom_offset + off); + bytes = VDEV_REGION_READ(vbasedev, VFIO_PCI_ROM_REGION_INDEX, off, + size, vdev->rom + off); if (bytes == 0) { break; } else if (bytes > 0) { off += bytes; size -= bytes; } else { - if (errno == EINTR || errno == EAGAIN) { + if (bytes == -EINTR || bytes == -EAGAIN) { continue; } - error_report("vfio: Error reading device ROM: %m"); + error_report("vfio: Error reading device ROM: %s", + strerror(-bytes)); break; } } @@ -905,11 +908,10 @@ static const MemoryRegionOps vfio_rom_ops = { static void vfio_pci_size_rom(VFIOPCIDevice *vdev) { + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); - off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; DeviceState *dev = DEVICE(vdev); char *name; - int fd = vdev->vbasedev.fd; if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { /* Since pci handles romfile, just print a message and return */ @@ -926,13 +928,23 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) * Use the same size ROM BAR as the physical device. The contents * will get filled in later when the guest tries to read it. */ - if (pread(fd, &orig, 4, offset) != 4 || - pwrite(fd, &size, 4, offset) != 4 || - pread(fd, &size, 4, offset) != 4 || - pwrite(fd, &orig, 4, offset) != 4) { - error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name); +#define rom_read(var) VDEV_REGION_READ(vbasedev, \ + VFIO_PCI_CONFIG_REGION_INDEX, \ + PCI_ROM_ADDRESS, 4, (var)) +#define rom_write(var) VDEV_REGION_WRITE(vbasedev, \ + VFIO_PCI_CONFIG_REGION_INDEX, \ + PCI_ROM_ADDRESS, 4, (var), false) + + if (rom_read(&orig) != 4 || + rom_write(&size) != 4 || + rom_read(&size) != 4 || + rom_write(&orig) != 4) { + + error_report("%s(%s) ROM access failed", __func__, vbasedev->name); return; } +#undef rom_read +#undef rom_write size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1; @@ -1110,6 +1122,7 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) { VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; memcpy(&emu_bits, vdev->emulated_config_bits + addr, len); @@ -1122,12 +1135,14 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) if (~emu_bits & (0xffffffffU >> (32 - len * 8))) { ssize_t ret; - ret = pread(vdev->vbasedev.fd, &phys_val, len, - vdev->config_offset + addr); + ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, addr, + len, &phys_val); if (ret != len) { - error_report("%s(%s, 0x%x, 0x%x) failed: %m", - __func__, vdev->vbasedev.name, addr, len); - return -errno; + const char *err = ret < 0 ? strerror(-ret) : "short read"; + + error_report("%s(%s, 0x%x, 0x%x) failed: %s", + __func__, vbasedev->name, addr, len, err); + return -1; } phys_val = le32_to_cpu(phys_val); } @@ -1143,15 +1158,20 @@ void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len) { VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t val_le = cpu_to_le32(val); + int ret; trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len); /* Write everything to VFIO, let it filter out what we can't write */ - if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr) - != len) { - error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m", - __func__, vdev->vbasedev.name, addr, val, len); + ret = VDEV_REGION_WRITE(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, addr, + len, &val_le, false); + if (ret != len) { + const char *err = ret < 0 ? strerror(-ret) : "short write"; + + error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s", + __func__, vbasedev->name, addr, val, len, err); } /* MSI/MSI-X Enabling/Disabling */ @@ -1239,10 +1259,13 @@ static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp) int ret, entries; Error *err = NULL; - if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl), - vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { - error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS"); - return -errno; + ret = VDEV_REGION_READ(&vdev->vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, + pos + PCI_CAP_FLAGS, sizeof(ctrl), &ctrl); + if (ret != sizeof(ctrl)) { + const char *err = ret < 0 ? strerror(-ret) : "short read"; + + error_setg(errp, "failed reading MSI PCI_CAP_FLAGS %s", err); + return ret; } ctrl = le16_to_cpu(ctrl); @@ -1444,33 +1467,40 @@ static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp) */ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) { + VFIODevice *vbasedev = &vdev->vbasedev; uint8_t pos; uint16_t ctrl; uint32_t table, pba; - int fd = vdev->vbasedev.fd; VFIOMSIXInfo *msix; + int ret; pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); if (!pos) { return; } - if (pread(fd, &ctrl, sizeof(ctrl), - vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS"); - return; + ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, + pos + PCI_MSIX_FLAGS, sizeof(ctrl), &ctrl); + if (ret != sizeof(ctrl)) { + const char *err = ret < 0 ? strerror(-ret) : "short read"; + + error_setg(errp, "failed to read PCI MSIX FLAGS %s", err); } - if (pread(fd, &table, sizeof(table), - vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE"); - return; + ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, + pos + PCI_MSIX_TABLE, sizeof(table), &table); + if (ret != sizeof(table)) { + const char *err = ret < 0 ? strerror(-ret) : "short read"; + + error_setg(errp, "failed to read PCI MSIX TABLE %s", err); } - if (pread(fd, &pba, sizeof(pba), - vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX PBA"); - return; + ret = VDEV_REGION_READ(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, + pos + PCI_MSIX_PBA, sizeof(pba), &pba); + if (ret != sizeof(pba)) { + const char *err = ret < 0 ? strerror(-ret) : "short read"; + + error_setg(errp, "failed to read PCI MSIX PBA %s", err); } ctrl = le16_to_cpu(ctrl); @@ -1608,7 +1638,6 @@ static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled) static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) { VFIOBAR *bar = &vdev->bars[nr]; - uint32_t pci_bar; int ret; @@ -1618,10 +1647,13 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) } /* Determine what type of BAR this is for registration */ - ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar), - vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); + ret = VDEV_REGION_READ(&vdev->vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, + PCI_BASE_ADDRESS_0 + (4 * nr), + sizeof(pci_bar), &pci_bar); if (ret != sizeof(pci_bar)) { - error_report("vfio: Failed to read BAR %d (%m)", nr); + const char *err = ret < 0 ? strerror(-ret) : "short read"; + + error_report("vfio: Failed to read BAR %d (%s)", nr, err); return; } @@ -2169,8 +2201,9 @@ static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) static void vfio_pci_post_reset(VFIOPCIDevice *vdev) { + VFIODevice *vbasedev = &vdev->vbasedev; Error *err = NULL; - int nr; + int ret, nr; vfio_intx_enable(vdev, &err); if (err) { @@ -2178,13 +2211,17 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) } for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) { - off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr); + off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr); uint32_t val = 0; uint32_t len = sizeof(val); - if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) { - error_report("%s(%s) reset bar %d failed: %m", __func__, - vdev->vbasedev.name, nr); + ret = VDEV_REGION_WRITE(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, addr, + len, &val, false); + if (ret != len) { + const char *err = ret < 0 ? strerror(-ret) : "short write"; + + error_report("%s(%s) reset bar %d failed: %s", __func__, + vbasedev->name, nr, err); } } @@ -2619,7 +2656,7 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); + ret = VDEV_GET_IRQ_INFO(vbasedev, &irq_info); if (ret) { /* This can fail for an old kernel or legacy PCI dev */ trace_vfio_populate_device_get_irq_info_failure(strerror(errno)); @@ -2738,8 +2775,10 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev) return; } - if (ioctl(vdev->vbasedev.fd, - VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) { + if (VDEV_GET_IRQ_INFO(&vdev->vbasedev, &irq_info) < 0) { + return; + } + if (irq_info.count < 1) { return; } @@ -2817,6 +2856,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vdev->vbasedev.ops = &vfio_pci_ops; vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI; vdev->vbasedev.dev = DEVICE(vdev); + vdev->vbasedev.io_ops = &vfio_dev_io_ioctl; tmp = g_strdup_printf("%s/iommu_group", vdev->vbasedev.sysfsdev); len = readlink(tmp, group_path, sizeof(group_path));
Used for communication with VFIO driver (prep work for vfio-user, which will communicate over a socket) Signed-off-by: John G Johnson <john.g.johnson@oracle.com> --- include/hw/vfio/vfio-common.h | 28 ++++++++ hw/vfio/common.c | 159 ++++++++++++++++++++++++++++++++++++++---- hw/vfio/pci.c | 146 ++++++++++++++++++++++++-------------- 3 files changed, 265 insertions(+), 68 deletions(-)