Message ID | 1474991845-27962-21-git-send-email-roger.pau@citrix.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
> -----Original Message----- > From: Roger Pau Monne [mailto:roger.pau@citrix.com] > Sent: 27 September 2016 16:57 > To: xen-devel@lists.xenproject.org > Cc: konrad.wilk@oracle.com; boris.ostrovsky@oracle.com; Roger Pau Monne > <roger.pau@citrix.com>; Jan Beulich <jbeulich@suse.com>; Andrew Cooper > <Andrew.Cooper3@citrix.com>; Paul Durrant <Paul.Durrant@citrix.com> > Subject: [PATCH v2 20/30] xen/x86: add the basic infrastructure to import > QEMU passthrough code > > Most of this code has been picked up from QEMU and modified so it can be > plugged into the internal Xen IO handlers. The structure of the handlers has > been keep quite similar to QEMU, so existing handlers can be imported > without a lot of effort. > If you lifted code from QEMU then one assumes there is no problem with license, but do you need to amend copyrights for any of the files where you put the code? > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> > --- > Cc: Jan Beulich <jbeulich@suse.com> > Cc: Andrew Cooper <andrew.cooper3@citrix.com> > Cc: Paul Durrant <paul.durrant@citrix.com> > --- > docs/misc/xen-command-line.markdown | 8 + > xen/arch/x86/hvm/hvm.c | 2 + > xen/arch/x86/hvm/io.c | 621 > ++++++++++++++++++++++++++++++++++++ > xen/include/asm-x86/hvm/domain.h | 4 + > xen/include/asm-x86/hvm/io.h | 176 ++++++++++ > xen/include/xen/pci.h | 5 + > 6 files changed, 816 insertions(+) > > diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen- > command-line.markdown > index 59d7210..78130c8 100644 > --- a/docs/misc/xen-command-line.markdown > +++ b/docs/misc/xen-command-line.markdown > @@ -670,6 +670,14 @@ Flag that makes a 64bit dom0 boot in PVH mode. No > 32bit support at present. > > Flag that makes a dom0 boot in PVHv2 mode. > > +### dom0permissive > +> `= <boolean>` > + > +> Default: `true` > + > +Select mode of PCI pass-through when using a PVHv2 Dom0, either > permissive or > +not. > + > ### dtuart (ARM) > > `= path [:options]` > > diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c > index a291f82..bc4f7bc 100644 > --- a/xen/arch/x86/hvm/hvm.c > +++ b/xen/arch/x86/hvm/hvm.c > @@ -632,6 +632,8 @@ int hvm_domain_initialise(struct domain *d) > goto fail1; > } > memset(d->arch.hvm_domain.io_bitmap, ~0, HVM_IOBITMAP_SIZE); > + INIT_LIST_HEAD(&d->arch.hvm_domain.pt_devices); > + rwlock_init(&d->arch.hvm_domain.pt_lock); > } > else > d->arch.hvm_domain.io_bitmap = hvm_io_bitmap; > diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c > index 31d54dc..7de1de3 100644 > --- a/xen/arch/x86/hvm/io.c > +++ b/xen/arch/x86/hvm/io.c > @@ -46,6 +46,10 @@ > #include <xen/iocap.h> > #include <public/hvm/ioreq.h> > > +/* Set permissive mode for HVM Dom0 PCI pass-through by default */ > +static bool_t opt_dom0permissive = 1; > +boolean_param("dom0permissive", opt_dom0permissive); > + > void send_timeoffset_req(unsigned long timeoff) > { > ioreq_t p = { > @@ -258,12 +262,403 @@ static bool_t hw_dpci_portio_accept(const struct > hvm_io_handler *handler, > return 0; > } > > +static struct hvm_pt_device *hw_dpci_get_device(struct domain *d) > +{ > + uint8_t bus, slot, func; > + uint32_t addr; > + struct hvm_pt_device *dev; > + > + /* Decode bus, slot and func. */ > + addr = CF8_BDF(d->arch.pci_cf8); > + bus = PCI_BUS(addr); > + slot = PCI_SLOT(addr); > + func = PCI_FUNC(addr); > + > + list_for_each_entry( dev, &d->arch.hvm_domain.pt_devices, entries ) > + { > + if ( dev->pdev->seg != 0 || dev->pdev->bus != bus || > + dev->pdev->devfn != PCI_DEVFN(slot,func) ) > + continue; > + > + return dev; > + } > + > + return NULL; > +} > + > +/* Dispatchers */ > + > +/* Find emulate register group entry */ > +struct hvm_pt_reg_group *hvm_pt_find_reg_grp(struct hvm_pt_device > *d, > + uint32_t address) > +{ > + struct hvm_pt_reg_group *entry = NULL; > + > + /* Find register group entry */ > + list_for_each_entry( entry, &d->register_groups, entries ) > + { > + /* check address */ > + if ( (entry->base_offset <= address) > + && ((entry->base_offset + entry->size) > address) ) > + return entry; > + } > + > + /* Group entry not found */ > + return NULL; > +} > + > +/* Find emulate register entry */ > +struct hvm_pt_reg *hvm_pt_find_reg(struct hvm_pt_reg_group *reg_grp, > + uint32_t address) > +{ > + struct hvm_pt_reg *reg_entry = NULL; > + struct hvm_pt_reg_handler *handler = NULL; > + uint32_t real_offset = 0; > + > + /* Find register entry */ > + list_for_each_entry( reg_entry, ®_grp->registers, entries ) > + { > + handler = reg_entry->handler; > + real_offset = reg_grp->base_offset + handler->offset; > + /* Check address */ > + if ( (real_offset <= address) > + && ((real_offset + handler->size) > address) ) > + return reg_entry; > + } > + > + return NULL; > +} > + > +static int hvm_pt_pci_config_access_check(struct hvm_pt_device *d, > + uint32_t addr, int len) > +{ > + /* Check offset range */ > + if ( addr >= 0xFF ) > + { > + printk_pdev(d->pdev, XENLOG_DEBUG, > + "failed to access register with offset exceeding 0xFF. " > + "(addr: 0x%02x, len: %d)\n", addr, len); > + return -EDOM; > + } > + > + /* Check read size */ > + if ( (len != 1) && (len != 2) && (len != 4) ) > + { > + printk_pdev(d->pdev, XENLOG_DEBUG, > + "failed to access register with invalid access length. " > + "(addr: 0x%02x, len: %d)\n", addr, len); > + return -EINVAL; > + } > + > + /* Check offset alignment */ > + if ( addr & (len - 1) ) > + { > + printk_pdev(d->pdev, XENLOG_DEBUG, > + "failed to access register with invalid access size " > + "alignment. (addr: 0x%02x, len: %d)\n", addr, len); > + return -EINVAL; > + } > + > + return 0; > +} > + > +static int hvm_pt_pci_read_config(struct hvm_pt_device *d, uint32_t addr, > + uint32_t *data, int len) > +{ > + uint32_t val = 0; > + struct hvm_pt_reg_group *reg_grp_entry = NULL; > + struct hvm_pt_reg *reg_entry = NULL; > + int rc = 0; > + int emul_len = 0; > + uint32_t find_addr = addr; > + unsigned int seg = d->pdev->seg; > + unsigned int bus = d->pdev->bus; > + unsigned int slot = PCI_SLOT(d->pdev->devfn); > + unsigned int func = PCI_FUNC(d->pdev->devfn); > + > + /* Sanity checks. */ > + if ( hvm_pt_pci_config_access_check(d, addr, len) ) > + return X86EMUL_UNHANDLEABLE; > + > + /* Find register group entry. */ > + reg_grp_entry = hvm_pt_find_reg_grp(d, addr); > + if ( reg_grp_entry == NULL ) > + return X86EMUL_UNHANDLEABLE; > + > + /* Read I/O device register value. */ > + switch( len ) > + { > + case 1: > + val = pci_conf_read8(seg, bus, slot, func, addr); > + break; > + case 2: > + val = pci_conf_read16(seg, bus, slot, func, addr); > + break; > + case 4: > + val = pci_conf_read32(seg, bus, slot, func, addr); > + break; > + default: > + BUG(); > + } > + > + /* Adjust the read value to appropriate CFC-CFF window. */ > + val <<= (addr & 3) << 3; > + emul_len = len; > + > + /* Loop around the guest requested size. */ > + while ( emul_len > 0 ) > + { > + /* Find register entry to be emulated. */ > + reg_entry = hvm_pt_find_reg(reg_grp_entry, find_addr); > + if ( reg_entry ) > + { > + struct hvm_pt_reg_handler *handler = reg_entry->handler; > + uint32_t real_offset = reg_grp_entry->base_offset + handler- > >offset; > + uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3); Figuring out whether this makes sense makes my brain hurt. Any chance of some macro or at least comments about this? > + uint8_t *ptr_val = NULL; > + > + valid_mask <<= (find_addr - real_offset) << 3; > + ptr_val = (uint8_t *)&val + (real_offset & 3); > + > + /* Do emulation based on register size. */ > + switch ( handler->size ) > + { > + case 1: > + if ( handler->u.b.read ) > + rc = handler->u.b.read(d, reg_entry, ptr_val, valid_mask); > + break; > + case 2: > + if ( handler->u.w.read ) > + rc = handler->u.w.read(d, reg_entry, (uint16_t *)ptr_val, > + valid_mask); > + break; > + case 4: > + if ( handler->u.dw.read ) > + rc = handler->u.dw.read(d, reg_entry, (uint32_t *)ptr_val, > + valid_mask); > + break; > + } > + > + if ( rc < 0 ) > + { > + gdprintk(XENLOG_WARNING, > + "Invalid read emulation, shutting down domain\n"); > + domain_crash(current->domain); > + return X86EMUL_UNHANDLEABLE; > + } > + > + /* Calculate next address to find. */ > + emul_len -= handler->size; > + if ( emul_len > 0 ) > + find_addr = real_offset + handler->size; > + } > + else > + { > + /* Nothing to do with passthrough type register */ > + emul_len--; > + find_addr++; > + } > + } > + > + /* Need to shift back before returning them to pci bus emulator */ > + val >>= ((addr & 3) << 3); > + *data = val; > + > + return X86EMUL_OKAY; > +} > + > +static int hvm_pt_pci_write_config(struct hvm_pt_device *d, uint32_t > addr, > + uint32_t val, int len) > +{ > + int index = 0; > + struct hvm_pt_reg_group *reg_grp_entry = NULL; > + int rc = 0; > + uint32_t read_val = 0, wb_mask; > + int emul_len = 0; > + struct hvm_pt_reg *reg_entry = NULL; > + uint32_t find_addr = addr; > + struct hvm_pt_reg_handler *handler = NULL; > + bool wp_flag = false; > + unsigned int seg = d->pdev->seg; > + unsigned int bus = d->pdev->bus; > + unsigned int slot = PCI_SLOT(d->pdev->devfn); > + unsigned int func = PCI_FUNC(d->pdev->devfn); > + > + /* Sanity checks. */ > + if ( hvm_pt_pci_config_access_check(d, addr, len) ) > + return X86EMUL_UNHANDLEABLE; > + > + /* Find register group entry. */ > + reg_grp_entry = hvm_pt_find_reg_grp(d, addr); > + if ( reg_grp_entry == NULL ) > + return X86EMUL_UNHANDLEABLE; > + > + /* Read I/O device register value. */ > + switch( len ) > + { > + case 1: > + read_val = pci_conf_read8(seg, bus, slot, func, addr); > + break; > + case 2: > + read_val = pci_conf_read16(seg, bus, slot, func, addr); > + break; > + case 4: > + read_val = pci_conf_read32(seg, bus, slot, func, addr); > + break; > + default: > + BUG(); > + } > + wb_mask = 0xFFFFFFFF >> ((4 - len) << 3); > + > + /* Adjust the read and write value to appropriate CFC-CFF window */ > + read_val <<= (addr & 3) << 3; > + val <<= (addr & 3) << 3; > + emul_len = len; > + > + /* Loop around the guest requested size */ > + while ( emul_len > 0 ) > + { > + /* Find register entry to be emulated */ > + reg_entry = hvm_pt_find_reg(reg_grp_entry, find_addr); > + if ( reg_entry ) > + { > + handler = reg_entry->handler; > + uint32_t real_offset = reg_grp_entry->base_offset + handler- > >offset; > + uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3); > + uint8_t *ptr_val = NULL; > + uint32_t wp_mask = handler->emu_mask | handler->ro_mask; > + > + valid_mask <<= (find_addr - real_offset) << 3; > + ptr_val = (uint8_t *)&val + (real_offset & 3); > + if ( !d->permissive ) > + wp_mask |= handler->res_mask; > + if ( wp_mask == (0xFFFFFFFF >> ((4 - handler->size) << 3)) ) > + wb_mask &= ~((wp_mask >> ((find_addr - real_offset) << 3)) > + << ((len - emul_len) << 3)); > + > + /* Do emulation based on register size */ > + switch ( handler->size ) > + { > + case 1: > + if ( handler->u.b.write ) > + rc = handler->u.b.write(d, reg_entry, ptr_val, > + read_val >> ((real_offset & 3) << 3), > + valid_mask); > + break; > + case 2: > + if ( handler->u.w.write ) > + rc = handler->u.w.write(d, reg_entry, (uint16_t *)ptr_val, > + (read_val >> ((real_offset & 3) << 3)), > + valid_mask); > + break; > + case 4: > + if ( handler->u.dw.write ) > + rc = handler->u.dw.write(d, reg_entry, (uint32_t *)ptr_val, > + (read_val >> ((real_offset & 3) << 3)), > + valid_mask); > + break; > + } > + > + if ( rc < 0 ) > + { > + gdprintk(XENLOG_WARNING, > + "Invalid write emulation, shutting down domain\n"); > + domain_crash(current->domain); > + return X86EMUL_UNHANDLEABLE; > + } > + > + /* Calculate next address to find */ > + emul_len -= handler->size; > + if ( emul_len > 0 ) > + find_addr = real_offset + handler->size; > + } > + else > + { > + /* Nothing to do with passthrough type register */ > + if ( !d->permissive ) > + { > + wb_mask &= ~(0xff << ((len - emul_len) << 3)); > + /* > + * Unused BARs will make it here, but we don't want to issue > + * warnings for writes to them (bogus writes get dealt with > + * above). > + */ > + if ( index < 0 ) > + wp_flag = true; > + } > + emul_len--; > + find_addr++; > + } > + } > + > + /* Need to shift back before passing them to xen_host_pci_set_block */ > + val >>= (addr & 3) << 3; > + > + if ( wp_flag && !d->permissive_warned ) > + { > + d->permissive_warned = true; > + gdprintk(XENLOG_WARNING, > + "Write-back to unknown field 0x%02x (partially) inhibited (0x%0*x)\n", > + addr, len * 2, wb_mask); > + gdprintk(XENLOG_WARNING, > + "If the device doesn't work, try enabling permissive mode\n"); > + gdprintk(XENLOG_WARNING, > + "(unsafe) and if it helps report the problem to xen-devel\n"); > + } > + for ( index = 0; wb_mask; index += len ) > + { > + /* Unknown regs are passed through */ > + while ( !(wb_mask & 0xff) ) > + { > + index++; > + wb_mask >>= 8; > + } > + len = 0; > + do { > + len++; > + wb_mask >>= 8; > + } while ( wb_mask & 0xff ); > + > + switch( len ) > + { > + case 1: > + { > + uint8_t value; > + memcpy(&value, (uint8_t *)&val + index, 1); > + pci_conf_write8(seg, bus, slot, func, addr + index, value); > + break; > + } > + case 2: > + { > + uint16_t value; > + memcpy(&value, (uint8_t *)&val + index, 2); > + pci_conf_write16(seg, bus, slot, func, addr + index, value); > + break; > + } > + case 4: > + { > + uint32_t value; > + memcpy(&value, (uint8_t *)&val + index, 4); > + pci_conf_write32(seg, bus, slot, func, addr + index, value); > + break; > + } > + default: > + BUG(); > + } > + } > + return X86EMUL_OKAY; > +} > + > static int hw_dpci_portio_read(const struct hvm_io_handler *handler, > uint64_t addr, > uint32_t size, > uint64_t *data) > { > struct domain *currd = current->domain; > + struct hvm_pt_device *dev; > + uint32_t data32; > + uint8_t reg; > + int rc; > > if ( addr == 0xcf8 ) > { > @@ -276,6 +671,22 @@ static int hw_dpci_portio_read(const struct > hvm_io_handler *handler, > size = min(size, 4 - ((uint32_t)addr & 3)); > if ( size == 3 ) > size = 2; > + > + read_lock(&currd->arch.hvm_domain.pt_lock); > + dev = hw_dpci_get_device(currd); > + if ( dev != NULL ) > + { > + reg = (currd->arch.pci_cf8 & 0xfc) | (addr & 0x3); > + rc = hvm_pt_pci_read_config(dev, reg, &data32, size); > + if ( rc == X86EMUL_OKAY ) > + { > + read_unlock(&currd->arch.hvm_domain.pt_lock); > + *data = data32; > + return rc; > + } > + } > + read_unlock(&currd->arch.hvm_domain.pt_lock); > + > if ( pci_cfg_ok(currd, addr & 3, size, NULL) ) > *data = pci_conf_read(currd->arch.pci_cf8, addr & 3, size); > > @@ -288,7 +699,10 @@ static int hw_dpci_portio_write(const struct > hvm_io_handler *handler, > uint64_t data) > { > struct domain *currd = current->domain; > + struct hvm_pt_device *dev; > uint32_t data32; > + uint8_t reg; > + int rc; > > if ( addr == 0xcf8 ) > { > @@ -302,12 +716,219 @@ static int hw_dpci_portio_write(const struct > hvm_io_handler *handler, > if ( size == 3 ) > size = 2; > data32 = data; > + > + read_lock(&currd->arch.hvm_domain.pt_lock); > + dev = hw_dpci_get_device(currd); > + if ( dev != NULL ) > + { > + reg = (currd->arch.pci_cf8 & 0xfc) | (addr & 0x3); > + rc = hvm_pt_pci_write_config(dev, reg, data32, size); > + if ( rc == X86EMUL_OKAY ) > + { > + read_unlock(&currd->arch.hvm_domain.pt_lock); > + return rc; > + } > + } > + read_unlock(&currd->arch.hvm_domain.pt_lock); > + I must be missing something here. Why are you adding passthrough code to the hardware domain's handlers? Surely it sees all devices anyway? > if ( pci_cfg_ok(currd, addr & 3, size, &data32) ) > pci_conf_write(currd->arch.pci_cf8, addr & 3, size, data); > > return X86EMUL_OKAY; > } > > +static void hvm_pt_free_device(struct hvm_pt_device *dev) > +{ > + struct hvm_pt_reg_group *group, *g; > + > + list_for_each_entry_safe( group, g, &dev->register_groups, entries ) > + { > + struct hvm_pt_reg *reg, *r; > + > + list_for_each_entry_safe( reg, r, &group->registers, entries ) > + { > + list_del(®->entries); > + xfree(reg); > + } > + > + list_del(&group->entries); > + xfree(group); > + } > + > + xfree(dev); > +} > + > +static int hvm_pt_add_register(struct hvm_pt_device *dev, > + struct hvm_pt_reg_group *group, > + struct hvm_pt_reg_handler *handler) > +{ > + struct pci_dev *pdev = dev->pdev; > + struct hvm_pt_reg *reg; > + > + reg = xmalloc(struct hvm_pt_reg); > + if ( reg == NULL ) > + return -ENOMEM; > + > + memset(reg, 0, sizeof(*reg)); xzalloc()? > + reg->handler = handler; > + if ( handler->init != NULL ) > + { > + uint32_t host_mask, size_mask, data = 0; > + uint8_t seg, bus, slot, func; > + unsigned int offset; > + uint32_t val; > + int rc; > + > + /* Initialize emulate register */ > + rc = handler->init(dev, reg->handler, > + group->base_offset + reg->handler->offset, &data); > + if ( rc < 0 ) > + return rc; > + > + if ( data == HVM_PT_INVALID_REG ) > + { > + xfree(reg); > + return 0; > + } > + > + /* Sync up the data to val */ > + offset = group->base_offset + reg->handler->offset; > + size_mask = 0xFFFFFFFF >> ((4 - reg->handler->size) << 3); > + > + seg = pdev->seg; > + bus = pdev->bus; > + slot = PCI_SLOT(pdev->devfn); > + func = PCI_FUNC(pdev->devfn); > + > + switch ( reg->handler->size ) > + { > + case 1: > + val = pci_conf_read8(seg, bus, slot, func, offset); > + break; > + case 2: > + val = pci_conf_read16(seg, bus, slot, func, offset); > + break; > + case 4: > + val = pci_conf_read32(seg, bus, slot, func, offset); > + break; > + default: > + BUG(); > + } > + > + /* > + * Set bits in emu_mask are the ones we emulate. The reg shall > + * contain the emulated view of the guest - therefore we flip > + * the mask to mask out the host values (which reg initially > + * has). > + */ > + host_mask = size_mask & ~reg->handler->emu_mask; > + > + if ( (data & host_mask) != (val & host_mask) ) > + { > + uint32_t new_val; > + > + /* Mask out host (including past size). */ > + new_val = val & host_mask; > + /* Merge emulated ones (excluding the non-emulated ones). */ > + new_val |= data & host_mask; > + /* > + * Leave intact host and emulated values past the size - > + * even though we do not care as we write per reg->size > + * granularity, but for the logging below lets have the > + * proper value. > + */ > + new_val |= ((val | data)) & ~size_mask; > + printk_pdev(pdev, XENLOG_ERR, > +"offset 0x%04x mismatch! Emulated=0x%04x, host=0x%04x, syncing to > 0x%04x.\n", > + offset, data, val, new_val); > + val = new_val; > + } > + else > + val = data; > + > + if ( val & ~size_mask ) > + { > + printk_pdev(pdev, XENLOG_ERR, > + "Offset 0x%04x:0x%04x expands past register size(%d)!\n", > + offset, val, reg->handler->size); > + return -EINVAL; > + } > + > + reg->val.dword = val; > + } > + list_add_tail(®->entries, &group->registers); > + > + return 0; > +} > + > +static struct hvm_pt_handler_init *hwdom_pt_handlers[] = { > +}; > + > +int hwdom_add_device(struct pci_dev *pdev) > +{ > + struct domain *d = pdev->domain; > + struct hvm_pt_device *dev; > + int j, i, rc; > + > + ASSERT( is_hardware_domain(d) ); > + ASSERT( pcidevs_locked() ); > + > + dev = xmalloc(struct hvm_pt_device); > + if ( dev == NULL ) > + return -ENOMEM; > + > + memset(dev, 0 , sizeof(*dev)); xzalloc()? > + > + dev->pdev = pdev; > + INIT_LIST_HEAD(&dev->register_groups); > + > + dev->permissive = opt_dom0permissive; > + > + for ( j = 0; j < ARRAY_SIZE(hwdom_pt_handlers); j++ ) > + { > + struct hvm_pt_handler_init *handler_init = hwdom_pt_handlers[j]; > + struct hvm_pt_reg_group *group; > + > + group = xmalloc(struct hvm_pt_reg_group); > + if ( group == NULL ) > + { > + xfree(dev); > + return -ENOMEM; > + } > + INIT_LIST_HEAD(&group->registers); > + > + rc = handler_init->init(dev, group); > + if ( rc == 0 ) > + { > + for ( i = 0; handler_init->handlers[i].size != 0; i++ ) > + { > + int rc; > + > + rc = hvm_pt_add_register(dev, group, > + &handler_init->handlers[i]); > + if ( rc ) > + { > + printk_pdev(pdev, XENLOG_ERR, "error adding register: %d\n", > + rc); > + hvm_pt_free_device(dev); > + return rc; > + } > + } > + > + list_add_tail(&group->entries, &dev->register_groups); > + } > + else > + xfree(group); > + } > + > + write_lock(&d->arch.hvm_domain.pt_lock); > + list_add_tail(&dev->entries, &d->arch.hvm_domain.pt_devices); > + write_unlock(&d->arch.hvm_domain.pt_lock); > + printk_pdev(pdev, XENLOG_DEBUG, "added for pass-through\n"); > + > + return 0; > +} > + > static const struct hvm_io_ops dpci_portio_ops = { > .accept = dpci_portio_accept, > .read = dpci_portio_read, > diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm- > x86/hvm/domain.h > index f34d784..1b1a52f 100644 > --- a/xen/include/asm-x86/hvm/domain.h > +++ b/xen/include/asm-x86/hvm/domain.h > @@ -152,6 +152,10 @@ struct hvm_domain { > struct vmx_domain vmx; > struct svm_domain svm; > }; > + > + /* List of passed-through devices (hw domain only). */ > + struct list_head pt_devices; > + rwlock_t pt_lock; > }; > > #define hap_enabled(d) ((d)->arch.hvm_domain.hap_enabled) > diff --git a/xen/include/asm-x86/hvm/io.h b/xen/include/asm-x86/hvm/io.h > index e9b3f83..80f830d 100644 > --- a/xen/include/asm-x86/hvm/io.h > +++ b/xen/include/asm-x86/hvm/io.h > @@ -153,6 +153,182 @@ extern void hvm_dpci_msi_eoi(struct domain *d, > int vector); > > void register_dpci_portio_handler(struct domain *d); > > +/* Structures for pci-passthrough state and handlers. */ > +struct hvm_pt_device; > +struct hvm_pt_reg_handler; > +struct hvm_pt_reg; > +struct hvm_pt_reg_group; > + > +/* Return code when register should be ignored. */ > +#define HVM_PT_INVALID_REG 0xFFFFFFFF > + > +/* function type for config reg */ > +typedef int (*hvm_pt_conf_reg_init) > + (struct hvm_pt_device *, struct hvm_pt_reg_handler *, uint32_t > real_offset, > + uint32_t *data); > + > +typedef int (*hvm_pt_conf_dword_write) > + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, > + uint32_t *val, uint32_t dev_value, uint32_t valid_mask); > +typedef int (*hvm_pt_conf_word_write) > + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, > + uint16_t *val, uint16_t dev_value, uint16_t valid_mask); > +typedef int (*hvm_pt_conf_byte_write) > + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, > + uint8_t *val, uint8_t dev_value, uint8_t valid_mask); > +typedef int (*hvm_pt_conf_dword_read) > + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, > + uint32_t *val, uint32_t valid_mask); > +typedef int (*hvm_pt_conf_word_read) > + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, > + uint16_t *val, uint16_t valid_mask); > +typedef int (*hvm_pt_conf_byte_read) > + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, > + uint8_t *val, uint8_t valid_mask); > + > +typedef int (*hvm_pt_group_init) > + (struct hvm_pt_device *, struct hvm_pt_reg_group *); > + > +/* > + * Emulated register information. > + * > + * This should be shared between all the consumers that trap on accesses > + * to certain PCI registers. > + */ > +struct hvm_pt_reg_handler { > + uint32_t offset; > + uint32_t size; > + uint32_t init_val; > + /* reg reserved field mask (ON:reserved, OFF:defined) */ > + uint32_t res_mask; > + /* reg read only field mask (ON:RO/ROS, OFF:other) */ > + uint32_t ro_mask; > + /* reg read/write-1-clear field mask (ON:RW1C/RW1CS, OFF:other) */ > + uint32_t rw1c_mask; > + /* reg emulate field mask (ON:emu, OFF:passthrough) */ > + uint32_t emu_mask; > + hvm_pt_conf_reg_init init; > + /* read/write function pointer > + * for double_word/word/byte size */ > + union { > + struct { > + hvm_pt_conf_dword_write write; > + hvm_pt_conf_dword_read read; > + } dw; > + struct { > + hvm_pt_conf_word_write write; > + hvm_pt_conf_word_read read; > + } w; > + struct { > + hvm_pt_conf_byte_write write; > + hvm_pt_conf_byte_read read; > + } b; > + } u; > +}; > + > +struct hvm_pt_handler_init { > + struct hvm_pt_reg_handler *handlers; > + hvm_pt_group_init init; > +}; > + > +/* > + * Emulated register value. > + * > + * This is the representation of each specific emulated register. > + */ > +struct hvm_pt_reg { > + struct list_head entries; > + struct hvm_pt_reg_handler *handler; > + union { > + uint8_t byte; > + uint16_t word; > + uint32_t dword; > + } val; > +}; > + > +/* > + * Emulated register group. > + * > + * In order to speed up (and logically group) emulated registers search, > + * groups are used that represent specific emulated features, like MSI. > + */ > +struct hvm_pt_reg_group { > + struct list_head entries; > + uint32_t base_offset; > + uint8_t size; > + struct list_head registers; > +}; > + > +/* > + * Guest MSI information. > + * > + * MSI values set by the guest. > + */ > +struct hvm_pt_msi { > + uint16_t flags; > + uint32_t addr_lo; /* guest message address */ > + uint32_t addr_hi; /* guest message upper address */ > + uint16_t data; /* guest message data */ > + uint32_t ctrl_offset; /* saved control offset */ > + int pirq; /* guest pirq corresponding */ > + bool_t initialized; /* when guest MSI is initialized */ > + bool_t mapped; /* when pirq is mapped */ > +}; > + > +/* > + * Guest passed-through PCI device. > + */ > +struct hvm_pt_device { > + struct list_head entries; > + > + struct pci_dev *pdev; > + > + bool_t permissive; > + bool_t permissive_warned; > + > + /* MSI status. */ > + struct hvm_pt_msi msi; > + > + struct list_head register_groups; > +}; > + > +/* > + * The hierarchy of the above structures is the following: > + * > + * +---------------+ +---------------+ > + * | | entries | | ... > + * | hvm_pt_device +---------+ hvm_pt_device +----+ > + * | | | | > + * +-+-------------+ +---------------+ > + * | > + * | register_groups > + * | > + * +-v----------------+ +------------------+ > + * | | entries | | ... > + * | hvm_pt_reg_group +----------+ hvm_pt_reg_group +----+ > + * | | | | > + * +-+----------------+ +------------------+ > + * | > + * | registers > + * | > + * +-v----------+ +------------+ > + * | | entries | | ... > + * | hvm_pt_reg +------------+ hvm_pt_reg +----+ > + * | | | | > + * +-+----------+ +-+----------+ > + * | | > + * | handler | handler > + * | | > + * +-v------------------+ +-v------------------+ > + * | | | | > + * | hvm_pt_reg_handler | | hvm_pt_reg_handler | > + * | | | | > + * +--------------------+ +--------------------+ > + */ > + > +/* Helper to add passed-through devices to the hardware domain. */ > +int hwdom_add_device(struct pci_dev *pdev); > + > #endif /* __ASM_X86_HVM_IO_H__ */ > > > diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h > index f191773..b21a891 100644 > --- a/xen/include/xen/pci.h > +++ b/xen/include/xen/pci.h > @@ -90,6 +90,11 @@ struct pci_dev { > u64 vf_rlen[6]; > }; > > +/* Helper for printing pci_dev related messages. */ > +#define printk_pdev(pdev, lvl, fmt, ...) \ > + printk(lvl "PCI %04x:%02x:%02x.%u: " fmt, pdev->seg, pdev->bus, \ > + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), ##__VA_ARGS__) > + > #define for_each_pdev(domain, pdev) \ > list_for_each_entry(pdev, &(domain->arch.pdev_list), domain_list) > > -- > 2.7.4 (Apple Git-66)
On Mon, Oct 03, 2016 at 10:54:54AM +0100, Paul Durrant wrote: > > -----Original Message----- > > From: Roger Pau Monne [mailto:roger.pau@citrix.com] > > Sent: 27 September 2016 16:57 > > To: xen-devel@lists.xenproject.org > > Cc: konrad.wilk@oracle.com; boris.ostrovsky@oracle.com; Roger Pau Monne > > <roger.pau@citrix.com>; Jan Beulich <jbeulich@suse.com>; Andrew Cooper > > <Andrew.Cooper3@citrix.com>; Paul Durrant <Paul.Durrant@citrix.com> > > Subject: [PATCH v2 20/30] xen/x86: add the basic infrastructure to import > > QEMU passthrough code > > > > Most of this code has been picked up from QEMU and modified so it can be > > plugged into the internal Xen IO handlers. The structure of the handlers has > > been keep quite similar to QEMU, so existing handlers can be imported > > without a lot of effort. > > > > If you lifted code from QEMU then one assumes there is no problem with license, but do you need to amend copyrights for any of the files where you put the code? License is GPL 2, same as Xen. For copyrights I have to admit I have no idea. The code is not imported as-is for obvious reasons, but the logic is mostly the same. I don't mind adding the copyright holders for all the code I've imported, they are: Copyright (c) 2007, Neocleus Corporation. Copyright (c) 2007, Intel Corporation. With different authors depending on the file. Adding Lars, Ian and George since they have more experience with copyrights. > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> > > --- > > Cc: Jan Beulich <jbeulich@suse.com> > > Cc: Andrew Cooper <andrew.cooper3@citrix.com> > > Cc: Paul Durrant <paul.durrant@citrix.com> > > --- > > docs/misc/xen-command-line.markdown | 8 + > > xen/arch/x86/hvm/hvm.c | 2 + > > xen/arch/x86/hvm/io.c | 621 > > ++++++++++++++++++++++++++++++++++++ > > xen/include/asm-x86/hvm/domain.h | 4 + > > xen/include/asm-x86/hvm/io.h | 176 ++++++++++ > > xen/include/xen/pci.h | 5 + > > 6 files changed, 816 insertions(+) > > > > diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen- > > command-line.markdown > > index 59d7210..78130c8 100644 > > --- a/docs/misc/xen-command-line.markdown > > +++ b/docs/misc/xen-command-line.markdown > > @@ -670,6 +670,14 @@ Flag that makes a 64bit dom0 boot in PVH mode. No > > 32bit support at present. > > > > Flag that makes a dom0 boot in PVHv2 mode. > > > > +### dom0permissive > > +> `= <boolean>` > > + > > +> Default: `true` > > + > > +Select mode of PCI pass-through when using a PVHv2 Dom0, either > > permissive or > > +not. > > + > > ### dtuart (ARM) > > > `= path [:options]` > > > > diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c > > index a291f82..bc4f7bc 100644 > > --- a/xen/arch/x86/hvm/hvm.c > > +++ b/xen/arch/x86/hvm/hvm.c > > @@ -632,6 +632,8 @@ int hvm_domain_initialise(struct domain *d) > > goto fail1; > > } > > memset(d->arch.hvm_domain.io_bitmap, ~0, HVM_IOBITMAP_SIZE); > > + INIT_LIST_HEAD(&d->arch.hvm_domain.pt_devices); > > + rwlock_init(&d->arch.hvm_domain.pt_lock); > > } > > else > > d->arch.hvm_domain.io_bitmap = hvm_io_bitmap; > > diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c > > index 31d54dc..7de1de3 100644 > > --- a/xen/arch/x86/hvm/io.c > > +++ b/xen/arch/x86/hvm/io.c > > @@ -46,6 +46,10 @@ > > #include <xen/iocap.h> > > #include <public/hvm/ioreq.h> > > > > +/* Set permissive mode for HVM Dom0 PCI pass-through by default */ > > +static bool_t opt_dom0permissive = 1; > > +boolean_param("dom0permissive", opt_dom0permissive); > > + > > void send_timeoffset_req(unsigned long timeoff) > > { > > ioreq_t p = { > > @@ -258,12 +262,403 @@ static bool_t hw_dpci_portio_accept(const struct > > hvm_io_handler *handler, > > return 0; > > } > > > > +static struct hvm_pt_device *hw_dpci_get_device(struct domain *d) > > +{ > > + uint8_t bus, slot, func; > > + uint32_t addr; > > + struct hvm_pt_device *dev; > > + > > + /* Decode bus, slot and func. */ > > + addr = CF8_BDF(d->arch.pci_cf8); > > + bus = PCI_BUS(addr); > > + slot = PCI_SLOT(addr); > > + func = PCI_FUNC(addr); > > + > > + list_for_each_entry( dev, &d->arch.hvm_domain.pt_devices, entries ) > > + { > > + if ( dev->pdev->seg != 0 || dev->pdev->bus != bus || > > + dev->pdev->devfn != PCI_DEVFN(slot,func) ) > > + continue; > > + > > + return dev; > > + } > > + > > + return NULL; > > +} > > + > > +/* Dispatchers */ > > + > > +/* Find emulate register group entry */ > > +struct hvm_pt_reg_group *hvm_pt_find_reg_grp(struct hvm_pt_device > > *d, > > + uint32_t address) > > +{ > > + struct hvm_pt_reg_group *entry = NULL; > > + > > + /* Find register group entry */ > > + list_for_each_entry( entry, &d->register_groups, entries ) > > + { > > + /* check address */ > > + if ( (entry->base_offset <= address) > > + && ((entry->base_offset + entry->size) > address) ) > > + return entry; > > + } > > + > > + /* Group entry not found */ > > + return NULL; > > +} > > + > > +/* Find emulate register entry */ > > +struct hvm_pt_reg *hvm_pt_find_reg(struct hvm_pt_reg_group *reg_grp, > > + uint32_t address) > > +{ > > + struct hvm_pt_reg *reg_entry = NULL; > > + struct hvm_pt_reg_handler *handler = NULL; > > + uint32_t real_offset = 0; > > + > > + /* Find register entry */ > > + list_for_each_entry( reg_entry, ®_grp->registers, entries ) > > + { > > + handler = reg_entry->handler; > > + real_offset = reg_grp->base_offset + handler->offset; > > + /* Check address */ > > + if ( (real_offset <= address) > > + && ((real_offset + handler->size) > address) ) > > + return reg_entry; > > + } > > + > > + return NULL; > > +} > > + > > +static int hvm_pt_pci_config_access_check(struct hvm_pt_device *d, > > + uint32_t addr, int len) > > +{ > > + /* Check offset range */ > > + if ( addr >= 0xFF ) > > + { > > + printk_pdev(d->pdev, XENLOG_DEBUG, > > + "failed to access register with offset exceeding 0xFF. " > > + "(addr: 0x%02x, len: %d)\n", addr, len); > > + return -EDOM; > > + } > > + > > + /* Check read size */ > > + if ( (len != 1) && (len != 2) && (len != 4) ) > > + { > > + printk_pdev(d->pdev, XENLOG_DEBUG, > > + "failed to access register with invalid access length. " > > + "(addr: 0x%02x, len: %d)\n", addr, len); > > + return -EINVAL; > > + } > > + > > + /* Check offset alignment */ > > + if ( addr & (len - 1) ) > > + { > > + printk_pdev(d->pdev, XENLOG_DEBUG, > > + "failed to access register with invalid access size " > > + "alignment. (addr: 0x%02x, len: %d)\n", addr, len); > > + return -EINVAL; > > + } > > + > > + return 0; > > +} > > + > > +static int hvm_pt_pci_read_config(struct hvm_pt_device *d, uint32_t addr, > > + uint32_t *data, int len) > > +{ > > + uint32_t val = 0; > > + struct hvm_pt_reg_group *reg_grp_entry = NULL; > > + struct hvm_pt_reg *reg_entry = NULL; > > + int rc = 0; > > + int emul_len = 0; > > + uint32_t find_addr = addr; > > + unsigned int seg = d->pdev->seg; > > + unsigned int bus = d->pdev->bus; > > + unsigned int slot = PCI_SLOT(d->pdev->devfn); > > + unsigned int func = PCI_FUNC(d->pdev->devfn); > > + > > + /* Sanity checks. */ > > + if ( hvm_pt_pci_config_access_check(d, addr, len) ) > > + return X86EMUL_UNHANDLEABLE; > > + > > + /* Find register group entry. */ > > + reg_grp_entry = hvm_pt_find_reg_grp(d, addr); > > + if ( reg_grp_entry == NULL ) > > + return X86EMUL_UNHANDLEABLE; > > + > > + /* Read I/O device register value. */ > > + switch( len ) > > + { > > + case 1: > > + val = pci_conf_read8(seg, bus, slot, func, addr); > > + break; > > + case 2: > > + val = pci_conf_read16(seg, bus, slot, func, addr); > > + break; > > + case 4: > > + val = pci_conf_read32(seg, bus, slot, func, addr); > > + break; > > + default: > > + BUG(); > > + } > > + > > + /* Adjust the read value to appropriate CFC-CFF window. */ > > + val <<= (addr & 3) << 3; > > + emul_len = len; > > + > > + /* Loop around the guest requested size. */ > > + while ( emul_len > 0 ) > > + { > > + /* Find register entry to be emulated. */ > > + reg_entry = hvm_pt_find_reg(reg_grp_entry, find_addr); > > + if ( reg_entry ) > > + { > > + struct hvm_pt_reg_handler *handler = reg_entry->handler; > > + uint32_t real_offset = reg_grp_entry->base_offset + handler- > > >offset; > > + uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3); > > Figuring out whether this makes sense makes my brain hurt. Any chance of some macro or at least comments about this? Right. What about: /* Create a bitmask from a given size (in bytes). */ #define HVM_PT_SIZE_TO_MASK(size) (0xFFFFFFFF >> ((4 - (size)) << 3)) > > + uint8_t *ptr_val = NULL; > > + > > + valid_mask <<= (find_addr - real_offset) << 3; > > + ptr_val = (uint8_t *)&val + (real_offset & 3); > > + > > + /* Do emulation based on register size. */ > > + switch ( handler->size ) > > + { > > + case 1: > > + if ( handler->u.b.read ) > > + rc = handler->u.b.read(d, reg_entry, ptr_val, valid_mask); > > + break; > > + case 2: > > + if ( handler->u.w.read ) > > + rc = handler->u.w.read(d, reg_entry, (uint16_t *)ptr_val, > > + valid_mask); > > + break; > > + case 4: > > + if ( handler->u.dw.read ) > > + rc = handler->u.dw.read(d, reg_entry, (uint32_t *)ptr_val, > > + valid_mask); > > + break; > > + } > > + > > + if ( rc < 0 ) > > + { > > + gdprintk(XENLOG_WARNING, > > + "Invalid read emulation, shutting down domain\n"); > > + domain_crash(current->domain); > > + return X86EMUL_UNHANDLEABLE; > > + } > > + > > + /* Calculate next address to find. */ > > + emul_len -= handler->size; > > + if ( emul_len > 0 ) > > + find_addr = real_offset + handler->size; > > + } > > + else > > + { > > + /* Nothing to do with passthrough type register */ > > + emul_len--; > > + find_addr++; > > + } > > + } > > + > > + /* Need to shift back before returning them to pci bus emulator */ > > + val >>= ((addr & 3) << 3); > > + *data = val; > > + > > + return X86EMUL_OKAY; > > +} > > + > > +static int hvm_pt_pci_write_config(struct hvm_pt_device *d, uint32_t > > addr, > > + uint32_t val, int len) > > +{ > > + int index = 0; > > + struct hvm_pt_reg_group *reg_grp_entry = NULL; > > + int rc = 0; > > + uint32_t read_val = 0, wb_mask; > > + int emul_len = 0; > > + struct hvm_pt_reg *reg_entry = NULL; > > + uint32_t find_addr = addr; > > + struct hvm_pt_reg_handler *handler = NULL; > > + bool wp_flag = false; > > + unsigned int seg = d->pdev->seg; > > + unsigned int bus = d->pdev->bus; > > + unsigned int slot = PCI_SLOT(d->pdev->devfn); > > + unsigned int func = PCI_FUNC(d->pdev->devfn); > > + > > + /* Sanity checks. */ > > + if ( hvm_pt_pci_config_access_check(d, addr, len) ) > > + return X86EMUL_UNHANDLEABLE; > > + > > + /* Find register group entry. */ > > + reg_grp_entry = hvm_pt_find_reg_grp(d, addr); > > + if ( reg_grp_entry == NULL ) > > + return X86EMUL_UNHANDLEABLE; > > + > > + /* Read I/O device register value. */ > > + switch( len ) > > + { > > + case 1: > > + read_val = pci_conf_read8(seg, bus, slot, func, addr); > > + break; > > + case 2: > > + read_val = pci_conf_read16(seg, bus, slot, func, addr); > > + break; > > + case 4: > > + read_val = pci_conf_read32(seg, bus, slot, func, addr); > > + break; > > + default: > > + BUG(); > > + } > > + wb_mask = 0xFFFFFFFF >> ((4 - len) << 3); > > + > > + /* Adjust the read and write value to appropriate CFC-CFF window */ > > + read_val <<= (addr & 3) << 3; > > + val <<= (addr & 3) << 3; > > + emul_len = len; > > + > > + /* Loop around the guest requested size */ > > + while ( emul_len > 0 ) > > + { > > + /* Find register entry to be emulated */ > > + reg_entry = hvm_pt_find_reg(reg_grp_entry, find_addr); > > + if ( reg_entry ) > > + { > > + handler = reg_entry->handler; > > + uint32_t real_offset = reg_grp_entry->base_offset + handler- > > >offset; > > + uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3); > > + uint8_t *ptr_val = NULL; > > + uint32_t wp_mask = handler->emu_mask | handler->ro_mask; > > + > > + valid_mask <<= (find_addr - real_offset) << 3; > > + ptr_val = (uint8_t *)&val + (real_offset & 3); > > + if ( !d->permissive ) > > + wp_mask |= handler->res_mask; > > + if ( wp_mask == (0xFFFFFFFF >> ((4 - handler->size) << 3)) ) > > + wb_mask &= ~((wp_mask >> ((find_addr - real_offset) << 3)) > > + << ((len - emul_len) << 3)); > > + > > + /* Do emulation based on register size */ > > + switch ( handler->size ) > > + { > > + case 1: > > + if ( handler->u.b.write ) > > + rc = handler->u.b.write(d, reg_entry, ptr_val, > > + read_val >> ((real_offset & 3) << 3), > > + valid_mask); > > + break; > > + case 2: > > + if ( handler->u.w.write ) > > + rc = handler->u.w.write(d, reg_entry, (uint16_t *)ptr_val, > > + (read_val >> ((real_offset & 3) << 3)), > > + valid_mask); > > + break; > > + case 4: > > + if ( handler->u.dw.write ) > > + rc = handler->u.dw.write(d, reg_entry, (uint32_t *)ptr_val, > > + (read_val >> ((real_offset & 3) << 3)), > > + valid_mask); > > + break; > > + } > > + > > + if ( rc < 0 ) > > + { > > + gdprintk(XENLOG_WARNING, > > + "Invalid write emulation, shutting down domain\n"); > > + domain_crash(current->domain); > > + return X86EMUL_UNHANDLEABLE; > > + } > > + > > + /* Calculate next address to find */ > > + emul_len -= handler->size; > > + if ( emul_len > 0 ) > > + find_addr = real_offset + handler->size; > > + } > > + else > > + { > > + /* Nothing to do with passthrough type register */ > > + if ( !d->permissive ) > > + { > > + wb_mask &= ~(0xff << ((len - emul_len) << 3)); > > + /* > > + * Unused BARs will make it here, but we don't want to issue > > + * warnings for writes to them (bogus writes get dealt with > > + * above). > > + */ > > + if ( index < 0 ) > > + wp_flag = true; > > + } > > + emul_len--; > > + find_addr++; > > + } > > + } > > + > > + /* Need to shift back before passing them to xen_host_pci_set_block */ > > + val >>= (addr & 3) << 3; > > + > > + if ( wp_flag && !d->permissive_warned ) > > + { > > + d->permissive_warned = true; > > + gdprintk(XENLOG_WARNING, > > + "Write-back to unknown field 0x%02x (partially) inhibited (0x%0*x)\n", > > + addr, len * 2, wb_mask); > > + gdprintk(XENLOG_WARNING, > > + "If the device doesn't work, try enabling permissive mode\n"); > > + gdprintk(XENLOG_WARNING, > > + "(unsafe) and if it helps report the problem to xen-devel\n"); > > + } > > + for ( index = 0; wb_mask; index += len ) > > + { > > + /* Unknown regs are passed through */ > > + while ( !(wb_mask & 0xff) ) > > + { > > + index++; > > + wb_mask >>= 8; > > + } > > + len = 0; > > + do { > > + len++; > > + wb_mask >>= 8; > > + } while ( wb_mask & 0xff ); > > + > > + switch( len ) > > + { > > + case 1: > > + { > > + uint8_t value; > > + memcpy(&value, (uint8_t *)&val + index, 1); > > + pci_conf_write8(seg, bus, slot, func, addr + index, value); > > + break; > > + } > > + case 2: > > + { > > + uint16_t value; > > + memcpy(&value, (uint8_t *)&val + index, 2); > > + pci_conf_write16(seg, bus, slot, func, addr + index, value); > > + break; > > + } > > + case 4: > > + { > > + uint32_t value; > > + memcpy(&value, (uint8_t *)&val + index, 4); > > + pci_conf_write32(seg, bus, slot, func, addr + index, value); > > + break; > > + } > > + default: > > + BUG(); > > + } > > + } > > + return X86EMUL_OKAY; > > +} > > + > > static int hw_dpci_portio_read(const struct hvm_io_handler *handler, > > uint64_t addr, > > uint32_t size, > > uint64_t *data) > > { > > struct domain *currd = current->domain; > > + struct hvm_pt_device *dev; > > + uint32_t data32; > > + uint8_t reg; > > + int rc; > > > > if ( addr == 0xcf8 ) > > { > > @@ -276,6 +671,22 @@ static int hw_dpci_portio_read(const struct > > hvm_io_handler *handler, > > size = min(size, 4 - ((uint32_t)addr & 3)); > > if ( size == 3 ) > > size = 2; > > + > > + read_lock(&currd->arch.hvm_domain.pt_lock); > > + dev = hw_dpci_get_device(currd); > > + if ( dev != NULL ) > > + { > > + reg = (currd->arch.pci_cf8 & 0xfc) | (addr & 0x3); > > + rc = hvm_pt_pci_read_config(dev, reg, &data32, size); > > + if ( rc == X86EMUL_OKAY ) > > + { > > + read_unlock(&currd->arch.hvm_domain.pt_lock); > > + *data = data32; > > + return rc; > > + } > > + } > > + read_unlock(&currd->arch.hvm_domain.pt_lock); > > + > > if ( pci_cfg_ok(currd, addr & 3, size, NULL) ) > > *data = pci_conf_read(currd->arch.pci_cf8, addr & 3, size); > > > > @@ -288,7 +699,10 @@ static int hw_dpci_portio_write(const struct > > hvm_io_handler *handler, > > uint64_t data) > > { > > struct domain *currd = current->domain; > > + struct hvm_pt_device *dev; > > uint32_t data32; > > + uint8_t reg; > > + int rc; > > > > if ( addr == 0xcf8 ) > > { > > @@ -302,12 +716,219 @@ static int hw_dpci_portio_write(const struct > > hvm_io_handler *handler, > > if ( size == 3 ) > > size = 2; > > data32 = data; > > + > > + read_lock(&currd->arch.hvm_domain.pt_lock); > > + dev = hw_dpci_get_device(currd); > > + if ( dev != NULL ) > > + { > > + reg = (currd->arch.pci_cf8 & 0xfc) | (addr & 0x3); > > + rc = hvm_pt_pci_write_config(dev, reg, data32, size); > > + if ( rc == X86EMUL_OKAY ) > > + { > > + read_unlock(&currd->arch.hvm_domain.pt_lock); > > + return rc; > > + } > > + } > > + read_unlock(&currd->arch.hvm_domain.pt_lock); > > + > > I must be missing something here. Why are you adding passthrough code to the hardware domain's handlers? Surely it sees all devices anyway? Yes, but it cannot access some of the registers directly, for example Dom0 cannot configure the MSI registers, or else Xen would start receiving interrupts from unset vectors. All this is done so that Xen can detect accesses to sensible registers and perform appropriate actions. For example, following the MSI case, Xen will detects this accesses and setup and bind proper PIRQs for the guest. > > if ( pci_cfg_ok(currd, addr & 3, size, &data32) ) > > pci_conf_write(currd->arch.pci_cf8, addr & 3, size, data); > > > > return X86EMUL_OKAY; > > } > > > > +static void hvm_pt_free_device(struct hvm_pt_device *dev) > > +{ > > + struct hvm_pt_reg_group *group, *g; > > + > > + list_for_each_entry_safe( group, g, &dev->register_groups, entries ) > > + { > > + struct hvm_pt_reg *reg, *r; > > + > > + list_for_each_entry_safe( reg, r, &group->registers, entries ) > > + { > > + list_del(®->entries); > > + xfree(reg); > > + } > > + > > + list_del(&group->entries); > > + xfree(group); > > + } > > + > > + xfree(dev); > > +} > > + > > +static int hvm_pt_add_register(struct hvm_pt_device *dev, > > + struct hvm_pt_reg_group *group, > > + struct hvm_pt_reg_handler *handler) > > +{ > > + struct pci_dev *pdev = dev->pdev; > > + struct hvm_pt_reg *reg; > > + > > + reg = xmalloc(struct hvm_pt_reg); > > + if ( reg == NULL ) > > + return -ENOMEM; > > + > > + memset(reg, 0, sizeof(*reg)); > > xzalloc()? Thanks. > > + reg->handler = handler; > > + if ( handler->init != NULL ) > > + { > > + uint32_t host_mask, size_mask, data = 0; > > + uint8_t seg, bus, slot, func; > > + unsigned int offset; > > + uint32_t val; > > + int rc; > > + > > + /* Initialize emulate register */ > > + rc = handler->init(dev, reg->handler, > > + group->base_offset + reg->handler->offset, &data); > > + if ( rc < 0 ) > > + return rc; > > + > > + if ( data == HVM_PT_INVALID_REG ) > > + { > > + xfree(reg); > > + return 0; > > + } > > + > > + /* Sync up the data to val */ > > + offset = group->base_offset + reg->handler->offset; > > + size_mask = 0xFFFFFFFF >> ((4 - reg->handler->size) << 3); > > + > > + seg = pdev->seg; > > + bus = pdev->bus; > > + slot = PCI_SLOT(pdev->devfn); > > + func = PCI_FUNC(pdev->devfn); > > + > > + switch ( reg->handler->size ) > > + { > > + case 1: > > + val = pci_conf_read8(seg, bus, slot, func, offset); > > + break; > > + case 2: > > + val = pci_conf_read16(seg, bus, slot, func, offset); > > + break; > > + case 4: > > + val = pci_conf_read32(seg, bus, slot, func, offset); > > + break; > > + default: > > + BUG(); > > + } > > + > > + /* > > + * Set bits in emu_mask are the ones we emulate. The reg shall > > + * contain the emulated view of the guest - therefore we flip > > + * the mask to mask out the host values (which reg initially > > + * has). > > + */ > > + host_mask = size_mask & ~reg->handler->emu_mask; > > + > > + if ( (data & host_mask) != (val & host_mask) ) > > + { > > + uint32_t new_val; > > + > > + /* Mask out host (including past size). */ > > + new_val = val & host_mask; > > + /* Merge emulated ones (excluding the non-emulated ones). */ > > + new_val |= data & host_mask; > > + /* > > + * Leave intact host and emulated values past the size - > > + * even though we do not care as we write per reg->size > > + * granularity, but for the logging below lets have the > > + * proper value. > > + */ > > + new_val |= ((val | data)) & ~size_mask; > > + printk_pdev(pdev, XENLOG_ERR, > > +"offset 0x%04x mismatch! Emulated=0x%04x, host=0x%04x, syncing to > > 0x%04x.\n", > > + offset, data, val, new_val); > > + val = new_val; > > + } > > + else > > + val = data; > > + > > + if ( val & ~size_mask ) > > + { > > + printk_pdev(pdev, XENLOG_ERR, > > + "Offset 0x%04x:0x%04x expands past register size(%d)!\n", > > + offset, val, reg->handler->size); > > + return -EINVAL; > > + } > > + > > + reg->val.dword = val; > > + } > > + list_add_tail(®->entries, &group->registers); > > + > > + return 0; > > +} > > + > > +static struct hvm_pt_handler_init *hwdom_pt_handlers[] = { > > +}; > > + > > +int hwdom_add_device(struct pci_dev *pdev) > > +{ > > + struct domain *d = pdev->domain; > > + struct hvm_pt_device *dev; > > + int j, i, rc; > > + > > + ASSERT( is_hardware_domain(d) ); > > + ASSERT( pcidevs_locked() ); > > + > > + dev = xmalloc(struct hvm_pt_device); > > + if ( dev == NULL ) > > + return -ENOMEM; > > + > > + memset(dev, 0 , sizeof(*dev)); > > xzalloc()? Fixed. > > + > > + dev->pdev = pdev; > > + INIT_LIST_HEAD(&dev->register_groups); > > + > > + dev->permissive = opt_dom0permissive; > > + > > + for ( j = 0; j < ARRAY_SIZE(hwdom_pt_handlers); j++ ) > > + { > > + struct hvm_pt_handler_init *handler_init = hwdom_pt_handlers[j]; > > + struct hvm_pt_reg_group *group; > > + > > + group = xmalloc(struct hvm_pt_reg_group); > > + if ( group == NULL ) > > + { > > + xfree(dev); > > + return -ENOMEM; > > + } > > + INIT_LIST_HEAD(&group->registers); > > + > > + rc = handler_init->init(dev, group); > > + if ( rc == 0 ) > > + { > > + for ( i = 0; handler_init->handlers[i].size != 0; i++ ) > > + { > > + int rc; > > + > > + rc = hvm_pt_add_register(dev, group, > > + &handler_init->handlers[i]); > > + if ( rc ) > > + { > > + printk_pdev(pdev, XENLOG_ERR, "error adding register: %d\n", > > + rc); > > + hvm_pt_free_device(dev); > > + return rc; > > + } > > + } > > + > > + list_add_tail(&group->entries, &dev->register_groups); > > + } > > + else > > + xfree(group); > > + } > > + > > + write_lock(&d->arch.hvm_domain.pt_lock); > > + list_add_tail(&dev->entries, &d->arch.hvm_domain.pt_devices); > > + write_unlock(&d->arch.hvm_domain.pt_lock); > > + printk_pdev(pdev, XENLOG_DEBUG, "added for pass-through\n"); > > + > > + return 0; > > +} > > + > > static const struct hvm_io_ops dpci_portio_ops = { > > .accept = dpci_portio_accept, > > .read = dpci_portio_read, > > diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm- > > x86/hvm/domain.h > > index f34d784..1b1a52f 100644 > > --- a/xen/include/asm-x86/hvm/domain.h > > +++ b/xen/include/asm-x86/hvm/domain.h > > @@ -152,6 +152,10 @@ struct hvm_domain { > > struct vmx_domain vmx; > > struct svm_domain svm; > > }; > > + > > + /* List of passed-through devices (hw domain only). */ > > + struct list_head pt_devices; > > + rwlock_t pt_lock; > > }; > > > > #define hap_enabled(d) ((d)->arch.hvm_domain.hap_enabled) > > diff --git a/xen/include/asm-x86/hvm/io.h b/xen/include/asm-x86/hvm/io.h > > index e9b3f83..80f830d 100644 > > --- a/xen/include/asm-x86/hvm/io.h > > +++ b/xen/include/asm-x86/hvm/io.h > > @@ -153,6 +153,182 @@ extern void hvm_dpci_msi_eoi(struct domain *d, > > int vector); > > > > void register_dpci_portio_handler(struct domain *d); > > > > +/* Structures for pci-passthrough state and handlers. */ > > +struct hvm_pt_device; > > +struct hvm_pt_reg_handler; > > +struct hvm_pt_reg; > > +struct hvm_pt_reg_group; > > + > > +/* Return code when register should be ignored. */ > > +#define HVM_PT_INVALID_REG 0xFFFFFFFF > > + > > +/* function type for config reg */ > > +typedef int (*hvm_pt_conf_reg_init) > > + (struct hvm_pt_device *, struct hvm_pt_reg_handler *, uint32_t > > real_offset, > > + uint32_t *data); > > + > > +typedef int (*hvm_pt_conf_dword_write) > > + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, > > + uint32_t *val, uint32_t dev_value, uint32_t valid_mask); > > +typedef int (*hvm_pt_conf_word_write) > > + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, > > + uint16_t *val, uint16_t dev_value, uint16_t valid_mask); > > +typedef int (*hvm_pt_conf_byte_write) > > + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, > > + uint8_t *val, uint8_t dev_value, uint8_t valid_mask); > > +typedef int (*hvm_pt_conf_dword_read) > > + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, > > + uint32_t *val, uint32_t valid_mask); > > +typedef int (*hvm_pt_conf_word_read) > > + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, > > + uint16_t *val, uint16_t valid_mask); > > +typedef int (*hvm_pt_conf_byte_read) > > + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, > > + uint8_t *val, uint8_t valid_mask); > > + > > +typedef int (*hvm_pt_group_init) > > + (struct hvm_pt_device *, struct hvm_pt_reg_group *); > > + > > +/* > > + * Emulated register information. > > + * > > + * This should be shared between all the consumers that trap on accesses > > + * to certain PCI registers. > > + */ > > +struct hvm_pt_reg_handler { > > + uint32_t offset; > > + uint32_t size; > > + uint32_t init_val; > > + /* reg reserved field mask (ON:reserved, OFF:defined) */ > > + uint32_t res_mask; > > + /* reg read only field mask (ON:RO/ROS, OFF:other) */ > > + uint32_t ro_mask; > > + /* reg read/write-1-clear field mask (ON:RW1C/RW1CS, OFF:other) */ > > + uint32_t rw1c_mask; > > + /* reg emulate field mask (ON:emu, OFF:passthrough) */ > > + uint32_t emu_mask; > > + hvm_pt_conf_reg_init init; > > + /* read/write function pointer > > + * for double_word/word/byte size */ > > + union { > > + struct { > > + hvm_pt_conf_dword_write write; > > + hvm_pt_conf_dword_read read; > > + } dw; > > + struct { > > + hvm_pt_conf_word_write write; > > + hvm_pt_conf_word_read read; > > + } w; > > + struct { > > + hvm_pt_conf_byte_write write; > > + hvm_pt_conf_byte_read read; > > + } b; > > + } u; > > +}; > > + > > +struct hvm_pt_handler_init { > > + struct hvm_pt_reg_handler *handlers; > > + hvm_pt_group_init init; > > +}; > > + > > +/* > > + * Emulated register value. > > + * > > + * This is the representation of each specific emulated register. > > + */ > > +struct hvm_pt_reg { > > + struct list_head entries; > > + struct hvm_pt_reg_handler *handler; > > + union { > > + uint8_t byte; > > + uint16_t word; > > + uint32_t dword; > > + } val; > > +}; > > + > > +/* > > + * Emulated register group. > > + * > > + * In order to speed up (and logically group) emulated registers search, > > + * groups are used that represent specific emulated features, like MSI. > > + */ > > +struct hvm_pt_reg_group { > > + struct list_head entries; > > + uint32_t base_offset; > > + uint8_t size; > > + struct list_head registers; > > +}; > > + > > +/* > > + * Guest MSI information. > > + * > > + * MSI values set by the guest. > > + */ > > +struct hvm_pt_msi { > > + uint16_t flags; > > + uint32_t addr_lo; /* guest message address */ > > + uint32_t addr_hi; /* guest message upper address */ > > + uint16_t data; /* guest message data */ > > + uint32_t ctrl_offset; /* saved control offset */ > > + int pirq; /* guest pirq corresponding */ > > + bool_t initialized; /* when guest MSI is initialized */ > > + bool_t mapped; /* when pirq is mapped */ > > +}; > > + > > +/* > > + * Guest passed-through PCI device. > > + */ > > +struct hvm_pt_device { > > + struct list_head entries; > > + > > + struct pci_dev *pdev; > > + > > + bool_t permissive; > > + bool_t permissive_warned; > > + > > + /* MSI status. */ > > + struct hvm_pt_msi msi; > > + > > + struct list_head register_groups; > > +}; > > + > > +/* > > + * The hierarchy of the above structures is the following: > > + * > > + * +---------------+ +---------------+ > > + * | | entries | | ... > > + * | hvm_pt_device +---------+ hvm_pt_device +----+ > > + * | | | | > > + * +-+-------------+ +---------------+ > > + * | > > + * | register_groups > > + * | > > + * +-v----------------+ +------------------+ > > + * | | entries | | ... > > + * | hvm_pt_reg_group +----------+ hvm_pt_reg_group +----+ > > + * | | | | > > + * +-+----------------+ +------------------+ > > + * | > > + * | registers > > + * | > > + * +-v----------+ +------------+ > > + * | | entries | | ... > > + * | hvm_pt_reg +------------+ hvm_pt_reg +----+ > > + * | | | | > > + * +-+----------+ +-+----------+ > > + * | | > > + * | handler | handler > > + * | | > > + * +-v------------------+ +-v------------------+ > > + * | | | | > > + * | hvm_pt_reg_handler | | hvm_pt_reg_handler | > > + * | | | | > > + * +--------------------+ +--------------------+ > > + */ > > + > > +/* Helper to add passed-through devices to the hardware domain. */ > > +int hwdom_add_device(struct pci_dev *pdev); > > + > > #endif /* __ASM_X86_HVM_IO_H__ */ > > > > > > diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h > > index f191773..b21a891 100644 > > --- a/xen/include/xen/pci.h > > +++ b/xen/include/xen/pci.h > > @@ -90,6 +90,11 @@ struct pci_dev { > > u64 vf_rlen[6]; > > }; > > > > +/* Helper for printing pci_dev related messages. */ > > +#define printk_pdev(pdev, lvl, fmt, ...) \ > > + printk(lvl "PCI %04x:%02x:%02x.%u: " fmt, pdev->seg, pdev->bus, \ > > + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), ##__VA_ARGS__) > > + > > #define for_each_pdev(domain, pdev) \ > > list_for_each_entry(pdev, &(domain->arch.pdev_list), domain_list) > > > > -- > > 2.7.4 (Apple Git-66) >
>>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote: > Most of this code has been picked up from QEMU and modified so it can be > plugged into the internal Xen IO handlers. The structure of the handlers has > been keep quite similar to QEMU, so existing handlers can be imported > without a lot of effort. Without looking at the code in any detail, the question that Paul has raised really needs to be answered first: Why pass-through code when Dom0 has access to (almost) all devices by default anyway? Jan
On 06/10/2016 17:08, "Roger Pau Monne" <roger.pau@citrix.com> wrote: >On Mon, Oct 03, 2016 at 10:54:54AM +0100, Paul Durrant wrote: >> > -----Original Message----- >> > From: Roger Pau Monne [mailto:roger.pau@citrix.com] >> > Sent: 27 September 2016 16:57 >> > To: xen-devel@lists.xenproject.org >> > Cc: konrad.wilk@oracle.com; boris.ostrovsky@oracle.com; Roger Pau >>Monne >> > <roger.pau@citrix.com>; Jan Beulich <jbeulich@suse.com>; Andrew Cooper >> > <Andrew.Cooper3@citrix.com>; Paul Durrant <Paul.Durrant@citrix.com> >> > Subject: [PATCH v2 20/30] xen/x86: add the basic infrastructure to >>import >> > QEMU passthrough code >> > >> > Most of this code has been picked up from QEMU and modified so it can >>be >> > plugged into the internal Xen IO handlers. The structure of the >>handlers has >> > been keep quite similar to QEMU, so existing handlers can be imported >> > without a lot of effort. >> > >> >> If you lifted code from QEMU then one assumes there is no problem with >>license, but do you need to amend copyrights for any of the files where >>you put the code? > >License is GPL 2, same as Xen. For copyrights I have to admit I have no >idea. The code is not imported as-is for obvious reasons, but the logic >is >mostly the same. I don't mind adding the copyright holders for all the >code >I've imported, they are: > >Copyright (c) 2007, Neocleus Corporation. >Copyright (c) 2007, Intel Corporation. For imported code, you should keep the (c) header as is, adapt the coding style, and then add a Copyright (c) 2016, ... if you are making significant modifications You should also create a README.source file (or add to one in that part of the tree), which tracks where the code came from (e.g. QEMU in this case, referring to the source file), such that it becomes easier if someone needs to go back at some point. The commit message should also contain that information. Lars
>>> On 06.10.16 at 17:08, <roger.pau@citrix.com> wrote: > On Mon, Oct 03, 2016 at 10:54:54AM +0100, Paul Durrant wrote: To both of you: Please limit the quoting in your replies. Thanks, Jan
>>> On 27.09.16 at 17:57, <roger.pau@citrix.com> wrote: > --- a/xen/arch/x86/hvm/hvm.c > +++ b/xen/arch/x86/hvm/hvm.c > @@ -632,6 +632,8 @@ int hvm_domain_initialise(struct domain *d) > goto fail1; > } > memset(d->arch.hvm_domain.io_bitmap, ~0, HVM_IOBITMAP_SIZE); > + INIT_LIST_HEAD(&d->arch.hvm_domain.pt_devices); This field appears to be redundant with arch.pdev_list. > --- a/xen/arch/x86/hvm/io.c > +++ b/xen/arch/x86/hvm/io.c > @@ -46,6 +46,10 @@ > #include <xen/iocap.h> > #include <public/hvm/ioreq.h> > > +/* Set permissive mode for HVM Dom0 PCI pass-through by default */ > +static bool_t opt_dom0permissive = 1; Plain bool / true / false please. And as mentioned by Andrew, we should stop adding more dom0xyz options, and use a consolidated dom0= one instead. > @@ -258,12 +262,403 @@ static bool_t hw_dpci_portio_accept(const struct > hvm_io_handler *handler, > return 0; > } > > +static struct hvm_pt_device *hw_dpci_get_device(struct domain *d) > +{ > + uint8_t bus, slot, func; > + uint32_t addr; > + struct hvm_pt_device *dev; > + > + /* Decode bus, slot and func. */ > + addr = CF8_BDF(d->arch.pci_cf8); > + bus = PCI_BUS(addr); > + slot = PCI_SLOT(addr); > + func = PCI_FUNC(addr); > + > + list_for_each_entry( dev, &d->arch.hvm_domain.pt_devices, entries ) > + { > + if ( dev->pdev->seg != 0 || dev->pdev->bus != bus || Okay, there's no way segments other than 0 can be handled here. But having glanced over the titles of the rest of the series - where are those going to be handled (read: Where is the MCFG code, which qemu doesn't have)? Also I think the function name is not well chosen: Its prefix suggests some kind of "official" interface, yet it really just is an internal helper which doesn't even "get" a device in the general sense of needing to "put" it later on. And it looks like the parameter could be constified (but this appears to be a wider problem). > +/* Dispatchers */ > + > +/* Find emulate register group entry */ > +struct hvm_pt_reg_group *hvm_pt_find_reg_grp(struct hvm_pt_device *d, > + uint32_t address) Please don't needlessly use fixed width types. > +{ > + struct hvm_pt_reg_group *entry = NULL; > + > + /* Find register group entry */ > + list_for_each_entry( entry, &d->register_groups, entries ) > + { > + /* check address */ > + if ( (entry->base_offset <= address) > + && ((entry->base_offset + entry->size) > address) ) Coding style (&& belongs on the previous line). And actually I guess I'll stop here, realizing that I'm completely unconvinced of the not even spelled out intentions. Alone the lifting of code from qemu is problematic imo: That code has proven to have many issues, only the most severe of which have got fixed over time. I'm therefore of the opinion that a clean re-write from scratch should at least be considered, once it was written down somewhere (docs/misc/hvmlite.markdown?) and agreed upon what the behavior actually ought to be. Jan
diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown index 59d7210..78130c8 100644 --- a/docs/misc/xen-command-line.markdown +++ b/docs/misc/xen-command-line.markdown @@ -670,6 +670,14 @@ Flag that makes a 64bit dom0 boot in PVH mode. No 32bit support at present. Flag that makes a dom0 boot in PVHv2 mode. +### dom0permissive +> `= <boolean>` + +> Default: `true` + +Select mode of PCI pass-through when using a PVHv2 Dom0, either permissive or +not. + ### dtuart (ARM) > `= path [:options]` diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index a291f82..bc4f7bc 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -632,6 +632,8 @@ int hvm_domain_initialise(struct domain *d) goto fail1; } memset(d->arch.hvm_domain.io_bitmap, ~0, HVM_IOBITMAP_SIZE); + INIT_LIST_HEAD(&d->arch.hvm_domain.pt_devices); + rwlock_init(&d->arch.hvm_domain.pt_lock); } else d->arch.hvm_domain.io_bitmap = hvm_io_bitmap; diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c index 31d54dc..7de1de3 100644 --- a/xen/arch/x86/hvm/io.c +++ b/xen/arch/x86/hvm/io.c @@ -46,6 +46,10 @@ #include <xen/iocap.h> #include <public/hvm/ioreq.h> +/* Set permissive mode for HVM Dom0 PCI pass-through by default */ +static bool_t opt_dom0permissive = 1; +boolean_param("dom0permissive", opt_dom0permissive); + void send_timeoffset_req(unsigned long timeoff) { ioreq_t p = { @@ -258,12 +262,403 @@ static bool_t hw_dpci_portio_accept(const struct hvm_io_handler *handler, return 0; } +static struct hvm_pt_device *hw_dpci_get_device(struct domain *d) +{ + uint8_t bus, slot, func; + uint32_t addr; + struct hvm_pt_device *dev; + + /* Decode bus, slot and func. */ + addr = CF8_BDF(d->arch.pci_cf8); + bus = PCI_BUS(addr); + slot = PCI_SLOT(addr); + func = PCI_FUNC(addr); + + list_for_each_entry( dev, &d->arch.hvm_domain.pt_devices, entries ) + { + if ( dev->pdev->seg != 0 || dev->pdev->bus != bus || + dev->pdev->devfn != PCI_DEVFN(slot,func) ) + continue; + + return dev; + } + + return NULL; +} + +/* Dispatchers */ + +/* Find emulate register group entry */ +struct hvm_pt_reg_group *hvm_pt_find_reg_grp(struct hvm_pt_device *d, + uint32_t address) +{ + struct hvm_pt_reg_group *entry = NULL; + + /* Find register group entry */ + list_for_each_entry( entry, &d->register_groups, entries ) + { + /* check address */ + if ( (entry->base_offset <= address) + && ((entry->base_offset + entry->size) > address) ) + return entry; + } + + /* Group entry not found */ + return NULL; +} + +/* Find emulate register entry */ +struct hvm_pt_reg *hvm_pt_find_reg(struct hvm_pt_reg_group *reg_grp, + uint32_t address) +{ + struct hvm_pt_reg *reg_entry = NULL; + struct hvm_pt_reg_handler *handler = NULL; + uint32_t real_offset = 0; + + /* Find register entry */ + list_for_each_entry( reg_entry, ®_grp->registers, entries ) + { + handler = reg_entry->handler; + real_offset = reg_grp->base_offset + handler->offset; + /* Check address */ + if ( (real_offset <= address) + && ((real_offset + handler->size) > address) ) + return reg_entry; + } + + return NULL; +} + +static int hvm_pt_pci_config_access_check(struct hvm_pt_device *d, + uint32_t addr, int len) +{ + /* Check offset range */ + if ( addr >= 0xFF ) + { + printk_pdev(d->pdev, XENLOG_DEBUG, + "failed to access register with offset exceeding 0xFF. " + "(addr: 0x%02x, len: %d)\n", addr, len); + return -EDOM; + } + + /* Check read size */ + if ( (len != 1) && (len != 2) && (len != 4) ) + { + printk_pdev(d->pdev, XENLOG_DEBUG, + "failed to access register with invalid access length. " + "(addr: 0x%02x, len: %d)\n", addr, len); + return -EINVAL; + } + + /* Check offset alignment */ + if ( addr & (len - 1) ) + { + printk_pdev(d->pdev, XENLOG_DEBUG, + "failed to access register with invalid access size " + "alignment. (addr: 0x%02x, len: %d)\n", addr, len); + return -EINVAL; + } + + return 0; +} + +static int hvm_pt_pci_read_config(struct hvm_pt_device *d, uint32_t addr, + uint32_t *data, int len) +{ + uint32_t val = 0; + struct hvm_pt_reg_group *reg_grp_entry = NULL; + struct hvm_pt_reg *reg_entry = NULL; + int rc = 0; + int emul_len = 0; + uint32_t find_addr = addr; + unsigned int seg = d->pdev->seg; + unsigned int bus = d->pdev->bus; + unsigned int slot = PCI_SLOT(d->pdev->devfn); + unsigned int func = PCI_FUNC(d->pdev->devfn); + + /* Sanity checks. */ + if ( hvm_pt_pci_config_access_check(d, addr, len) ) + return X86EMUL_UNHANDLEABLE; + + /* Find register group entry. */ + reg_grp_entry = hvm_pt_find_reg_grp(d, addr); + if ( reg_grp_entry == NULL ) + return X86EMUL_UNHANDLEABLE; + + /* Read I/O device register value. */ + switch( len ) + { + case 1: + val = pci_conf_read8(seg, bus, slot, func, addr); + break; + case 2: + val = pci_conf_read16(seg, bus, slot, func, addr); + break; + case 4: + val = pci_conf_read32(seg, bus, slot, func, addr); + break; + default: + BUG(); + } + + /* Adjust the read value to appropriate CFC-CFF window. */ + val <<= (addr & 3) << 3; + emul_len = len; + + /* Loop around the guest requested size. */ + while ( emul_len > 0 ) + { + /* Find register entry to be emulated. */ + reg_entry = hvm_pt_find_reg(reg_grp_entry, find_addr); + if ( reg_entry ) + { + struct hvm_pt_reg_handler *handler = reg_entry->handler; + uint32_t real_offset = reg_grp_entry->base_offset + handler->offset; + uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3); + uint8_t *ptr_val = NULL; + + valid_mask <<= (find_addr - real_offset) << 3; + ptr_val = (uint8_t *)&val + (real_offset & 3); + + /* Do emulation based on register size. */ + switch ( handler->size ) + { + case 1: + if ( handler->u.b.read ) + rc = handler->u.b.read(d, reg_entry, ptr_val, valid_mask); + break; + case 2: + if ( handler->u.w.read ) + rc = handler->u.w.read(d, reg_entry, (uint16_t *)ptr_val, + valid_mask); + break; + case 4: + if ( handler->u.dw.read ) + rc = handler->u.dw.read(d, reg_entry, (uint32_t *)ptr_val, + valid_mask); + break; + } + + if ( rc < 0 ) + { + gdprintk(XENLOG_WARNING, + "Invalid read emulation, shutting down domain\n"); + domain_crash(current->domain); + return X86EMUL_UNHANDLEABLE; + } + + /* Calculate next address to find. */ + emul_len -= handler->size; + if ( emul_len > 0 ) + find_addr = real_offset + handler->size; + } + else + { + /* Nothing to do with passthrough type register */ + emul_len--; + find_addr++; + } + } + + /* Need to shift back before returning them to pci bus emulator */ + val >>= ((addr & 3) << 3); + *data = val; + + return X86EMUL_OKAY; +} + +static int hvm_pt_pci_write_config(struct hvm_pt_device *d, uint32_t addr, + uint32_t val, int len) +{ + int index = 0; + struct hvm_pt_reg_group *reg_grp_entry = NULL; + int rc = 0; + uint32_t read_val = 0, wb_mask; + int emul_len = 0; + struct hvm_pt_reg *reg_entry = NULL; + uint32_t find_addr = addr; + struct hvm_pt_reg_handler *handler = NULL; + bool wp_flag = false; + unsigned int seg = d->pdev->seg; + unsigned int bus = d->pdev->bus; + unsigned int slot = PCI_SLOT(d->pdev->devfn); + unsigned int func = PCI_FUNC(d->pdev->devfn); + + /* Sanity checks. */ + if ( hvm_pt_pci_config_access_check(d, addr, len) ) + return X86EMUL_UNHANDLEABLE; + + /* Find register group entry. */ + reg_grp_entry = hvm_pt_find_reg_grp(d, addr); + if ( reg_grp_entry == NULL ) + return X86EMUL_UNHANDLEABLE; + + /* Read I/O device register value. */ + switch( len ) + { + case 1: + read_val = pci_conf_read8(seg, bus, slot, func, addr); + break; + case 2: + read_val = pci_conf_read16(seg, bus, slot, func, addr); + break; + case 4: + read_val = pci_conf_read32(seg, bus, slot, func, addr); + break; + default: + BUG(); + } + wb_mask = 0xFFFFFFFF >> ((4 - len) << 3); + + /* Adjust the read and write value to appropriate CFC-CFF window */ + read_val <<= (addr & 3) << 3; + val <<= (addr & 3) << 3; + emul_len = len; + + /* Loop around the guest requested size */ + while ( emul_len > 0 ) + { + /* Find register entry to be emulated */ + reg_entry = hvm_pt_find_reg(reg_grp_entry, find_addr); + if ( reg_entry ) + { + handler = reg_entry->handler; + uint32_t real_offset = reg_grp_entry->base_offset + handler->offset; + uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3); + uint8_t *ptr_val = NULL; + uint32_t wp_mask = handler->emu_mask | handler->ro_mask; + + valid_mask <<= (find_addr - real_offset) << 3; + ptr_val = (uint8_t *)&val + (real_offset & 3); + if ( !d->permissive ) + wp_mask |= handler->res_mask; + if ( wp_mask == (0xFFFFFFFF >> ((4 - handler->size) << 3)) ) + wb_mask &= ~((wp_mask >> ((find_addr - real_offset) << 3)) + << ((len - emul_len) << 3)); + + /* Do emulation based on register size */ + switch ( handler->size ) + { + case 1: + if ( handler->u.b.write ) + rc = handler->u.b.write(d, reg_entry, ptr_val, + read_val >> ((real_offset & 3) << 3), + valid_mask); + break; + case 2: + if ( handler->u.w.write ) + rc = handler->u.w.write(d, reg_entry, (uint16_t *)ptr_val, + (read_val >> ((real_offset & 3) << 3)), + valid_mask); + break; + case 4: + if ( handler->u.dw.write ) + rc = handler->u.dw.write(d, reg_entry, (uint32_t *)ptr_val, + (read_val >> ((real_offset & 3) << 3)), + valid_mask); + break; + } + + if ( rc < 0 ) + { + gdprintk(XENLOG_WARNING, + "Invalid write emulation, shutting down domain\n"); + domain_crash(current->domain); + return X86EMUL_UNHANDLEABLE; + } + + /* Calculate next address to find */ + emul_len -= handler->size; + if ( emul_len > 0 ) + find_addr = real_offset + handler->size; + } + else + { + /* Nothing to do with passthrough type register */ + if ( !d->permissive ) + { + wb_mask &= ~(0xff << ((len - emul_len) << 3)); + /* + * Unused BARs will make it here, but we don't want to issue + * warnings for writes to them (bogus writes get dealt with + * above). + */ + if ( index < 0 ) + wp_flag = true; + } + emul_len--; + find_addr++; + } + } + + /* Need to shift back before passing them to xen_host_pci_set_block */ + val >>= (addr & 3) << 3; + + if ( wp_flag && !d->permissive_warned ) + { + d->permissive_warned = true; + gdprintk(XENLOG_WARNING, + "Write-back to unknown field 0x%02x (partially) inhibited (0x%0*x)\n", + addr, len * 2, wb_mask); + gdprintk(XENLOG_WARNING, + "If the device doesn't work, try enabling permissive mode\n"); + gdprintk(XENLOG_WARNING, + "(unsafe) and if it helps report the problem to xen-devel\n"); + } + for ( index = 0; wb_mask; index += len ) + { + /* Unknown regs are passed through */ + while ( !(wb_mask & 0xff) ) + { + index++; + wb_mask >>= 8; + } + len = 0; + do { + len++; + wb_mask >>= 8; + } while ( wb_mask & 0xff ); + + switch( len ) + { + case 1: + { + uint8_t value; + memcpy(&value, (uint8_t *)&val + index, 1); + pci_conf_write8(seg, bus, slot, func, addr + index, value); + break; + } + case 2: + { + uint16_t value; + memcpy(&value, (uint8_t *)&val + index, 2); + pci_conf_write16(seg, bus, slot, func, addr + index, value); + break; + } + case 4: + { + uint32_t value; + memcpy(&value, (uint8_t *)&val + index, 4); + pci_conf_write32(seg, bus, slot, func, addr + index, value); + break; + } + default: + BUG(); + } + } + return X86EMUL_OKAY; +} + static int hw_dpci_portio_read(const struct hvm_io_handler *handler, uint64_t addr, uint32_t size, uint64_t *data) { struct domain *currd = current->domain; + struct hvm_pt_device *dev; + uint32_t data32; + uint8_t reg; + int rc; if ( addr == 0xcf8 ) { @@ -276,6 +671,22 @@ static int hw_dpci_portio_read(const struct hvm_io_handler *handler, size = min(size, 4 - ((uint32_t)addr & 3)); if ( size == 3 ) size = 2; + + read_lock(&currd->arch.hvm_domain.pt_lock); + dev = hw_dpci_get_device(currd); + if ( dev != NULL ) + { + reg = (currd->arch.pci_cf8 & 0xfc) | (addr & 0x3); + rc = hvm_pt_pci_read_config(dev, reg, &data32, size); + if ( rc == X86EMUL_OKAY ) + { + read_unlock(&currd->arch.hvm_domain.pt_lock); + *data = data32; + return rc; + } + } + read_unlock(&currd->arch.hvm_domain.pt_lock); + if ( pci_cfg_ok(currd, addr & 3, size, NULL) ) *data = pci_conf_read(currd->arch.pci_cf8, addr & 3, size); @@ -288,7 +699,10 @@ static int hw_dpci_portio_write(const struct hvm_io_handler *handler, uint64_t data) { struct domain *currd = current->domain; + struct hvm_pt_device *dev; uint32_t data32; + uint8_t reg; + int rc; if ( addr == 0xcf8 ) { @@ -302,12 +716,219 @@ static int hw_dpci_portio_write(const struct hvm_io_handler *handler, if ( size == 3 ) size = 2; data32 = data; + + read_lock(&currd->arch.hvm_domain.pt_lock); + dev = hw_dpci_get_device(currd); + if ( dev != NULL ) + { + reg = (currd->arch.pci_cf8 & 0xfc) | (addr & 0x3); + rc = hvm_pt_pci_write_config(dev, reg, data32, size); + if ( rc == X86EMUL_OKAY ) + { + read_unlock(&currd->arch.hvm_domain.pt_lock); + return rc; + } + } + read_unlock(&currd->arch.hvm_domain.pt_lock); + if ( pci_cfg_ok(currd, addr & 3, size, &data32) ) pci_conf_write(currd->arch.pci_cf8, addr & 3, size, data); return X86EMUL_OKAY; } +static void hvm_pt_free_device(struct hvm_pt_device *dev) +{ + struct hvm_pt_reg_group *group, *g; + + list_for_each_entry_safe( group, g, &dev->register_groups, entries ) + { + struct hvm_pt_reg *reg, *r; + + list_for_each_entry_safe( reg, r, &group->registers, entries ) + { + list_del(®->entries); + xfree(reg); + } + + list_del(&group->entries); + xfree(group); + } + + xfree(dev); +} + +static int hvm_pt_add_register(struct hvm_pt_device *dev, + struct hvm_pt_reg_group *group, + struct hvm_pt_reg_handler *handler) +{ + struct pci_dev *pdev = dev->pdev; + struct hvm_pt_reg *reg; + + reg = xmalloc(struct hvm_pt_reg); + if ( reg == NULL ) + return -ENOMEM; + + memset(reg, 0, sizeof(*reg)); + reg->handler = handler; + if ( handler->init != NULL ) + { + uint32_t host_mask, size_mask, data = 0; + uint8_t seg, bus, slot, func; + unsigned int offset; + uint32_t val; + int rc; + + /* Initialize emulate register */ + rc = handler->init(dev, reg->handler, + group->base_offset + reg->handler->offset, &data); + if ( rc < 0 ) + return rc; + + if ( data == HVM_PT_INVALID_REG ) + { + xfree(reg); + return 0; + } + + /* Sync up the data to val */ + offset = group->base_offset + reg->handler->offset; + size_mask = 0xFFFFFFFF >> ((4 - reg->handler->size) << 3); + + seg = pdev->seg; + bus = pdev->bus; + slot = PCI_SLOT(pdev->devfn); + func = PCI_FUNC(pdev->devfn); + + switch ( reg->handler->size ) + { + case 1: + val = pci_conf_read8(seg, bus, slot, func, offset); + break; + case 2: + val = pci_conf_read16(seg, bus, slot, func, offset); + break; + case 4: + val = pci_conf_read32(seg, bus, slot, func, offset); + break; + default: + BUG(); + } + + /* + * Set bits in emu_mask are the ones we emulate. The reg shall + * contain the emulated view of the guest - therefore we flip + * the mask to mask out the host values (which reg initially + * has). + */ + host_mask = size_mask & ~reg->handler->emu_mask; + + if ( (data & host_mask) != (val & host_mask) ) + { + uint32_t new_val; + + /* Mask out host (including past size). */ + new_val = val & host_mask; + /* Merge emulated ones (excluding the non-emulated ones). */ + new_val |= data & host_mask; + /* + * Leave intact host and emulated values past the size - + * even though we do not care as we write per reg->size + * granularity, but for the logging below lets have the + * proper value. + */ + new_val |= ((val | data)) & ~size_mask; + printk_pdev(pdev, XENLOG_ERR, +"offset 0x%04x mismatch! Emulated=0x%04x, host=0x%04x, syncing to 0x%04x.\n", + offset, data, val, new_val); + val = new_val; + } + else + val = data; + + if ( val & ~size_mask ) + { + printk_pdev(pdev, XENLOG_ERR, + "Offset 0x%04x:0x%04x expands past register size(%d)!\n", + offset, val, reg->handler->size); + return -EINVAL; + } + + reg->val.dword = val; + } + list_add_tail(®->entries, &group->registers); + + return 0; +} + +static struct hvm_pt_handler_init *hwdom_pt_handlers[] = { +}; + +int hwdom_add_device(struct pci_dev *pdev) +{ + struct domain *d = pdev->domain; + struct hvm_pt_device *dev; + int j, i, rc; + + ASSERT( is_hardware_domain(d) ); + ASSERT( pcidevs_locked() ); + + dev = xmalloc(struct hvm_pt_device); + if ( dev == NULL ) + return -ENOMEM; + + memset(dev, 0 , sizeof(*dev)); + + dev->pdev = pdev; + INIT_LIST_HEAD(&dev->register_groups); + + dev->permissive = opt_dom0permissive; + + for ( j = 0; j < ARRAY_SIZE(hwdom_pt_handlers); j++ ) + { + struct hvm_pt_handler_init *handler_init = hwdom_pt_handlers[j]; + struct hvm_pt_reg_group *group; + + group = xmalloc(struct hvm_pt_reg_group); + if ( group == NULL ) + { + xfree(dev); + return -ENOMEM; + } + INIT_LIST_HEAD(&group->registers); + + rc = handler_init->init(dev, group); + if ( rc == 0 ) + { + for ( i = 0; handler_init->handlers[i].size != 0; i++ ) + { + int rc; + + rc = hvm_pt_add_register(dev, group, + &handler_init->handlers[i]); + if ( rc ) + { + printk_pdev(pdev, XENLOG_ERR, "error adding register: %d\n", + rc); + hvm_pt_free_device(dev); + return rc; + } + } + + list_add_tail(&group->entries, &dev->register_groups); + } + else + xfree(group); + } + + write_lock(&d->arch.hvm_domain.pt_lock); + list_add_tail(&dev->entries, &d->arch.hvm_domain.pt_devices); + write_unlock(&d->arch.hvm_domain.pt_lock); + printk_pdev(pdev, XENLOG_DEBUG, "added for pass-through\n"); + + return 0; +} + static const struct hvm_io_ops dpci_portio_ops = { .accept = dpci_portio_accept, .read = dpci_portio_read, diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h index f34d784..1b1a52f 100644 --- a/xen/include/asm-x86/hvm/domain.h +++ b/xen/include/asm-x86/hvm/domain.h @@ -152,6 +152,10 @@ struct hvm_domain { struct vmx_domain vmx; struct svm_domain svm; }; + + /* List of passed-through devices (hw domain only). */ + struct list_head pt_devices; + rwlock_t pt_lock; }; #define hap_enabled(d) ((d)->arch.hvm_domain.hap_enabled) diff --git a/xen/include/asm-x86/hvm/io.h b/xen/include/asm-x86/hvm/io.h index e9b3f83..80f830d 100644 --- a/xen/include/asm-x86/hvm/io.h +++ b/xen/include/asm-x86/hvm/io.h @@ -153,6 +153,182 @@ extern void hvm_dpci_msi_eoi(struct domain *d, int vector); void register_dpci_portio_handler(struct domain *d); +/* Structures for pci-passthrough state and handlers. */ +struct hvm_pt_device; +struct hvm_pt_reg_handler; +struct hvm_pt_reg; +struct hvm_pt_reg_group; + +/* Return code when register should be ignored. */ +#define HVM_PT_INVALID_REG 0xFFFFFFFF + +/* function type for config reg */ +typedef int (*hvm_pt_conf_reg_init) + (struct hvm_pt_device *, struct hvm_pt_reg_handler *, uint32_t real_offset, + uint32_t *data); + +typedef int (*hvm_pt_conf_dword_write) + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, + uint32_t *val, uint32_t dev_value, uint32_t valid_mask); +typedef int (*hvm_pt_conf_word_write) + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, + uint16_t *val, uint16_t dev_value, uint16_t valid_mask); +typedef int (*hvm_pt_conf_byte_write) + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, + uint8_t *val, uint8_t dev_value, uint8_t valid_mask); +typedef int (*hvm_pt_conf_dword_read) + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, + uint32_t *val, uint32_t valid_mask); +typedef int (*hvm_pt_conf_word_read) + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, + uint16_t *val, uint16_t valid_mask); +typedef int (*hvm_pt_conf_byte_read) + (struct hvm_pt_device *, struct hvm_pt_reg *cfg_entry, + uint8_t *val, uint8_t valid_mask); + +typedef int (*hvm_pt_group_init) + (struct hvm_pt_device *, struct hvm_pt_reg_group *); + +/* + * Emulated register information. + * + * This should be shared between all the consumers that trap on accesses + * to certain PCI registers. + */ +struct hvm_pt_reg_handler { + uint32_t offset; + uint32_t size; + uint32_t init_val; + /* reg reserved field mask (ON:reserved, OFF:defined) */ + uint32_t res_mask; + /* reg read only field mask (ON:RO/ROS, OFF:other) */ + uint32_t ro_mask; + /* reg read/write-1-clear field mask (ON:RW1C/RW1CS, OFF:other) */ + uint32_t rw1c_mask; + /* reg emulate field mask (ON:emu, OFF:passthrough) */ + uint32_t emu_mask; + hvm_pt_conf_reg_init init; + /* read/write function pointer + * for double_word/word/byte size */ + union { + struct { + hvm_pt_conf_dword_write write; + hvm_pt_conf_dword_read read; + } dw; + struct { + hvm_pt_conf_word_write write; + hvm_pt_conf_word_read read; + } w; + struct { + hvm_pt_conf_byte_write write; + hvm_pt_conf_byte_read read; + } b; + } u; +}; + +struct hvm_pt_handler_init { + struct hvm_pt_reg_handler *handlers; + hvm_pt_group_init init; +}; + +/* + * Emulated register value. + * + * This is the representation of each specific emulated register. + */ +struct hvm_pt_reg { + struct list_head entries; + struct hvm_pt_reg_handler *handler; + union { + uint8_t byte; + uint16_t word; + uint32_t dword; + } val; +}; + +/* + * Emulated register group. + * + * In order to speed up (and logically group) emulated registers search, + * groups are used that represent specific emulated features, like MSI. + */ +struct hvm_pt_reg_group { + struct list_head entries; + uint32_t base_offset; + uint8_t size; + struct list_head registers; +}; + +/* + * Guest MSI information. + * + * MSI values set by the guest. + */ +struct hvm_pt_msi { + uint16_t flags; + uint32_t addr_lo; /* guest message address */ + uint32_t addr_hi; /* guest message upper address */ + uint16_t data; /* guest message data */ + uint32_t ctrl_offset; /* saved control offset */ + int pirq; /* guest pirq corresponding */ + bool_t initialized; /* when guest MSI is initialized */ + bool_t mapped; /* when pirq is mapped */ +}; + +/* + * Guest passed-through PCI device. + */ +struct hvm_pt_device { + struct list_head entries; + + struct pci_dev *pdev; + + bool_t permissive; + bool_t permissive_warned; + + /* MSI status. */ + struct hvm_pt_msi msi; + + struct list_head register_groups; +}; + +/* + * The hierarchy of the above structures is the following: + * + * +---------------+ +---------------+ + * | | entries | | ... + * | hvm_pt_device +---------+ hvm_pt_device +----+ + * | | | | + * +-+-------------+ +---------------+ + * | + * | register_groups + * | + * +-v----------------+ +------------------+ + * | | entries | | ... + * | hvm_pt_reg_group +----------+ hvm_pt_reg_group +----+ + * | | | | + * +-+----------------+ +------------------+ + * | + * | registers + * | + * +-v----------+ +------------+ + * | | entries | | ... + * | hvm_pt_reg +------------+ hvm_pt_reg +----+ + * | | | | + * +-+----------+ +-+----------+ + * | | + * | handler | handler + * | | + * +-v------------------+ +-v------------------+ + * | | | | + * | hvm_pt_reg_handler | | hvm_pt_reg_handler | + * | | | | + * +--------------------+ +--------------------+ + */ + +/* Helper to add passed-through devices to the hardware domain. */ +int hwdom_add_device(struct pci_dev *pdev); + #endif /* __ASM_X86_HVM_IO_H__ */ diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h index f191773..b21a891 100644 --- a/xen/include/xen/pci.h +++ b/xen/include/xen/pci.h @@ -90,6 +90,11 @@ struct pci_dev { u64 vf_rlen[6]; }; +/* Helper for printing pci_dev related messages. */ +#define printk_pdev(pdev, lvl, fmt, ...) \ + printk(lvl "PCI %04x:%02x:%02x.%u: " fmt, pdev->seg, pdev->bus, \ + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), ##__VA_ARGS__) + #define for_each_pdev(domain, pdev) \ list_for_each_entry(pdev, &(domain->arch.pdev_list), domain_list)
Most of this code has been picked up from QEMU and modified so it can be plugged into the internal Xen IO handlers. The structure of the handlers has been keep quite similar to QEMU, so existing handlers can be imported without a lot of effort. Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> --- Cc: Jan Beulich <jbeulich@suse.com> Cc: Andrew Cooper <andrew.cooper3@citrix.com> Cc: Paul Durrant <paul.durrant@citrix.com> --- docs/misc/xen-command-line.markdown | 8 + xen/arch/x86/hvm/hvm.c | 2 + xen/arch/x86/hvm/io.c | 621 ++++++++++++++++++++++++++++++++++++ xen/include/asm-x86/hvm/domain.h | 4 + xen/include/asm-x86/hvm/io.h | 176 ++++++++++ xen/include/xen/pci.h | 5 + 6 files changed, 816 insertions(+)