Message ID | 20161030135931.1a2adc95@t450s.home (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 2016/10/31 3:59, Alex Williamson wrote: > On Wed, 26 Oct 2016 16:56:13 +0800 > Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote: > >> Now the kernel commit 05f0c03fbac1 ("vfio-pci: Allow to mmap >> sub-page MMIO BARs if the mmio page is exclusive") allows VFIO >> to mmap sub-page BARs. This is the corresponding QEMU patch. >> With those patches applied, we could passthrough sub-page BARs >> to guest, which can help to improve IO performance for some devices. >> >> In this patch, we expand MemoryRegions of these sub-page >> MMIO BARs to PAGE_SIZE in vfio_pci_write_config(), so that >> the BARs could be passed to KVM ioctl KVM_SET_USER_MEMORY_REGION >> with a valid size. The expanding size will be recovered when >> the base address of sub-page BAR is changed and not page aligned >> any more in guest. And we also set the priority of these BARs' >> memory regions to zero in case of overlap with BARs which share >> the same page with sub-page BARs in guest. >> >> Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com> >> --- >> hw/vfio/common.c | 3 +-- >> hw/vfio/pci.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ >> 2 files changed, 78 insertions(+), 2 deletions(-) >> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >> index 9505fb3..6e48f98 100644 >> --- a/hw/vfio/common.c >> +++ b/hw/vfio/common.c >> @@ -662,8 +662,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, >> region, name, region->size); >> >> if (!vbasedev->no_mmap && >> - region->flags & VFIO_REGION_INFO_FLAG_MMAP && >> - !(region->size & ~qemu_real_host_page_mask)) { >> + region->flags & VFIO_REGION_INFO_FLAG_MMAP) { >> >> vfio_setup_region_sparse_mmaps(region, info); >> >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c >> index 65d30fd..0516e62 100644 >> --- a/hw/vfio/pci.c >> +++ b/hw/vfio/pci.c >> @@ -1071,6 +1071,65 @@ static const MemoryRegionOps vfio_vga_ops = { >> }; >> >> /* >> + * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page >> + * size if the BAR is in an exclusive page in host so that we could map >> + * this BAR to guest. But this sub-page BAR may not occupy an exclusive >> + * page in guest. So we should set the priority of the expanded memory >> + * region to zero in case of overlap with BARs which share the same page >> + * with the sub-page BAR in guest. Besides, we should also recover the >> + * size of this sub-page BAR when its base address is changed in guest >> + * and not page aligned any more. >> + */ >> +static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) >> +{ >> + VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); >> + VFIORegion *region = &vdev->bars[bar].region; >> + MemoryRegion *mmap_mr, *mr; >> + PCIIORegion *r; >> + pcibus_t bar_addr; >> + >> + /* Make sure that the whole region is allowed to be mmapped */ >> + if (!(region->nr_mmaps == 1 && >> + region->mmaps[0].size == region->size)) { > Isn't region->size equal to the BAR size, which is less than > PAGE_SIZE? Doesn't that mean that the mmap(2) was performed with a > length val less than a page size? Isn't that going to fail? We could > also test mmaps[0].mmap here instead of burying it below. The mmap(2) will allow passing a length val less than PAGE_SIZE. In do_mmap(), it will set len = PAGE_ALIGN(len) at first. >> + return; >> + } >> + >> + r = &pdev->io_regions[bar]; >> + bar_addr = r->addr; >> + if (bar_addr == PCI_BAR_UNMAPPED) { >> + return; >> + } > See below, why not reset sizes on this case? You are right. We should also reset sizes on this case. >> + >> + mr = region->mem; >> + mmap_mr = ®ion->mmaps[0].mem; >> + memory_region_transaction_begin(); >> + if (memory_region_size(mr) < qemu_real_host_page_size) { >> + if (!(bar_addr & ~qemu_real_host_page_mask) && >> + memory_region_is_mapped(mr) && region->mmaps[0].mmap) { >> + /* Expand memory region to page size and set priority */ >> + memory_region_del_subregion(r->address_space, mr); >> + memory_region_set_size(mr, qemu_real_host_page_size); >> + memory_region_set_size(mmap_mr, qemu_real_host_page_size); >> + memory_region_add_subregion_overlap(r->address_space, >> + bar_addr, mr, 0); >> + } >> + } else { >> + /* This case would happen when guest rescan one PCI device */ >> + if (bar_addr & ~qemu_real_host_page_mask) { >> + /* Recover the size of memory region */ >> + memory_region_set_size(mr, r->size); >> + memory_region_set_size(mmap_mr, r->size); >> + } else if (memory_region_is_mapped(mr)) { >> + /* Set the priority of memory region to zero */ >> + memory_region_del_subregion(r->address_space, mr); >> + memory_region_add_subregion_overlap(r->address_space, >> + bar_addr, mr, 0); >> + } >> + } >> + memory_region_transaction_commit(); > > The logic flow here is confusing to me, too many cases that presume the > previous state of the device. Can't we *always* set the memory region > sizes based on the value written to the BAR? And can't we *always* > re-prioritize regions when we have a mapped MemoryRegion for which > we've modified the size? Something like below (untested) makes a lot > more sense to me: > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index 03e3c94..d7dbe0e 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -1087,45 +1087,35 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) > MemoryRegion *mmap_mr, *mr; > PCIIORegion *r; > pcibus_t bar_addr; > + uint64_t size = region->size; > > /* Make sure that the whole region is allowed to be mmapped */ > - if (!(region->nr_mmaps == 1 && > - region->mmaps[0].size == region->size)) { > + if (region->nr_mmaps != 1 || !region->mmaps[0].mmap || > + region->mmaps[0].size != region->size) { > return; > } > r = &pdev->io_regions[bar]; > bar_addr = r->addr; > - if (bar_addr == PCI_BAR_UNMAPPED) { > - return; > - } > - > mr = region->mem; > mmap_mr = ®ion->mmaps[0].mem; > + > + /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */ > + if (bar_addr != PCI_BAR_UNMAPPED && > + !(bar_addr & ~qemu_real_host_page_mask)) { > + size = qemu_real_host_page_size; > + } > + > memory_region_transaction_begin(); > - if (memory_region_size(mr) < qemu_real_host_page_size) { > - if (!(bar_addr & ~qemu_real_host_page_mask) && > - memory_region_is_mapped(mr) && region->mmaps[0].mmap) { > - /* Expand memory region to page size and set priority */ > - memory_region_del_subregion(r->address_space, mr); > - memory_region_set_size(mr, qemu_real_host_page_size); > - memory_region_set_size(mmap_mr, qemu_real_host_page_size); > - memory_region_add_subregion_overlap(r->address_space, > - bar_addr, mr, 0); > - } > - } else { > - /* This case would happen when guest rescan one PCI device */ > - if (bar_addr & ~qemu_real_host_page_mask) { > - /* Recover the size of memory region */ > - memory_region_set_size(mr, r->size); > - memory_region_set_size(mmap_mr, r->size); > - } else if (memory_region_is_mapped(mr)) { > - /* Set the priority of memory region to zero */ > - memory_region_del_subregion(r->address_space, mr); > - memory_region_add_subregion_overlap(r->address_space, > - bar_addr, mr, 0); > - } > + > + memory_region_set_size(mr, size); > + memory_region_set_size(mmap_mr, size); > + if (size != region->size && memory_region_is_mapped(mr)) { > + memory_region_del_subregion(r->address_space, mr); > + memory_region_add_subregion_overlap(r->address_space, > + bar_addr, mr, 0); > } > + > memory_region_transaction_commit(); > } > > > Please note that QEMU 2.8 goes into freeze on Nov-1, we need to rev > quickly to get this in. Thanks, > > Alex This code looks good. It's more clear. Thank you. I'll test it and send the new patch as soon as possible. Regards, Yongji -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 03e3c94..d7dbe0e 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1087,45 +1087,35 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) MemoryRegion *mmap_mr, *mr; PCIIORegion *r; pcibus_t bar_addr; + uint64_t size = region->size; /* Make sure that the whole region is allowed to be mmapped */ - if (!(region->nr_mmaps == 1 && - region->mmaps[0].size == region->size)) { + if (region->nr_mmaps != 1 || !region->mmaps[0].mmap || + region->mmaps[0].size != region->size) { return; } r = &pdev->io_regions[bar]; bar_addr = r->addr; - if (bar_addr == PCI_BAR_UNMAPPED) { - return; - } - mr = region->mem; mmap_mr = ®ion->mmaps[0].mem; + + /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */ + if (bar_addr != PCI_BAR_UNMAPPED && + !(bar_addr & ~qemu_real_host_page_mask)) { + size = qemu_real_host_page_size; + } + memory_region_transaction_begin(); - if (memory_region_size(mr) < qemu_real_host_page_size) { - if (!(bar_addr & ~qemu_real_host_page_mask) && - memory_region_is_mapped(mr) && region->mmaps[0].mmap) { - /* Expand memory region to page size and set priority */ - memory_region_del_subregion(r->address_space, mr); - memory_region_set_size(mr, qemu_real_host_page_size); - memory_region_set_size(mmap_mr, qemu_real_host_page_size); - memory_region_add_subregion_overlap(r->address_space, - bar_addr, mr, 0); - } - } else { - /* This case would happen when guest rescan one PCI device */ - if (bar_addr & ~qemu_real_host_page_mask) { - /* Recover the size of memory region */ - memory_region_set_size(mr, r->size); - memory_region_set_size(mmap_mr, r->size); - } else if (memory_region_is_mapped(mr)) { - /* Set the priority of memory region to zero */ - memory_region_del_subregion(r->address_space, mr); - memory_region_add_subregion_overlap(r->address_space, - bar_addr, mr, 0); - } + + memory_region_set_size(mr, size); + memory_region_set_size(mmap_mr, size); + if (size != region->size && memory_region_is_mapped(mr)) { + memory_region_del_subregion(r->address_space, mr); + memory_region_add_subregion_overlap(r->address_space, + bar_addr, mr, 0); } + memory_region_transaction_commit(); }