Message ID | 1375292732-7627-7-git-send-email-stefano.stabellini@eu.citrix.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, 31 Jul 2013, Stefano Stabellini wrote: > Support autotranslate guests in swiotlb-xen by keeping track of the > phys-to-bus and bus-to-phys mappings of the swiotlb buffer > (xen_io_tlb_start-xen_io_tlb_end). > > Use a simple direct access on a pre-allocated array for phys-to-bus > queries. Use a red-black tree for bus-to-phys queries. > > Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> > CC: david.vrabel@citrix.com > --- > drivers/xen/swiotlb-xen.c | 127 +++++++++++++++++++++++++++++++++++++++------ > 1 files changed, 111 insertions(+), 16 deletions(-) > > diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c > index 353f013..c79ac88 100644 > --- a/drivers/xen/swiotlb-xen.c > +++ b/drivers/xen/swiotlb-xen.c > @@ -38,32 +38,116 @@ > #include <linux/bootmem.h> > #include <linux/dma-mapping.h> > #include <linux/export.h> > +#include <linux/slab.h> > +#include <linux/spinlock_types.h> > +#include <linux/rbtree.h> > #include <xen/swiotlb-xen.h> > #include <xen/page.h> > #include <xen/xen-ops.h> > #include <xen/hvc-console.h> > +#include <xen/features.h> > /* > * Used to do a quick range check in swiotlb_tbl_unmap_single and > * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this > * API. > */ > > +#define NR_DMA_SEGS ((xen_io_tlb_nslabs + IO_TLB_SEGSIZE - 1) / IO_TLB_SEGSIZE) > static char *xen_io_tlb_start, *xen_io_tlb_end; > static unsigned long xen_io_tlb_nslabs; > /* > * Quick lookup value of the bus address of the IOTLB. > */ > > -static u64 start_dma_addr; > +struct xen_dma{ > + dma_addr_t dma_addr; > + phys_addr_t phys_addr; > + size_t size; > + struct rb_node rbnode; > +}; > + > +static struct xen_dma *xen_dma_seg; > +static struct rb_root bus_to_phys = RB_ROOT; > +static DEFINE_SPINLOCK(xen_dma_lock); > + > +static void xen_dma_insert(struct xen_dma *entry) > +{ > + struct rb_node **link = &bus_to_phys.rb_node; > + struct rb_node *parent = NULL; > + struct xen_dma *e; > + > + spin_lock(&xen_dma_lock); > + > + while (*link) { > + parent = *link; > + e = rb_entry(parent, struct xen_dma, rbnode); > + > + WARN_ON(entry->dma_addr == e->dma_addr); > + > + if (entry->dma_addr < e->dma_addr) > + link = &(*link)->rb_left; > + else > + link = &(*link)->rb_right; > + } > + rb_link_node(&entry->rbnode, parent, link); > + rb_insert_color(&entry->rbnode, &bus_to_phys); > + > + spin_unlock(&xen_dma_lock); > +} > + > +static struct xen_dma *xen_dma_retrieve(dma_addr_t dma_addr) > +{ > + struct rb_node *n = bus_to_phys.rb_node; > + struct xen_dma *e; > + > + spin_lock(&xen_dma_lock); > + > + while (n) { > + e = rb_entry(n, struct xen_dma, rbnode); > + if (e->dma_addr <= dma_addr && e->dma_addr + e->size > dma_addr) { > + spin_unlock(&xen_dma_lock); > + return e; > + } > + if (dma_addr < e->dma_addr) > + n = n->rb_left; > + else > + n = n->rb_right; > + } > + > + spin_unlock(&xen_dma_lock); > + return NULL; > +} > > static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) > { > - return phys_to_machine(XPADDR(paddr)).maddr; > + int nr_seg; > + unsigned long offset; > + char* vaddr; > + > + if (!xen_feature(XENFEAT_auto_translated_physmap)) > + return phys_to_machine(XPADDR(paddr)).maddr; > + > + vaddr = (char *) phys_to_virt(paddr); > + if (vaddr >= xen_io_tlb_end || vaddr < xen_io_tlb_start) > + return ~0; > + > + offset = vaddr - xen_io_tlb_start; > + nr_seg = offset / (IO_TLB_SEGSIZE << IO_TLB_SHIFT); > + > + return xen_dma_seg[nr_seg].dma_addr + (paddr & ((IO_TLB_SEGSIZE << IO_TLB_SHIFT) - 1)); I have just realized that there is a much better way of doing this: return xen_dma_seg[nr_seg].dma_addr + (paddr - xen_dma_seg[nr_seg].phys_addr); > } > > static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) > { > - return machine_to_phys(XMADDR(baddr)).paddr; > + if (xen_feature(XENFEAT_auto_translated_physmap)) > + { > + struct xen_dma *dma = xen_dma_retrieve(baddr); > + if (dma == NULL) > + return ~0; > + else > + return dma->phys_addr + (baddr & ((IO_TLB_SEGSIZE << IO_TLB_SHIFT) - 1)); same here: return dma->phys_addr + (baddr - dma->dma_addr);
On Wed, Jul 31, 2013 at 06:45:31PM +0100, Stefano Stabellini wrote: > Support autotranslate guests in swiotlb-xen by keeping track of the > phys-to-bus and bus-to-phys mappings of the swiotlb buffer > (xen_io_tlb_start-xen_io_tlb_end). > > Use a simple direct access on a pre-allocated array for phys-to-bus > queries. Use a red-black tree for bus-to-phys queries. > > Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> > CC: david.vrabel@citrix.com > --- > drivers/xen/swiotlb-xen.c | 127 +++++++++++++++++++++++++++++++++++++++------ > 1 files changed, 111 insertions(+), 16 deletions(-) > > diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c > index 353f013..c79ac88 100644 > --- a/drivers/xen/swiotlb-xen.c > +++ b/drivers/xen/swiotlb-xen.c > @@ -38,32 +38,116 @@ > #include <linux/bootmem.h> > #include <linux/dma-mapping.h> > #include <linux/export.h> > +#include <linux/slab.h> > +#include <linux/spinlock_types.h> > +#include <linux/rbtree.h> > #include <xen/swiotlb-xen.h> > #include <xen/page.h> > #include <xen/xen-ops.h> > #include <xen/hvc-console.h> > +#include <xen/features.h> > /* > * Used to do a quick range check in swiotlb_tbl_unmap_single and > * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this > * API. > */ > > +#define NR_DMA_SEGS ((xen_io_tlb_nslabs + IO_TLB_SEGSIZE - 1) / IO_TLB_SEGSIZE) > static char *xen_io_tlb_start, *xen_io_tlb_end; > static unsigned long xen_io_tlb_nslabs; > /* > * Quick lookup value of the bus address of the IOTLB. > */ > > -static u64 start_dma_addr; > +struct xen_dma{ ^ Ahem! I think scripts/checkpath.pl would tell you about. You did run checkpath.pl right :-) The name is very generic. Xen DMA. Brings a lot of ideas around but none of them to do with tracking a tuple. Perhaps a name of 'xen_dma_entry'? 'xen_dma_node' ? 'xen_dma_info' ? > + dma_addr_t dma_addr; > + phys_addr_t phys_addr; > + size_t size; > + struct rb_node rbnode; > +}; > + > +static struct xen_dma *xen_dma_seg; > +static struct rb_root bus_to_phys = RB_ROOT; That needs a bit of explanation of what this tree is suppose to do. I have an idea, but it would be good to have it right there explained. > +static DEFINE_SPINLOCK(xen_dma_lock); And this name should also change. And you need a comment explainig what this protects please. > + > +static void xen_dma_insert(struct xen_dma *entry) A better name for the function please. Sounds like a IOMMU operation at first glance (as in inserting an phys and dma addr in the IOMMU context). But that is not what this does. It just adds an entry to a tree. Perhaps 'xen_dma_add_entry' ? > +{ > + struct rb_node **link = &bus_to_phys.rb_node; > + struct rb_node *parent = NULL; > + struct xen_dma *e; 'e' ? eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee? Ah, 'entry'!!! Perhaps 'entry'? Oh wait, already taken. How about you call the one that is passed in as 'new' and this one as 'entry'? > + > + spin_lock(&xen_dma_lock); > + > + while (*link) { > + parent = *link; > + e = rb_entry(parent, struct xen_dma, rbnode); > + > + WARN_ON(entry->dma_addr == e->dma_addr); You probably also want to print out the physical address of both of them? And do you really want to continue if the physical address for the two entries is different? That would mean you lose one of them when the entry is being deleted from it. Perhaps you should just return -EINVAL? > + > + if (entry->dma_addr < e->dma_addr) > + link = &(*link)->rb_left; > + else > + link = &(*link)->rb_right; > + } > + rb_link_node(&entry->rbnode, parent, link); > + rb_insert_color(&entry->rbnode, &bus_to_phys); > + > + spin_unlock(&xen_dma_lock); How come we don't return 0 here? > +} > + > +static struct xen_dma *xen_dma_retrieve(dma_addr_t dma_addr) get? > +{ > + struct rb_node *n = bus_to_phys.rb_node; > + struct xen_dma *e; > + Stray spaces, and 'eeeeeeeeeeeee' strikes again :-) > + spin_lock(&xen_dma_lock); > + > + while (n) { > + e = rb_entry(n, struct xen_dma, rbnode); > + if (e->dma_addr <= dma_addr && e->dma_addr + e->size > dma_addr) { Shouldn't this check be for '==' not '<=' ? Hm, maybe I am not sure what this tree is suppose to do. Somehow I thought it was to keep a consistent mapping of phys and dma addresses. But this seems to be more of 'free' phys and dma addresses. In which case I think you need to change the name of the rb root node to have the word 'free' in it. > + spin_unlock(&xen_dma_lock); > + return e; > + } > + if (dma_addr < e->dma_addr) > + n = n->rb_left; > + else > + n = n->rb_right; > + } > + > + spin_unlock(&xen_dma_lock); > + return NULL; > +} > > static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) > { > - return phys_to_machine(XPADDR(paddr)).maddr; > + int nr_seg; > + unsigned long offset; > + char* vaddr; Ahem! > + > + if (!xen_feature(XENFEAT_auto_translated_physmap)) > + return phys_to_machine(XPADDR(paddr)).maddr; > + > + vaddr = (char *) phys_to_virt(paddr); Ahem! > + if (vaddr >= xen_io_tlb_end || vaddr < xen_io_tlb_start) > + return ~0; No no no. Please please use a proper #define. > + > + offset = vaddr - xen_io_tlb_start; > + nr_seg = offset / (IO_TLB_SEGSIZE << IO_TLB_SHIFT); > + > + return xen_dma_seg[nr_seg].dma_addr + (paddr & ((IO_TLB_SEGSIZE << IO_TLB_SHIFT) - 1)); > } > > static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) > { > - return machine_to_phys(XMADDR(baddr)).paddr; > + if (xen_feature(XENFEAT_auto_translated_physmap)) I am curios to how this will work with PVH x86 which is auto_translated_physmap, but maps the whole kernel in its IOMMU context. Perhaps we should have some extra logic here? For guests that have IOMMU support and for those that don't? B/c in some way this could be used on x86 with VT-x + without an VT-d and PVH for dom0 I think. > + { > + struct xen_dma *dma = xen_dma_retrieve(baddr); > + if (dma == NULL) > + return ~0; I think you know what I am going to say. > + else > + return dma->phys_addr + (baddr & ((IO_TLB_SEGSIZE << IO_TLB_SHIFT) - 1)); > + } else > + return machine_to_phys(XMADDR(baddr)).paddr; > } > > static dma_addr_t xen_virt_to_bus(void *address) > @@ -107,6 +191,9 @@ static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) > unsigned long pfn = mfn_to_local_pfn(mfn); > phys_addr_t paddr; > > + if (xen_feature(XENFEAT_auto_translated_physmap)) > + return 1; > + That does not look right to me. I think this would make PVH go bonk. > /* If the address is outside our domain, it CAN > * have the same virtual address as another address > * in our domain. Therefore _only_ check address within our domain. > @@ -124,13 +211,12 @@ static int max_dma_bits = 32; > static int > xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) > { > - int i, rc; > + int i, j, rc; > int dma_bits; > - dma_addr_t dma_handle; > > dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; > > - i = 0; > + i = j = 0; > do { > int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE); > > @@ -138,12 +224,16 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) > rc = xen_create_contiguous_region( > (unsigned long)buf + (i << IO_TLB_SHIFT), > get_order(slabs << IO_TLB_SHIFT), > - dma_bits, &dma_handle); > + dma_bits, &xen_dma_seg[j].dma_addr); > + xen_dma_seg[j].phys_addr = virt_to_phys(buf + (i << IO_TLB_SHIFT)); > + xen_dma_seg[j].size = slabs << IO_TLB_SHIFT; > + xen_dma_insert(&xen_dma_seg[j]); > } while (rc && dma_bits++ < max_dma_bits); > if (rc) > return rc; > > i += slabs; > + j++; > } while (i < nslabs); > return 0; > } > @@ -193,9 +283,10 @@ retry: > /* > * Get IO TLB memory from any location. > */ > - if (early) > + if (early) { > xen_io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes)); > - else { > + xen_dma_seg = alloc_bootmem(sizeof(struct xen_dma) * NR_DMA_SEGS); > + } else { > #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) > #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) > while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { > @@ -210,6 +301,7 @@ retry: > xen_io_tlb_nslabs = SLABS_PER_PAGE << order; > bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; > } > + xen_dma_seg = kzalloc(sizeof(struct xen_dma) * NR_DMA_SEGS, GFP_KERNEL); > } > if (!xen_io_tlb_start) { > m_ret = XEN_SWIOTLB_ENOMEM; > @@ -232,7 +324,6 @@ retry: > m_ret = XEN_SWIOTLB_EFIXUP; > goto error; > } > - start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); > if (early) { > if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, > verbose)) > @@ -290,7 +381,8 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, > > phys = virt_to_phys(ret); > dev_addr = xen_phys_to_bus(phys); > - if (((dev_addr + size - 1 <= dma_mask)) && > + if (!xen_feature(XENFEAT_auto_translated_physmap) && > + ((dev_addr + size - 1 <= dma_mask)) && > !range_straddles_page_boundary(phys, size)) > *dma_handle = dev_addr; > else { > @@ -321,8 +413,9 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, > > phys = virt_to_phys(vaddr); > > - if (((dev_addr + size - 1 > dma_mask)) || > - range_straddles_page_boundary(phys, size)) > + if (xen_feature(XENFEAT_auto_translated_physmap) || > + (((dev_addr + size - 1 > dma_mask)) || > + range_straddles_page_boundary(phys, size))) > xen_destroy_contiguous_region((unsigned long)vaddr, order); > > free_pages((unsigned long)vaddr, order); > @@ -351,14 +444,15 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, > * we can safely return the device addr and not worry about bounce > * buffering it. > */ > - if (dma_capable(dev, dev_addr, size) && > + if (!xen_feature(XENFEAT_auto_translated_physmap) && > + dma_capable(dev, dev_addr, size) && > !range_straddles_page_boundary(phys, size) && !swiotlb_force) > return dev_addr; I am returning back to PVH on these and just not sure what the impact is. Mukesh, thoughts? > > /* > * Oh well, have to allocate and map a bounce buffer. > */ > - map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); > + map = swiotlb_tbl_map_single(dev, xen_dma_seg[0].dma_addr, phys, size, dir); At 0? Not some index? > if (map == SWIOTLB_MAP_ERROR) > return DMA_ERROR_CODE; > > @@ -494,10 +588,11 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, > dma_addr_t dev_addr = xen_phys_to_bus(paddr); > > if (swiotlb_force || > + xen_feature(XENFEAT_auto_translated_physmap) || > !dma_capable(hwdev, dev_addr, sg->length) || > range_straddles_page_boundary(paddr, sg->length)) { > phys_addr_t map = swiotlb_tbl_map_single(hwdev, > - start_dma_addr, > + xen_dma_seg[0].dma_addr, I am not sure I understand the purpose of 'xen_dma_seg'. Could you clarify that please?
On Fri, 2 Aug 2013, Konrad Rzeszutek Wilk wrote: > On Wed, Jul 31, 2013 at 06:45:31PM +0100, Stefano Stabellini wrote: > > Support autotranslate guests in swiotlb-xen by keeping track of the > > phys-to-bus and bus-to-phys mappings of the swiotlb buffer > > (xen_io_tlb_start-xen_io_tlb_end). > > > > Use a simple direct access on a pre-allocated array for phys-to-bus > > queries. Use a red-black tree for bus-to-phys queries. > > > > Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> > > CC: david.vrabel@citrix.com > > --- > > drivers/xen/swiotlb-xen.c | 127 +++++++++++++++++++++++++++++++++++++++------ > > 1 files changed, 111 insertions(+), 16 deletions(-) > > > > diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c > > index 353f013..c79ac88 100644 > > --- a/drivers/xen/swiotlb-xen.c > > +++ b/drivers/xen/swiotlb-xen.c > > @@ -38,32 +38,116 @@ > > #include <linux/bootmem.h> > > #include <linux/dma-mapping.h> > > #include <linux/export.h> > > +#include <linux/slab.h> > > +#include <linux/spinlock_types.h> > > +#include <linux/rbtree.h> > > #include <xen/swiotlb-xen.h> > > #include <xen/page.h> > > #include <xen/xen-ops.h> > > #include <xen/hvc-console.h> > > +#include <xen/features.h> > > /* > > * Used to do a quick range check in swiotlb_tbl_unmap_single and > > * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this > > * API. > > */ > > > > +#define NR_DMA_SEGS ((xen_io_tlb_nslabs + IO_TLB_SEGSIZE - 1) / IO_TLB_SEGSIZE) > > static char *xen_io_tlb_start, *xen_io_tlb_end; > > static unsigned long xen_io_tlb_nslabs; > > /* > > * Quick lookup value of the bus address of the IOTLB. > > */ > > > > -static u64 start_dma_addr; > > +struct xen_dma{ > ^ > Ahem! I think scripts/checkpath.pl would tell you about. > > You did run checkpath.pl right :-) > > The name is very generic. Xen DMA. Brings a lot of ideas around but none > of them to do with tracking a tuple. Perhaps a name of 'xen_dma_entry'? > 'xen_dma_node' ? 'xen_dma_info' ? Thanks for the detailed review. I have made this and all the other changes that you mentioned, except the ones commented below. > > + spin_lock(&xen_dma_lock); > > + > > + while (n) { > > + e = rb_entry(n, struct xen_dma, rbnode); > > + if (e->dma_addr <= dma_addr && e->dma_addr + e->size > dma_addr) { > > Shouldn't this check be for '==' not '<=' ? Nope: e->dma_addr is the start address of one of the contiguous chucks of memory of size (IO_TLB_SEGSIZE << IO_TLB_SHIFT). However xen_swiotlb_map_page can be called asking for a smaller buffer. As a result xen_phys_to_bus is going to be called passing dma addresses that do not always match the beginning of a contiguous region. > Hm, maybe I am not sure what this tree is suppose to do. Somehow I > thought it was to keep a consistent mapping of phys and dma addresses. > But this seems to be more of 'free' phys and dma addresses. In which > case I think you need to change the name of the rb root node to have the > word 'free' in it. It keeps track of the bus to phys mappings. It doesn't matter whether the corresponding buffers are free or in use as long as the mapping is valid. > > + > > + offset = vaddr - xen_io_tlb_start; > > + nr_seg = offset / (IO_TLB_SEGSIZE << IO_TLB_SHIFT); > > + > > + return xen_dma_seg[nr_seg].dma_addr + (paddr & ((IO_TLB_SEGSIZE << IO_TLB_SHIFT) - 1)); > > } > > > > static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) > > { > > - return machine_to_phys(XMADDR(baddr)).paddr; > > + if (xen_feature(XENFEAT_auto_translated_physmap)) > > I am curios to how this will work with PVH x86 which is > auto_translated_physmap, but maps the whole kernel in its IOMMU context. > > Perhaps we should have some extra logic here? For guests that have IOMMU > support and for those that don't? I would avoid using the swiotlb altogether if an IOMMU is available. Maybe we can pass a flag in xen_features. > B/c in some way this could be used on x86 with VT-x + without an VT-d > and PVH for dom0 I think. Right, it could be a good fit for that. We could spot whether Xen passes a certain flag in xen_features and enable the swiotlb if we need it. > > + else > > + return dma->phys_addr + (baddr & ((IO_TLB_SEGSIZE << IO_TLB_SHIFT) - 1)); > > + } else > > + return machine_to_phys(XMADDR(baddr)).paddr; > > } > > > > static dma_addr_t xen_virt_to_bus(void *address) > > @@ -107,6 +191,9 @@ static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) > > unsigned long pfn = mfn_to_local_pfn(mfn); > > phys_addr_t paddr; > > > > + if (xen_feature(XENFEAT_auto_translated_physmap)) > > + return 1; > > + > > That does not look right to me. I think this would make PVH go bonk. On PVH if an IOMMU is available we would never initialize the swiotlb and we would never use this function. If no IOMMUs are available and we initialized the swiotlb, then this assumption is correct: all contiguous buffers used with xen_swiotlb_sync_single and xen_unmap_single are swiotlb bounce buffers. > > @@ -351,14 +444,15 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, > > * we can safely return the device addr and not worry about bounce > > * buffering it. > > */ > > - if (dma_capable(dev, dev_addr, size) && > > + if (!xen_feature(XENFEAT_auto_translated_physmap) && > > + dma_capable(dev, dev_addr, size) && > > !range_straddles_page_boundary(phys, size) && !swiotlb_force) > > return dev_addr; > > I am returning back to PVH on these and just not sure what the impact > is. Mukesh, thoughts? Same as before: any dma_capable and range_straddles_page_boundary checks on a random buffer are meaningless for XENFEAT_auto_translated_physmap guests (we don't know the p2m mapping of a given page and that mapping might be transient). It's best to avoid them altogether and take the slow path. It should have no impact for PVH today though: PVH guests should just assume that an IOMMU is present and avoid initializing the swiotlb for now. If we want to give them a backup option in case no IOMMU is available, then we can come back to this later. > > /* > > * Oh well, have to allocate and map a bounce buffer. > > */ > > - map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); > > + map = swiotlb_tbl_map_single(dev, xen_dma_seg[0].dma_addr, phys, size, dir); > > At 0? Not some index? Right, I agree with you, I doesn't make sense to me either. However the code does exactly the same thing that was done before. It's just a naming change. > > if (map == SWIOTLB_MAP_ERROR) > > return DMA_ERROR_CODE; > > > > @@ -494,10 +588,11 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, > > dma_addr_t dev_addr = xen_phys_to_bus(paddr); > > > > if (swiotlb_force || > > + xen_feature(XENFEAT_auto_translated_physmap) || > > !dma_capable(hwdev, dev_addr, sg->length) || > > range_straddles_page_boundary(paddr, sg->length)) { > > phys_addr_t map = swiotlb_tbl_map_single(hwdev, > > - start_dma_addr, > > + xen_dma_seg[0].dma_addr, > > I am not sure I understand the purpose of 'xen_dma_seg'. Could you > clarify that please? xen_dma_seg is an array of struct xen_dma that keeps track of the phys to bus mappings. Each entry is (IO_TLB_SEGSIZE << IO_TLB_SHIFT) bytes, except for the last one that could be smaller. Getting the dma address corresponding to a physical address within xen_io_tlb_start - xen_io_tlb_end is just a matter of calculating the index in the array, see xen_phys_to_bus. Also the bus_to_phys tree is conveniently built upon these pre-allocated xen_dma_seg entries.
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 353f013..c79ac88 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -38,32 +38,116 @@ #include <linux/bootmem.h> #include <linux/dma-mapping.h> #include <linux/export.h> +#include <linux/slab.h> +#include <linux/spinlock_types.h> +#include <linux/rbtree.h> #include <xen/swiotlb-xen.h> #include <xen/page.h> #include <xen/xen-ops.h> #include <xen/hvc-console.h> +#include <xen/features.h> /* * Used to do a quick range check in swiotlb_tbl_unmap_single and * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this * API. */ +#define NR_DMA_SEGS ((xen_io_tlb_nslabs + IO_TLB_SEGSIZE - 1) / IO_TLB_SEGSIZE) static char *xen_io_tlb_start, *xen_io_tlb_end; static unsigned long xen_io_tlb_nslabs; /* * Quick lookup value of the bus address of the IOTLB. */ -static u64 start_dma_addr; +struct xen_dma{ + dma_addr_t dma_addr; + phys_addr_t phys_addr; + size_t size; + struct rb_node rbnode; +}; + +static struct xen_dma *xen_dma_seg; +static struct rb_root bus_to_phys = RB_ROOT; +static DEFINE_SPINLOCK(xen_dma_lock); + +static void xen_dma_insert(struct xen_dma *entry) +{ + struct rb_node **link = &bus_to_phys.rb_node; + struct rb_node *parent = NULL; + struct xen_dma *e; + + spin_lock(&xen_dma_lock); + + while (*link) { + parent = *link; + e = rb_entry(parent, struct xen_dma, rbnode); + + WARN_ON(entry->dma_addr == e->dma_addr); + + if (entry->dma_addr < e->dma_addr) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + rb_link_node(&entry->rbnode, parent, link); + rb_insert_color(&entry->rbnode, &bus_to_phys); + + spin_unlock(&xen_dma_lock); +} + +static struct xen_dma *xen_dma_retrieve(dma_addr_t dma_addr) +{ + struct rb_node *n = bus_to_phys.rb_node; + struct xen_dma *e; + + spin_lock(&xen_dma_lock); + + while (n) { + e = rb_entry(n, struct xen_dma, rbnode); + if (e->dma_addr <= dma_addr && e->dma_addr + e->size > dma_addr) { + spin_unlock(&xen_dma_lock); + return e; + } + if (dma_addr < e->dma_addr) + n = n->rb_left; + else + n = n->rb_right; + } + + spin_unlock(&xen_dma_lock); + return NULL; +} static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) { - return phys_to_machine(XPADDR(paddr)).maddr; + int nr_seg; + unsigned long offset; + char* vaddr; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + return phys_to_machine(XPADDR(paddr)).maddr; + + vaddr = (char *) phys_to_virt(paddr); + if (vaddr >= xen_io_tlb_end || vaddr < xen_io_tlb_start) + return ~0; + + offset = vaddr - xen_io_tlb_start; + nr_seg = offset / (IO_TLB_SEGSIZE << IO_TLB_SHIFT); + + return xen_dma_seg[nr_seg].dma_addr + (paddr & ((IO_TLB_SEGSIZE << IO_TLB_SHIFT) - 1)); } static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) { - return machine_to_phys(XMADDR(baddr)).paddr; + if (xen_feature(XENFEAT_auto_translated_physmap)) + { + struct xen_dma *dma = xen_dma_retrieve(baddr); + if (dma == NULL) + return ~0; + else + return dma->phys_addr + (baddr & ((IO_TLB_SEGSIZE << IO_TLB_SHIFT) - 1)); + } else + return machine_to_phys(XMADDR(baddr)).paddr; } static dma_addr_t xen_virt_to_bus(void *address) @@ -107,6 +191,9 @@ static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) unsigned long pfn = mfn_to_local_pfn(mfn); phys_addr_t paddr; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 1; + /* If the address is outside our domain, it CAN * have the same virtual address as another address * in our domain. Therefore _only_ check address within our domain. @@ -124,13 +211,12 @@ static int max_dma_bits = 32; static int xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) { - int i, rc; + int i, j, rc; int dma_bits; - dma_addr_t dma_handle; dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; - i = 0; + i = j = 0; do { int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE); @@ -138,12 +224,16 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) rc = xen_create_contiguous_region( (unsigned long)buf + (i << IO_TLB_SHIFT), get_order(slabs << IO_TLB_SHIFT), - dma_bits, &dma_handle); + dma_bits, &xen_dma_seg[j].dma_addr); + xen_dma_seg[j].phys_addr = virt_to_phys(buf + (i << IO_TLB_SHIFT)); + xen_dma_seg[j].size = slabs << IO_TLB_SHIFT; + xen_dma_insert(&xen_dma_seg[j]); } while (rc && dma_bits++ < max_dma_bits); if (rc) return rc; i += slabs; + j++; } while (i < nslabs); return 0; } @@ -193,9 +283,10 @@ retry: /* * Get IO TLB memory from any location. */ - if (early) + if (early) { xen_io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes)); - else { + xen_dma_seg = alloc_bootmem(sizeof(struct xen_dma) * NR_DMA_SEGS); + } else { #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { @@ -210,6 +301,7 @@ retry: xen_io_tlb_nslabs = SLABS_PER_PAGE << order; bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; } + xen_dma_seg = kzalloc(sizeof(struct xen_dma) * NR_DMA_SEGS, GFP_KERNEL); } if (!xen_io_tlb_start) { m_ret = XEN_SWIOTLB_ENOMEM; @@ -232,7 +324,6 @@ retry: m_ret = XEN_SWIOTLB_EFIXUP; goto error; } - start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); if (early) { if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose)) @@ -290,7 +381,8 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, phys = virt_to_phys(ret); dev_addr = xen_phys_to_bus(phys); - if (((dev_addr + size - 1 <= dma_mask)) && + if (!xen_feature(XENFEAT_auto_translated_physmap) && + ((dev_addr + size - 1 <= dma_mask)) && !range_straddles_page_boundary(phys, size)) *dma_handle = dev_addr; else { @@ -321,8 +413,9 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, phys = virt_to_phys(vaddr); - if (((dev_addr + size - 1 > dma_mask)) || - range_straddles_page_boundary(phys, size)) + if (xen_feature(XENFEAT_auto_translated_physmap) || + (((dev_addr + size - 1 > dma_mask)) || + range_straddles_page_boundary(phys, size))) xen_destroy_contiguous_region((unsigned long)vaddr, order); free_pages((unsigned long)vaddr, order); @@ -351,14 +444,15 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, * we can safely return the device addr and not worry about bounce * buffering it. */ - if (dma_capable(dev, dev_addr, size) && + if (!xen_feature(XENFEAT_auto_translated_physmap) && + dma_capable(dev, dev_addr, size) && !range_straddles_page_boundary(phys, size) && !swiotlb_force) return dev_addr; /* * Oh well, have to allocate and map a bounce buffer. */ - map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); + map = swiotlb_tbl_map_single(dev, xen_dma_seg[0].dma_addr, phys, size, dir); if (map == SWIOTLB_MAP_ERROR) return DMA_ERROR_CODE; @@ -494,10 +588,11 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, dma_addr_t dev_addr = xen_phys_to_bus(paddr); if (swiotlb_force || + xen_feature(XENFEAT_auto_translated_physmap) || !dma_capable(hwdev, dev_addr, sg->length) || range_straddles_page_boundary(paddr, sg->length)) { phys_addr_t map = swiotlb_tbl_map_single(hwdev, - start_dma_addr, + xen_dma_seg[0].dma_addr, sg_phys(sg), sg->length, dir);
Support autotranslate guests in swiotlb-xen by keeping track of the phys-to-bus and bus-to-phys mappings of the swiotlb buffer (xen_io_tlb_start-xen_io_tlb_end). Use a simple direct access on a pre-allocated array for phys-to-bus queries. Use a red-black tree for bus-to-phys queries. Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> CC: david.vrabel@citrix.com --- drivers/xen/swiotlb-xen.c | 127 +++++++++++++++++++++++++++++++++++++++------ 1 files changed, 111 insertions(+), 16 deletions(-)