Message ID | 20210521153253.518037-7-thomas.hellstrom@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/i915: Move LMEM (VRAM) management over to TTM | expand |
On Fri, 21 May 2021 at 16:33, Thomas Hellström <thomas.hellstrom@linux.intel.com> wrote: > > The internal ttm_bo_util memcpy uses ioremap functionality, and while it > probably might be possible to use it for copying in- and out of > sglist represented io memory, using io_mem_reserve() / io_mem_free() > callbacks, that would cause problems with fault(). > Instead, implement a method mapping page-by-page using kmap_local() > semantics. As an additional benefit we then avoid the occasional global > TLB flushes of ioremap() and consuming ioremap space, elimination of a > critical point of failure and with a slight change of semantics we could > also push the memcpy out async for testing and async driver development > purposes. > > A special linear iomem iterator is introduced internally to mimic the > old ioremap behaviour for code-paths that can't immediately be ported > over. This adds to the code size and should be considered a temporary > solution. > > Looking at the code we have a lot of checks for iomap tagged pointers. > Ideally we should extend the core memremap functions to also accept > uncached memory and kmap_local functionality. Then we could strip a > lot of code. > > Cc: Christian König <christian.koenig@amd.com> > Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> > --- > v3: > - Split up in various TTM files and addressed review comments by > Christian König. Tested and fixed legacy iomap memcpy path on i915. > --- > drivers/gpu/drm/ttm/ttm_bo_util.c | 278 ++++++++++------------------- > drivers/gpu/drm/ttm/ttm_module.c | 35 ++++ > drivers/gpu/drm/ttm/ttm_resource.c | 166 +++++++++++++++++ > drivers/gpu/drm/ttm/ttm_tt.c | 42 +++++ > include/drm/ttm/ttm_bo_driver.h | 28 +++ > include/drm/ttm/ttm_caching.h | 2 + > include/drm/ttm/ttm_kmap_iter.h | 61 +++++++ > include/drm/ttm/ttm_resource.h | 61 +++++++ > include/drm/ttm/ttm_tt.h | 16 ++ > 9 files changed, 508 insertions(+), 181 deletions(-) > create mode 100644 include/drm/ttm/ttm_kmap_iter.h > > diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c > index ae8b61460724..912cbe8e60a2 100644 > --- a/drivers/gpu/drm/ttm/ttm_bo_util.c > +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c > @@ -72,190 +72,126 @@ void ttm_mem_io_free(struct ttm_device *bdev, > mem->bus.addr = NULL; > } > > -static int ttm_resource_ioremap(struct ttm_device *bdev, > - struct ttm_resource *mem, > - void **virtual) > +/** > + * ttm_move_memcpy - Helper to perform a memcpy ttm move operation. > + * @bo: The struct ttm_buffer_object. > + * @new_mem: The struct ttm_resource we're moving to (copy destination). > + * @new_iter: A struct ttm_kmap_iter representing the destination resource. > + * @src_iter: A struct ttm_kmap_iter representing the source resource. > + * > + * This function is intended to be able to move out async under a > + * dma-fence if desired. > + */ > +void ttm_move_memcpy(struct ttm_buffer_object *bo, > + struct ttm_resource *dst_mem, > + struct ttm_kmap_iter *dst_iter, > + struct ttm_kmap_iter *src_iter) > { > - int ret; > - void *addr; > - > - *virtual = NULL; > - ret = ttm_mem_io_reserve(bdev, mem); > - if (ret || !mem->bus.is_iomem) > - return ret; > + const struct ttm_kmap_iter_ops *dst_ops = dst_iter->ops; > + const struct ttm_kmap_iter_ops *src_ops = src_iter->ops; > + struct ttm_tt *ttm = bo->ttm; > + struct dma_buf_map src_map, dst_map; > + pgoff_t i; > > - if (mem->bus.addr) { > - addr = mem->bus.addr; > - } else { > - size_t bus_size = (size_t)mem->num_pages << PAGE_SHIFT; > + /* Single TTM move. NOP */ > + if (dst_ops->maps_tt && src_ops->maps_tt) > + return; > > - if (mem->bus.caching == ttm_write_combined) > - addr = ioremap_wc(mem->bus.offset, bus_size); > -#ifdef CONFIG_X86 > - else if (mem->bus.caching == ttm_cached) > - addr = ioremap_cache(mem->bus.offset, bus_size); > -#endif > - else > - addr = ioremap(mem->bus.offset, bus_size); > - if (!addr) { > - ttm_mem_io_free(bdev, mem); > - return -ENOMEM; > + /* Don't move nonexistent data. Clear destination instead. */ > + if (src_ops->maps_tt && (!ttm || !ttm_tt_is_populated(ttm))) { > + if (ttm && !(ttm->page_flags & TTM_PAGE_FLAG_ZERO_ALLOC)) > + return; > + > + for (i = 0; i < dst_mem->num_pages; ++i) { > + dst_ops->map_local(dst_iter, &dst_map, i); > + if (dst_map.is_iomem) > + memset_io(dst_map.vaddr_iomem, 0, PAGE_SIZE); > + else > + memset(dst_map.vaddr, 0, PAGE_SIZE); > + if (dst_ops->unmap_local) > + dst_ops->unmap_local(dst_iter, &dst_map); > } > + return; > } > - *virtual = addr; > - return 0; > -} > - > -static void ttm_resource_iounmap(struct ttm_device *bdev, > - struct ttm_resource *mem, > - void *virtual) > -{ > - if (virtual && mem->bus.addr == NULL) > - iounmap(virtual); > - ttm_mem_io_free(bdev, mem); > -} > - > -static int ttm_copy_io_page(void *dst, void *src, unsigned long page) > -{ > - uint32_t *dstP = > - (uint32_t *) ((unsigned long)dst + (page << PAGE_SHIFT)); > - uint32_t *srcP = > - (uint32_t *) ((unsigned long)src + (page << PAGE_SHIFT)); > - > - int i; > - for (i = 0; i < PAGE_SIZE / sizeof(uint32_t); ++i) > - iowrite32(ioread32(srcP++), dstP++); > - return 0; > -} > - > -static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, > - unsigned long page, > - pgprot_t prot) > -{ > - struct page *d = ttm->pages[page]; > - void *dst; > - > - if (!d) > - return -ENOMEM; > - > - src = (void *)((unsigned long)src + (page << PAGE_SHIFT)); > - dst = kmap_atomic_prot(d, prot); > - if (!dst) > - return -ENOMEM; > - > - memcpy_fromio(dst, src, PAGE_SIZE); > - > - kunmap_atomic(dst); > - > - return 0; > -} > - > -static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, > - unsigned long page, > - pgprot_t prot) > -{ > - struct page *s = ttm->pages[page]; > - void *src; > - > - if (!s) > - return -ENOMEM; > - > - dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT)); > - src = kmap_atomic_prot(s, prot); > - if (!src) > - return -ENOMEM; > > - memcpy_toio(dst, src, PAGE_SIZE); > - > - kunmap_atomic(src); > + for (i = 0; i < dst_mem->num_pages; ++i) { > + dst_ops->map_local(dst_iter, &dst_map, i); > + src_ops->map_local(src_iter, &src_map, i); > + > + if (!src_map.is_iomem && !dst_map.is_iomem) { > + memcpy(dst_map.vaddr, src_map.vaddr, PAGE_SIZE); > + } else if (!src_map.is_iomem) { > + dma_buf_map_memcpy_to(&dst_map, src_map.vaddr, > + PAGE_SIZE); > + } else if (!dst_map.is_iomem) { > + memcpy_fromio(dst_map.vaddr, src_map.vaddr_iomem, > + PAGE_SIZE); > + } else { > + int j; > + u32 __iomem *src = src_map.vaddr_iomem; > + u32 __iomem *dst = dst_map.vaddr_iomem; > > - return 0; > + for (j = 0; j < (PAGE_SIZE >> 2); ++j) IMO PAGE_SIZE / sizeof(u32) is easier to understand. > + iowrite32(ioread32(src++), dst++); > + } > + if (src_ops->unmap_local) > + src_ops->unmap_local(src_iter, &src_map); > + if (dst_ops->unmap_local) > + dst_ops->unmap_local(dst_iter, &dst_map); > + } > } > +EXPORT_SYMBOL(ttm_move_memcpy); > > int ttm_bo_move_memcpy(struct ttm_buffer_object *bo, > struct ttm_operation_ctx *ctx, > - struct ttm_resource *new_mem) > + struct ttm_resource *dst_mem) > { > struct ttm_device *bdev = bo->bdev; > - struct ttm_resource_manager *man = ttm_manager_type(bdev, new_mem->mem_type); > + struct ttm_resource_manager *dst_man = > + ttm_manager_type(bo->bdev, dst_mem->mem_type); > struct ttm_tt *ttm = bo->ttm; > - struct ttm_resource *old_mem = &bo->mem; > - struct ttm_resource old_copy = *old_mem; > - void *old_iomap; > - void *new_iomap; > + struct ttm_resource *src_mem = &bo->mem; > + struct ttm_resource_manager *src_man = > + ttm_manager_type(bdev, src_mem->mem_type); > + struct ttm_resource src_copy = *src_mem; > + union { > + struct ttm_kmap_iter_tt tt; > + struct ttm_kmap_iter_linear_io io; > + } _dst_iter, _src_iter; > + struct ttm_kmap_iter *dst_iter, *src_iter; > int ret; > - unsigned long i; > > - ret = ttm_bo_wait_ctx(bo, ctx); > - if (ret) > - return ret; > - > - ret = ttm_resource_ioremap(bdev, old_mem, &old_iomap); > - if (ret) > - return ret; > - ret = ttm_resource_ioremap(bdev, new_mem, &new_iomap); > - if (ret) > - goto out; > - > - /* > - * Single TTM move. NOP. > - */ > - if (old_iomap == NULL && new_iomap == NULL) > - goto out2; > - > - /* > - * Don't move nonexistent data. Clear destination instead. > - */ > - if (old_iomap == NULL && > - (ttm == NULL || (!ttm_tt_is_populated(ttm) && > - !(ttm->page_flags & TTM_PAGE_FLAG_SWAPPED)))) { > - memset_io(new_iomap, 0, new_mem->num_pages*PAGE_SIZE); > - goto out2; > - } > - > - /* > - * TTM might be null for moves within the same region. > - */ > - if (ttm) { > + if (ttm && ((ttm->page_flags & TTM_PAGE_FLAG_SWAPPED) || > + dst_man->use_tt)) { > ret = ttm_tt_populate(bdev, ttm, ctx); > if (ret) > - goto out1; > + return ret; > } > > - for (i = 0; i < new_mem->num_pages; ++i) { > - if (old_iomap == NULL) { > - pgprot_t prot = ttm_io_prot(bo, old_mem, PAGE_KERNEL); > - ret = ttm_copy_ttm_io_page(ttm, new_iomap, i, > - prot); > - } else if (new_iomap == NULL) { > - pgprot_t prot = ttm_io_prot(bo, new_mem, PAGE_KERNEL); > - ret = ttm_copy_io_ttm_page(ttm, old_iomap, i, > - prot); > - } else { > - ret = ttm_copy_io_page(new_iomap, old_iomap, i); > - } > - if (ret) > - goto out1; > + dst_iter = ttm_kmap_iter_linear_io_init(&_dst_iter.io, bdev, dst_mem); > + if (PTR_ERR(dst_iter) == -EINVAL && dst_man->use_tt) > + dst_iter = ttm_kmap_iter_tt_init(&_dst_iter.tt, bo->ttm); > + if (IS_ERR(dst_iter)) > + return PTR_ERR(dst_iter); > + > + src_iter = ttm_kmap_iter_linear_io_init(&_src_iter.io, bdev, src_mem); > + if (PTR_ERR(src_iter) == -EINVAL && src_man->use_tt) > + src_iter = ttm_kmap_iter_tt_init(&_src_iter.tt, bo->ttm); > + if (IS_ERR(src_iter)) { > + ret = PTR_ERR(src_iter); > + goto out_src_iter; > } > - mb(); > -out2: > - old_copy = *old_mem; > > - ttm_bo_assign_mem(bo, new_mem); > - > - if (!man->use_tt) > - ttm_bo_tt_destroy(bo); > + ttm_move_memcpy(bo, dst_mem, dst_iter, src_iter); > + src_copy = *src_mem; > + ttm_bo_move_sync_cleanup(bo, dst_mem); > > -out1: > - ttm_resource_iounmap(bdev, old_mem, new_iomap); > -out: > - ttm_resource_iounmap(bdev, &old_copy, old_iomap); > + if (!src_iter->ops->maps_tt) > + ttm_kmap_iter_linear_io_fini(&_src_iter.io, bdev, &src_copy); > +out_src_iter: > + if (!dst_iter->ops->maps_tt) > + ttm_kmap_iter_linear_io_fini(&_dst_iter.io, bdev, dst_mem); > > - /* > - * On error, keep the mm node! > - */ > - if (!ret) > - ttm_resource_free(bo, &old_copy); > return ret; > } > EXPORT_SYMBOL(ttm_bo_move_memcpy); > @@ -336,27 +272,7 @@ pgprot_t ttm_io_prot(struct ttm_buffer_object *bo, struct ttm_resource *res, > man = ttm_manager_type(bo->bdev, res->mem_type); > caching = man->use_tt ? bo->ttm->caching : res->bus.caching; > > - /* Cached mappings need no adjustment */ > - if (caching == ttm_cached) > - return tmp; > - > -#if defined(__i386__) || defined(__x86_64__) > - if (caching == ttm_write_combined) > - tmp = pgprot_writecombine(tmp); > - else if (boot_cpu_data.x86 > 3) > - tmp = pgprot_noncached(tmp); > -#endif > -#if defined(__ia64__) || defined(__arm__) || defined(__aarch64__) || \ > - defined(__powerpc__) || defined(__mips__) > - if (caching == ttm_write_combined) > - tmp = pgprot_writecombine(tmp); > - else > - tmp = pgprot_noncached(tmp); > -#endif > -#if defined(__sparc__) > - tmp = pgprot_noncached(tmp); > -#endif > - return tmp; > + return ttm_prot_from_caching(caching, tmp); > } > EXPORT_SYMBOL(ttm_io_prot); > > diff --git a/drivers/gpu/drm/ttm/ttm_module.c b/drivers/gpu/drm/ttm/ttm_module.c > index 56b0efdba1a9..997c458f68a9 100644 > --- a/drivers/gpu/drm/ttm/ttm_module.c > +++ b/drivers/gpu/drm/ttm/ttm_module.c > @@ -31,12 +31,47 @@ > */ > #include <linux/module.h> > #include <linux/device.h> > +#include <linux/pgtable.h> > #include <linux/sched.h> > #include <linux/debugfs.h> > #include <drm/drm_sysfs.h> > +#include <drm/ttm/ttm_caching.h> > > #include "ttm_module.h" > > +/** > + * ttm_prot_from_caching - Modify the page protection according to the > + * ttm cacing mode > + * @caching: The ttm caching mode > + * @tmp: The original page protection > + * > + * Return: The modified page protection > + */ > +pgprot_t ttm_prot_from_caching(enum ttm_caching caching, pgprot_t tmp) > +{ > + /* Cached mappings need no adjustment */ > + if (caching == ttm_cached) > + return tmp; > + > +#if defined(__i386__) || defined(__x86_64__) > + if (caching == ttm_write_combined) > + tmp = pgprot_writecombine(tmp); > + else if (boot_cpu_data.x86 > 3) > + tmp = pgprot_noncached(tmp); > +#endif > +#if defined(__ia64__) || defined(__arm__) || defined(__aarch64__) || \ > + defined(__powerpc__) || defined(__mips__) > + if (caching == ttm_write_combined) > + tmp = pgprot_writecombine(tmp); > + else > + tmp = pgprot_noncached(tmp); > +#endif > +#if defined(__sparc__) > + tmp = pgprot_noncached(tmp); > +#endif > + return tmp; > +} > + > struct dentry *ttm_debugfs_root; > > static int __init ttm_init(void) > diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c > index 59e2b7157e41..e05ae7e3d477 100644 > --- a/drivers/gpu/drm/ttm/ttm_resource.c > +++ b/drivers/gpu/drm/ttm/ttm_resource.c > @@ -22,6 +22,10 @@ > * Authors: Christian König > */ > > +#include <linux/dma-buf-map.h> > +#include <linux/io-mapping.h> > +#include <linux/scatterlist.h> > + > #include <drm/ttm/ttm_resource.h> > #include <drm/ttm/ttm_bo_driver.h> > > @@ -147,3 +151,165 @@ void ttm_resource_manager_debug(struct ttm_resource_manager *man, > man->func->debug(man, p); > } > EXPORT_SYMBOL(ttm_resource_manager_debug); > + > +static void ttm_kmap_iter_iomap_map_local(struct ttm_kmap_iter *iter, > + struct dma_buf_map *dmap, > + pgoff_t i) > +{ > + struct ttm_kmap_iter_iomap *iter_io = > + container_of(iter, typeof(*iter_io), base); > + void __iomem *addr; > + > +retry: > + while (i >= iter_io->cache.end) { > + iter_io->cache.sg = iter_io->cache.sg ? > + sg_next(iter_io->cache.sg) : iter_io->st->sgl; > + iter_io->cache.i = iter_io->cache.end; > + iter_io->cache.end += sg_dma_len(iter_io->cache.sg) >> > + PAGE_SHIFT; > + iter_io->cache.offs = sg_dma_address(iter_io->cache.sg) - > + iter_io->start; > + } > + > + if (i < iter_io->cache.i) { > + iter_io->cache.end = 0; > + iter_io->cache.sg = NULL; > + goto retry; > + } > + > + addr = io_mapping_map_local_wc(iter_io->iomap, iter_io->cache.offs + > + (((resource_size_t)i - iter_io->cache.i) > + << PAGE_SHIFT)); > + dma_buf_map_set_vaddr_iomem(dmap, addr); > +} > + > +static void ttm_kmap_iter_iomap_unmap_local(struct ttm_kmap_iter *iter, > + struct dma_buf_map *map) > +{ > + io_mapping_unmap_local(map->vaddr_iomem); > +} > + > +static const struct ttm_kmap_iter_ops ttm_kmap_iter_io_ops = { > + .map_local = ttm_kmap_iter_iomap_map_local, > + .unmap_local = ttm_kmap_iter_iomap_unmap_local, > + .maps_tt = false, > +}; > + > +/** > + * ttm_kmap_iter_iomap_init - Initialize a struct ttm_kmap_iter_iomap > + * @iter_io: The struct ttm_kmap_iter_iomap to initialize. > + * @iomap: The struct io_mapping representing the underlying linear io_memory. > + * @st: sg_table into @iomap, representing the memory of the struct > + * ttm_resource. > + * @start: Offset that needs to be subtracted from @st to make > + * sg_dma_address(st->sgl) - @start == 0 for @iomap start. > + * > + * Return: Pointer to the embedded struct ttm_kmap_iter. > + */ > +struct ttm_kmap_iter * > +ttm_kmap_iter_iomap_init(struct ttm_kmap_iter_iomap *iter_io, > + struct io_mapping *iomap, > + struct sg_table *st, > + resource_size_t start) > +{ > + iter_io->base.ops = &ttm_kmap_iter_io_ops; > + iter_io->iomap = iomap; > + iter_io->st = st; > + iter_io->start = start; > + memset(&iter_io->cache, 0, sizeof(iter_io->cache)); > + > + return &iter_io->base; > +} > +EXPORT_SYMBOL(ttm_kmap_iter_iomap_init); > + > +/** > + * DOC: Linear io iterator > + * > + * This code should die in the not too near future. Best would be if we could > + * make io-mapping use memremap for all io memory, and have memremap > + * implement a kmap_local functionality. We could then strip a huge amount of > + * code. These linear io iterators are implemented to mimic old functionality, > + * and they don't use kmap_local semantics at all internally. Rather ioremap or > + * friends, and at least on 32-bit they add global TLB flushes and points > + * of failure. > + */ > + > +static void ttm_kmap_iter_linear_io_map_local(struct ttm_kmap_iter *iter, > + struct dma_buf_map *dmap, > + pgoff_t i) > +{ > + struct ttm_kmap_iter_linear_io *iter_io = > + container_of(iter, typeof(*iter_io), base); > + > + *dmap = iter_io->dmap; > + dma_buf_map_incr(dmap, i * PAGE_SIZE); > +} > + > +static const struct ttm_kmap_iter_ops ttm_kmap_iter_linear_io_ops = { > + .map_local = ttm_kmap_iter_linear_io_map_local, > + .maps_tt = false, > +}; > + > +struct ttm_kmap_iter * > +ttm_kmap_iter_linear_io_init(struct ttm_kmap_iter_linear_io *iter_io, > + struct ttm_device *bdev, > + struct ttm_resource *mem) > +{ > + int ret; > + > + ret = ttm_mem_io_reserve(bdev, mem); > + if (ret) > + goto out_err; > + if (!mem->bus.is_iomem) { > + ret = -EINVAL; > + goto out_io_free; > + } > + > + if (mem->bus.addr) { > + dma_buf_map_set_vaddr(&iter_io->dmap, mem->bus.addr); > + iter_io->needs_unmap = false; > + } else { > + size_t bus_size = (size_t)mem->num_pages << PAGE_SHIFT; > + > + iter_io->needs_unmap = true; > + if (mem->bus.caching == ttm_write_combined) > + dma_buf_map_set_vaddr_iomem(&iter_io->dmap, > + ioremap_wc(mem->bus.offset, > + bus_size)); > + else if (mem->bus.caching == ttm_cached) > + dma_buf_map_set_vaddr(&iter_io->dmap, > + memremap(mem->bus.offset, bus_size, > + MEMREMAP_WB)); The comments in set_vaddr suggest that this is meant for system-memory. Does that actually matter or is it just about not losing the __iomem annotation on platforms where it matters? Apparently cached device local is a thing. Also should this not be wrapped in CONFIG_X86?
On 5/25/21 11:18 AM, Matthew Auld wrote: > On Fri, 21 May 2021 at 16:33, Thomas Hellström > <thomas.hellstrom@linux.intel.com> wrote: >> The internal ttm_bo_util memcpy uses ioremap functionality, and while it >> probably might be possible to use it for copying in- and out of >> sglist represented io memory, using io_mem_reserve() / io_mem_free() >> callbacks, that would cause problems with fault(). >> Instead, implement a method mapping page-by-page using kmap_local() >> semantics. As an additional benefit we then avoid the occasional global >> TLB flushes of ioremap() and consuming ioremap space, elimination of a >> critical point of failure and with a slight change of semantics we could >> also push the memcpy out async for testing and async driver development >> purposes. >> >> A special linear iomem iterator is introduced internally to mimic the >> old ioremap behaviour for code-paths that can't immediately be ported >> over. This adds to the code size and should be considered a temporary >> solution. >> >> Looking at the code we have a lot of checks for iomap tagged pointers. >> Ideally we should extend the core memremap functions to also accept >> uncached memory and kmap_local functionality. Then we could strip a >> lot of code. >> >> Cc: Christian König <christian.koenig@amd.com> >> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> >> --- >> v3: >> - Split up in various TTM files and addressed review comments by >> Christian König. Tested and fixed legacy iomap memcpy path on i915. >> --- >> drivers/gpu/drm/ttm/ttm_bo_util.c | 278 ++++++++++------------------- >> drivers/gpu/drm/ttm/ttm_module.c | 35 ++++ >> drivers/gpu/drm/ttm/ttm_resource.c | 166 +++++++++++++++++ >> drivers/gpu/drm/ttm/ttm_tt.c | 42 +++++ >> include/drm/ttm/ttm_bo_driver.h | 28 +++ >> include/drm/ttm/ttm_caching.h | 2 + >> include/drm/ttm/ttm_kmap_iter.h | 61 +++++++ >> include/drm/ttm/ttm_resource.h | 61 +++++++ >> include/drm/ttm/ttm_tt.h | 16 ++ >> 9 files changed, 508 insertions(+), 181 deletions(-) >> create mode 100644 include/drm/ttm/ttm_kmap_iter.h >> >> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c >> index ae8b61460724..912cbe8e60a2 100644 >> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c >> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c >> @@ -72,190 +72,126 @@ void ttm_mem_io_free(struct ttm_device *bdev, >> mem->bus.addr = NULL; >> } >> >> -static int ttm_resource_ioremap(struct ttm_device *bdev, >> - struct ttm_resource *mem, >> - void **virtual) >> +/** >> + * ttm_move_memcpy - Helper to perform a memcpy ttm move operation. >> + * @bo: The struct ttm_buffer_object. >> + * @new_mem: The struct ttm_resource we're moving to (copy destination). >> + * @new_iter: A struct ttm_kmap_iter representing the destination resource. >> + * @src_iter: A struct ttm_kmap_iter representing the source resource. >> + * >> + * This function is intended to be able to move out async under a >> + * dma-fence if desired. >> + */ >> +void ttm_move_memcpy(struct ttm_buffer_object *bo, >> + struct ttm_resource *dst_mem, >> + struct ttm_kmap_iter *dst_iter, >> + struct ttm_kmap_iter *src_iter) >> { >> - int ret; >> - void *addr; >> - >> - *virtual = NULL; >> - ret = ttm_mem_io_reserve(bdev, mem); >> - if (ret || !mem->bus.is_iomem) >> - return ret; >> + const struct ttm_kmap_iter_ops *dst_ops = dst_iter->ops; >> + const struct ttm_kmap_iter_ops *src_ops = src_iter->ops; >> + struct ttm_tt *ttm = bo->ttm; >> + struct dma_buf_map src_map, dst_map; >> + pgoff_t i; >> >> - if (mem->bus.addr) { >> - addr = mem->bus.addr; >> - } else { >> - size_t bus_size = (size_t)mem->num_pages << PAGE_SHIFT; >> + /* Single TTM move. NOP */ >> + if (dst_ops->maps_tt && src_ops->maps_tt) >> + return; >> >> - if (mem->bus.caching == ttm_write_combined) >> - addr = ioremap_wc(mem->bus.offset, bus_size); >> -#ifdef CONFIG_X86 >> - else if (mem->bus.caching == ttm_cached) >> - addr = ioremap_cache(mem->bus.offset, bus_size); >> -#endif >> - else >> - addr = ioremap(mem->bus.offset, bus_size); >> - if (!addr) { >> - ttm_mem_io_free(bdev, mem); >> - return -ENOMEM; >> + /* Don't move nonexistent data. Clear destination instead. */ >> + if (src_ops->maps_tt && (!ttm || !ttm_tt_is_populated(ttm))) { >> + if (ttm && !(ttm->page_flags & TTM_PAGE_FLAG_ZERO_ALLOC)) >> + return; >> + >> + for (i = 0; i < dst_mem->num_pages; ++i) { >> + dst_ops->map_local(dst_iter, &dst_map, i); >> + if (dst_map.is_iomem) >> + memset_io(dst_map.vaddr_iomem, 0, PAGE_SIZE); >> + else >> + memset(dst_map.vaddr, 0, PAGE_SIZE); >> + if (dst_ops->unmap_local) >> + dst_ops->unmap_local(dst_iter, &dst_map); >> } >> + return; >> } >> - *virtual = addr; >> - return 0; >> -} >> - >> -static void ttm_resource_iounmap(struct ttm_device *bdev, >> - struct ttm_resource *mem, >> - void *virtual) >> -{ >> - if (virtual && mem->bus.addr == NULL) >> - iounmap(virtual); >> - ttm_mem_io_free(bdev, mem); >> -} >> - >> -static int ttm_copy_io_page(void *dst, void *src, unsigned long page) >> -{ >> - uint32_t *dstP = >> - (uint32_t *) ((unsigned long)dst + (page << PAGE_SHIFT)); >> - uint32_t *srcP = >> - (uint32_t *) ((unsigned long)src + (page << PAGE_SHIFT)); >> - >> - int i; >> - for (i = 0; i < PAGE_SIZE / sizeof(uint32_t); ++i) >> - iowrite32(ioread32(srcP++), dstP++); >> - return 0; >> -} >> - >> -static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, >> - unsigned long page, >> - pgprot_t prot) >> -{ >> - struct page *d = ttm->pages[page]; >> - void *dst; >> - >> - if (!d) >> - return -ENOMEM; >> - >> - src = (void *)((unsigned long)src + (page << PAGE_SHIFT)); >> - dst = kmap_atomic_prot(d, prot); >> - if (!dst) >> - return -ENOMEM; >> - >> - memcpy_fromio(dst, src, PAGE_SIZE); >> - >> - kunmap_atomic(dst); >> - >> - return 0; >> -} >> - >> -static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, >> - unsigned long page, >> - pgprot_t prot) >> -{ >> - struct page *s = ttm->pages[page]; >> - void *src; >> - >> - if (!s) >> - return -ENOMEM; >> - >> - dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT)); >> - src = kmap_atomic_prot(s, prot); >> - if (!src) >> - return -ENOMEM; >> >> - memcpy_toio(dst, src, PAGE_SIZE); >> - >> - kunmap_atomic(src); >> + for (i = 0; i < dst_mem->num_pages; ++i) { >> + dst_ops->map_local(dst_iter, &dst_map, i); >> + src_ops->map_local(src_iter, &src_map, i); >> + >> + if (!src_map.is_iomem && !dst_map.is_iomem) { >> + memcpy(dst_map.vaddr, src_map.vaddr, PAGE_SIZE); >> + } else if (!src_map.is_iomem) { >> + dma_buf_map_memcpy_to(&dst_map, src_map.vaddr, >> + PAGE_SIZE); >> + } else if (!dst_map.is_iomem) { >> + memcpy_fromio(dst_map.vaddr, src_map.vaddr_iomem, >> + PAGE_SIZE); >> + } else { >> + int j; >> + u32 __iomem *src = src_map.vaddr_iomem; >> + u32 __iomem *dst = dst_map.vaddr_iomem; >> >> - return 0; >> + for (j = 0; j < (PAGE_SIZE >> 2); ++j) > IMO PAGE_SIZE / sizeof(u32) is easier to understand. OK, will fix. > >> + iowrite32(ioread32(src++), dst++); >> + } >> + if (src_ops->unmap_local) >> + src_ops->unmap_local(src_iter, &src_map); >> + if (dst_ops->unmap_local) >> + dst_ops->unmap_local(dst_iter, &dst_map); >> + } >> } >> +EXPORT_SYMBOL(ttm_move_memcpy); >> >> int ttm_bo_move_memcpy(struct ttm_buffer_object *bo, >> struct ttm_operation_ctx *ctx, >> - struct ttm_resource *new_mem) >> + struct ttm_resource *dst_mem) >> { >> struct ttm_device *bdev = bo->bdev; >> - struct ttm_resource_manager *man = ttm_manager_type(bdev, new_mem->mem_type); >> + struct ttm_resource_manager *dst_man = >> + ttm_manager_type(bo->bdev, dst_mem->mem_type); >> struct ttm_tt *ttm = bo->ttm; >> - struct ttm_resource *old_mem = &bo->mem; >> - struct ttm_resource old_copy = *old_mem; >> - void *old_iomap; >> - void *new_iomap; >> + struct ttm_resource *src_mem = &bo->mem; >> + struct ttm_resource_manager *src_man = >> + ttm_manager_type(bdev, src_mem->mem_type); >> + struct ttm_resource src_copy = *src_mem; >> + union { >> + struct ttm_kmap_iter_tt tt; >> + struct ttm_kmap_iter_linear_io io; >> + } _dst_iter, _src_iter; >> + struct ttm_kmap_iter *dst_iter, *src_iter; >> int ret; >> - unsigned long i; >> >> - ret = ttm_bo_wait_ctx(bo, ctx); >> - if (ret) >> - return ret; >> - >> - ret = ttm_resource_ioremap(bdev, old_mem, &old_iomap); >> - if (ret) >> - return ret; >> - ret = ttm_resource_ioremap(bdev, new_mem, &new_iomap); >> - if (ret) >> - goto out; >> - >> - /* >> - * Single TTM move. NOP. >> - */ >> - if (old_iomap == NULL && new_iomap == NULL) >> - goto out2; >> - >> - /* >> - * Don't move nonexistent data. Clear destination instead. >> - */ >> - if (old_iomap == NULL && >> - (ttm == NULL || (!ttm_tt_is_populated(ttm) && >> - !(ttm->page_flags & TTM_PAGE_FLAG_SWAPPED)))) { >> - memset_io(new_iomap, 0, new_mem->num_pages*PAGE_SIZE); >> - goto out2; >> - } >> - >> - /* >> - * TTM might be null for moves within the same region. >> - */ >> - if (ttm) { >> + if (ttm && ((ttm->page_flags & TTM_PAGE_FLAG_SWAPPED) || >> + dst_man->use_tt)) { >> ret = ttm_tt_populate(bdev, ttm, ctx); >> if (ret) >> - goto out1; >> + return ret; >> } >> >> - for (i = 0; i < new_mem->num_pages; ++i) { >> - if (old_iomap == NULL) { >> - pgprot_t prot = ttm_io_prot(bo, old_mem, PAGE_KERNEL); >> - ret = ttm_copy_ttm_io_page(ttm, new_iomap, i, >> - prot); >> - } else if (new_iomap == NULL) { >> - pgprot_t prot = ttm_io_prot(bo, new_mem, PAGE_KERNEL); >> - ret = ttm_copy_io_ttm_page(ttm, old_iomap, i, >> - prot); >> - } else { >> - ret = ttm_copy_io_page(new_iomap, old_iomap, i); >> - } >> - if (ret) >> - goto out1; >> + dst_iter = ttm_kmap_iter_linear_io_init(&_dst_iter.io, bdev, dst_mem); >> + if (PTR_ERR(dst_iter) == -EINVAL && dst_man->use_tt) >> + dst_iter = ttm_kmap_iter_tt_init(&_dst_iter.tt, bo->ttm); >> + if (IS_ERR(dst_iter)) >> + return PTR_ERR(dst_iter); >> + >> + src_iter = ttm_kmap_iter_linear_io_init(&_src_iter.io, bdev, src_mem); >> + if (PTR_ERR(src_iter) == -EINVAL && src_man->use_tt) >> + src_iter = ttm_kmap_iter_tt_init(&_src_iter.tt, bo->ttm); >> + if (IS_ERR(src_iter)) { >> + ret = PTR_ERR(src_iter); >> + goto out_src_iter; >> } >> - mb(); >> -out2: >> - old_copy = *old_mem; >> >> - ttm_bo_assign_mem(bo, new_mem); >> - >> - if (!man->use_tt) >> - ttm_bo_tt_destroy(bo); >> + ttm_move_memcpy(bo, dst_mem, dst_iter, src_iter); >> + src_copy = *src_mem; >> + ttm_bo_move_sync_cleanup(bo, dst_mem); >> >> -out1: >> - ttm_resource_iounmap(bdev, old_mem, new_iomap); >> -out: >> - ttm_resource_iounmap(bdev, &old_copy, old_iomap); >> + if (!src_iter->ops->maps_tt) >> + ttm_kmap_iter_linear_io_fini(&_src_iter.io, bdev, &src_copy); >> +out_src_iter: >> + if (!dst_iter->ops->maps_tt) >> + ttm_kmap_iter_linear_io_fini(&_dst_iter.io, bdev, dst_mem); >> >> - /* >> - * On error, keep the mm node! >> - */ >> - if (!ret) >> - ttm_resource_free(bo, &old_copy); >> return ret; >> } >> EXPORT_SYMBOL(ttm_bo_move_memcpy); >> @@ -336,27 +272,7 @@ pgprot_t ttm_io_prot(struct ttm_buffer_object *bo, struct ttm_resource *res, >> man = ttm_manager_type(bo->bdev, res->mem_type); >> caching = man->use_tt ? bo->ttm->caching : res->bus.caching; >> >> - /* Cached mappings need no adjustment */ >> - if (caching == ttm_cached) >> - return tmp; >> - >> -#if defined(__i386__) || defined(__x86_64__) >> - if (caching == ttm_write_combined) >> - tmp = pgprot_writecombine(tmp); >> - else if (boot_cpu_data.x86 > 3) >> - tmp = pgprot_noncached(tmp); >> -#endif >> -#if defined(__ia64__) || defined(__arm__) || defined(__aarch64__) || \ >> - defined(__powerpc__) || defined(__mips__) >> - if (caching == ttm_write_combined) >> - tmp = pgprot_writecombine(tmp); >> - else >> - tmp = pgprot_noncached(tmp); >> -#endif >> -#if defined(__sparc__) >> - tmp = pgprot_noncached(tmp); >> -#endif >> - return tmp; >> + return ttm_prot_from_caching(caching, tmp); >> } >> EXPORT_SYMBOL(ttm_io_prot); >> >> diff --git a/drivers/gpu/drm/ttm/ttm_module.c b/drivers/gpu/drm/ttm/ttm_module.c >> index 56b0efdba1a9..997c458f68a9 100644 >> --- a/drivers/gpu/drm/ttm/ttm_module.c >> +++ b/drivers/gpu/drm/ttm/ttm_module.c >> @@ -31,12 +31,47 @@ >> */ >> #include <linux/module.h> >> #include <linux/device.h> >> +#include <linux/pgtable.h> >> #include <linux/sched.h> >> #include <linux/debugfs.h> >> #include <drm/drm_sysfs.h> >> +#include <drm/ttm/ttm_caching.h> >> >> #include "ttm_module.h" >> >> +/** >> + * ttm_prot_from_caching - Modify the page protection according to the >> + * ttm cacing mode >> + * @caching: The ttm caching mode >> + * @tmp: The original page protection >> + * >> + * Return: The modified page protection >> + */ >> +pgprot_t ttm_prot_from_caching(enum ttm_caching caching, pgprot_t tmp) >> +{ >> + /* Cached mappings need no adjustment */ >> + if (caching == ttm_cached) >> + return tmp; >> + >> +#if defined(__i386__) || defined(__x86_64__) >> + if (caching == ttm_write_combined) >> + tmp = pgprot_writecombine(tmp); >> + else if (boot_cpu_data.x86 > 3) >> + tmp = pgprot_noncached(tmp); >> +#endif >> +#if defined(__ia64__) || defined(__arm__) || defined(__aarch64__) || \ >> + defined(__powerpc__) || defined(__mips__) >> + if (caching == ttm_write_combined) >> + tmp = pgprot_writecombine(tmp); >> + else >> + tmp = pgprot_noncached(tmp); >> +#endif >> +#if defined(__sparc__) >> + tmp = pgprot_noncached(tmp); >> +#endif >> + return tmp; >> +} >> + >> struct dentry *ttm_debugfs_root; >> >> static int __init ttm_init(void) >> diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c >> index 59e2b7157e41..e05ae7e3d477 100644 >> --- a/drivers/gpu/drm/ttm/ttm_resource.c >> +++ b/drivers/gpu/drm/ttm/ttm_resource.c >> @@ -22,6 +22,10 @@ >> * Authors: Christian König >> */ >> >> +#include <linux/dma-buf-map.h> >> +#include <linux/io-mapping.h> >> +#include <linux/scatterlist.h> >> + >> #include <drm/ttm/ttm_resource.h> >> #include <drm/ttm/ttm_bo_driver.h> >> >> @@ -147,3 +151,165 @@ void ttm_resource_manager_debug(struct ttm_resource_manager *man, >> man->func->debug(man, p); >> } >> EXPORT_SYMBOL(ttm_resource_manager_debug); >> + >> +static void ttm_kmap_iter_iomap_map_local(struct ttm_kmap_iter *iter, >> + struct dma_buf_map *dmap, >> + pgoff_t i) >> +{ >> + struct ttm_kmap_iter_iomap *iter_io = >> + container_of(iter, typeof(*iter_io), base); >> + void __iomem *addr; >> + >> +retry: >> + while (i >= iter_io->cache.end) { >> + iter_io->cache.sg = iter_io->cache.sg ? >> + sg_next(iter_io->cache.sg) : iter_io->st->sgl; >> + iter_io->cache.i = iter_io->cache.end; >> + iter_io->cache.end += sg_dma_len(iter_io->cache.sg) >> >> + PAGE_SHIFT; >> + iter_io->cache.offs = sg_dma_address(iter_io->cache.sg) - >> + iter_io->start; >> + } >> + >> + if (i < iter_io->cache.i) { >> + iter_io->cache.end = 0; >> + iter_io->cache.sg = NULL; >> + goto retry; >> + } >> + >> + addr = io_mapping_map_local_wc(iter_io->iomap, iter_io->cache.offs + >> + (((resource_size_t)i - iter_io->cache.i) >> + << PAGE_SHIFT)); >> + dma_buf_map_set_vaddr_iomem(dmap, addr); >> +} >> + >> +static void ttm_kmap_iter_iomap_unmap_local(struct ttm_kmap_iter *iter, >> + struct dma_buf_map *map) >> +{ >> + io_mapping_unmap_local(map->vaddr_iomem); >> +} >> + >> +static const struct ttm_kmap_iter_ops ttm_kmap_iter_io_ops = { >> + .map_local = ttm_kmap_iter_iomap_map_local, >> + .unmap_local = ttm_kmap_iter_iomap_unmap_local, >> + .maps_tt = false, >> +}; >> + >> +/** >> + * ttm_kmap_iter_iomap_init - Initialize a struct ttm_kmap_iter_iomap >> + * @iter_io: The struct ttm_kmap_iter_iomap to initialize. >> + * @iomap: The struct io_mapping representing the underlying linear io_memory. >> + * @st: sg_table into @iomap, representing the memory of the struct >> + * ttm_resource. >> + * @start: Offset that needs to be subtracted from @st to make >> + * sg_dma_address(st->sgl) - @start == 0 for @iomap start. >> + * >> + * Return: Pointer to the embedded struct ttm_kmap_iter. >> + */ >> +struct ttm_kmap_iter * >> +ttm_kmap_iter_iomap_init(struct ttm_kmap_iter_iomap *iter_io, >> + struct io_mapping *iomap, >> + struct sg_table *st, >> + resource_size_t start) >> +{ >> + iter_io->base.ops = &ttm_kmap_iter_io_ops; >> + iter_io->iomap = iomap; >> + iter_io->st = st; >> + iter_io->start = start; >> + memset(&iter_io->cache, 0, sizeof(iter_io->cache)); >> + >> + return &iter_io->base; >> +} >> +EXPORT_SYMBOL(ttm_kmap_iter_iomap_init); >> + >> +/** >> + * DOC: Linear io iterator >> + * >> + * This code should die in the not too near future. Best would be if we could >> + * make io-mapping use memremap for all io memory, and have memremap >> + * implement a kmap_local functionality. We could then strip a huge amount of >> + * code. These linear io iterators are implemented to mimic old functionality, >> + * and they don't use kmap_local semantics at all internally. Rather ioremap or >> + * friends, and at least on 32-bit they add global TLB flushes and points >> + * of failure. >> + */ >> + >> +static void ttm_kmap_iter_linear_io_map_local(struct ttm_kmap_iter *iter, >> + struct dma_buf_map *dmap, >> + pgoff_t i) >> +{ >> + struct ttm_kmap_iter_linear_io *iter_io = >> + container_of(iter, typeof(*iter_io), base); >> + >> + *dmap = iter_io->dmap; >> + dma_buf_map_incr(dmap, i * PAGE_SIZE); >> +} >> + >> +static const struct ttm_kmap_iter_ops ttm_kmap_iter_linear_io_ops = { >> + .map_local = ttm_kmap_iter_linear_io_map_local, >> + .maps_tt = false, >> +}; >> + >> +struct ttm_kmap_iter * >> +ttm_kmap_iter_linear_io_init(struct ttm_kmap_iter_linear_io *iter_io, >> + struct ttm_device *bdev, >> + struct ttm_resource *mem) >> +{ >> + int ret; >> + >> + ret = ttm_mem_io_reserve(bdev, mem); >> + if (ret) >> + goto out_err; >> + if (!mem->bus.is_iomem) { >> + ret = -EINVAL; >> + goto out_io_free; >> + } >> + >> + if (mem->bus.addr) { >> + dma_buf_map_set_vaddr(&iter_io->dmap, mem->bus.addr); >> + iter_io->needs_unmap = false; >> + } else { >> + size_t bus_size = (size_t)mem->num_pages << PAGE_SHIFT; >> + >> + iter_io->needs_unmap = true; >> + if (mem->bus.caching == ttm_write_combined) >> + dma_buf_map_set_vaddr_iomem(&iter_io->dmap, >> + ioremap_wc(mem->bus.offset, >> + bus_size)); >> + else if (mem->bus.caching == ttm_cached) >> + dma_buf_map_set_vaddr(&iter_io->dmap, >> + memremap(mem->bus.offset, bus_size, >> + MEMREMAP_WB)); > The comments in set_vaddr suggest that this is meant for > system-memory. Does that actually matter or is it just about not > losing the __iomem annotation on platforms where it matters? Yes, it's the latter. dma_buf_map() is relatively new and the author probably didn't think about the case of cached iomem, which is used by, for example, vmwgfx. > Apparently cached device local is a thing. Also should this not be > wrapped in CONFIG_X86? Both dma_buf_map() and memremap are generic, I think, I guess memremap would return NULL if it's not supported. /Thomas
On Tue, 25 May 2021 at 10:32, Thomas Hellström <thomas.hellstrom@linux.intel.com> wrote: > > > On 5/25/21 11:18 AM, Matthew Auld wrote: > > On Fri, 21 May 2021 at 16:33, Thomas Hellström > > <thomas.hellstrom@linux.intel.com> wrote: > >> The internal ttm_bo_util memcpy uses ioremap functionality, and while it > >> probably might be possible to use it for copying in- and out of > >> sglist represented io memory, using io_mem_reserve() / io_mem_free() > >> callbacks, that would cause problems with fault(). > >> Instead, implement a method mapping page-by-page using kmap_local() > >> semantics. As an additional benefit we then avoid the occasional global > >> TLB flushes of ioremap() and consuming ioremap space, elimination of a > >> critical point of failure and with a slight change of semantics we could > >> also push the memcpy out async for testing and async driver development > >> purposes. > >> > >> A special linear iomem iterator is introduced internally to mimic the > >> old ioremap behaviour for code-paths that can't immediately be ported > >> over. This adds to the code size and should be considered a temporary > >> solution. > >> > >> Looking at the code we have a lot of checks for iomap tagged pointers. > >> Ideally we should extend the core memremap functions to also accept > >> uncached memory and kmap_local functionality. Then we could strip a > >> lot of code. > >> > >> Cc: Christian König <christian.koenig@amd.com> > >> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> > >> --- > >> v3: > >> - Split up in various TTM files and addressed review comments by > >> Christian König. Tested and fixed legacy iomap memcpy path on i915. > >> --- > >> drivers/gpu/drm/ttm/ttm_bo_util.c | 278 ++++++++++------------------- > >> drivers/gpu/drm/ttm/ttm_module.c | 35 ++++ > >> drivers/gpu/drm/ttm/ttm_resource.c | 166 +++++++++++++++++ > >> drivers/gpu/drm/ttm/ttm_tt.c | 42 +++++ > >> include/drm/ttm/ttm_bo_driver.h | 28 +++ > >> include/drm/ttm/ttm_caching.h | 2 + > >> include/drm/ttm/ttm_kmap_iter.h | 61 +++++++ > >> include/drm/ttm/ttm_resource.h | 61 +++++++ > >> include/drm/ttm/ttm_tt.h | 16 ++ > >> 9 files changed, 508 insertions(+), 181 deletions(-) > >> create mode 100644 include/drm/ttm/ttm_kmap_iter.h > >> > >> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c > >> index ae8b61460724..912cbe8e60a2 100644 > >> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c > >> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c > >> @@ -72,190 +72,126 @@ void ttm_mem_io_free(struct ttm_device *bdev, > >> mem->bus.addr = NULL; > >> } > >> > >> -static int ttm_resource_ioremap(struct ttm_device *bdev, > >> - struct ttm_resource *mem, > >> - void **virtual) > >> +/** > >> + * ttm_move_memcpy - Helper to perform a memcpy ttm move operation. > >> + * @bo: The struct ttm_buffer_object. > >> + * @new_mem: The struct ttm_resource we're moving to (copy destination). > >> + * @new_iter: A struct ttm_kmap_iter representing the destination resource. > >> + * @src_iter: A struct ttm_kmap_iter representing the source resource. > >> + * > >> + * This function is intended to be able to move out async under a > >> + * dma-fence if desired. > >> + */ > >> +void ttm_move_memcpy(struct ttm_buffer_object *bo, > >> + struct ttm_resource *dst_mem, > >> + struct ttm_kmap_iter *dst_iter, > >> + struct ttm_kmap_iter *src_iter) > >> { > >> - int ret; > >> - void *addr; > >> - > >> - *virtual = NULL; > >> - ret = ttm_mem_io_reserve(bdev, mem); > >> - if (ret || !mem->bus.is_iomem) > >> - return ret; > >> + const struct ttm_kmap_iter_ops *dst_ops = dst_iter->ops; > >> + const struct ttm_kmap_iter_ops *src_ops = src_iter->ops; > >> + struct ttm_tt *ttm = bo->ttm; > >> + struct dma_buf_map src_map, dst_map; > >> + pgoff_t i; > >> > >> - if (mem->bus.addr) { > >> - addr = mem->bus.addr; > >> - } else { > >> - size_t bus_size = (size_t)mem->num_pages << PAGE_SHIFT; > >> + /* Single TTM move. NOP */ > >> + if (dst_ops->maps_tt && src_ops->maps_tt) > >> + return; > >> > >> - if (mem->bus.caching == ttm_write_combined) > >> - addr = ioremap_wc(mem->bus.offset, bus_size); > >> -#ifdef CONFIG_X86 > >> - else if (mem->bus.caching == ttm_cached) > >> - addr = ioremap_cache(mem->bus.offset, bus_size); > >> -#endif > >> - else > >> - addr = ioremap(mem->bus.offset, bus_size); > >> - if (!addr) { > >> - ttm_mem_io_free(bdev, mem); > >> - return -ENOMEM; > >> + /* Don't move nonexistent data. Clear destination instead. */ > >> + if (src_ops->maps_tt && (!ttm || !ttm_tt_is_populated(ttm))) { > >> + if (ttm && !(ttm->page_flags & TTM_PAGE_FLAG_ZERO_ALLOC)) > >> + return; > >> + > >> + for (i = 0; i < dst_mem->num_pages; ++i) { > >> + dst_ops->map_local(dst_iter, &dst_map, i); > >> + if (dst_map.is_iomem) > >> + memset_io(dst_map.vaddr_iomem, 0, PAGE_SIZE); > >> + else > >> + memset(dst_map.vaddr, 0, PAGE_SIZE); > >> + if (dst_ops->unmap_local) > >> + dst_ops->unmap_local(dst_iter, &dst_map); > >> } > >> + return; > >> } > >> - *virtual = addr; > >> - return 0; > >> -} > >> - > >> -static void ttm_resource_iounmap(struct ttm_device *bdev, > >> - struct ttm_resource *mem, > >> - void *virtual) > >> -{ > >> - if (virtual && mem->bus.addr == NULL) > >> - iounmap(virtual); > >> - ttm_mem_io_free(bdev, mem); > >> -} > >> - > >> -static int ttm_copy_io_page(void *dst, void *src, unsigned long page) > >> -{ > >> - uint32_t *dstP = > >> - (uint32_t *) ((unsigned long)dst + (page << PAGE_SHIFT)); > >> - uint32_t *srcP = > >> - (uint32_t *) ((unsigned long)src + (page << PAGE_SHIFT)); > >> - > >> - int i; > >> - for (i = 0; i < PAGE_SIZE / sizeof(uint32_t); ++i) > >> - iowrite32(ioread32(srcP++), dstP++); > >> - return 0; > >> -} > >> - > >> -static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, > >> - unsigned long page, > >> - pgprot_t prot) > >> -{ > >> - struct page *d = ttm->pages[page]; > >> - void *dst; > >> - > >> - if (!d) > >> - return -ENOMEM; > >> - > >> - src = (void *)((unsigned long)src + (page << PAGE_SHIFT)); > >> - dst = kmap_atomic_prot(d, prot); > >> - if (!dst) > >> - return -ENOMEM; > >> - > >> - memcpy_fromio(dst, src, PAGE_SIZE); > >> - > >> - kunmap_atomic(dst); > >> - > >> - return 0; > >> -} > >> - > >> -static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, > >> - unsigned long page, > >> - pgprot_t prot) > >> -{ > >> - struct page *s = ttm->pages[page]; > >> - void *src; > >> - > >> - if (!s) > >> - return -ENOMEM; > >> - > >> - dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT)); > >> - src = kmap_atomic_prot(s, prot); > >> - if (!src) > >> - return -ENOMEM; > >> > >> - memcpy_toio(dst, src, PAGE_SIZE); > >> - > >> - kunmap_atomic(src); > >> + for (i = 0; i < dst_mem->num_pages; ++i) { > >> + dst_ops->map_local(dst_iter, &dst_map, i); > >> + src_ops->map_local(src_iter, &src_map, i); > >> + > >> + if (!src_map.is_iomem && !dst_map.is_iomem) { > >> + memcpy(dst_map.vaddr, src_map.vaddr, PAGE_SIZE); > >> + } else if (!src_map.is_iomem) { > >> + dma_buf_map_memcpy_to(&dst_map, src_map.vaddr, > >> + PAGE_SIZE); > >> + } else if (!dst_map.is_iomem) { > >> + memcpy_fromio(dst_map.vaddr, src_map.vaddr_iomem, > >> + PAGE_SIZE); > >> + } else { > >> + int j; > >> + u32 __iomem *src = src_map.vaddr_iomem; > >> + u32 __iomem *dst = dst_map.vaddr_iomem; > >> > >> - return 0; > >> + for (j = 0; j < (PAGE_SIZE >> 2); ++j) > > IMO PAGE_SIZE / sizeof(u32) is easier to understand. > > OK, will fix. > > > > > >> + iowrite32(ioread32(src++), dst++); > >> + } > >> + if (src_ops->unmap_local) > >> + src_ops->unmap_local(src_iter, &src_map); > >> + if (dst_ops->unmap_local) > >> + dst_ops->unmap_local(dst_iter, &dst_map); > >> + } > >> } > >> +EXPORT_SYMBOL(ttm_move_memcpy); > >> > >> int ttm_bo_move_memcpy(struct ttm_buffer_object *bo, > >> struct ttm_operation_ctx *ctx, > >> - struct ttm_resource *new_mem) > >> + struct ttm_resource *dst_mem) > >> { > >> struct ttm_device *bdev = bo->bdev; > >> - struct ttm_resource_manager *man = ttm_manager_type(bdev, new_mem->mem_type); > >> + struct ttm_resource_manager *dst_man = > >> + ttm_manager_type(bo->bdev, dst_mem->mem_type); > >> struct ttm_tt *ttm = bo->ttm; > >> - struct ttm_resource *old_mem = &bo->mem; > >> - struct ttm_resource old_copy = *old_mem; > >> - void *old_iomap; > >> - void *new_iomap; > >> + struct ttm_resource *src_mem = &bo->mem; > >> + struct ttm_resource_manager *src_man = > >> + ttm_manager_type(bdev, src_mem->mem_type); > >> + struct ttm_resource src_copy = *src_mem; > >> + union { > >> + struct ttm_kmap_iter_tt tt; > >> + struct ttm_kmap_iter_linear_io io; > >> + } _dst_iter, _src_iter; > >> + struct ttm_kmap_iter *dst_iter, *src_iter; > >> int ret; > >> - unsigned long i; > >> > >> - ret = ttm_bo_wait_ctx(bo, ctx); > >> - if (ret) > >> - return ret; > >> - > >> - ret = ttm_resource_ioremap(bdev, old_mem, &old_iomap); > >> - if (ret) > >> - return ret; > >> - ret = ttm_resource_ioremap(bdev, new_mem, &new_iomap); > >> - if (ret) > >> - goto out; > >> - > >> - /* > >> - * Single TTM move. NOP. > >> - */ > >> - if (old_iomap == NULL && new_iomap == NULL) > >> - goto out2; > >> - > >> - /* > >> - * Don't move nonexistent data. Clear destination instead. > >> - */ > >> - if (old_iomap == NULL && > >> - (ttm == NULL || (!ttm_tt_is_populated(ttm) && > >> - !(ttm->page_flags & TTM_PAGE_FLAG_SWAPPED)))) { > >> - memset_io(new_iomap, 0, new_mem->num_pages*PAGE_SIZE); > >> - goto out2; > >> - } > >> - > >> - /* > >> - * TTM might be null for moves within the same region. > >> - */ > >> - if (ttm) { > >> + if (ttm && ((ttm->page_flags & TTM_PAGE_FLAG_SWAPPED) || > >> + dst_man->use_tt)) { > >> ret = ttm_tt_populate(bdev, ttm, ctx); > >> if (ret) > >> - goto out1; > >> + return ret; > >> } > >> > >> - for (i = 0; i < new_mem->num_pages; ++i) { > >> - if (old_iomap == NULL) { > >> - pgprot_t prot = ttm_io_prot(bo, old_mem, PAGE_KERNEL); > >> - ret = ttm_copy_ttm_io_page(ttm, new_iomap, i, > >> - prot); > >> - } else if (new_iomap == NULL) { > >> - pgprot_t prot = ttm_io_prot(bo, new_mem, PAGE_KERNEL); > >> - ret = ttm_copy_io_ttm_page(ttm, old_iomap, i, > >> - prot); > >> - } else { > >> - ret = ttm_copy_io_page(new_iomap, old_iomap, i); > >> - } > >> - if (ret) > >> - goto out1; > >> + dst_iter = ttm_kmap_iter_linear_io_init(&_dst_iter.io, bdev, dst_mem); > >> + if (PTR_ERR(dst_iter) == -EINVAL && dst_man->use_tt) > >> + dst_iter = ttm_kmap_iter_tt_init(&_dst_iter.tt, bo->ttm); > >> + if (IS_ERR(dst_iter)) > >> + return PTR_ERR(dst_iter); > >> + > >> + src_iter = ttm_kmap_iter_linear_io_init(&_src_iter.io, bdev, src_mem); > >> + if (PTR_ERR(src_iter) == -EINVAL && src_man->use_tt) > >> + src_iter = ttm_kmap_iter_tt_init(&_src_iter.tt, bo->ttm); > >> + if (IS_ERR(src_iter)) { > >> + ret = PTR_ERR(src_iter); > >> + goto out_src_iter; > >> } > >> - mb(); > >> -out2: > >> - old_copy = *old_mem; > >> > >> - ttm_bo_assign_mem(bo, new_mem); > >> - > >> - if (!man->use_tt) > >> - ttm_bo_tt_destroy(bo); > >> + ttm_move_memcpy(bo, dst_mem, dst_iter, src_iter); > >> + src_copy = *src_mem; > >> + ttm_bo_move_sync_cleanup(bo, dst_mem); > >> > >> -out1: > >> - ttm_resource_iounmap(bdev, old_mem, new_iomap); > >> -out: > >> - ttm_resource_iounmap(bdev, &old_copy, old_iomap); > >> + if (!src_iter->ops->maps_tt) > >> + ttm_kmap_iter_linear_io_fini(&_src_iter.io, bdev, &src_copy); > >> +out_src_iter: > >> + if (!dst_iter->ops->maps_tt) > >> + ttm_kmap_iter_linear_io_fini(&_dst_iter.io, bdev, dst_mem); > >> > >> - /* > >> - * On error, keep the mm node! > >> - */ > >> - if (!ret) > >> - ttm_resource_free(bo, &old_copy); > >> return ret; > >> } > >> EXPORT_SYMBOL(ttm_bo_move_memcpy); > >> @@ -336,27 +272,7 @@ pgprot_t ttm_io_prot(struct ttm_buffer_object *bo, struct ttm_resource *res, > >> man = ttm_manager_type(bo->bdev, res->mem_type); > >> caching = man->use_tt ? bo->ttm->caching : res->bus.caching; > >> > >> - /* Cached mappings need no adjustment */ > >> - if (caching == ttm_cached) > >> - return tmp; > >> - > >> -#if defined(__i386__) || defined(__x86_64__) > >> - if (caching == ttm_write_combined) > >> - tmp = pgprot_writecombine(tmp); > >> - else if (boot_cpu_data.x86 > 3) > >> - tmp = pgprot_noncached(tmp); > >> -#endif > >> -#if defined(__ia64__) || defined(__arm__) || defined(__aarch64__) || \ > >> - defined(__powerpc__) || defined(__mips__) > >> - if (caching == ttm_write_combined) > >> - tmp = pgprot_writecombine(tmp); > >> - else > >> - tmp = pgprot_noncached(tmp); > >> -#endif > >> -#if defined(__sparc__) > >> - tmp = pgprot_noncached(tmp); > >> -#endif > >> - return tmp; > >> + return ttm_prot_from_caching(caching, tmp); > >> } > >> EXPORT_SYMBOL(ttm_io_prot); > >> > >> diff --git a/drivers/gpu/drm/ttm/ttm_module.c b/drivers/gpu/drm/ttm/ttm_module.c > >> index 56b0efdba1a9..997c458f68a9 100644 > >> --- a/drivers/gpu/drm/ttm/ttm_module.c > >> +++ b/drivers/gpu/drm/ttm/ttm_module.c > >> @@ -31,12 +31,47 @@ > >> */ > >> #include <linux/module.h> > >> #include <linux/device.h> > >> +#include <linux/pgtable.h> > >> #include <linux/sched.h> > >> #include <linux/debugfs.h> > >> #include <drm/drm_sysfs.h> > >> +#include <drm/ttm/ttm_caching.h> > >> > >> #include "ttm_module.h" > >> > >> +/** > >> + * ttm_prot_from_caching - Modify the page protection according to the > >> + * ttm cacing mode > >> + * @caching: The ttm caching mode > >> + * @tmp: The original page protection > >> + * > >> + * Return: The modified page protection > >> + */ > >> +pgprot_t ttm_prot_from_caching(enum ttm_caching caching, pgprot_t tmp) > >> +{ > >> + /* Cached mappings need no adjustment */ > >> + if (caching == ttm_cached) > >> + return tmp; > >> + > >> +#if defined(__i386__) || defined(__x86_64__) > >> + if (caching == ttm_write_combined) > >> + tmp = pgprot_writecombine(tmp); > >> + else if (boot_cpu_data.x86 > 3) > >> + tmp = pgprot_noncached(tmp); > >> +#endif > >> +#if defined(__ia64__) || defined(__arm__) || defined(__aarch64__) || \ > >> + defined(__powerpc__) || defined(__mips__) > >> + if (caching == ttm_write_combined) > >> + tmp = pgprot_writecombine(tmp); > >> + else > >> + tmp = pgprot_noncached(tmp); > >> +#endif > >> +#if defined(__sparc__) > >> + tmp = pgprot_noncached(tmp); > >> +#endif > >> + return tmp; > >> +} > >> + > >> struct dentry *ttm_debugfs_root; > >> > >> static int __init ttm_init(void) > >> diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c > >> index 59e2b7157e41..e05ae7e3d477 100644 > >> --- a/drivers/gpu/drm/ttm/ttm_resource.c > >> +++ b/drivers/gpu/drm/ttm/ttm_resource.c > >> @@ -22,6 +22,10 @@ > >> * Authors: Christian König > >> */ > >> > >> +#include <linux/dma-buf-map.h> > >> +#include <linux/io-mapping.h> > >> +#include <linux/scatterlist.h> > >> + > >> #include <drm/ttm/ttm_resource.h> > >> #include <drm/ttm/ttm_bo_driver.h> > >> > >> @@ -147,3 +151,165 @@ void ttm_resource_manager_debug(struct ttm_resource_manager *man, > >> man->func->debug(man, p); > >> } > >> EXPORT_SYMBOL(ttm_resource_manager_debug); > >> + > >> +static void ttm_kmap_iter_iomap_map_local(struct ttm_kmap_iter *iter, > >> + struct dma_buf_map *dmap, > >> + pgoff_t i) > >> +{ > >> + struct ttm_kmap_iter_iomap *iter_io = > >> + container_of(iter, typeof(*iter_io), base); > >> + void __iomem *addr; > >> + > >> +retry: > >> + while (i >= iter_io->cache.end) { > >> + iter_io->cache.sg = iter_io->cache.sg ? > >> + sg_next(iter_io->cache.sg) : iter_io->st->sgl; > >> + iter_io->cache.i = iter_io->cache.end; > >> + iter_io->cache.end += sg_dma_len(iter_io->cache.sg) >> > >> + PAGE_SHIFT; > >> + iter_io->cache.offs = sg_dma_address(iter_io->cache.sg) - > >> + iter_io->start; > >> + } > >> + > >> + if (i < iter_io->cache.i) { > >> + iter_io->cache.end = 0; > >> + iter_io->cache.sg = NULL; > >> + goto retry; > >> + } > >> + > >> + addr = io_mapping_map_local_wc(iter_io->iomap, iter_io->cache.offs + > >> + (((resource_size_t)i - iter_io->cache.i) > >> + << PAGE_SHIFT)); > >> + dma_buf_map_set_vaddr_iomem(dmap, addr); > >> +} > >> + > >> +static void ttm_kmap_iter_iomap_unmap_local(struct ttm_kmap_iter *iter, > >> + struct dma_buf_map *map) > >> +{ > >> + io_mapping_unmap_local(map->vaddr_iomem); > >> +} > >> + > >> +static const struct ttm_kmap_iter_ops ttm_kmap_iter_io_ops = { > >> + .map_local = ttm_kmap_iter_iomap_map_local, > >> + .unmap_local = ttm_kmap_iter_iomap_unmap_local, > >> + .maps_tt = false, > >> +}; > >> + > >> +/** > >> + * ttm_kmap_iter_iomap_init - Initialize a struct ttm_kmap_iter_iomap > >> + * @iter_io: The struct ttm_kmap_iter_iomap to initialize. > >> + * @iomap: The struct io_mapping representing the underlying linear io_memory. > >> + * @st: sg_table into @iomap, representing the memory of the struct > >> + * ttm_resource. > >> + * @start: Offset that needs to be subtracted from @st to make > >> + * sg_dma_address(st->sgl) - @start == 0 for @iomap start. > >> + * > >> + * Return: Pointer to the embedded struct ttm_kmap_iter. > >> + */ > >> +struct ttm_kmap_iter * > >> +ttm_kmap_iter_iomap_init(struct ttm_kmap_iter_iomap *iter_io, > >> + struct io_mapping *iomap, > >> + struct sg_table *st, > >> + resource_size_t start) > >> +{ > >> + iter_io->base.ops = &ttm_kmap_iter_io_ops; > >> + iter_io->iomap = iomap; > >> + iter_io->st = st; > >> + iter_io->start = start; > >> + memset(&iter_io->cache, 0, sizeof(iter_io->cache)); > >> + > >> + return &iter_io->base; > >> +} > >> +EXPORT_SYMBOL(ttm_kmap_iter_iomap_init); > >> + > >> +/** > >> + * DOC: Linear io iterator > >> + * > >> + * This code should die in the not too near future. Best would be if we could > >> + * make io-mapping use memremap for all io memory, and have memremap > >> + * implement a kmap_local functionality. We could then strip a huge amount of > >> + * code. These linear io iterators are implemented to mimic old functionality, > >> + * and they don't use kmap_local semantics at all internally. Rather ioremap or > >> + * friends, and at least on 32-bit they add global TLB flushes and points > >> + * of failure. > >> + */ > >> + > >> +static void ttm_kmap_iter_linear_io_map_local(struct ttm_kmap_iter *iter, > >> + struct dma_buf_map *dmap, > >> + pgoff_t i) > >> +{ > >> + struct ttm_kmap_iter_linear_io *iter_io = > >> + container_of(iter, typeof(*iter_io), base); > >> + > >> + *dmap = iter_io->dmap; > >> + dma_buf_map_incr(dmap, i * PAGE_SIZE); > >> +} > >> + > >> +static const struct ttm_kmap_iter_ops ttm_kmap_iter_linear_io_ops = { > >> + .map_local = ttm_kmap_iter_linear_io_map_local, > >> + .maps_tt = false, > >> +}; > >> + > >> +struct ttm_kmap_iter * > >> +ttm_kmap_iter_linear_io_init(struct ttm_kmap_iter_linear_io *iter_io, > >> + struct ttm_device *bdev, > >> + struct ttm_resource *mem) > >> +{ > >> + int ret; > >> + > >> + ret = ttm_mem_io_reserve(bdev, mem); > >> + if (ret) > >> + goto out_err; > >> + if (!mem->bus.is_iomem) { > >> + ret = -EINVAL; > >> + goto out_io_free; > >> + } > >> + > >> + if (mem->bus.addr) { > >> + dma_buf_map_set_vaddr(&iter_io->dmap, mem->bus.addr); > >> + iter_io->needs_unmap = false; > >> + } else { > >> + size_t bus_size = (size_t)mem->num_pages << PAGE_SHIFT; > >> + > >> + iter_io->needs_unmap = true; > >> + if (mem->bus.caching == ttm_write_combined) > >> + dma_buf_map_set_vaddr_iomem(&iter_io->dmap, > >> + ioremap_wc(mem->bus.offset, > >> + bus_size)); > >> + else if (mem->bus.caching == ttm_cached) > >> + dma_buf_map_set_vaddr(&iter_io->dmap, > >> + memremap(mem->bus.offset, bus_size, > >> + MEMREMAP_WB)); > > The comments in set_vaddr suggest that this is meant for > > system-memory. Does that actually matter or is it just about not > > losing the __iomem annotation on platforms where it matters? > > Yes, it's the latter. dma_buf_map() is relatively new and the author > probably didn't think about the case of cached iomem, which is used by, > for example, vmwgfx. > > > Apparently cached device local is a thing. Also should this not be > > wrapped in CONFIG_X86? > > Both dma_buf_map() and memremap are generic, I think, I guess memremap > would return NULL if it's not supported. It looks like memremap just wraps ioremap_cache, but since it also discards the __iomem annotation should we be doing that universally? Also not sure if ioremap_cache is universally supported, so wrapping in CONFIG_X86 and falling back to plain ioremap() might be needed? Or at least that looks like roughly what the previous code was doing? Not too sure tbh. > > /Thomas > >
On Tue, 2021-05-25 at 10:58 +0100, Matthew Auld wrote: > On Tue, 25 May 2021 at 10:32, Thomas Hellström > <thomas.hellstrom@linux.intel.com> wrote: > > > > > > On 5/25/21 11:18 AM, Matthew Auld wrote: > > > On Fri, 21 May 2021 at 16:33, Thomas Hellström > > > <thomas.hellstrom@linux.intel.com> wrote: > > > > The internal ttm_bo_util memcpy uses ioremap functionality, and > > > > while it > > > > probably might be possible to use it for copying in- and out of > > > > sglist represented io memory, using io_mem_reserve() / > > > > io_mem_free() > > > > callbacks, that would cause problems with fault(). > > > > Instead, implement a method mapping page-by-page using > > > > kmap_local() > > > > semantics. As an additional benefit we then avoid the > > > > occasional global > > > > TLB flushes of ioremap() and consuming ioremap space, > > > > elimination of a > > > > critical point of failure and with a slight change of semantics > > > > we could > > > > also push the memcpy out async for testing and async driver > > > > development > > > > purposes. > > > > > > > > A special linear iomem iterator is introduced internally to > > > > mimic the > > > > old ioremap behaviour for code-paths that can't immediately be > > > > ported > > > > over. This adds to the code size and should be considered a > > > > temporary > > > > solution. > > > > > > > > Looking at the code we have a lot of checks for iomap tagged > > > > pointers. > > > > Ideally we should extend the core memremap functions to also > > > > accept > > > > uncached memory and kmap_local functionality. Then we could > > > > strip a > > > > lot of code. > > > > > > > > Cc: Christian König <christian.koenig@amd.com> > > > > Signed-off-by: Thomas Hellström < > > > > thomas.hellstrom@linux.intel.com> > > > > --- > > > > v3: > > > > - Split up in various TTM files and addressed review comments > > > > by > > > > Christian König. Tested and fixed legacy iomap memcpy path > > > > on i915. > > > > --- > > > > drivers/gpu/drm/ttm/ttm_bo_util.c | 278 ++++++++++---------- > > > > --------- > > > > drivers/gpu/drm/ttm/ttm_module.c | 35 ++++ > > > > drivers/gpu/drm/ttm/ttm_resource.c | 166 +++++++++++++++++ > > > > drivers/gpu/drm/ttm/ttm_tt.c | 42 +++++ > > > > include/drm/ttm/ttm_bo_driver.h | 28 +++ > > > > include/drm/ttm/ttm_caching.h | 2 + > > > > include/drm/ttm/ttm_kmap_iter.h | 61 +++++++ > > > > include/drm/ttm/ttm_resource.h | 61 +++++++ > > > > include/drm/ttm/ttm_tt.h | 16 ++ > > > > 9 files changed, 508 insertions(+), 181 deletions(-) > > > > create mode 100644 include/drm/ttm/ttm_kmap_iter.h > > > > > > > > diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c > > > > b/drivers/gpu/drm/ttm/ttm_bo_util.c > > > > index ae8b61460724..912cbe8e60a2 100644 > > > > --- a/drivers/gpu/drm/ttm/ttm_bo_util.c > > > > +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c > > > > @@ -72,190 +72,126 @@ void ttm_mem_io_free(struct ttm_device > > > > *bdev, > > > > mem->bus.addr = NULL; > > > > } > > > > > > > > -static int ttm_resource_ioremap(struct ttm_device *bdev, > > > > - struct ttm_resource *mem, > > > > - void **virtual) > > > > +/** > > > > + * ttm_move_memcpy - Helper to perform a memcpy ttm move > > > > operation. > > > > + * @bo: The struct ttm_buffer_object. > > > > + * @new_mem: The struct ttm_resource we're moving to (copy > > > > destination). > > > > + * @new_iter: A struct ttm_kmap_iter representing the > > > > destination resource. > > > > + * @src_iter: A struct ttm_kmap_iter representing the source > > > > resource. > > > > + * > > > > + * This function is intended to be able to move out async > > > > under a > > > > + * dma-fence if desired. > > > > + */ > > > > +void ttm_move_memcpy(struct ttm_buffer_object *bo, > > > > + struct ttm_resource *dst_mem, > > > > + struct ttm_kmap_iter *dst_iter, > > > > + struct ttm_kmap_iter *src_iter) > > > > { > > > > - int ret; > > > > - void *addr; > > > > - > > > > - *virtual = NULL; > > > > - ret = ttm_mem_io_reserve(bdev, mem); > > > > - if (ret || !mem->bus.is_iomem) > > > > - return ret; > > > > + const struct ttm_kmap_iter_ops *dst_ops = dst_iter- > > > > >ops; > > > > + const struct ttm_kmap_iter_ops *src_ops = src_iter- > > > > >ops; > > > > + struct ttm_tt *ttm = bo->ttm; > > > > + struct dma_buf_map src_map, dst_map; > > > > + pgoff_t i; > > > > > > > > - if (mem->bus.addr) { > > > > - addr = mem->bus.addr; > > > > - } else { > > > > - size_t bus_size = (size_t)mem->num_pages << > > > > PAGE_SHIFT; > > > > + /* Single TTM move. NOP */ > > > > + if (dst_ops->maps_tt && src_ops->maps_tt) > > > > + return; > > > > > > > > - if (mem->bus.caching == ttm_write_combined) > > > > - addr = ioremap_wc(mem->bus.offset, > > > > bus_size); > > > > -#ifdef CONFIG_X86 > > > > - else if (mem->bus.caching == ttm_cached) > > > > - addr = ioremap_cache(mem->bus.offset, > > > > bus_size); > > > > -#endif > > > > - else > > > > - addr = ioremap(mem->bus.offset, > > > > bus_size); > > > > - if (!addr) { > > > > - ttm_mem_io_free(bdev, mem); > > > > - return -ENOMEM; > > > > + /* Don't move nonexistent data. Clear destination > > > > instead. */ > > > > + if (src_ops->maps_tt && (!ttm || > > > > !ttm_tt_is_populated(ttm))) { > > > > + if (ttm && !(ttm->page_flags & > > > > TTM_PAGE_FLAG_ZERO_ALLOC)) > > > > + return; > > > > + > > > > + for (i = 0; i < dst_mem->num_pages; ++i) { > > > > + dst_ops->map_local(dst_iter, &dst_map, > > > > i); > > > > + if (dst_map.is_iomem) > > > > + memset_io(dst_map.vaddr_iomem, > > > > 0, PAGE_SIZE); > > > > + else > > > > + memset(dst_map.vaddr, 0, > > > > PAGE_SIZE); > > > > + if (dst_ops->unmap_local) > > > > + dst_ops->unmap_local(dst_iter, > > > > &dst_map); > > > > } > > > > + return; > > > > } > > > > - *virtual = addr; > > > > - return 0; > > > > -} > > > > - > > > > -static void ttm_resource_iounmap(struct ttm_device *bdev, > > > > - struct ttm_resource *mem, > > > > - void *virtual) > > > > -{ > > > > - if (virtual && mem->bus.addr == NULL) > > > > - iounmap(virtual); > > > > - ttm_mem_io_free(bdev, mem); > > > > -} > > > > - > > > > -static int ttm_copy_io_page(void *dst, void *src, unsigned > > > > long page) > > > > -{ > > > > - uint32_t *dstP = > > > > - (uint32_t *) ((unsigned long)dst + (page << > > > > PAGE_SHIFT)); > > > > - uint32_t *srcP = > > > > - (uint32_t *) ((unsigned long)src + (page << > > > > PAGE_SHIFT)); > > > > - > > > > - int i; > > > > - for (i = 0; i < PAGE_SIZE / sizeof(uint32_t); ++i) > > > > - iowrite32(ioread32(srcP++), dstP++); > > > > - return 0; > > > > -} > > > > - > > > > -static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, > > > > - unsigned long page, > > > > - pgprot_t prot) > > > > -{ > > > > - struct page *d = ttm->pages[page]; > > > > - void *dst; > > > > - > > > > - if (!d) > > > > - return -ENOMEM; > > > > - > > > > - src = (void *)((unsigned long)src + (page << > > > > PAGE_SHIFT)); > > > > - dst = kmap_atomic_prot(d, prot); > > > > - if (!dst) > > > > - return -ENOMEM; > > > > - > > > > - memcpy_fromio(dst, src, PAGE_SIZE); > > > > - > > > > - kunmap_atomic(dst); > > > > - > > > > - return 0; > > > > -} > > > > - > > > > -static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, > > > > - unsigned long page, > > > > - pgprot_t prot) > > > > -{ > > > > - struct page *s = ttm->pages[page]; > > > > - void *src; > > > > - > > > > - if (!s) > > > > - return -ENOMEM; > > > > - > > > > - dst = (void *)((unsigned long)dst + (page << > > > > PAGE_SHIFT)); > > > > - src = kmap_atomic_prot(s, prot); > > > > - if (!src) > > > > - return -ENOMEM; > > > > > > > > - memcpy_toio(dst, src, PAGE_SIZE); > > > > - > > > > - kunmap_atomic(src); > > > > + for (i = 0; i < dst_mem->num_pages; ++i) { > > > > + dst_ops->map_local(dst_iter, &dst_map, i); > > > > + src_ops->map_local(src_iter, &src_map, i); > > > > + > > > > + if (!src_map.is_iomem && !dst_map.is_iomem) { > > > > + memcpy(dst_map.vaddr, src_map.vaddr, > > > > PAGE_SIZE); > > > > + } else if (!src_map.is_iomem) { > > > > + dma_buf_map_memcpy_to(&dst_map, > > > > src_map.vaddr, > > > > + PAGE_SIZE); > > > > + } else if (!dst_map.is_iomem) { > > > > + memcpy_fromio(dst_map.vaddr, > > > > src_map.vaddr_iomem, > > > > + PAGE_SIZE); > > > > + } else { > > > > + int j; > > > > + u32 __iomem *src = src_map.vaddr_iomem; > > > > + u32 __iomem *dst = dst_map.vaddr_iomem; > > > > > > > > - return 0; > > > > + for (j = 0; j < (PAGE_SIZE >> 2); ++j) > > > IMO PAGE_SIZE / sizeof(u32) is easier to understand. > > > > OK, will fix. > > > > > > > > > > > + iowrite32(ioread32(src++), > > > > dst++); > > > > + } > > > > + if (src_ops->unmap_local) > > > > + src_ops->unmap_local(src_iter, > > > > &src_map); > > > > + if (dst_ops->unmap_local) > > > > + dst_ops->unmap_local(dst_iter, > > > > &dst_map); > > > > + } > > > > } > > > > +EXPORT_SYMBOL(ttm_move_memcpy); > > > > > > > > int ttm_bo_move_memcpy(struct ttm_buffer_object *bo, > > > > struct ttm_operation_ctx *ctx, > > > > - struct ttm_resource *new_mem) > > > > + struct ttm_resource *dst_mem) > > > > { > > > > struct ttm_device *bdev = bo->bdev; > > > > - struct ttm_resource_manager *man = > > > > ttm_manager_type(bdev, new_mem->mem_type); > > > > + struct ttm_resource_manager *dst_man = > > > > + ttm_manager_type(bo->bdev, dst_mem->mem_type); > > > > struct ttm_tt *ttm = bo->ttm; > > > > - struct ttm_resource *old_mem = &bo->mem; > > > > - struct ttm_resource old_copy = *old_mem; > > > > - void *old_iomap; > > > > - void *new_iomap; > > > > + struct ttm_resource *src_mem = &bo->mem; > > > > + struct ttm_resource_manager *src_man = > > > > + ttm_manager_type(bdev, src_mem->mem_type); > > > > + struct ttm_resource src_copy = *src_mem; > > > > + union { > > > > + struct ttm_kmap_iter_tt tt; > > > > + struct ttm_kmap_iter_linear_io io; > > > > + } _dst_iter, _src_iter; > > > > + struct ttm_kmap_iter *dst_iter, *src_iter; > > > > int ret; > > > > - unsigned long i; > > > > > > > > - ret = ttm_bo_wait_ctx(bo, ctx); > > > > - if (ret) > > > > - return ret; > > > > - > > > > - ret = ttm_resource_ioremap(bdev, old_mem, &old_iomap); > > > > - if (ret) > > > > - return ret; > > > > - ret = ttm_resource_ioremap(bdev, new_mem, &new_iomap); > > > > - if (ret) > > > > - goto out; > > > > - > > > > - /* > > > > - * Single TTM move. NOP. > > > > - */ > > > > - if (old_iomap == NULL && new_iomap == NULL) > > > > - goto out2; > > > > - > > > > - /* > > > > - * Don't move nonexistent data. Clear destination > > > > instead. > > > > - */ > > > > - if (old_iomap == NULL && > > > > - (ttm == NULL || (!ttm_tt_is_populated(ttm) && > > > > - !(ttm->page_flags & > > > > TTM_PAGE_FLAG_SWAPPED)))) { > > > > - memset_io(new_iomap, 0, new_mem- > > > > >num_pages*PAGE_SIZE); > > > > - goto out2; > > > > - } > > > > - > > > > - /* > > > > - * TTM might be null for moves within the same region. > > > > - */ > > > > - if (ttm) { > > > > + if (ttm && ((ttm->page_flags & TTM_PAGE_FLAG_SWAPPED) > > > > || > > > > + dst_man->use_tt)) { > > > > ret = ttm_tt_populate(bdev, ttm, ctx); > > > > if (ret) > > > > - goto out1; > > > > + return ret; > > > > } > > > > > > > > - for (i = 0; i < new_mem->num_pages; ++i) { > > > > - if (old_iomap == NULL) { > > > > - pgprot_t prot = ttm_io_prot(bo, > > > > old_mem, PAGE_KERNEL); > > > > - ret = ttm_copy_ttm_io_page(ttm, > > > > new_iomap, i, > > > > - prot); > > > > - } else if (new_iomap == NULL) { > > > > - pgprot_t prot = ttm_io_prot(bo, > > > > new_mem, PAGE_KERNEL); > > > > - ret = ttm_copy_io_ttm_page(ttm, > > > > old_iomap, i, > > > > - prot); > > > > - } else { > > > > - ret = ttm_copy_io_page(new_iomap, > > > > old_iomap, i); > > > > - } > > > > - if (ret) > > > > - goto out1; > > > > + dst_iter = ttm_kmap_iter_linear_io_init(&_dst_iter.io, > > > > bdev, dst_mem); > > > > + if (PTR_ERR(dst_iter) == -EINVAL && dst_man->use_tt) > > > > + dst_iter = ttm_kmap_iter_tt_init(&_dst_iter.tt, > > > > bo->ttm); > > > > + if (IS_ERR(dst_iter)) > > > > + return PTR_ERR(dst_iter); > > > > + > > > > + src_iter = ttm_kmap_iter_linear_io_init(&_src_iter.io, > > > > bdev, src_mem); > > > > + if (PTR_ERR(src_iter) == -EINVAL && src_man->use_tt) > > > > + src_iter = ttm_kmap_iter_tt_init(&_src_iter.tt, > > > > bo->ttm); > > > > + if (IS_ERR(src_iter)) { > > > > + ret = PTR_ERR(src_iter); > > > > + goto out_src_iter; > > > > } > > > > - mb(); > > > > -out2: > > > > - old_copy = *old_mem; > > > > > > > > - ttm_bo_assign_mem(bo, new_mem); > > > > - > > > > - if (!man->use_tt) > > > > - ttm_bo_tt_destroy(bo); > > > > + ttm_move_memcpy(bo, dst_mem, dst_iter, src_iter); > > > > + src_copy = *src_mem; > > > > + ttm_bo_move_sync_cleanup(bo, dst_mem); > > > > > > > > -out1: > > > > - ttm_resource_iounmap(bdev, old_mem, new_iomap); > > > > -out: > > > > - ttm_resource_iounmap(bdev, &old_copy, old_iomap); > > > > + if (!src_iter->ops->maps_tt) > > > > + ttm_kmap_iter_linear_io_fini(&_src_iter.io, > > > > bdev, &src_copy); > > > > +out_src_iter: > > > > + if (!dst_iter->ops->maps_tt) > > > > + ttm_kmap_iter_linear_io_fini(&_dst_iter.io, > > > > bdev, dst_mem); > > > > > > > > - /* > > > > - * On error, keep the mm node! > > > > - */ > > > > - if (!ret) > > > > - ttm_resource_free(bo, &old_copy); > > > > return ret; > > > > } > > > > EXPORT_SYMBOL(ttm_bo_move_memcpy); > > > > @@ -336,27 +272,7 @@ pgprot_t ttm_io_prot(struct > > > > ttm_buffer_object *bo, struct ttm_resource *res, > > > > man = ttm_manager_type(bo->bdev, res->mem_type); > > > > caching = man->use_tt ? bo->ttm->caching : res- > > > > >bus.caching; > > > > > > > > - /* Cached mappings need no adjustment */ > > > > - if (caching == ttm_cached) > > > > - return tmp; > > > > - > > > > -#if defined(__i386__) || defined(__x86_64__) > > > > - if (caching == ttm_write_combined) > > > > - tmp = pgprot_writecombine(tmp); > > > > - else if (boot_cpu_data.x86 > 3) > > > > - tmp = pgprot_noncached(tmp); > > > > -#endif > > > > -#if defined(__ia64__) || defined(__arm__) || > > > > defined(__aarch64__) || \ > > > > - defined(__powerpc__) || defined(__mips__) > > > > - if (caching == ttm_write_combined) > > > > - tmp = pgprot_writecombine(tmp); > > > > - else > > > > - tmp = pgprot_noncached(tmp); > > > > -#endif > > > > -#if defined(__sparc__) > > > > - tmp = pgprot_noncached(tmp); > > > > -#endif > > > > - return tmp; > > > > + return ttm_prot_from_caching(caching, tmp); > > > > } > > > > EXPORT_SYMBOL(ttm_io_prot); > > > > > > > > diff --git a/drivers/gpu/drm/ttm/ttm_module.c > > > > b/drivers/gpu/drm/ttm/ttm_module.c > > > > index 56b0efdba1a9..997c458f68a9 100644 > > > > --- a/drivers/gpu/drm/ttm/ttm_module.c > > > > +++ b/drivers/gpu/drm/ttm/ttm_module.c > > > > @@ -31,12 +31,47 @@ > > > > */ > > > > #include <linux/module.h> > > > > #include <linux/device.h> > > > > +#include <linux/pgtable.h> > > > > #include <linux/sched.h> > > > > #include <linux/debugfs.h> > > > > #include <drm/drm_sysfs.h> > > > > +#include <drm/ttm/ttm_caching.h> > > > > > > > > #include "ttm_module.h" > > > > > > > > +/** > > > > + * ttm_prot_from_caching - Modify the page protection > > > > according to the > > > > + * ttm cacing mode > > > > + * @caching: The ttm caching mode > > > > + * @tmp: The original page protection > > > > + * > > > > + * Return: The modified page protection > > > > + */ > > > > +pgprot_t ttm_prot_from_caching(enum ttm_caching caching, > > > > pgprot_t tmp) > > > > +{ > > > > + /* Cached mappings need no adjustment */ > > > > + if (caching == ttm_cached) > > > > + return tmp; > > > > + > > > > +#if defined(__i386__) || defined(__x86_64__) > > > > + if (caching == ttm_write_combined) > > > > + tmp = pgprot_writecombine(tmp); > > > > + else if (boot_cpu_data.x86 > 3) > > > > + tmp = pgprot_noncached(tmp); > > > > +#endif > > > > +#if defined(__ia64__) || defined(__arm__) || > > > > defined(__aarch64__) || \ > > > > + defined(__powerpc__) || defined(__mips__) > > > > + if (caching == ttm_write_combined) > > > > + tmp = pgprot_writecombine(tmp); > > > > + else > > > > + tmp = pgprot_noncached(tmp); > > > > +#endif > > > > +#if defined(__sparc__) > > > > + tmp = pgprot_noncached(tmp); > > > > +#endif > > > > + return tmp; > > > > +} > > > > + > > > > struct dentry *ttm_debugfs_root; > > > > > > > > static int __init ttm_init(void) > > > > diff --git a/drivers/gpu/drm/ttm/ttm_resource.c > > > > b/drivers/gpu/drm/ttm/ttm_resource.c > > > > index 59e2b7157e41..e05ae7e3d477 100644 > > > > --- a/drivers/gpu/drm/ttm/ttm_resource.c > > > > +++ b/drivers/gpu/drm/ttm/ttm_resource.c > > > > @@ -22,6 +22,10 @@ > > > > * Authors: Christian König > > > > */ > > > > > > > > +#include <linux/dma-buf-map.h> > > > > +#include <linux/io-mapping.h> > > > > +#include <linux/scatterlist.h> > > > > + > > > > #include <drm/ttm/ttm_resource.h> > > > > #include <drm/ttm/ttm_bo_driver.h> > > > > > > > > @@ -147,3 +151,165 @@ void ttm_resource_manager_debug(struct > > > > ttm_resource_manager *man, > > > > man->func->debug(man, p); > > > > } > > > > EXPORT_SYMBOL(ttm_resource_manager_debug); > > > > + > > > > +static void ttm_kmap_iter_iomap_map_local(struct ttm_kmap_iter > > > > *iter, > > > > + struct dma_buf_map > > > > *dmap, > > > > + pgoff_t i) > > > > +{ > > > > + struct ttm_kmap_iter_iomap *iter_io = > > > > + container_of(iter, typeof(*iter_io), base); > > > > + void __iomem *addr; > > > > + > > > > +retry: > > > > + while (i >= iter_io->cache.end) { > > > > + iter_io->cache.sg = iter_io->cache.sg ? > > > > + sg_next(iter_io->cache.sg) : iter_io- > > > > >st->sgl; > > > > + iter_io->cache.i = iter_io->cache.end; > > > > + iter_io->cache.end += sg_dma_len(iter_io- > > > > >cache.sg) >> > > > > + PAGE_SHIFT; > > > > + iter_io->cache.offs = sg_dma_address(iter_io- > > > > >cache.sg) - > > > > + iter_io->start; > > > > + } > > > > + > > > > + if (i < iter_io->cache.i) { > > > > + iter_io->cache.end = 0; > > > > + iter_io->cache.sg = NULL; > > > > + goto retry; > > > > + } > > > > + > > > > + addr = io_mapping_map_local_wc(iter_io->iomap, iter_io- > > > > >cache.offs + > > > > + (((resource_size_t)i - > > > > iter_io->cache.i) > > > > + << PAGE_SHIFT)); > > > > + dma_buf_map_set_vaddr_iomem(dmap, addr); > > > > +} > > > > + > > > > +static void ttm_kmap_iter_iomap_unmap_local(struct > > > > ttm_kmap_iter *iter, > > > > + struct dma_buf_map > > > > *map) > > > > +{ > > > > + io_mapping_unmap_local(map->vaddr_iomem); > > > > +} > > > > + > > > > +static const struct ttm_kmap_iter_ops ttm_kmap_iter_io_ops = { > > > > + .map_local = ttm_kmap_iter_iomap_map_local, > > > > + .unmap_local = ttm_kmap_iter_iomap_unmap_local, > > > > + .maps_tt = false, > > > > +}; > > > > + > > > > +/** > > > > + * ttm_kmap_iter_iomap_init - Initialize a struct > > > > ttm_kmap_iter_iomap > > > > + * @iter_io: The struct ttm_kmap_iter_iomap to initialize. > > > > + * @iomap: The struct io_mapping representing the underlying > > > > linear io_memory. > > > > + * @st: sg_table into @iomap, representing the memory of the > > > > struct > > > > + * ttm_resource. > > > > + * @start: Offset that needs to be subtracted from @st to make > > > > + * sg_dma_address(st->sgl) - @start == 0 for @iomap start. > > > > + * > > > > + * Return: Pointer to the embedded struct ttm_kmap_iter. > > > > + */ > > > > +struct ttm_kmap_iter * > > > > +ttm_kmap_iter_iomap_init(struct ttm_kmap_iter_iomap *iter_io, > > > > + struct io_mapping *iomap, > > > > + struct sg_table *st, > > > > + resource_size_t start) > > > > +{ > > > > + iter_io->base.ops = &ttm_kmap_iter_io_ops; > > > > + iter_io->iomap = iomap; > > > > + iter_io->st = st; > > > > + iter_io->start = start; > > > > + memset(&iter_io->cache, 0, sizeof(iter_io->cache)); > > > > + > > > > + return &iter_io->base; > > > > +} > > > > +EXPORT_SYMBOL(ttm_kmap_iter_iomap_init); > > > > + > > > > +/** > > > > + * DOC: Linear io iterator > > > > + * > > > > + * This code should die in the not too near future. Best would > > > > be if we could > > > > + * make io-mapping use memremap for all io memory, and have > > > > memremap > > > > + * implement a kmap_local functionality. We could then strip a > > > > huge amount of > > > > + * code. These linear io iterators are implemented to mimic > > > > old functionality, > > > > + * and they don't use kmap_local semantics at all internally. > > > > Rather ioremap or > > > > + * friends, and at least on 32-bit they add global TLB flushes > > > > and points > > > > + * of failure. > > > > + */ > > > > + > > > > +static void ttm_kmap_iter_linear_io_map_local(struct > > > > ttm_kmap_iter *iter, > > > > + struct > > > > dma_buf_map *dmap, > > > > + pgoff_t i) > > > > +{ > > > > + struct ttm_kmap_iter_linear_io *iter_io = > > > > + container_of(iter, typeof(*iter_io), base); > > > > + > > > > + *dmap = iter_io->dmap; > > > > + dma_buf_map_incr(dmap, i * PAGE_SIZE); > > > > +} > > > > + > > > > +static const struct ttm_kmap_iter_ops > > > > ttm_kmap_iter_linear_io_ops = { > > > > + .map_local = ttm_kmap_iter_linear_io_map_local, > > > > + .maps_tt = false, > > > > +}; > > > > + > > > > +struct ttm_kmap_iter * > > > > +ttm_kmap_iter_linear_io_init(struct ttm_kmap_iter_linear_io > > > > *iter_io, > > > > + struct ttm_device *bdev, > > > > + struct ttm_resource *mem) > > > > +{ > > > > + int ret; > > > > + > > > > + ret = ttm_mem_io_reserve(bdev, mem); > > > > + if (ret) > > > > + goto out_err; > > > > + if (!mem->bus.is_iomem) { > > > > + ret = -EINVAL; > > > > + goto out_io_free; > > > > + } > > > > + > > > > + if (mem->bus.addr) { > > > > + dma_buf_map_set_vaddr(&iter_io->dmap, mem- > > > > >bus.addr); > > > > + iter_io->needs_unmap = false; > > > > + } else { > > > > + size_t bus_size = (size_t)mem->num_pages << > > > > PAGE_SHIFT; > > > > + > > > > + iter_io->needs_unmap = true; > > > > + if (mem->bus.caching == ttm_write_combined) > > > > + dma_buf_map_set_vaddr_iomem(&iter_io- > > > > >dmap, > > > > + > > > > ioremap_wc(mem->bus.offset, > > > > + > > > > bus_size)); > > > > + else if (mem->bus.caching == ttm_cached) > > > > + dma_buf_map_set_vaddr(&iter_io->dmap, > > > > + memremap(mem- > > > > >bus.offset, bus_size, > > > > + > > > > MEMREMAP_WB)); > > > The comments in set_vaddr suggest that this is meant for > > > system-memory. Does that actually matter or is it just about not > > > losing the __iomem annotation on platforms where it matters? > > > > Yes, it's the latter. dma_buf_map() is relatively new and the > > author > > probably didn't think about the case of cached iomem, which is used > > by, > > for example, vmwgfx. > > > > > Apparently cached device local is a thing. Also should this not > > > be > > > wrapped in CONFIG_X86? > > > > Both dma_buf_map() and memremap are generic, I think, I guess > > memremap > > would return NULL if it's not supported. > > It looks like memremap just wraps ioremap_cache, but since it also > discards the __iomem annotation should we be doing that universally? > Also not sure if ioremap_cache is universally supported, so wrapping > in CONFIG_X86 and falling back to plain ioremap() might be needed? Or > at least that looks like roughly what the previous code was doing? > Not > too sure tbh. > I think the long term goal is to use memremap all over the place, to just not have to bother with the __iomem annotation. But to do that io- mapping.h needs to support memremap. But for now we need to be strict about __iomem unless we're in arch specific code. That's why that dma_buf_map thing was created, but TTM memcpy was never fully adapted. As for limited arch support for memremap cached, It looks like we only need to or in "backup" mapping modes in the memremap flags, and we'd mimic the previous behaviour. /Thomas > > > > /Thomas > > > >
Am 25.05.21 um 12:07 schrieb Thomas Hellström: > On Tue, 2021-05-25 at 10:58 +0100, Matthew Auld wrote: >> On Tue, 25 May 2021 at 10:32, Thomas Hellström >> <thomas.hellstrom@linux.intel.com> wrote: >>> >>> On 5/25/21 11:18 AM, Matthew Auld wrote: >>>> On Fri, 21 May 2021 at 16:33, Thomas Hellström >>>> <thomas.hellstrom@linux.intel.com> wrote: >>>>> The internal ttm_bo_util memcpy uses ioremap functionality, and >>>>> while it >>>>> probably might be possible to use it for copying in- and out of >>>>> sglist represented io memory, using io_mem_reserve() / >>>>> io_mem_free() >>>>> callbacks, that would cause problems with fault(). >>>>> Instead, implement a method mapping page-by-page using >>>>> kmap_local() >>>>> semantics. As an additional benefit we then avoid the >>>>> occasional global >>>>> TLB flushes of ioremap() and consuming ioremap space, >>>>> elimination of a >>>>> critical point of failure and with a slight change of semantics >>>>> we could >>>>> also push the memcpy out async for testing and async driver >>>>> development >>>>> purposes. >>>>> >>>>> A special linear iomem iterator is introduced internally to >>>>> mimic the >>>>> old ioremap behaviour for code-paths that can't immediately be >>>>> ported >>>>> over. This adds to the code size and should be considered a >>>>> temporary >>>>> solution. >>>>> >>>>> Looking at the code we have a lot of checks for iomap tagged >>>>> pointers. >>>>> Ideally we should extend the core memremap functions to also >>>>> accept >>>>> uncached memory and kmap_local functionality. Then we could >>>>> strip a >>>>> lot of code. >>>>> >>>>> Cc: Christian König <christian.koenig@amd.com> >>>>> Signed-off-by: Thomas Hellström < >>>>> thomas.hellstrom@linux.intel.com> >>>>> --- >>>>> v3: >>>>> - Split up in various TTM files and addressed review comments >>>>> by >>>>> Christian König. Tested and fixed legacy iomap memcpy path >>>>> on i915. >>>>> --- >>>>> drivers/gpu/drm/ttm/ttm_bo_util.c | 278 ++++++++++---------- >>>>> --------- >>>>> drivers/gpu/drm/ttm/ttm_module.c | 35 ++++ >>>>> drivers/gpu/drm/ttm/ttm_resource.c | 166 +++++++++++++++++ >>>>> drivers/gpu/drm/ttm/ttm_tt.c | 42 +++++ >>>>> include/drm/ttm/ttm_bo_driver.h | 28 +++ >>>>> include/drm/ttm/ttm_caching.h | 2 + >>>>> include/drm/ttm/ttm_kmap_iter.h | 61 +++++++ >>>>> include/drm/ttm/ttm_resource.h | 61 +++++++ >>>>> include/drm/ttm/ttm_tt.h | 16 ++ >>>>> 9 files changed, 508 insertions(+), 181 deletions(-) >>>>> create mode 100644 include/drm/ttm/ttm_kmap_iter.h >>>>> >>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c >>>>> b/drivers/gpu/drm/ttm/ttm_bo_util.c >>>>> index ae8b61460724..912cbe8e60a2 100644 >>>>> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c >>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c >>>>> @@ -72,190 +72,126 @@ void ttm_mem_io_free(struct ttm_device >>>>> *bdev, >>>>> mem->bus.addr = NULL; >>>>> } >>>>> >>>>> -static int ttm_resource_ioremap(struct ttm_device *bdev, >>>>> - struct ttm_resource *mem, >>>>> - void **virtual) >>>>> +/** >>>>> + * ttm_move_memcpy - Helper to perform a memcpy ttm move >>>>> operation. >>>>> + * @bo: The struct ttm_buffer_object. >>>>> + * @new_mem: The struct ttm_resource we're moving to (copy >>>>> destination). >>>>> + * @new_iter: A struct ttm_kmap_iter representing the >>>>> destination resource. >>>>> + * @src_iter: A struct ttm_kmap_iter representing the source >>>>> resource. >>>>> + * >>>>> + * This function is intended to be able to move out async >>>>> under a >>>>> + * dma-fence if desired. >>>>> + */ >>>>> +void ttm_move_memcpy(struct ttm_buffer_object *bo, >>>>> + struct ttm_resource *dst_mem, >>>>> + struct ttm_kmap_iter *dst_iter, >>>>> + struct ttm_kmap_iter *src_iter) >>>>> { >>>>> - int ret; >>>>> - void *addr; >>>>> - >>>>> - *virtual = NULL; >>>>> - ret = ttm_mem_io_reserve(bdev, mem); >>>>> - if (ret || !mem->bus.is_iomem) >>>>> - return ret; >>>>> + const struct ttm_kmap_iter_ops *dst_ops = dst_iter- >>>>>> ops; >>>>> + const struct ttm_kmap_iter_ops *src_ops = src_iter- >>>>>> ops; >>>>> + struct ttm_tt *ttm = bo->ttm; >>>>> + struct dma_buf_map src_map, dst_map; >>>>> + pgoff_t i; >>>>> >>>>> - if (mem->bus.addr) { >>>>> - addr = mem->bus.addr; >>>>> - } else { >>>>> - size_t bus_size = (size_t)mem->num_pages << >>>>> PAGE_SHIFT; >>>>> + /* Single TTM move. NOP */ >>>>> + if (dst_ops->maps_tt && src_ops->maps_tt) >>>>> + return; >>>>> >>>>> - if (mem->bus.caching == ttm_write_combined) >>>>> - addr = ioremap_wc(mem->bus.offset, >>>>> bus_size); >>>>> -#ifdef CONFIG_X86 >>>>> - else if (mem->bus.caching == ttm_cached) >>>>> - addr = ioremap_cache(mem->bus.offset, >>>>> bus_size); >>>>> -#endif >>>>> - else >>>>> - addr = ioremap(mem->bus.offset, >>>>> bus_size); >>>>> - if (!addr) { >>>>> - ttm_mem_io_free(bdev, mem); >>>>> - return -ENOMEM; >>>>> + /* Don't move nonexistent data. Clear destination >>>>> instead. */ >>>>> + if (src_ops->maps_tt && (!ttm || >>>>> !ttm_tt_is_populated(ttm))) { >>>>> + if (ttm && !(ttm->page_flags & >>>>> TTM_PAGE_FLAG_ZERO_ALLOC)) >>>>> + return; >>>>> + >>>>> + for (i = 0; i < dst_mem->num_pages; ++i) { >>>>> + dst_ops->map_local(dst_iter, &dst_map, >>>>> i); >>>>> + if (dst_map.is_iomem) >>>>> + memset_io(dst_map.vaddr_iomem, >>>>> 0, PAGE_SIZE); >>>>> + else >>>>> + memset(dst_map.vaddr, 0, >>>>> PAGE_SIZE); >>>>> + if (dst_ops->unmap_local) >>>>> + dst_ops->unmap_local(dst_iter, >>>>> &dst_map); >>>>> } >>>>> + return; >>>>> } >>>>> - *virtual = addr; >>>>> - return 0; >>>>> -} >>>>> - >>>>> -static void ttm_resource_iounmap(struct ttm_device *bdev, >>>>> - struct ttm_resource *mem, >>>>> - void *virtual) >>>>> -{ >>>>> - if (virtual && mem->bus.addr == NULL) >>>>> - iounmap(virtual); >>>>> - ttm_mem_io_free(bdev, mem); >>>>> -} >>>>> - >>>>> -static int ttm_copy_io_page(void *dst, void *src, unsigned >>>>> long page) >>>>> -{ >>>>> - uint32_t *dstP = >>>>> - (uint32_t *) ((unsigned long)dst + (page << >>>>> PAGE_SHIFT)); >>>>> - uint32_t *srcP = >>>>> - (uint32_t *) ((unsigned long)src + (page << >>>>> PAGE_SHIFT)); >>>>> - >>>>> - int i; >>>>> - for (i = 0; i < PAGE_SIZE / sizeof(uint32_t); ++i) >>>>> - iowrite32(ioread32(srcP++), dstP++); >>>>> - return 0; >>>>> -} >>>>> - >>>>> -static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, >>>>> - unsigned long page, >>>>> - pgprot_t prot) >>>>> -{ >>>>> - struct page *d = ttm->pages[page]; >>>>> - void *dst; >>>>> - >>>>> - if (!d) >>>>> - return -ENOMEM; >>>>> - >>>>> - src = (void *)((unsigned long)src + (page << >>>>> PAGE_SHIFT)); >>>>> - dst = kmap_atomic_prot(d, prot); >>>>> - if (!dst) >>>>> - return -ENOMEM; >>>>> - >>>>> - memcpy_fromio(dst, src, PAGE_SIZE); >>>>> - >>>>> - kunmap_atomic(dst); >>>>> - >>>>> - return 0; >>>>> -} >>>>> - >>>>> -static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, >>>>> - unsigned long page, >>>>> - pgprot_t prot) >>>>> -{ >>>>> - struct page *s = ttm->pages[page]; >>>>> - void *src; >>>>> - >>>>> - if (!s) >>>>> - return -ENOMEM; >>>>> - >>>>> - dst = (void *)((unsigned long)dst + (page << >>>>> PAGE_SHIFT)); >>>>> - src = kmap_atomic_prot(s, prot); >>>>> - if (!src) >>>>> - return -ENOMEM; >>>>> >>>>> - memcpy_toio(dst, src, PAGE_SIZE); >>>>> - >>>>> - kunmap_atomic(src); >>>>> + for (i = 0; i < dst_mem->num_pages; ++i) { >>>>> + dst_ops->map_local(dst_iter, &dst_map, i); >>>>> + src_ops->map_local(src_iter, &src_map, i); >>>>> + >>>>> + if (!src_map.is_iomem && !dst_map.is_iomem) { >>>>> + memcpy(dst_map.vaddr, src_map.vaddr, >>>>> PAGE_SIZE); >>>>> + } else if (!src_map.is_iomem) { >>>>> + dma_buf_map_memcpy_to(&dst_map, >>>>> src_map.vaddr, >>>>> + PAGE_SIZE); >>>>> + } else if (!dst_map.is_iomem) { >>>>> + memcpy_fromio(dst_map.vaddr, >>>>> src_map.vaddr_iomem, >>>>> + PAGE_SIZE); >>>>> + } else { >>>>> + int j; >>>>> + u32 __iomem *src = src_map.vaddr_iomem; >>>>> + u32 __iomem *dst = dst_map.vaddr_iomem; >>>>> >>>>> - return 0; >>>>> + for (j = 0; j < (PAGE_SIZE >> 2); ++j) >>>> IMO PAGE_SIZE / sizeof(u32) is easier to understand. >>> OK, will fix. >>> >>> >>>>> + iowrite32(ioread32(src++), >>>>> dst++); >>>>> + } >>>>> + if (src_ops->unmap_local) >>>>> + src_ops->unmap_local(src_iter, >>>>> &src_map); >>>>> + if (dst_ops->unmap_local) >>>>> + dst_ops->unmap_local(dst_iter, >>>>> &dst_map); >>>>> + } >>>>> } >>>>> +EXPORT_SYMBOL(ttm_move_memcpy); >>>>> >>>>> int ttm_bo_move_memcpy(struct ttm_buffer_object *bo, >>>>> struct ttm_operation_ctx *ctx, >>>>> - struct ttm_resource *new_mem) >>>>> + struct ttm_resource *dst_mem) >>>>> { >>>>> struct ttm_device *bdev = bo->bdev; >>>>> - struct ttm_resource_manager *man = >>>>> ttm_manager_type(bdev, new_mem->mem_type); >>>>> + struct ttm_resource_manager *dst_man = >>>>> + ttm_manager_type(bo->bdev, dst_mem->mem_type); >>>>> struct ttm_tt *ttm = bo->ttm; >>>>> - struct ttm_resource *old_mem = &bo->mem; >>>>> - struct ttm_resource old_copy = *old_mem; >>>>> - void *old_iomap; >>>>> - void *new_iomap; >>>>> + struct ttm_resource *src_mem = &bo->mem; >>>>> + struct ttm_resource_manager *src_man = >>>>> + ttm_manager_type(bdev, src_mem->mem_type); >>>>> + struct ttm_resource src_copy = *src_mem; >>>>> + union { >>>>> + struct ttm_kmap_iter_tt tt; >>>>> + struct ttm_kmap_iter_linear_io io; >>>>> + } _dst_iter, _src_iter; >>>>> + struct ttm_kmap_iter *dst_iter, *src_iter; >>>>> int ret; >>>>> - unsigned long i; >>>>> >>>>> - ret = ttm_bo_wait_ctx(bo, ctx); >>>>> - if (ret) >>>>> - return ret; >>>>> - >>>>> - ret = ttm_resource_ioremap(bdev, old_mem, &old_iomap); >>>>> - if (ret) >>>>> - return ret; >>>>> - ret = ttm_resource_ioremap(bdev, new_mem, &new_iomap); >>>>> - if (ret) >>>>> - goto out; >>>>> - >>>>> - /* >>>>> - * Single TTM move. NOP. >>>>> - */ >>>>> - if (old_iomap == NULL && new_iomap == NULL) >>>>> - goto out2; >>>>> - >>>>> - /* >>>>> - * Don't move nonexistent data. Clear destination >>>>> instead. >>>>> - */ >>>>> - if (old_iomap == NULL && >>>>> - (ttm == NULL || (!ttm_tt_is_populated(ttm) && >>>>> - !(ttm->page_flags & >>>>> TTM_PAGE_FLAG_SWAPPED)))) { >>>>> - memset_io(new_iomap, 0, new_mem- >>>>>> num_pages*PAGE_SIZE); >>>>> - goto out2; >>>>> - } >>>>> - >>>>> - /* >>>>> - * TTM might be null for moves within the same region. >>>>> - */ >>>>> - if (ttm) { >>>>> + if (ttm && ((ttm->page_flags & TTM_PAGE_FLAG_SWAPPED) >>>>> || >>>>> + dst_man->use_tt)) { >>>>> ret = ttm_tt_populate(bdev, ttm, ctx); >>>>> if (ret) >>>>> - goto out1; >>>>> + return ret; >>>>> } >>>>> >>>>> - for (i = 0; i < new_mem->num_pages; ++i) { >>>>> - if (old_iomap == NULL) { >>>>> - pgprot_t prot = ttm_io_prot(bo, >>>>> old_mem, PAGE_KERNEL); >>>>> - ret = ttm_copy_ttm_io_page(ttm, >>>>> new_iomap, i, >>>>> - prot); >>>>> - } else if (new_iomap == NULL) { >>>>> - pgprot_t prot = ttm_io_prot(bo, >>>>> new_mem, PAGE_KERNEL); >>>>> - ret = ttm_copy_io_ttm_page(ttm, >>>>> old_iomap, i, >>>>> - prot); >>>>> - } else { >>>>> - ret = ttm_copy_io_page(new_iomap, >>>>> old_iomap, i); >>>>> - } >>>>> - if (ret) >>>>> - goto out1; >>>>> + dst_iter = ttm_kmap_iter_linear_io_init(&_dst_iter.io, >>>>> bdev, dst_mem); >>>>> + if (PTR_ERR(dst_iter) == -EINVAL && dst_man->use_tt) >>>>> + dst_iter = ttm_kmap_iter_tt_init(&_dst_iter.tt, >>>>> bo->ttm); >>>>> + if (IS_ERR(dst_iter)) >>>>> + return PTR_ERR(dst_iter); >>>>> + >>>>> + src_iter = ttm_kmap_iter_linear_io_init(&_src_iter.io, >>>>> bdev, src_mem); >>>>> + if (PTR_ERR(src_iter) == -EINVAL && src_man->use_tt) >>>>> + src_iter = ttm_kmap_iter_tt_init(&_src_iter.tt, >>>>> bo->ttm); >>>>> + if (IS_ERR(src_iter)) { >>>>> + ret = PTR_ERR(src_iter); >>>>> + goto out_src_iter; >>>>> } >>>>> - mb(); >>>>> -out2: >>>>> - old_copy = *old_mem; >>>>> >>>>> - ttm_bo_assign_mem(bo, new_mem); >>>>> - >>>>> - if (!man->use_tt) >>>>> - ttm_bo_tt_destroy(bo); >>>>> + ttm_move_memcpy(bo, dst_mem, dst_iter, src_iter); >>>>> + src_copy = *src_mem; >>>>> + ttm_bo_move_sync_cleanup(bo, dst_mem); >>>>> >>>>> -out1: >>>>> - ttm_resource_iounmap(bdev, old_mem, new_iomap); >>>>> -out: >>>>> - ttm_resource_iounmap(bdev, &old_copy, old_iomap); >>>>> + if (!src_iter->ops->maps_tt) >>>>> + ttm_kmap_iter_linear_io_fini(&_src_iter.io, >>>>> bdev, &src_copy); >>>>> +out_src_iter: >>>>> + if (!dst_iter->ops->maps_tt) >>>>> + ttm_kmap_iter_linear_io_fini(&_dst_iter.io, >>>>> bdev, dst_mem); >>>>> >>>>> - /* >>>>> - * On error, keep the mm node! >>>>> - */ >>>>> - if (!ret) >>>>> - ttm_resource_free(bo, &old_copy); >>>>> return ret; >>>>> } >>>>> EXPORT_SYMBOL(ttm_bo_move_memcpy); >>>>> @@ -336,27 +272,7 @@ pgprot_t ttm_io_prot(struct >>>>> ttm_buffer_object *bo, struct ttm_resource *res, >>>>> man = ttm_manager_type(bo->bdev, res->mem_type); >>>>> caching = man->use_tt ? bo->ttm->caching : res- >>>>>> bus.caching; >>>>> - /* Cached mappings need no adjustment */ >>>>> - if (caching == ttm_cached) >>>>> - return tmp; >>>>> - >>>>> -#if defined(__i386__) || defined(__x86_64__) >>>>> - if (caching == ttm_write_combined) >>>>> - tmp = pgprot_writecombine(tmp); >>>>> - else if (boot_cpu_data.x86 > 3) >>>>> - tmp = pgprot_noncached(tmp); >>>>> -#endif >>>>> -#if defined(__ia64__) || defined(__arm__) || >>>>> defined(__aarch64__) || \ >>>>> - defined(__powerpc__) || defined(__mips__) >>>>> - if (caching == ttm_write_combined) >>>>> - tmp = pgprot_writecombine(tmp); >>>>> - else >>>>> - tmp = pgprot_noncached(tmp); >>>>> -#endif >>>>> -#if defined(__sparc__) >>>>> - tmp = pgprot_noncached(tmp); >>>>> -#endif >>>>> - return tmp; >>>>> + return ttm_prot_from_caching(caching, tmp); >>>>> } >>>>> EXPORT_SYMBOL(ttm_io_prot); >>>>> >>>>> diff --git a/drivers/gpu/drm/ttm/ttm_module.c >>>>> b/drivers/gpu/drm/ttm/ttm_module.c >>>>> index 56b0efdba1a9..997c458f68a9 100644 >>>>> --- a/drivers/gpu/drm/ttm/ttm_module.c >>>>> +++ b/drivers/gpu/drm/ttm/ttm_module.c >>>>> @@ -31,12 +31,47 @@ >>>>> */ >>>>> #include <linux/module.h> >>>>> #include <linux/device.h> >>>>> +#include <linux/pgtable.h> >>>>> #include <linux/sched.h> >>>>> #include <linux/debugfs.h> >>>>> #include <drm/drm_sysfs.h> >>>>> +#include <drm/ttm/ttm_caching.h> >>>>> >>>>> #include "ttm_module.h" >>>>> >>>>> +/** >>>>> + * ttm_prot_from_caching - Modify the page protection >>>>> according to the >>>>> + * ttm cacing mode >>>>> + * @caching: The ttm caching mode >>>>> + * @tmp: The original page protection >>>>> + * >>>>> + * Return: The modified page protection >>>>> + */ >>>>> +pgprot_t ttm_prot_from_caching(enum ttm_caching caching, >>>>> pgprot_t tmp) >>>>> +{ >>>>> + /* Cached mappings need no adjustment */ >>>>> + if (caching == ttm_cached) >>>>> + return tmp; >>>>> + >>>>> +#if defined(__i386__) || defined(__x86_64__) >>>>> + if (caching == ttm_write_combined) >>>>> + tmp = pgprot_writecombine(tmp); >>>>> + else if (boot_cpu_data.x86 > 3) >>>>> + tmp = pgprot_noncached(tmp); >>>>> +#endif >>>>> +#if defined(__ia64__) || defined(__arm__) || >>>>> defined(__aarch64__) || \ >>>>> + defined(__powerpc__) || defined(__mips__) >>>>> + if (caching == ttm_write_combined) >>>>> + tmp = pgprot_writecombine(tmp); >>>>> + else >>>>> + tmp = pgprot_noncached(tmp); >>>>> +#endif >>>>> +#if defined(__sparc__) >>>>> + tmp = pgprot_noncached(tmp); >>>>> +#endif >>>>> + return tmp; >>>>> +} >>>>> + >>>>> struct dentry *ttm_debugfs_root; >>>>> >>>>> static int __init ttm_init(void) >>>>> diff --git a/drivers/gpu/drm/ttm/ttm_resource.c >>>>> b/drivers/gpu/drm/ttm/ttm_resource.c >>>>> index 59e2b7157e41..e05ae7e3d477 100644 >>>>> --- a/drivers/gpu/drm/ttm/ttm_resource.c >>>>> +++ b/drivers/gpu/drm/ttm/ttm_resource.c >>>>> @@ -22,6 +22,10 @@ >>>>> * Authors: Christian König >>>>> */ >>>>> >>>>> +#include <linux/dma-buf-map.h> >>>>> +#include <linux/io-mapping.h> >>>>> +#include <linux/scatterlist.h> >>>>> + >>>>> #include <drm/ttm/ttm_resource.h> >>>>> #include <drm/ttm/ttm_bo_driver.h> >>>>> >>>>> @@ -147,3 +151,165 @@ void ttm_resource_manager_debug(struct >>>>> ttm_resource_manager *man, >>>>> man->func->debug(man, p); >>>>> } >>>>> EXPORT_SYMBOL(ttm_resource_manager_debug); >>>>> + >>>>> +static void ttm_kmap_iter_iomap_map_local(struct ttm_kmap_iter >>>>> *iter, >>>>> + struct dma_buf_map >>>>> *dmap, >>>>> + pgoff_t i) >>>>> +{ >>>>> + struct ttm_kmap_iter_iomap *iter_io = >>>>> + container_of(iter, typeof(*iter_io), base); >>>>> + void __iomem *addr; >>>>> + >>>>> +retry: >>>>> + while (i >= iter_io->cache.end) { >>>>> + iter_io->cache.sg = iter_io->cache.sg ? >>>>> + sg_next(iter_io->cache.sg) : iter_io- >>>>>> st->sgl; >>>>> + iter_io->cache.i = iter_io->cache.end; >>>>> + iter_io->cache.end += sg_dma_len(iter_io- >>>>>> cache.sg) >> >>>>> + PAGE_SHIFT; >>>>> + iter_io->cache.offs = sg_dma_address(iter_io- >>>>>> cache.sg) - >>>>> + iter_io->start; >>>>> + } >>>>> + >>>>> + if (i < iter_io->cache.i) { >>>>> + iter_io->cache.end = 0; >>>>> + iter_io->cache.sg = NULL; >>>>> + goto retry; >>>>> + } >>>>> + >>>>> + addr = io_mapping_map_local_wc(iter_io->iomap, iter_io- >>>>>> cache.offs + >>>>> + (((resource_size_t)i - >>>>> iter_io->cache.i) >>>>> + << PAGE_SHIFT)); >>>>> + dma_buf_map_set_vaddr_iomem(dmap, addr); >>>>> +} >>>>> + >>>>> +static void ttm_kmap_iter_iomap_unmap_local(struct >>>>> ttm_kmap_iter *iter, >>>>> + struct dma_buf_map >>>>> *map) >>>>> +{ >>>>> + io_mapping_unmap_local(map->vaddr_iomem); >>>>> +} >>>>> + >>>>> +static const struct ttm_kmap_iter_ops ttm_kmap_iter_io_ops = { >>>>> + .map_local = ttm_kmap_iter_iomap_map_local, >>>>> + .unmap_local = ttm_kmap_iter_iomap_unmap_local, >>>>> + .maps_tt = false, >>>>> +}; >>>>> + >>>>> +/** >>>>> + * ttm_kmap_iter_iomap_init - Initialize a struct >>>>> ttm_kmap_iter_iomap >>>>> + * @iter_io: The struct ttm_kmap_iter_iomap to initialize. >>>>> + * @iomap: The struct io_mapping representing the underlying >>>>> linear io_memory. >>>>> + * @st: sg_table into @iomap, representing the memory of the >>>>> struct >>>>> + * ttm_resource. >>>>> + * @start: Offset that needs to be subtracted from @st to make >>>>> + * sg_dma_address(st->sgl) - @start == 0 for @iomap start. >>>>> + * >>>>> + * Return: Pointer to the embedded struct ttm_kmap_iter. >>>>> + */ >>>>> +struct ttm_kmap_iter * >>>>> +ttm_kmap_iter_iomap_init(struct ttm_kmap_iter_iomap *iter_io, >>>>> + struct io_mapping *iomap, >>>>> + struct sg_table *st, >>>>> + resource_size_t start) >>>>> +{ >>>>> + iter_io->base.ops = &ttm_kmap_iter_io_ops; >>>>> + iter_io->iomap = iomap; >>>>> + iter_io->st = st; >>>>> + iter_io->start = start; >>>>> + memset(&iter_io->cache, 0, sizeof(iter_io->cache)); >>>>> + >>>>> + return &iter_io->base; >>>>> +} >>>>> +EXPORT_SYMBOL(ttm_kmap_iter_iomap_init); >>>>> + >>>>> +/** >>>>> + * DOC: Linear io iterator >>>>> + * >>>>> + * This code should die in the not too near future. Best would >>>>> be if we could >>>>> + * make io-mapping use memremap for all io memory, and have >>>>> memremap >>>>> + * implement a kmap_local functionality. We could then strip a >>>>> huge amount of >>>>> + * code. These linear io iterators are implemented to mimic >>>>> old functionality, >>>>> + * and they don't use kmap_local semantics at all internally. >>>>> Rather ioremap or >>>>> + * friends, and at least on 32-bit they add global TLB flushes >>>>> and points >>>>> + * of failure. >>>>> + */ >>>>> + >>>>> +static void ttm_kmap_iter_linear_io_map_local(struct >>>>> ttm_kmap_iter *iter, >>>>> + struct >>>>> dma_buf_map *dmap, >>>>> + pgoff_t i) >>>>> +{ >>>>> + struct ttm_kmap_iter_linear_io *iter_io = >>>>> + container_of(iter, typeof(*iter_io), base); >>>>> + >>>>> + *dmap = iter_io->dmap; >>>>> + dma_buf_map_incr(dmap, i * PAGE_SIZE); >>>>> +} >>>>> + >>>>> +static const struct ttm_kmap_iter_ops >>>>> ttm_kmap_iter_linear_io_ops = { >>>>> + .map_local = ttm_kmap_iter_linear_io_map_local, >>>>> + .maps_tt = false, >>>>> +}; >>>>> + >>>>> +struct ttm_kmap_iter * >>>>> +ttm_kmap_iter_linear_io_init(struct ttm_kmap_iter_linear_io >>>>> *iter_io, >>>>> + struct ttm_device *bdev, >>>>> + struct ttm_resource *mem) >>>>> +{ >>>>> + int ret; >>>>> + >>>>> + ret = ttm_mem_io_reserve(bdev, mem); >>>>> + if (ret) >>>>> + goto out_err; >>>>> + if (!mem->bus.is_iomem) { >>>>> + ret = -EINVAL; >>>>> + goto out_io_free; >>>>> + } >>>>> + >>>>> + if (mem->bus.addr) { >>>>> + dma_buf_map_set_vaddr(&iter_io->dmap, mem- >>>>>> bus.addr); >>>>> + iter_io->needs_unmap = false; >>>>> + } else { >>>>> + size_t bus_size = (size_t)mem->num_pages << >>>>> PAGE_SHIFT; >>>>> + >>>>> + iter_io->needs_unmap = true; >>>>> + if (mem->bus.caching == ttm_write_combined) >>>>> + dma_buf_map_set_vaddr_iomem(&iter_io- >>>>>> dmap, >>>>> + >>>>> ioremap_wc(mem->bus.offset, >>>>> + >>>>> bus_size)); >>>>> + else if (mem->bus.caching == ttm_cached) >>>>> + dma_buf_map_set_vaddr(&iter_io->dmap, >>>>> + memremap(mem- >>>>>> bus.offset, bus_size, >>>>> + >>>>> MEMREMAP_WB)); >>>> The comments in set_vaddr suggest that this is meant for >>>> system-memory. Does that actually matter or is it just about not >>>> losing the __iomem annotation on platforms where it matters? >>> Yes, it's the latter. dma_buf_map() is relatively new and the >>> author >>> probably didn't think about the case of cached iomem, which is used >>> by, >>> for example, vmwgfx. >>> >>>> Apparently cached device local is a thing. Also should this not >>>> be >>>> wrapped in CONFIG_X86? >>> Both dma_buf_map() and memremap are generic, I think, I guess >>> memremap >>> would return NULL if it's not supported. >> It looks like memremap just wraps ioremap_cache, but since it also >> discards the __iomem annotation should we be doing that universally? >> Also not sure if ioremap_cache is universally supported, so wrapping >> in CONFIG_X86 and falling back to plain ioremap() might be needed? Or >> at least that looks like roughly what the previous code was doing? >> Not >> too sure tbh. >> > I think the long term goal is to use memremap all over the place, to > just not have to bother with the __iomem annotation. But to do that io- > mapping.h needs to support memremap. But for now we need to be strict > about __iomem unless we're in arch specific code. That's why that > dma_buf_map thing was created, but TTM memcpy was never fully adapted. I don't think that this will work. __iomem annotation is there because we have architectures where you need to use special CPU instructions for iomem access. That won't go away just because we use memremap(). Christian. > > As for limited arch support for memremap cached, It looks like we only > need to or in "backup" mapping modes in the memremap flags, and we'd > mimic the previous behaviour. > > /Thomas > > >>> /Thomas >>> >>> >
On 5/25/21 5:48 PM, Christian König wrote: > > > Am 25.05.21 um 12:07 schrieb Thomas Hellström: >> On Tue, 2021-05-25 at 10:58 +0100, Matthew Auld wrote: >>> On Tue, 25 May 2021 at 10:32, Thomas Hellström >>> <thomas.hellstrom@linux.intel.com> wrote: >>>> >>>> On 5/25/21 11:18 AM, Matthew Auld wrote: >>>>> On Fri, 21 May 2021 at 16:33, Thomas Hellström >>>>> <thomas.hellstrom@linux.intel.com> wrote: >>>>>> The internal ttm_bo_util memcpy uses ioremap functionality, and >>>>>> while it >>>>>> probably might be possible to use it for copying in- and out of >>>>>> sglist represented io memory, using io_mem_reserve() / >>>>>> io_mem_free() >>>>>> callbacks, that would cause problems with fault(). >>>>>> Instead, implement a method mapping page-by-page using >>>>>> kmap_local() >>>>>> semantics. As an additional benefit we then avoid the >>>>>> occasional global >>>>>> TLB flushes of ioremap() and consuming ioremap space, >>>>>> elimination of a >>>>>> critical point of failure and with a slight change of semantics >>>>>> we could >>>>>> also push the memcpy out async for testing and async driver >>>>>> development >>>>>> purposes. >>>>>> >>>>>> A special linear iomem iterator is introduced internally to >>>>>> mimic the >>>>>> old ioremap behaviour for code-paths that can't immediately be >>>>>> ported >>>>>> over. This adds to the code size and should be considered a >>>>>> temporary >>>>>> solution. >>>>>> >>>>>> Looking at the code we have a lot of checks for iomap tagged >>>>>> pointers. >>>>>> Ideally we should extend the core memremap functions to also >>>>>> accept >>>>>> uncached memory and kmap_local functionality. Then we could >>>>>> strip a >>>>>> lot of code. >>>>>> >>>>>> Cc: Christian König <christian.koenig@amd.com> >>>>>> Signed-off-by: Thomas Hellström < >>>>>> thomas.hellstrom@linux.intel.com> >>>>>> --- >>>>>> v3: >>>>>> - Split up in various TTM files and addressed review comments >>>>>> by >>>>>> Christian König. Tested and fixed legacy iomap memcpy path >>>>>> on i915. >>>>>> --- >>>>>> drivers/gpu/drm/ttm/ttm_bo_util.c | 278 ++++++++++---------- >>>>>> --------- >>>>>> drivers/gpu/drm/ttm/ttm_module.c | 35 ++++ >>>>>> drivers/gpu/drm/ttm/ttm_resource.c | 166 +++++++++++++++++ >>>>>> drivers/gpu/drm/ttm/ttm_tt.c | 42 +++++ >>>>>> include/drm/ttm/ttm_bo_driver.h | 28 +++ >>>>>> include/drm/ttm/ttm_caching.h | 2 + >>>>>> include/drm/ttm/ttm_kmap_iter.h | 61 +++++++ >>>>>> include/drm/ttm/ttm_resource.h | 61 +++++++ >>>>>> include/drm/ttm/ttm_tt.h | 16 ++ >>>>>> 9 files changed, 508 insertions(+), 181 deletions(-) >>>>>> create mode 100644 include/drm/ttm/ttm_kmap_iter.h >>>>>> >>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c >>>>>> b/drivers/gpu/drm/ttm/ttm_bo_util.c >>>>>> index ae8b61460724..912cbe8e60a2 100644 >>>>>> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c >>>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c >>>>>> @@ -72,190 +72,126 @@ void ttm_mem_io_free(struct ttm_device >>>>>> *bdev, >>>>>> mem->bus.addr = NULL; >>>>>> } >>>>>> >>>>>> -static int ttm_resource_ioremap(struct ttm_device *bdev, >>>>>> - struct ttm_resource *mem, >>>>>> - void **virtual) >>>>>> +/** >>>>>> + * ttm_move_memcpy - Helper to perform a memcpy ttm move >>>>>> operation. >>>>>> + * @bo: The struct ttm_buffer_object. >>>>>> + * @new_mem: The struct ttm_resource we're moving to (copy >>>>>> destination). >>>>>> + * @new_iter: A struct ttm_kmap_iter representing the >>>>>> destination resource. >>>>>> + * @src_iter: A struct ttm_kmap_iter representing the source >>>>>> resource. >>>>>> + * >>>>>> + * This function is intended to be able to move out async >>>>>> under a >>>>>> + * dma-fence if desired. >>>>>> + */ >>>>>> +void ttm_move_memcpy(struct ttm_buffer_object *bo, >>>>>> + struct ttm_resource *dst_mem, >>>>>> + struct ttm_kmap_iter *dst_iter, >>>>>> + struct ttm_kmap_iter *src_iter) >>>>>> { >>>>>> - int ret; >>>>>> - void *addr; >>>>>> - >>>>>> - *virtual = NULL; >>>>>> - ret = ttm_mem_io_reserve(bdev, mem); >>>>>> - if (ret || !mem->bus.is_iomem) >>>>>> - return ret; >>>>>> + const struct ttm_kmap_iter_ops *dst_ops = dst_iter- >>>>>>> ops; >>>>>> + const struct ttm_kmap_iter_ops *src_ops = src_iter- >>>>>>> ops; >>>>>> + struct ttm_tt *ttm = bo->ttm; >>>>>> + struct dma_buf_map src_map, dst_map; >>>>>> + pgoff_t i; >>>>>> >>>>>> - if (mem->bus.addr) { >>>>>> - addr = mem->bus.addr; >>>>>> - } else { >>>>>> - size_t bus_size = (size_t)mem->num_pages << >>>>>> PAGE_SHIFT; >>>>>> + /* Single TTM move. NOP */ >>>>>> + if (dst_ops->maps_tt && src_ops->maps_tt) >>>>>> + return; >>>>>> >>>>>> - if (mem->bus.caching == ttm_write_combined) >>>>>> - addr = ioremap_wc(mem->bus.offset, >>>>>> bus_size); >>>>>> -#ifdef CONFIG_X86 >>>>>> - else if (mem->bus.caching == ttm_cached) >>>>>> - addr = ioremap_cache(mem->bus.offset, >>>>>> bus_size); >>>>>> -#endif >>>>>> - else >>>>>> - addr = ioremap(mem->bus.offset, >>>>>> bus_size); >>>>>> - if (!addr) { >>>>>> - ttm_mem_io_free(bdev, mem); >>>>>> - return -ENOMEM; >>>>>> + /* Don't move nonexistent data. Clear destination >>>>>> instead. */ >>>>>> + if (src_ops->maps_tt && (!ttm || >>>>>> !ttm_tt_is_populated(ttm))) { >>>>>> + if (ttm && !(ttm->page_flags & >>>>>> TTM_PAGE_FLAG_ZERO_ALLOC)) >>>>>> + return; >>>>>> + >>>>>> + for (i = 0; i < dst_mem->num_pages; ++i) { >>>>>> + dst_ops->map_local(dst_iter, &dst_map, >>>>>> i); >>>>>> + if (dst_map.is_iomem) >>>>>> + memset_io(dst_map.vaddr_iomem, >>>>>> 0, PAGE_SIZE); >>>>>> + else >>>>>> + memset(dst_map.vaddr, 0, >>>>>> PAGE_SIZE); >>>>>> + if (dst_ops->unmap_local) >>>>>> + dst_ops->unmap_local(dst_iter, >>>>>> &dst_map); >>>>>> } >>>>>> + return; >>>>>> } >>>>>> - *virtual = addr; >>>>>> - return 0; >>>>>> -} >>>>>> - >>>>>> -static void ttm_resource_iounmap(struct ttm_device *bdev, >>>>>> - struct ttm_resource *mem, >>>>>> - void *virtual) >>>>>> -{ >>>>>> - if (virtual && mem->bus.addr == NULL) >>>>>> - iounmap(virtual); >>>>>> - ttm_mem_io_free(bdev, mem); >>>>>> -} >>>>>> - >>>>>> -static int ttm_copy_io_page(void *dst, void *src, unsigned >>>>>> long page) >>>>>> -{ >>>>>> - uint32_t *dstP = >>>>>> - (uint32_t *) ((unsigned long)dst + (page << >>>>>> PAGE_SHIFT)); >>>>>> - uint32_t *srcP = >>>>>> - (uint32_t *) ((unsigned long)src + (page << >>>>>> PAGE_SHIFT)); >>>>>> - >>>>>> - int i; >>>>>> - for (i = 0; i < PAGE_SIZE / sizeof(uint32_t); ++i) >>>>>> - iowrite32(ioread32(srcP++), dstP++); >>>>>> - return 0; >>>>>> -} >>>>>> - >>>>>> -static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, >>>>>> - unsigned long page, >>>>>> - pgprot_t prot) >>>>>> -{ >>>>>> - struct page *d = ttm->pages[page]; >>>>>> - void *dst; >>>>>> - >>>>>> - if (!d) >>>>>> - return -ENOMEM; >>>>>> - >>>>>> - src = (void *)((unsigned long)src + (page << >>>>>> PAGE_SHIFT)); >>>>>> - dst = kmap_atomic_prot(d, prot); >>>>>> - if (!dst) >>>>>> - return -ENOMEM; >>>>>> - >>>>>> - memcpy_fromio(dst, src, PAGE_SIZE); >>>>>> - >>>>>> - kunmap_atomic(dst); >>>>>> - >>>>>> - return 0; >>>>>> -} >>>>>> - >>>>>> -static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, >>>>>> - unsigned long page, >>>>>> - pgprot_t prot) >>>>>> -{ >>>>>> - struct page *s = ttm->pages[page]; >>>>>> - void *src; >>>>>> - >>>>>> - if (!s) >>>>>> - return -ENOMEM; >>>>>> - >>>>>> - dst = (void *)((unsigned long)dst + (page << >>>>>> PAGE_SHIFT)); >>>>>> - src = kmap_atomic_prot(s, prot); >>>>>> - if (!src) >>>>>> - return -ENOMEM; >>>>>> >>>>>> - memcpy_toio(dst, src, PAGE_SIZE); >>>>>> - >>>>>> - kunmap_atomic(src); >>>>>> + for (i = 0; i < dst_mem->num_pages; ++i) { >>>>>> + dst_ops->map_local(dst_iter, &dst_map, i); >>>>>> + src_ops->map_local(src_iter, &src_map, i); >>>>>> + >>>>>> + if (!src_map.is_iomem && !dst_map.is_iomem) { >>>>>> + memcpy(dst_map.vaddr, src_map.vaddr, >>>>>> PAGE_SIZE); >>>>>> + } else if (!src_map.is_iomem) { >>>>>> + dma_buf_map_memcpy_to(&dst_map, >>>>>> src_map.vaddr, >>>>>> + PAGE_SIZE); >>>>>> + } else if (!dst_map.is_iomem) { >>>>>> + memcpy_fromio(dst_map.vaddr, >>>>>> src_map.vaddr_iomem, >>>>>> + PAGE_SIZE); >>>>>> + } else { >>>>>> + int j; >>>>>> + u32 __iomem *src = src_map.vaddr_iomem; >>>>>> + u32 __iomem *dst = dst_map.vaddr_iomem; >>>>>> >>>>>> - return 0; >>>>>> + for (j = 0; j < (PAGE_SIZE >> 2); ++j) >>>>> IMO PAGE_SIZE / sizeof(u32) is easier to understand. >>>> OK, will fix. >>>> >>>> >>>>>> + iowrite32(ioread32(src++), >>>>>> dst++); >>>>>> + } >>>>>> + if (src_ops->unmap_local) >>>>>> + src_ops->unmap_local(src_iter, >>>>>> &src_map); >>>>>> + if (dst_ops->unmap_local) >>>>>> + dst_ops->unmap_local(dst_iter, >>>>>> &dst_map); >>>>>> + } >>>>>> } >>>>>> +EXPORT_SYMBOL(ttm_move_memcpy); >>>>>> >>>>>> int ttm_bo_move_memcpy(struct ttm_buffer_object *bo, >>>>>> struct ttm_operation_ctx *ctx, >>>>>> - struct ttm_resource *new_mem) >>>>>> + struct ttm_resource *dst_mem) >>>>>> { >>>>>> struct ttm_device *bdev = bo->bdev; >>>>>> - struct ttm_resource_manager *man = >>>>>> ttm_manager_type(bdev, new_mem->mem_type); >>>>>> + struct ttm_resource_manager *dst_man = >>>>>> + ttm_manager_type(bo->bdev, dst_mem->mem_type); >>>>>> struct ttm_tt *ttm = bo->ttm; >>>>>> - struct ttm_resource *old_mem = &bo->mem; >>>>>> - struct ttm_resource old_copy = *old_mem; >>>>>> - void *old_iomap; >>>>>> - void *new_iomap; >>>>>> + struct ttm_resource *src_mem = &bo->mem; >>>>>> + struct ttm_resource_manager *src_man = >>>>>> + ttm_manager_type(bdev, src_mem->mem_type); >>>>>> + struct ttm_resource src_copy = *src_mem; >>>>>> + union { >>>>>> + struct ttm_kmap_iter_tt tt; >>>>>> + struct ttm_kmap_iter_linear_io io; >>>>>> + } _dst_iter, _src_iter; >>>>>> + struct ttm_kmap_iter *dst_iter, *src_iter; >>>>>> int ret; >>>>>> - unsigned long i; >>>>>> >>>>>> - ret = ttm_bo_wait_ctx(bo, ctx); >>>>>> - if (ret) >>>>>> - return ret; >>>>>> - >>>>>> - ret = ttm_resource_ioremap(bdev, old_mem, &old_iomap); >>>>>> - if (ret) >>>>>> - return ret; >>>>>> - ret = ttm_resource_ioremap(bdev, new_mem, &new_iomap); >>>>>> - if (ret) >>>>>> - goto out; >>>>>> - >>>>>> - /* >>>>>> - * Single TTM move. NOP. >>>>>> - */ >>>>>> - if (old_iomap == NULL && new_iomap == NULL) >>>>>> - goto out2; >>>>>> - >>>>>> - /* >>>>>> - * Don't move nonexistent data. Clear destination >>>>>> instead. >>>>>> - */ >>>>>> - if (old_iomap == NULL && >>>>>> - (ttm == NULL || (!ttm_tt_is_populated(ttm) && >>>>>> - !(ttm->page_flags & >>>>>> TTM_PAGE_FLAG_SWAPPED)))) { >>>>>> - memset_io(new_iomap, 0, new_mem- >>>>>>> num_pages*PAGE_SIZE); >>>>>> - goto out2; >>>>>> - } >>>>>> - >>>>>> - /* >>>>>> - * TTM might be null for moves within the same region. >>>>>> - */ >>>>>> - if (ttm) { >>>>>> + if (ttm && ((ttm->page_flags & TTM_PAGE_FLAG_SWAPPED) >>>>>> || >>>>>> + dst_man->use_tt)) { >>>>>> ret = ttm_tt_populate(bdev, ttm, ctx); >>>>>> if (ret) >>>>>> - goto out1; >>>>>> + return ret; >>>>>> } >>>>>> >>>>>> - for (i = 0; i < new_mem->num_pages; ++i) { >>>>>> - if (old_iomap == NULL) { >>>>>> - pgprot_t prot = ttm_io_prot(bo, >>>>>> old_mem, PAGE_KERNEL); >>>>>> - ret = ttm_copy_ttm_io_page(ttm, >>>>>> new_iomap, i, >>>>>> - prot); >>>>>> - } else if (new_iomap == NULL) { >>>>>> - pgprot_t prot = ttm_io_prot(bo, >>>>>> new_mem, PAGE_KERNEL); >>>>>> - ret = ttm_copy_io_ttm_page(ttm, >>>>>> old_iomap, i, >>>>>> - prot); >>>>>> - } else { >>>>>> - ret = ttm_copy_io_page(new_iomap, >>>>>> old_iomap, i); >>>>>> - } >>>>>> - if (ret) >>>>>> - goto out1; >>>>>> + dst_iter = ttm_kmap_iter_linear_io_init(&_dst_iter.io, >>>>>> bdev, dst_mem); >>>>>> + if (PTR_ERR(dst_iter) == -EINVAL && dst_man->use_tt) >>>>>> + dst_iter = ttm_kmap_iter_tt_init(&_dst_iter.tt, >>>>>> bo->ttm); >>>>>> + if (IS_ERR(dst_iter)) >>>>>> + return PTR_ERR(dst_iter); >>>>>> + >>>>>> + src_iter = ttm_kmap_iter_linear_io_init(&_src_iter.io, >>>>>> bdev, src_mem); >>>>>> + if (PTR_ERR(src_iter) == -EINVAL && src_man->use_tt) >>>>>> + src_iter = ttm_kmap_iter_tt_init(&_src_iter.tt, >>>>>> bo->ttm); >>>>>> + if (IS_ERR(src_iter)) { >>>>>> + ret = PTR_ERR(src_iter); >>>>>> + goto out_src_iter; >>>>>> } >>>>>> - mb(); >>>>>> -out2: >>>>>> - old_copy = *old_mem; >>>>>> >>>>>> - ttm_bo_assign_mem(bo, new_mem); >>>>>> - >>>>>> - if (!man->use_tt) >>>>>> - ttm_bo_tt_destroy(bo); >>>>>> + ttm_move_memcpy(bo, dst_mem, dst_iter, src_iter); >>>>>> + src_copy = *src_mem; >>>>>> + ttm_bo_move_sync_cleanup(bo, dst_mem); >>>>>> >>>>>> -out1: >>>>>> - ttm_resource_iounmap(bdev, old_mem, new_iomap); >>>>>> -out: >>>>>> - ttm_resource_iounmap(bdev, &old_copy, old_iomap); >>>>>> + if (!src_iter->ops->maps_tt) >>>>>> + ttm_kmap_iter_linear_io_fini(&_src_iter.io, >>>>>> bdev, &src_copy); >>>>>> +out_src_iter: >>>>>> + if (!dst_iter->ops->maps_tt) >>>>>> + ttm_kmap_iter_linear_io_fini(&_dst_iter.io, >>>>>> bdev, dst_mem); >>>>>> >>>>>> - /* >>>>>> - * On error, keep the mm node! >>>>>> - */ >>>>>> - if (!ret) >>>>>> - ttm_resource_free(bo, &old_copy); >>>>>> return ret; >>>>>> } >>>>>> EXPORT_SYMBOL(ttm_bo_move_memcpy); >>>>>> @@ -336,27 +272,7 @@ pgprot_t ttm_io_prot(struct >>>>>> ttm_buffer_object *bo, struct ttm_resource *res, >>>>>> man = ttm_manager_type(bo->bdev, res->mem_type); >>>>>> caching = man->use_tt ? bo->ttm->caching : res- >>>>>>> bus.caching; >>>>>> - /* Cached mappings need no adjustment */ >>>>>> - if (caching == ttm_cached) >>>>>> - return tmp; >>>>>> - >>>>>> -#if defined(__i386__) || defined(__x86_64__) >>>>>> - if (caching == ttm_write_combined) >>>>>> - tmp = pgprot_writecombine(tmp); >>>>>> - else if (boot_cpu_data.x86 > 3) >>>>>> - tmp = pgprot_noncached(tmp); >>>>>> -#endif >>>>>> -#if defined(__ia64__) || defined(__arm__) || >>>>>> defined(__aarch64__) || \ >>>>>> - defined(__powerpc__) || defined(__mips__) >>>>>> - if (caching == ttm_write_combined) >>>>>> - tmp = pgprot_writecombine(tmp); >>>>>> - else >>>>>> - tmp = pgprot_noncached(tmp); >>>>>> -#endif >>>>>> -#if defined(__sparc__) >>>>>> - tmp = pgprot_noncached(tmp); >>>>>> -#endif >>>>>> - return tmp; >>>>>> + return ttm_prot_from_caching(caching, tmp); >>>>>> } >>>>>> EXPORT_SYMBOL(ttm_io_prot); >>>>>> >>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_module.c >>>>>> b/drivers/gpu/drm/ttm/ttm_module.c >>>>>> index 56b0efdba1a9..997c458f68a9 100644 >>>>>> --- a/drivers/gpu/drm/ttm/ttm_module.c >>>>>> +++ b/drivers/gpu/drm/ttm/ttm_module.c >>>>>> @@ -31,12 +31,47 @@ >>>>>> */ >>>>>> #include <linux/module.h> >>>>>> #include <linux/device.h> >>>>>> +#include <linux/pgtable.h> >>>>>> #include <linux/sched.h> >>>>>> #include <linux/debugfs.h> >>>>>> #include <drm/drm_sysfs.h> >>>>>> +#include <drm/ttm/ttm_caching.h> >>>>>> >>>>>> #include "ttm_module.h" >>>>>> >>>>>> +/** >>>>>> + * ttm_prot_from_caching - Modify the page protection >>>>>> according to the >>>>>> + * ttm cacing mode >>>>>> + * @caching: The ttm caching mode >>>>>> + * @tmp: The original page protection >>>>>> + * >>>>>> + * Return: The modified page protection >>>>>> + */ >>>>>> +pgprot_t ttm_prot_from_caching(enum ttm_caching caching, >>>>>> pgprot_t tmp) >>>>>> +{ >>>>>> + /* Cached mappings need no adjustment */ >>>>>> + if (caching == ttm_cached) >>>>>> + return tmp; >>>>>> + >>>>>> +#if defined(__i386__) || defined(__x86_64__) >>>>>> + if (caching == ttm_write_combined) >>>>>> + tmp = pgprot_writecombine(tmp); >>>>>> + else if (boot_cpu_data.x86 > 3) >>>>>> + tmp = pgprot_noncached(tmp); >>>>>> +#endif >>>>>> +#if defined(__ia64__) || defined(__arm__) || >>>>>> defined(__aarch64__) || \ >>>>>> + defined(__powerpc__) || defined(__mips__) >>>>>> + if (caching == ttm_write_combined) >>>>>> + tmp = pgprot_writecombine(tmp); >>>>>> + else >>>>>> + tmp = pgprot_noncached(tmp); >>>>>> +#endif >>>>>> +#if defined(__sparc__) >>>>>> + tmp = pgprot_noncached(tmp); >>>>>> +#endif >>>>>> + return tmp; >>>>>> +} >>>>>> + >>>>>> struct dentry *ttm_debugfs_root; >>>>>> >>>>>> static int __init ttm_init(void) >>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_resource.c >>>>>> b/drivers/gpu/drm/ttm/ttm_resource.c >>>>>> index 59e2b7157e41..e05ae7e3d477 100644 >>>>>> --- a/drivers/gpu/drm/ttm/ttm_resource.c >>>>>> +++ b/drivers/gpu/drm/ttm/ttm_resource.c >>>>>> @@ -22,6 +22,10 @@ >>>>>> * Authors: Christian König >>>>>> */ >>>>>> >>>>>> +#include <linux/dma-buf-map.h> >>>>>> +#include <linux/io-mapping.h> >>>>>> +#include <linux/scatterlist.h> >>>>>> + >>>>>> #include <drm/ttm/ttm_resource.h> >>>>>> #include <drm/ttm/ttm_bo_driver.h> >>>>>> >>>>>> @@ -147,3 +151,165 @@ void ttm_resource_manager_debug(struct >>>>>> ttm_resource_manager *man, >>>>>> man->func->debug(man, p); >>>>>> } >>>>>> EXPORT_SYMBOL(ttm_resource_manager_debug); >>>>>> + >>>>>> +static void ttm_kmap_iter_iomap_map_local(struct ttm_kmap_iter >>>>>> *iter, >>>>>> + struct dma_buf_map >>>>>> *dmap, >>>>>> + pgoff_t i) >>>>>> +{ >>>>>> + struct ttm_kmap_iter_iomap *iter_io = >>>>>> + container_of(iter, typeof(*iter_io), base); >>>>>> + void __iomem *addr; >>>>>> + >>>>>> +retry: >>>>>> + while (i >= iter_io->cache.end) { >>>>>> + iter_io->cache.sg = iter_io->cache.sg ? >>>>>> + sg_next(iter_io->cache.sg) : iter_io- >>>>>>> st->sgl; >>>>>> + iter_io->cache.i = iter_io->cache.end; >>>>>> + iter_io->cache.end += sg_dma_len(iter_io- >>>>>>> cache.sg) >> >>>>>> + PAGE_SHIFT; >>>>>> + iter_io->cache.offs = sg_dma_address(iter_io- >>>>>>> cache.sg) - >>>>>> + iter_io->start; >>>>>> + } >>>>>> + >>>>>> + if (i < iter_io->cache.i) { >>>>>> + iter_io->cache.end = 0; >>>>>> + iter_io->cache.sg = NULL; >>>>>> + goto retry; >>>>>> + } >>>>>> + >>>>>> + addr = io_mapping_map_local_wc(iter_io->iomap, iter_io- >>>>>>> cache.offs + >>>>>> + (((resource_size_t)i - >>>>>> iter_io->cache.i) >>>>>> + << PAGE_SHIFT)); >>>>>> + dma_buf_map_set_vaddr_iomem(dmap, addr); >>>>>> +} >>>>>> + >>>>>> +static void ttm_kmap_iter_iomap_unmap_local(struct >>>>>> ttm_kmap_iter *iter, >>>>>> + struct dma_buf_map >>>>>> *map) >>>>>> +{ >>>>>> + io_mapping_unmap_local(map->vaddr_iomem); >>>>>> +} >>>>>> + >>>>>> +static const struct ttm_kmap_iter_ops ttm_kmap_iter_io_ops = { >>>>>> + .map_local = ttm_kmap_iter_iomap_map_local, >>>>>> + .unmap_local = ttm_kmap_iter_iomap_unmap_local, >>>>>> + .maps_tt = false, >>>>>> +}; >>>>>> + >>>>>> +/** >>>>>> + * ttm_kmap_iter_iomap_init - Initialize a struct >>>>>> ttm_kmap_iter_iomap >>>>>> + * @iter_io: The struct ttm_kmap_iter_iomap to initialize. >>>>>> + * @iomap: The struct io_mapping representing the underlying >>>>>> linear io_memory. >>>>>> + * @st: sg_table into @iomap, representing the memory of the >>>>>> struct >>>>>> + * ttm_resource. >>>>>> + * @start: Offset that needs to be subtracted from @st to make >>>>>> + * sg_dma_address(st->sgl) - @start == 0 for @iomap start. >>>>>> + * >>>>>> + * Return: Pointer to the embedded struct ttm_kmap_iter. >>>>>> + */ >>>>>> +struct ttm_kmap_iter * >>>>>> +ttm_kmap_iter_iomap_init(struct ttm_kmap_iter_iomap *iter_io, >>>>>> + struct io_mapping *iomap, >>>>>> + struct sg_table *st, >>>>>> + resource_size_t start) >>>>>> +{ >>>>>> + iter_io->base.ops = &ttm_kmap_iter_io_ops; >>>>>> + iter_io->iomap = iomap; >>>>>> + iter_io->st = st; >>>>>> + iter_io->start = start; >>>>>> + memset(&iter_io->cache, 0, sizeof(iter_io->cache)); >>>>>> + >>>>>> + return &iter_io->base; >>>>>> +} >>>>>> +EXPORT_SYMBOL(ttm_kmap_iter_iomap_init); >>>>>> + >>>>>> +/** >>>>>> + * DOC: Linear io iterator >>>>>> + * >>>>>> + * This code should die in the not too near future. Best would >>>>>> be if we could >>>>>> + * make io-mapping use memremap for all io memory, and have >>>>>> memremap >>>>>> + * implement a kmap_local functionality. We could then strip a >>>>>> huge amount of >>>>>> + * code. These linear io iterators are implemented to mimic >>>>>> old functionality, >>>>>> + * and they don't use kmap_local semantics at all internally. >>>>>> Rather ioremap or >>>>>> + * friends, and at least on 32-bit they add global TLB flushes >>>>>> and points >>>>>> + * of failure. >>>>>> + */ >>>>>> + >>>>>> +static void ttm_kmap_iter_linear_io_map_local(struct >>>>>> ttm_kmap_iter *iter, >>>>>> + struct >>>>>> dma_buf_map *dmap, >>>>>> + pgoff_t i) >>>>>> +{ >>>>>> + struct ttm_kmap_iter_linear_io *iter_io = >>>>>> + container_of(iter, typeof(*iter_io), base); >>>>>> + >>>>>> + *dmap = iter_io->dmap; >>>>>> + dma_buf_map_incr(dmap, i * PAGE_SIZE); >>>>>> +} >>>>>> + >>>>>> +static const struct ttm_kmap_iter_ops >>>>>> ttm_kmap_iter_linear_io_ops = { >>>>>> + .map_local = ttm_kmap_iter_linear_io_map_local, >>>>>> + .maps_tt = false, >>>>>> +}; >>>>>> + >>>>>> +struct ttm_kmap_iter * >>>>>> +ttm_kmap_iter_linear_io_init(struct ttm_kmap_iter_linear_io >>>>>> *iter_io, >>>>>> + struct ttm_device *bdev, >>>>>> + struct ttm_resource *mem) >>>>>> +{ >>>>>> + int ret; >>>>>> + >>>>>> + ret = ttm_mem_io_reserve(bdev, mem); >>>>>> + if (ret) >>>>>> + goto out_err; >>>>>> + if (!mem->bus.is_iomem) { >>>>>> + ret = -EINVAL; >>>>>> + goto out_io_free; >>>>>> + } >>>>>> + >>>>>> + if (mem->bus.addr) { >>>>>> + dma_buf_map_set_vaddr(&iter_io->dmap, mem- >>>>>>> bus.addr); >>>>>> + iter_io->needs_unmap = false; >>>>>> + } else { >>>>>> + size_t bus_size = (size_t)mem->num_pages << >>>>>> PAGE_SHIFT; >>>>>> + >>>>>> + iter_io->needs_unmap = true; >>>>>> + if (mem->bus.caching == ttm_write_combined) >>>>>> + dma_buf_map_set_vaddr_iomem(&iter_io- >>>>>>> dmap, >>>>>> + >>>>>> ioremap_wc(mem->bus.offset, >>>>>> + >>>>>> bus_size)); >>>>>> + else if (mem->bus.caching == ttm_cached) >>>>>> + dma_buf_map_set_vaddr(&iter_io->dmap, >>>>>> + memremap(mem- >>>>>>> bus.offset, bus_size, >>>>>> + >>>>>> MEMREMAP_WB)); >>>>> The comments in set_vaddr suggest that this is meant for >>>>> system-memory. Does that actually matter or is it just about not >>>>> losing the __iomem annotation on platforms where it matters? >>>> Yes, it's the latter. dma_buf_map() is relatively new and the >>>> author >>>> probably didn't think about the case of cached iomem, which is used >>>> by, >>>> for example, vmwgfx. >>>> >>>>> Apparently cached device local is a thing. Also should this not >>>>> be >>>>> wrapped in CONFIG_X86? >>>> Both dma_buf_map() and memremap are generic, I think, I guess >>>> memremap >>>> would return NULL if it's not supported. >>> It looks like memremap just wraps ioremap_cache, but since it also >>> discards the __iomem annotation should we be doing that universally? >>> Also not sure if ioremap_cache is universally supported, so wrapping >>> in CONFIG_X86 and falling back to plain ioremap() might be needed? Or >>> at least that looks like roughly what the previous code was doing? >>> Not >>> too sure tbh. >>> >> I think the long term goal is to use memremap all over the place, to >> just not have to bother with the __iomem annotation. But to do that io- >> mapping.h needs to support memremap. But for now we need to be strict >> about __iomem unless we're in arch specific code. That's why that >> dma_buf_map thing was created, but TTM memcpy was never fully adapted. > > I don't think that this will work. __iomem annotation is there because > we have architectures where you need to use special CPU instructions > for iomem access. > > That won't go away just because we use memremap(). That's true, but can we ever support those with TTM, given that we allow user-space mmaping that transparently may change to an iomap? Given that, and what's written here https://lwn.net/Articles/653585/ To me it sounds like if an architecture can't support memremap, we can't support it with TTM either. In any case for this particular patch, to avoid potential regressions, OK if I just add an ioremap() in case the memremap fails? /Thomas > > Christian. > >> >> As for limited arch support for memremap cached, It looks like we only >> need to or in "backup" mapping modes in the memremap flags, and we'd >> mimic the previous behaviour. >> >> /Thomas >> >> >>>> /Thomas >>>> >>>> >> >
Am 26.05.21 um 09:39 schrieb Thomas Hellström: > [SNIP] >>> I think the long term goal is to use memremap all over the place, to >>> just not have to bother with the __iomem annotation. But to do that io- >>> mapping.h needs to support memremap. But for now we need to be strict >>> about __iomem unless we're in arch specific code. That's why that >>> dma_buf_map thing was created, but TTM memcpy was never fully adapted. >> >> I don't think that this will work. __iomem annotation is there >> because we have architectures where you need to use special CPU >> instructions for iomem access. >> >> That won't go away just because we use memremap(). > > That's true, but can we ever support those with TTM, given that we > allow user-space mmaping that transparently may change to an iomap? > Given that, and what's written here > > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flwn.net%2FArticles%2F653585%2F&data=04%7C01%7Cchristian.koenig%40amd.com%7C1cdcfe9d20e740308c9e08d92019785b%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637576116034492654%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=e2BFGQJcElwVxrvHcnALDWscHN7ernLekGvXHqWBcwY%3D&reserved=0 > > > To me it sounds like if an architecture can't support memremap, we > can't support it with TTM either. That was also my argument, but this is unfortunately not true. We already have architectures where the __iomem approach is mandatory for kernel mappings, but work fine for userspace (don't ask me how that works, idk). That's the reason why we had to fix the UVD firmware upload in the kernel: commit ba0b2275a6781b2f3919d931d63329b5548f6d5f Author: Christian König <christian.koenig@amd.com> Date: Tue Aug 23 11:00:17 2016 +0200 drm/amdgpu: use memcpy_to/fromio for UVD fw upload > > In any case for this particular patch, to avoid potential regressions, > OK if I just add an ioremap() in case the memremap fails? Well because of the issues outlined above I would actually prefer if we can keep the __iomem annotation around. Christian. > > /Thomas > > >> >> Christian. >> >>> >>> As for limited arch support for memremap cached, It looks like we only >>> need to or in "backup" mapping modes in the memremap flags, and we'd >>> mimic the previous behaviour. >>> >>> /Thomas >>> >>> >>>>> /Thomas >>>>> >>>>> >>> >>
On Wed, 2021-05-26 at 12:45 +0200, Christian König wrote: > Am 26.05.21 um 09:39 schrieb Thomas Hellström: > > [SNIP] > > > > I think the long term goal is to use memremap all over the > > > > place, to > > > > just not have to bother with the __iomem annotation. But to do > > > > that io- > > > > mapping.h needs to support memremap. But for now we need to be > > > > strict > > > > about __iomem unless we're in arch specific code. That's why > > > > that > > > > dma_buf_map thing was created, but TTM memcpy was never fully > > > > adapted. > > > > > > I don't think that this will work. __iomem annotation is there > > > because we have architectures where you need to use special CPU > > > instructions for iomem access. > > > > > > That won't go away just because we use memremap(). > > > > That's true, but can we ever support those with TTM, given that we > > allow user-space mmaping that transparently may change to an iomap? > > Given that, and what's written here > > > > > > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flwn.net%2FArticles%2F653585%2F&data=04%7C01%7Cchristian.koenig%40amd.com%7C1cdcfe9d20e740308c9e08d92019785b%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637576116034492654%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=e2BFGQJcElwVxrvHcnALDWscHN7ernLekGvXHqWBcwY%3D&reserved=0 > > > > > > > > To me it sounds like if an architecture can't support memremap, we > > can't support it with TTM either. > > That was also my argument, but this is unfortunately not true. > > We already have architectures where the __iomem approach is mandatory > for kernel mappings, but work fine for userspace (don't ask me how > that > works, idk). Ugh. :/ > > That's the reason why we had to fix the UVD firmware upload in the > kernel: > > commit ba0b2275a6781b2f3919d931d63329b5548f6d5f > Author: Christian König <christian.koenig@amd.com> > Date: Tue Aug 23 11:00:17 2016 +0200 > > drm/amdgpu: use memcpy_to/fromio for UVD fw upload > > > > > In any case for this particular patch, to avoid potential > > regressions, > > OK if I just add an ioremap() in case the memremap fails? > > Well because of the issues outlined above I would actually prefer if > we > can keep the __iomem annotation around. Well, we'd do that. Since we use the dma_buf_map unconditionally. So what would happen in the above case, would be: - memremap would fail. (Otherwise I'd be terribly confused) - we retry with ioremap and the dma-buf-map member is_iomem would thus be set - memcpy would do the right thing, based on is_iomem. /Thomas > > Christian. > > > > > /Thomas > > > > > > > > > > Christian. > > > > > > > > > > > As for limited arch support for memremap cached, It looks like > > > > we only > > > > need to or in "backup" mapping modes in the memremap flags, and > > > > we'd > > > > mimic the previous behaviour. > > > > > > > > /Thomas > > > > > > > > > > > > > > /Thomas > > > > > > > > > > > > > > > > > > > >
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index ae8b61460724..912cbe8e60a2 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -72,190 +72,126 @@ void ttm_mem_io_free(struct ttm_device *bdev, mem->bus.addr = NULL; } -static int ttm_resource_ioremap(struct ttm_device *bdev, - struct ttm_resource *mem, - void **virtual) +/** + * ttm_move_memcpy - Helper to perform a memcpy ttm move operation. + * @bo: The struct ttm_buffer_object. + * @new_mem: The struct ttm_resource we're moving to (copy destination). + * @new_iter: A struct ttm_kmap_iter representing the destination resource. + * @src_iter: A struct ttm_kmap_iter representing the source resource. + * + * This function is intended to be able to move out async under a + * dma-fence if desired. + */ +void ttm_move_memcpy(struct ttm_buffer_object *bo, + struct ttm_resource *dst_mem, + struct ttm_kmap_iter *dst_iter, + struct ttm_kmap_iter *src_iter) { - int ret; - void *addr; - - *virtual = NULL; - ret = ttm_mem_io_reserve(bdev, mem); - if (ret || !mem->bus.is_iomem) - return ret; + const struct ttm_kmap_iter_ops *dst_ops = dst_iter->ops; + const struct ttm_kmap_iter_ops *src_ops = src_iter->ops; + struct ttm_tt *ttm = bo->ttm; + struct dma_buf_map src_map, dst_map; + pgoff_t i; - if (mem->bus.addr) { - addr = mem->bus.addr; - } else { - size_t bus_size = (size_t)mem->num_pages << PAGE_SHIFT; + /* Single TTM move. NOP */ + if (dst_ops->maps_tt && src_ops->maps_tt) + return; - if (mem->bus.caching == ttm_write_combined) - addr = ioremap_wc(mem->bus.offset, bus_size); -#ifdef CONFIG_X86 - else if (mem->bus.caching == ttm_cached) - addr = ioremap_cache(mem->bus.offset, bus_size); -#endif - else - addr = ioremap(mem->bus.offset, bus_size); - if (!addr) { - ttm_mem_io_free(bdev, mem); - return -ENOMEM; + /* Don't move nonexistent data. Clear destination instead. */ + if (src_ops->maps_tt && (!ttm || !ttm_tt_is_populated(ttm))) { + if (ttm && !(ttm->page_flags & TTM_PAGE_FLAG_ZERO_ALLOC)) + return; + + for (i = 0; i < dst_mem->num_pages; ++i) { + dst_ops->map_local(dst_iter, &dst_map, i); + if (dst_map.is_iomem) + memset_io(dst_map.vaddr_iomem, 0, PAGE_SIZE); + else + memset(dst_map.vaddr, 0, PAGE_SIZE); + if (dst_ops->unmap_local) + dst_ops->unmap_local(dst_iter, &dst_map); } + return; } - *virtual = addr; - return 0; -} - -static void ttm_resource_iounmap(struct ttm_device *bdev, - struct ttm_resource *mem, - void *virtual) -{ - if (virtual && mem->bus.addr == NULL) - iounmap(virtual); - ttm_mem_io_free(bdev, mem); -} - -static int ttm_copy_io_page(void *dst, void *src, unsigned long page) -{ - uint32_t *dstP = - (uint32_t *) ((unsigned long)dst + (page << PAGE_SHIFT)); - uint32_t *srcP = - (uint32_t *) ((unsigned long)src + (page << PAGE_SHIFT)); - - int i; - for (i = 0; i < PAGE_SIZE / sizeof(uint32_t); ++i) - iowrite32(ioread32(srcP++), dstP++); - return 0; -} - -static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, - unsigned long page, - pgprot_t prot) -{ - struct page *d = ttm->pages[page]; - void *dst; - - if (!d) - return -ENOMEM; - - src = (void *)((unsigned long)src + (page << PAGE_SHIFT)); - dst = kmap_atomic_prot(d, prot); - if (!dst) - return -ENOMEM; - - memcpy_fromio(dst, src, PAGE_SIZE); - - kunmap_atomic(dst); - - return 0; -} - -static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, - unsigned long page, - pgprot_t prot) -{ - struct page *s = ttm->pages[page]; - void *src; - - if (!s) - return -ENOMEM; - - dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT)); - src = kmap_atomic_prot(s, prot); - if (!src) - return -ENOMEM; - memcpy_toio(dst, src, PAGE_SIZE); - - kunmap_atomic(src); + for (i = 0; i < dst_mem->num_pages; ++i) { + dst_ops->map_local(dst_iter, &dst_map, i); + src_ops->map_local(src_iter, &src_map, i); + + if (!src_map.is_iomem && !dst_map.is_iomem) { + memcpy(dst_map.vaddr, src_map.vaddr, PAGE_SIZE); + } else if (!src_map.is_iomem) { + dma_buf_map_memcpy_to(&dst_map, src_map.vaddr, + PAGE_SIZE); + } else if (!dst_map.is_iomem) { + memcpy_fromio(dst_map.vaddr, src_map.vaddr_iomem, + PAGE_SIZE); + } else { + int j; + u32 __iomem *src = src_map.vaddr_iomem; + u32 __iomem *dst = dst_map.vaddr_iomem; - return 0; + for (j = 0; j < (PAGE_SIZE >> 2); ++j) + iowrite32(ioread32(src++), dst++); + } + if (src_ops->unmap_local) + src_ops->unmap_local(src_iter, &src_map); + if (dst_ops->unmap_local) + dst_ops->unmap_local(dst_iter, &dst_map); + } } +EXPORT_SYMBOL(ttm_move_memcpy); int ttm_bo_move_memcpy(struct ttm_buffer_object *bo, struct ttm_operation_ctx *ctx, - struct ttm_resource *new_mem) + struct ttm_resource *dst_mem) { struct ttm_device *bdev = bo->bdev; - struct ttm_resource_manager *man = ttm_manager_type(bdev, new_mem->mem_type); + struct ttm_resource_manager *dst_man = + ttm_manager_type(bo->bdev, dst_mem->mem_type); struct ttm_tt *ttm = bo->ttm; - struct ttm_resource *old_mem = &bo->mem; - struct ttm_resource old_copy = *old_mem; - void *old_iomap; - void *new_iomap; + struct ttm_resource *src_mem = &bo->mem; + struct ttm_resource_manager *src_man = + ttm_manager_type(bdev, src_mem->mem_type); + struct ttm_resource src_copy = *src_mem; + union { + struct ttm_kmap_iter_tt tt; + struct ttm_kmap_iter_linear_io io; + } _dst_iter, _src_iter; + struct ttm_kmap_iter *dst_iter, *src_iter; int ret; - unsigned long i; - ret = ttm_bo_wait_ctx(bo, ctx); - if (ret) - return ret; - - ret = ttm_resource_ioremap(bdev, old_mem, &old_iomap); - if (ret) - return ret; - ret = ttm_resource_ioremap(bdev, new_mem, &new_iomap); - if (ret) - goto out; - - /* - * Single TTM move. NOP. - */ - if (old_iomap == NULL && new_iomap == NULL) - goto out2; - - /* - * Don't move nonexistent data. Clear destination instead. - */ - if (old_iomap == NULL && - (ttm == NULL || (!ttm_tt_is_populated(ttm) && - !(ttm->page_flags & TTM_PAGE_FLAG_SWAPPED)))) { - memset_io(new_iomap, 0, new_mem->num_pages*PAGE_SIZE); - goto out2; - } - - /* - * TTM might be null for moves within the same region. - */ - if (ttm) { + if (ttm && ((ttm->page_flags & TTM_PAGE_FLAG_SWAPPED) || + dst_man->use_tt)) { ret = ttm_tt_populate(bdev, ttm, ctx); if (ret) - goto out1; + return ret; } - for (i = 0; i < new_mem->num_pages; ++i) { - if (old_iomap == NULL) { - pgprot_t prot = ttm_io_prot(bo, old_mem, PAGE_KERNEL); - ret = ttm_copy_ttm_io_page(ttm, new_iomap, i, - prot); - } else if (new_iomap == NULL) { - pgprot_t prot = ttm_io_prot(bo, new_mem, PAGE_KERNEL); - ret = ttm_copy_io_ttm_page(ttm, old_iomap, i, - prot); - } else { - ret = ttm_copy_io_page(new_iomap, old_iomap, i); - } - if (ret) - goto out1; + dst_iter = ttm_kmap_iter_linear_io_init(&_dst_iter.io, bdev, dst_mem); + if (PTR_ERR(dst_iter) == -EINVAL && dst_man->use_tt) + dst_iter = ttm_kmap_iter_tt_init(&_dst_iter.tt, bo->ttm); + if (IS_ERR(dst_iter)) + return PTR_ERR(dst_iter); + + src_iter = ttm_kmap_iter_linear_io_init(&_src_iter.io, bdev, src_mem); + if (PTR_ERR(src_iter) == -EINVAL && src_man->use_tt) + src_iter = ttm_kmap_iter_tt_init(&_src_iter.tt, bo->ttm); + if (IS_ERR(src_iter)) { + ret = PTR_ERR(src_iter); + goto out_src_iter; } - mb(); -out2: - old_copy = *old_mem; - ttm_bo_assign_mem(bo, new_mem); - - if (!man->use_tt) - ttm_bo_tt_destroy(bo); + ttm_move_memcpy(bo, dst_mem, dst_iter, src_iter); + src_copy = *src_mem; + ttm_bo_move_sync_cleanup(bo, dst_mem); -out1: - ttm_resource_iounmap(bdev, old_mem, new_iomap); -out: - ttm_resource_iounmap(bdev, &old_copy, old_iomap); + if (!src_iter->ops->maps_tt) + ttm_kmap_iter_linear_io_fini(&_src_iter.io, bdev, &src_copy); +out_src_iter: + if (!dst_iter->ops->maps_tt) + ttm_kmap_iter_linear_io_fini(&_dst_iter.io, bdev, dst_mem); - /* - * On error, keep the mm node! - */ - if (!ret) - ttm_resource_free(bo, &old_copy); return ret; } EXPORT_SYMBOL(ttm_bo_move_memcpy); @@ -336,27 +272,7 @@ pgprot_t ttm_io_prot(struct ttm_buffer_object *bo, struct ttm_resource *res, man = ttm_manager_type(bo->bdev, res->mem_type); caching = man->use_tt ? bo->ttm->caching : res->bus.caching; - /* Cached mappings need no adjustment */ - if (caching == ttm_cached) - return tmp; - -#if defined(__i386__) || defined(__x86_64__) - if (caching == ttm_write_combined) - tmp = pgprot_writecombine(tmp); - else if (boot_cpu_data.x86 > 3) - tmp = pgprot_noncached(tmp); -#endif -#if defined(__ia64__) || defined(__arm__) || defined(__aarch64__) || \ - defined(__powerpc__) || defined(__mips__) - if (caching == ttm_write_combined) - tmp = pgprot_writecombine(tmp); - else - tmp = pgprot_noncached(tmp); -#endif -#if defined(__sparc__) - tmp = pgprot_noncached(tmp); -#endif - return tmp; + return ttm_prot_from_caching(caching, tmp); } EXPORT_SYMBOL(ttm_io_prot); diff --git a/drivers/gpu/drm/ttm/ttm_module.c b/drivers/gpu/drm/ttm/ttm_module.c index 56b0efdba1a9..997c458f68a9 100644 --- a/drivers/gpu/drm/ttm/ttm_module.c +++ b/drivers/gpu/drm/ttm/ttm_module.c @@ -31,12 +31,47 @@ */ #include <linux/module.h> #include <linux/device.h> +#include <linux/pgtable.h> #include <linux/sched.h> #include <linux/debugfs.h> #include <drm/drm_sysfs.h> +#include <drm/ttm/ttm_caching.h> #include "ttm_module.h" +/** + * ttm_prot_from_caching - Modify the page protection according to the + * ttm cacing mode + * @caching: The ttm caching mode + * @tmp: The original page protection + * + * Return: The modified page protection + */ +pgprot_t ttm_prot_from_caching(enum ttm_caching caching, pgprot_t tmp) +{ + /* Cached mappings need no adjustment */ + if (caching == ttm_cached) + return tmp; + +#if defined(__i386__) || defined(__x86_64__) + if (caching == ttm_write_combined) + tmp = pgprot_writecombine(tmp); + else if (boot_cpu_data.x86 > 3) + tmp = pgprot_noncached(tmp); +#endif +#if defined(__ia64__) || defined(__arm__) || defined(__aarch64__) || \ + defined(__powerpc__) || defined(__mips__) + if (caching == ttm_write_combined) + tmp = pgprot_writecombine(tmp); + else + tmp = pgprot_noncached(tmp); +#endif +#if defined(__sparc__) + tmp = pgprot_noncached(tmp); +#endif + return tmp; +} + struct dentry *ttm_debugfs_root; static int __init ttm_init(void) diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c index 59e2b7157e41..e05ae7e3d477 100644 --- a/drivers/gpu/drm/ttm/ttm_resource.c +++ b/drivers/gpu/drm/ttm/ttm_resource.c @@ -22,6 +22,10 @@ * Authors: Christian König */ +#include <linux/dma-buf-map.h> +#include <linux/io-mapping.h> +#include <linux/scatterlist.h> + #include <drm/ttm/ttm_resource.h> #include <drm/ttm/ttm_bo_driver.h> @@ -147,3 +151,165 @@ void ttm_resource_manager_debug(struct ttm_resource_manager *man, man->func->debug(man, p); } EXPORT_SYMBOL(ttm_resource_manager_debug); + +static void ttm_kmap_iter_iomap_map_local(struct ttm_kmap_iter *iter, + struct dma_buf_map *dmap, + pgoff_t i) +{ + struct ttm_kmap_iter_iomap *iter_io = + container_of(iter, typeof(*iter_io), base); + void __iomem *addr; + +retry: + while (i >= iter_io->cache.end) { + iter_io->cache.sg = iter_io->cache.sg ? + sg_next(iter_io->cache.sg) : iter_io->st->sgl; + iter_io->cache.i = iter_io->cache.end; + iter_io->cache.end += sg_dma_len(iter_io->cache.sg) >> + PAGE_SHIFT; + iter_io->cache.offs = sg_dma_address(iter_io->cache.sg) - + iter_io->start; + } + + if (i < iter_io->cache.i) { + iter_io->cache.end = 0; + iter_io->cache.sg = NULL; + goto retry; + } + + addr = io_mapping_map_local_wc(iter_io->iomap, iter_io->cache.offs + + (((resource_size_t)i - iter_io->cache.i) + << PAGE_SHIFT)); + dma_buf_map_set_vaddr_iomem(dmap, addr); +} + +static void ttm_kmap_iter_iomap_unmap_local(struct ttm_kmap_iter *iter, + struct dma_buf_map *map) +{ + io_mapping_unmap_local(map->vaddr_iomem); +} + +static const struct ttm_kmap_iter_ops ttm_kmap_iter_io_ops = { + .map_local = ttm_kmap_iter_iomap_map_local, + .unmap_local = ttm_kmap_iter_iomap_unmap_local, + .maps_tt = false, +}; + +/** + * ttm_kmap_iter_iomap_init - Initialize a struct ttm_kmap_iter_iomap + * @iter_io: The struct ttm_kmap_iter_iomap to initialize. + * @iomap: The struct io_mapping representing the underlying linear io_memory. + * @st: sg_table into @iomap, representing the memory of the struct + * ttm_resource. + * @start: Offset that needs to be subtracted from @st to make + * sg_dma_address(st->sgl) - @start == 0 for @iomap start. + * + * Return: Pointer to the embedded struct ttm_kmap_iter. + */ +struct ttm_kmap_iter * +ttm_kmap_iter_iomap_init(struct ttm_kmap_iter_iomap *iter_io, + struct io_mapping *iomap, + struct sg_table *st, + resource_size_t start) +{ + iter_io->base.ops = &ttm_kmap_iter_io_ops; + iter_io->iomap = iomap; + iter_io->st = st; + iter_io->start = start; + memset(&iter_io->cache, 0, sizeof(iter_io->cache)); + + return &iter_io->base; +} +EXPORT_SYMBOL(ttm_kmap_iter_iomap_init); + +/** + * DOC: Linear io iterator + * + * This code should die in the not too near future. Best would be if we could + * make io-mapping use memremap for all io memory, and have memremap + * implement a kmap_local functionality. We could then strip a huge amount of + * code. These linear io iterators are implemented to mimic old functionality, + * and they don't use kmap_local semantics at all internally. Rather ioremap or + * friends, and at least on 32-bit they add global TLB flushes and points + * of failure. + */ + +static void ttm_kmap_iter_linear_io_map_local(struct ttm_kmap_iter *iter, + struct dma_buf_map *dmap, + pgoff_t i) +{ + struct ttm_kmap_iter_linear_io *iter_io = + container_of(iter, typeof(*iter_io), base); + + *dmap = iter_io->dmap; + dma_buf_map_incr(dmap, i * PAGE_SIZE); +} + +static const struct ttm_kmap_iter_ops ttm_kmap_iter_linear_io_ops = { + .map_local = ttm_kmap_iter_linear_io_map_local, + .maps_tt = false, +}; + +struct ttm_kmap_iter * +ttm_kmap_iter_linear_io_init(struct ttm_kmap_iter_linear_io *iter_io, + struct ttm_device *bdev, + struct ttm_resource *mem) +{ + int ret; + + ret = ttm_mem_io_reserve(bdev, mem); + if (ret) + goto out_err; + if (!mem->bus.is_iomem) { + ret = -EINVAL; + goto out_io_free; + } + + if (mem->bus.addr) { + dma_buf_map_set_vaddr(&iter_io->dmap, mem->bus.addr); + iter_io->needs_unmap = false; + } else { + size_t bus_size = (size_t)mem->num_pages << PAGE_SHIFT; + + iter_io->needs_unmap = true; + if (mem->bus.caching == ttm_write_combined) + dma_buf_map_set_vaddr_iomem(&iter_io->dmap, + ioremap_wc(mem->bus.offset, + bus_size)); + else if (mem->bus.caching == ttm_cached) + dma_buf_map_set_vaddr(&iter_io->dmap, + memremap(mem->bus.offset, bus_size, + MEMREMAP_WB)); + else + dma_buf_map_set_vaddr_iomem(&iter_io->dmap, + ioremap(mem->bus.offset, + bus_size)); + if (dma_buf_map_is_null(&iter_io->dmap)) { + ret = -ENOMEM; + goto out_io_free; + } + } + + iter_io->base.ops = &ttm_kmap_iter_linear_io_ops; + return &iter_io->base; + +out_io_free: + ttm_mem_io_free(bdev, mem); +out_err: + return ERR_PTR(ret); +} + +void +ttm_kmap_iter_linear_io_fini(struct ttm_kmap_iter_linear_io *iter_io, + struct ttm_device *bdev, + struct ttm_resource *mem) +{ + if (iter_io->needs_unmap && dma_buf_map_is_set(&iter_io->dmap)) { + if (iter_io->dmap.is_iomem) + iounmap(iter_io->dmap.vaddr_iomem); + else + memunmap(iter_io->dmap.vaddr); + } + + ttm_mem_io_free(bdev, mem); +} diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c index 539e0232cb3b..0e41227116b1 100644 --- a/drivers/gpu/drm/ttm/ttm_tt.c +++ b/drivers/gpu/drm/ttm/ttm_tt.c @@ -433,3 +433,45 @@ void ttm_tt_mgr_init(unsigned long num_pages, unsigned long num_dma32_pages) if (!ttm_dma32_pages_limit) ttm_dma32_pages_limit = num_dma32_pages; } + +static void ttm_kmap_iter_tt_map_local(struct ttm_kmap_iter *iter, + struct dma_buf_map *dmap, + pgoff_t i) +{ + struct ttm_kmap_iter_tt *iter_tt = + container_of(iter, typeof(*iter_tt), base); + + dma_buf_map_set_vaddr(dmap, kmap_local_page_prot(iter_tt->tt->pages[i], + iter_tt->prot)); +} + +static void ttm_kmap_iter_tt_unmap_local(struct ttm_kmap_iter *iter, + struct dma_buf_map *map) +{ + kunmap_local(map->vaddr); +} + +static const struct ttm_kmap_iter_ops ttm_kmap_iter_tt_ops = { + .map_local = ttm_kmap_iter_tt_map_local, + .unmap_local = ttm_kmap_iter_tt_unmap_local, + .maps_tt = true, +}; + +/** + * ttm_kmap_iter_tt_init - Initialize a struct ttm_kmap_iter_tt + * @iter_tt: The struct ttm_kmap_iter_tt to initialize. + * @tt: Struct ttm_tt holding page pointers of the struct ttm_resource. + * + * Return: Pointer to the embedded struct ttm_kmap_iter. + */ +struct ttm_kmap_iter * +ttm_kmap_iter_tt_init(struct ttm_kmap_iter_tt *iter_tt, + struct ttm_tt *tt) +{ + iter_tt->base.ops = &ttm_kmap_iter_tt_ops; + iter_tt->tt = tt; + iter_tt->prot = ttm_prot_from_caching(tt->caching, PAGE_KERNEL); + + return &iter_tt->base; +} +EXPORT_SYMBOL(ttm_kmap_iter_tt_init); diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index dbccac957f8f..61fa648ee9e4 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -40,6 +40,7 @@ #include <drm/ttm/ttm_device.h> #include "ttm_bo_api.h" +#include "ttm_kmap_iter.h" #include "ttm_placement.h" #include "ttm_tt.h" #include "ttm_pool.h" @@ -272,6 +273,23 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, bool pipeline, struct ttm_resource *new_mem); +/** + * ttm_bo_move_accel_cleanup. + * + * @bo: A pointer to a struct ttm_buffer_object. + * @new_mem: struct ttm_resource indicating where to move. + * + * Special case of ttm_bo_move_accel_cleanup where the bo is guaranteed + * by the caller to be idle. Typically used after memcpy buffer moves. + */ +static inline void ttm_bo_move_sync_cleanup(struct ttm_buffer_object *bo, + struct ttm_resource *new_mem) +{ + int ret = ttm_bo_move_accel_cleanup(bo, NULL, true, false, new_mem); + + WARN_ON(ret); +} + /** * ttm_bo_pipeline_gutting. * @@ -332,4 +350,14 @@ int ttm_range_man_init(struct ttm_device *bdev, int ttm_range_man_fini(struct ttm_device *bdev, unsigned type); +void ttm_move_memcpy(struct ttm_buffer_object *bo, + struct ttm_resource *new_mem, + struct ttm_kmap_iter *dst_iter, + struct ttm_kmap_iter *src_iter); + +struct ttm_kmap_iter * +ttm_kmap_iter_iomap_init(struct ttm_kmap_iter_iomap *iter_io, + struct io_mapping *iomap, + struct sg_table *st, + resource_size_t start); #endif diff --git a/include/drm/ttm/ttm_caching.h b/include/drm/ttm/ttm_caching.h index a0b4a49fa432..3c9dd65f5aaf 100644 --- a/include/drm/ttm/ttm_caching.h +++ b/include/drm/ttm/ttm_caching.h @@ -33,4 +33,6 @@ enum ttm_caching { ttm_cached }; +pgprot_t ttm_prot_from_caching(enum ttm_caching caching, pgprot_t tmp); + #endif diff --git a/include/drm/ttm/ttm_kmap_iter.h b/include/drm/ttm/ttm_kmap_iter.h new file mode 100644 index 000000000000..8bb00fd39d6c --- /dev/null +++ b/include/drm/ttm/ttm_kmap_iter.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ +#ifndef __TTM_KMAP_ITER_H__ +#define __TTM_KMAP_ITER_H__ + +#include <linux/types.h> + +struct ttm_kmap_iter; +struct dma_buf_map; + +/** + * struct ttm_kmap_iter_ops - Ops structure for a struct + * ttm_kmap_iter. + * @maps_tt: Whether the iterator maps TT memory directly, as opposed + * mapping a TT through an aperture. Both these modes have + * struct ttm_resource_manager::use_tt set, but the latter typically + * returns is_iomem == true from ttm_mem_io_reserve. + */ +struct ttm_kmap_iter_ops { + /** + * kmap_local() - Map a PAGE_SIZE part of the resource using + * kmap_local semantics. + * @res_iter: Pointer to the struct ttm_kmap_iter representing + * the resource. + * @dmap: The struct dma_buf_map holding the virtual address after + * the operation. + * @i: The location within the resource to map. PAGE_SIZE granularity. + */ + void (*map_local)(struct ttm_kmap_iter *res_iter, + struct dma_buf_map *dmap, pgoff_t i); + /** + * unmap_local() - Unmap a PAGE_SIZE part of the resource previously + * mapped using kmap_local. + * @res_iter: Pointer to the struct ttm_kmap_iter representing + * the resource. + * @dmap: The struct dma_buf_map holding the virtual address after + * the operation. + */ + void (*unmap_local)(struct ttm_kmap_iter *res_iter, + struct dma_buf_map *dmap); + bool maps_tt; +}; + +/** + * struct ttm_kmap_iter - Iterator for kmap_local type operations on a + * resource. + * @ops: Pointer to the operations struct. + * + * This struct is intended to be embedded in a resource-specific specialization + * implementing operations for the resource. + * + * Nothing stops us from extending the operations to vmap, vmap_pfn etc, + * replacing some or parts of the ttm_bo_util. cpu-map functionality. + */ +struct ttm_kmap_iter { + const struct ttm_kmap_iter_ops *ops; +}; + +#endif /* __TTM_KMAP_ITER_H__ */ diff --git a/include/drm/ttm/ttm_resource.h b/include/drm/ttm/ttm_resource.h index 890b9d369519..b8dc0bdb0da5 100644 --- a/include/drm/ttm/ttm_resource.h +++ b/include/drm/ttm/ttm_resource.h @@ -27,9 +27,11 @@ #include <linux/types.h> #include <linux/mutex.h> +#include <linux/dma-buf-map.h> #include <linux/dma-fence.h> #include <drm/drm_print.h> #include <drm/ttm/ttm_caching.h> +#include <drm/ttm/ttm_kmap_iter.h> #define TTM_MAX_BO_PRIORITY 4U @@ -38,6 +40,10 @@ struct ttm_resource_manager; struct ttm_resource; struct ttm_place; struct ttm_buffer_object; +struct dma_buf_map; +struct io_mapping; +struct sg_table; +struct scatterlist; struct ttm_resource_manager_func { /** @@ -176,6 +182,45 @@ struct ttm_resource { struct ttm_bus_placement bus; }; +/** + * struct ttm_kmap_iter_iomap - Specialization for a struct io_mapping + + * struct sg_table backed struct ttm_resource. + * @base: Embedded struct ttm_kmap_iter providing the usage interface. + * @iomap: struct io_mapping representing the underlying linear io_memory. + * @st: sg_table into @iomap, representing the memory of the struct ttm_resource. + * @start: Offset that needs to be subtracted from @st to make + * sg_dma_address(st->sgl) - @start == 0 for @iomap start. + * @cache: Scatterlist traversal cache for fast lookups. + * @cache.sg: Pointer to the currently cached scatterlist segment. + * @cache.i: First index of @sg. PAGE_SIZE granularity. + * @cache.end: Last index + 1 of @sg. PAGE_SIZE granularity. + * @cache.offs: First offset into @iomap of @sg. PAGE_SIZE granularity. + */ +struct ttm_kmap_iter_iomap { + struct ttm_kmap_iter base; + struct io_mapping *iomap; + struct sg_table *st; + resource_size_t start; + struct { + struct scatterlist *sg; + pgoff_t i; + pgoff_t end; + pgoff_t offs; + } cache; +}; + +/** + * struct ttm_kmap_iter_linear_io - Iterator specialization for linear io + * @base: The base iterator + * @dmap: Points to the starting address of the region + * @needs_unmap: Whether we need to unmap on fini + */ +struct ttm_kmap_iter_linear_io { + struct ttm_kmap_iter base; + struct dma_buf_map dmap; + bool needs_unmap; +}; + /** * ttm_resource_manager_set_used * @@ -237,4 +282,20 @@ int ttm_resource_manager_evict_all(struct ttm_device *bdev, void ttm_resource_manager_debug(struct ttm_resource_manager *man, struct drm_printer *p); +struct ttm_kmap_iter * +ttm_kmap_iter_iomap_init(struct ttm_kmap_iter_iomap *iter_io, + struct io_mapping *iomap, + struct sg_table *st, + resource_size_t start); + +struct ttm_kmap_iter_linear_io; + +struct ttm_kmap_iter * +ttm_kmap_iter_linear_io_init(struct ttm_kmap_iter_linear_io *iter_io, + struct ttm_device *bdev, + struct ttm_resource *mem); + +void ttm_kmap_iter_linear_io_fini(struct ttm_kmap_iter_linear_io *iter_io, + struct ttm_device *bdev, + struct ttm_resource *mem); #endif diff --git a/include/drm/ttm/ttm_tt.h b/include/drm/ttm/ttm_tt.h index 134d09ef7766..3102059db726 100644 --- a/include/drm/ttm/ttm_tt.h +++ b/include/drm/ttm/ttm_tt.h @@ -29,6 +29,7 @@ #include <linux/types.h> #include <drm/ttm/ttm_caching.h> +#include <drm/ttm/ttm_kmap_iter.h> struct ttm_bo_device; struct ttm_tt; @@ -69,6 +70,18 @@ struct ttm_tt { enum ttm_caching caching; }; +/** + * struct ttm_kmap_iter_tt - Specialization of a mappig iterator for a tt. + * @base: Embedded struct ttm_kmap_iter providing the usage interface + * @tt: Cached struct ttm_tt. + * @prot: Cached page protection for mapping. + */ +struct ttm_kmap_iter_tt { + struct ttm_kmap_iter base; + struct ttm_tt *tt; + pgprot_t prot; +}; + static inline bool ttm_tt_is_populated(struct ttm_tt *tt) { return tt->page_flags & TTM_PAGE_FLAG_PRIV_POPULATED; @@ -159,6 +172,9 @@ void ttm_tt_unpopulate(struct ttm_device *bdev, struct ttm_tt *ttm); void ttm_tt_mgr_init(unsigned long num_pages, unsigned long num_dma32_pages); +struct ttm_kmap_iter *ttm_kmap_iter_tt_init(struct ttm_kmap_iter_tt *iter_tt, + struct ttm_tt *tt); + #if IS_ENABLED(CONFIG_AGP) #include <linux/agp_backend.h>
The internal ttm_bo_util memcpy uses ioremap functionality, and while it probably might be possible to use it for copying in- and out of sglist represented io memory, using io_mem_reserve() / io_mem_free() callbacks, that would cause problems with fault(). Instead, implement a method mapping page-by-page using kmap_local() semantics. As an additional benefit we then avoid the occasional global TLB flushes of ioremap() and consuming ioremap space, elimination of a critical point of failure and with a slight change of semantics we could also push the memcpy out async for testing and async driver development purposes. A special linear iomem iterator is introduced internally to mimic the old ioremap behaviour for code-paths that can't immediately be ported over. This adds to the code size and should be considered a temporary solution. Looking at the code we have a lot of checks for iomap tagged pointers. Ideally we should extend the core memremap functions to also accept uncached memory and kmap_local functionality. Then we could strip a lot of code. Cc: Christian König <christian.koenig@amd.com> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> --- v3: - Split up in various TTM files and addressed review comments by Christian König. Tested and fixed legacy iomap memcpy path on i915. --- drivers/gpu/drm/ttm/ttm_bo_util.c | 278 ++++++++++------------------- drivers/gpu/drm/ttm/ttm_module.c | 35 ++++ drivers/gpu/drm/ttm/ttm_resource.c | 166 +++++++++++++++++ drivers/gpu/drm/ttm/ttm_tt.c | 42 +++++ include/drm/ttm/ttm_bo_driver.h | 28 +++ include/drm/ttm/ttm_caching.h | 2 + include/drm/ttm/ttm_kmap_iter.h | 61 +++++++ include/drm/ttm/ttm_resource.h | 61 +++++++ include/drm/ttm/ttm_tt.h | 16 ++ 9 files changed, 508 insertions(+), 181 deletions(-) create mode 100644 include/drm/ttm/ttm_kmap_iter.h