Message ID | 1404324218-4743-6-git-send-email-lauraa@codeaurora.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, Jul 02, 2014 at 11:03:38AM -0700, Laura Abbott wrote: [...] > diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c [...] > +static struct gen_pool *atomic_pool; > + > +#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K > +static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; There doesn't seem to be much use for this since it can't be overridden via init_dma_coherent_pool_size like on ARM. > +static int __free_from_pool(void *start, size_t size) > +{ > + if (!__in_atomic_pool(start, size)) > + return 0; > + > + gen_pool_free(atomic_pool, (unsigned long)start, size); > + > + return 1; > +} > + > + There's a gratuituous blank line here. > static void *__dma_alloc_coherent(struct device *dev, size_t size, > dma_addr_t *dma_handle, gfp_t flags, > struct dma_attrs *attrs) > @@ -53,7 +103,8 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size, > if (IS_ENABLED(CONFIG_ZONE_DMA) && > dev->coherent_dma_mask <= DMA_BIT_MASK(32)) > flags |= GFP_DMA; > - if (IS_ENABLED(CONFIG_DMA_CMA)) { > + > + if (!(flags & __GFP_WAIT) && IS_ENABLED(CONFIG_DMA_CMA)) { I think the diff would be more readable here if this wasn't introducing a blank linke and kept the IS_ENABLED() first. > struct page *page; > > size = PAGE_ALIGN(size); > @@ -73,50 +124,56 @@ static void __dma_free_coherent(struct device *dev, size_t size, > void *vaddr, dma_addr_t dma_handle, > struct dma_attrs *attrs) > { > + bool freed; > + phys_addr_t paddr = dma_to_phys(dev, dma_handle); > + > if (dev == NULL) { > WARN_ONCE(1, "Use an actual device structure for DMA allocation\n"); > return; > } > > - if (IS_ENABLED(CONFIG_DMA_CMA)) { > - phys_addr_t paddr = dma_to_phys(dev, dma_handle); > > - dma_release_from_contiguous(dev, The above leaves an unnecessary blank line in place. > ptr = __dma_alloc_coherent(dev, size, dma_handle, flags, attrs); > if (!ptr) > goto no_mem; > - map = kmalloc(sizeof(struct page *) << order, flags & ~GFP_DMA); > - if (!map) > - goto no_map; > > /* remove any dirty cache lines on the kernel alias */ > __dma_flush_range(ptr, ptr + size); > > + Adds an unnecessary blank line. > @@ -332,6 +391,67 @@ static struct notifier_block amba_bus_nb = { > > extern int swiotlb_late_init_with_default_size(size_t default_size); > > +static int __init atomic_pool_init(void) > +{ > + pgprot_t prot = __pgprot(PROT_NORMAL_NC); > + unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; > + struct page *page; > + void *addr; > + > + There's another gratuituous blank line here... > + if (dev_get_cma_area(NULL)) > + page = dma_alloc_from_contiguous(NULL, nr_pages, > + get_order(atomic_pool_size)); > + else > + page = alloc_pages(GFP_KERNEL, get_order(atomic_pool_size)); > + > + and here. > + if (page) { > + int ret; > + > + atomic_pool = gen_pool_create(PAGE_SHIFT, -1); > + if (!atomic_pool) > + goto free_page; > + > + addr = dma_common_contiguous_remap(page, atomic_pool_size, > + VM_USERMAP, prot, atomic_pool_init); > + > + if (!addr) > + goto destroy_genpool; > + > + memset(addr, 0, atomic_pool_size); > + __dma_flush_range(addr, addr + atomic_pool_size); > + > + ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, > + page_to_phys(page), > + atomic_pool_size, -1); > + if (ret) > + goto remove_mapping; > + > + gen_pool_set_algo(atomic_pool, > + gen_pool_first_fit_order_align, NULL); > + > + pr_info("DMA: preallocated %zd KiB pool for atomic allocations\n", I think this should be "%zu" because atomic_pool_size is a size_t, not a ssize_t. > + atomic_pool_size / 1024); > + return 0; > + } > + goto out; > + > +remove_mapping: > + dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP); > +destroy_genpool: > + gen_pool_destroy(atomic_pool); > + atomic_pool == NULL; This probably doesn't belong here. > +free_page: > + if (!dma_release_from_contiguous(NULL, page, nr_pages)) > + __free_pages(page, get_order(atomic_pool_size)); You use get_order(atomic_pool_size) a lot, perhaps it should be a temporary variable? > +out: > + pr_err("DMA: failed to allocate %zx KiB pool for atomic coherent allocation\n", > + atomic_pool_size / 1024); Print in decimal rather than hexadecimal? Thierry
On Wed, Jul 02, 2014 at 07:03:38PM +0100, Laura Abbott wrote: > diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c > index 4164c5a..a2487f1 100644 > --- a/arch/arm64/mm/dma-mapping.c > +++ b/arch/arm64/mm/dma-mapping.c [...] > static void *__dma_alloc_coherent(struct device *dev, size_t size, > dma_addr_t *dma_handle, gfp_t flags, > struct dma_attrs *attrs) > @@ -53,7 +103,8 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size, > if (IS_ENABLED(CONFIG_ZONE_DMA) && > dev->coherent_dma_mask <= DMA_BIT_MASK(32)) > flags |= GFP_DMA; > - if (IS_ENABLED(CONFIG_DMA_CMA)) { > + > + if (!(flags & __GFP_WAIT) && IS_ENABLED(CONFIG_DMA_CMA)) { > struct page *page; > > size = PAGE_ALIGN(size); I think that's the wrong condition here. You want to use CMA if (flags & __GFP_WAIT). CMA does not support atomic allocations so it can fall back to swiotlb_alloc_coherent(). > @@ -73,50 +124,56 @@ static void __dma_free_coherent(struct device *dev, size_t size, > void *vaddr, dma_addr_t dma_handle, > struct dma_attrs *attrs) > { > + bool freed; > + phys_addr_t paddr = dma_to_phys(dev, dma_handle); > + > if (dev == NULL) { > WARN_ONCE(1, "Use an actual device structure for DMA allocation\n"); > return; > } > > - if (IS_ENABLED(CONFIG_DMA_CMA)) { > - phys_addr_t paddr = dma_to_phys(dev, dma_handle); > > - dma_release_from_contiguous(dev, > + freed = dma_release_from_contiguous(dev, > phys_to_page(paddr), > size >> PAGE_SHIFT); > - } else { > + if (!freed) > swiotlb_free_coherent(dev, size, vaddr, dma_handle); > - } > } Is __dma_free_coherent() ever called in atomic context? If yes, the dma_release_from_contiguous() may not like it since it tries to acquire a mutex. But since we don't have the gfp flags here, we don't have an easy way to know what to call. So the initial idea of always calling __alloc_from_pool() for both coherent/non-coherent cases would work better (but still with a single shared pool, see below). > static void *__dma_alloc_noncoherent(struct device *dev, size_t size, > dma_addr_t *dma_handle, gfp_t flags, > struct dma_attrs *attrs) > { > - struct page *page, **map; > + struct page *page; > void *ptr, *coherent_ptr; > - int order, i; > > size = PAGE_ALIGN(size); > - order = get_order(size); > + > + if (!(flags & __GFP_WAIT)) { > + struct page *page = NULL; > + void *addr = __alloc_from_pool(size, &page); > + > + if (addr) > + *dma_handle = phys_to_dma(dev, page_to_phys(page)); > + > + return addr; > + > + } If we do the above for the __dma_alloc_coherent() case, we could use the same pool but instead of returning addr it could just return page_address(page). The downside of sharing the pool is that you need cache flushing for every allocation (which we already do for the non-atomic case). > @@ -332,6 +391,67 @@ static struct notifier_block amba_bus_nb = { > > extern int swiotlb_late_init_with_default_size(size_t default_size); > > +static int __init atomic_pool_init(void) > +{ > + pgprot_t prot = __pgprot(PROT_NORMAL_NC); > + unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; > + struct page *page; > + void *addr; > + > + > + if (dev_get_cma_area(NULL)) Is it worth using this condition for other places where we check IS_ENABLED(CONFIG_DMA_CMA) (maybe as a separate patch). > + page = dma_alloc_from_contiguous(NULL, nr_pages, > + get_order(atomic_pool_size)); > + else > + page = alloc_pages(GFP_KERNEL, get_order(atomic_pool_size)); One problem here is that the atomic pool wouldn't be able to honour GFP_DMA (in the latest kernel, CMA is by default in ZONE_DMA). You should probably pass GFP_KERNEL|GFP_DMA here. You could also use the swiotlb_alloc_coherent() which, with a NULL dev, assumes 32-bit DMA mask but it still expects GFP_DMA to be passed. > + if (page) { > + int ret; > + > + atomic_pool = gen_pool_create(PAGE_SHIFT, -1); > + if (!atomic_pool) > + goto free_page; > + > + addr = dma_common_contiguous_remap(page, atomic_pool_size, > + VM_USERMAP, prot, atomic_pool_init); > + > + if (!addr) > + goto destroy_genpool; > + > + memset(addr, 0, atomic_pool_size); > + __dma_flush_range(addr, addr + atomic_pool_size); If you add the flushing in the __dma_alloc_noncoherent(), it won't be needed here (of course, more efficient here but it would not work if we share the pool). > +postcore_initcall(atomic_pool_init); Why not arch_initcall? Or even better, we could have a common DMA init function that calls swiotlb_late_init() and atomic_pool_init() (in this order if you decide to use swiotlb allocation above).
On 7/4/2014 6:35 AM, Thierry Reding wrote: > On Wed, Jul 02, 2014 at 11:03:38AM -0700, Laura Abbott wrote: > [...] >> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c > [...] >> +static struct gen_pool *atomic_pool; >> + >> +#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K >> +static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; > > There doesn't seem to be much use for this since it can't be overridden > via init_dma_coherent_pool_size like on ARM. > There is still the command line option coherent_pool=<size> though [...] >> + if (page) { >> + int ret; >> + >> + atomic_pool = gen_pool_create(PAGE_SHIFT, -1); >> + if (!atomic_pool) >> + goto free_page; >> + >> + addr = dma_common_contiguous_remap(page, atomic_pool_size, >> + VM_USERMAP, prot, atomic_pool_init); >> + >> + if (!addr) >> + goto destroy_genpool; >> + >> + memset(addr, 0, atomic_pool_size); >> + __dma_flush_range(addr, addr + atomic_pool_size); >> + >> + ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, >> + page_to_phys(page), >> + atomic_pool_size, -1); >> + if (ret) >> + goto remove_mapping; >> + >> + gen_pool_set_algo(atomic_pool, >> + gen_pool_first_fit_order_align, NULL); >> + >> + pr_info("DMA: preallocated %zd KiB pool for atomic allocations\n", > > I think this should be "%zu" because atomic_pool_size is a size_t, not a > ssize_t. > Yes, will fix. >> + atomic_pool_size / 1024); >> + return 0; >> + } >> + goto out; >> + >> +remove_mapping: >> + dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP); >> +destroy_genpool: >> + gen_pool_destroy(atomic_pool); >> + atomic_pool == NULL; > > This probably doesn't belong here. > Dastardly typo. >> +free_page: >> + if (!dma_release_from_contiguous(NULL, page, nr_pages)) >> + __free_pages(page, get_order(atomic_pool_size)); > > You use get_order(atomic_pool_size) a lot, perhaps it should be a > temporary variable? > Yes, three usages is probably enough. >> +out: >> + pr_err("DMA: failed to allocate %zx KiB pool for atomic coherent allocation\n", >> + atomic_pool_size / 1024); > > Print in decimal rather than hexadecimal? > I actually prefer hexadecimal but I should at least be consistent between error and non-error paths. > Thierry > Thanks, Laura
On 7/18/2014 6:43 AM, Catalin Marinas wrote: > On Wed, Jul 02, 2014 at 07:03:38PM +0100, Laura Abbott wrote: >> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c >> index 4164c5a..a2487f1 100644 >> --- a/arch/arm64/mm/dma-mapping.c >> +++ b/arch/arm64/mm/dma-mapping.c > [...] >> static void *__dma_alloc_coherent(struct device *dev, size_t size, >> dma_addr_t *dma_handle, gfp_t flags, >> struct dma_attrs *attrs) >> @@ -53,7 +103,8 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size, >> if (IS_ENABLED(CONFIG_ZONE_DMA) && >> dev->coherent_dma_mask <= DMA_BIT_MASK(32)) >> flags |= GFP_DMA; >> - if (IS_ENABLED(CONFIG_DMA_CMA)) { >> + >> + if (!(flags & __GFP_WAIT) && IS_ENABLED(CONFIG_DMA_CMA)) { >> struct page *page; >> >> size = PAGE_ALIGN(size); > > I think that's the wrong condition here. You want to use CMA if > (flags & __GFP_WAIT). CMA does not support atomic allocations so it can > fall back to swiotlb_alloc_coherent(). > >> @@ -73,50 +124,56 @@ static void __dma_free_coherent(struct device *dev, size_t size, >> void *vaddr, dma_addr_t dma_handle, >> struct dma_attrs *attrs) >> { >> + bool freed; >> + phys_addr_t paddr = dma_to_phys(dev, dma_handle); >> + >> if (dev == NULL) { >> WARN_ONCE(1, "Use an actual device structure for DMA allocation\n"); >> return; >> } >> >> - if (IS_ENABLED(CONFIG_DMA_CMA)) { >> - phys_addr_t paddr = dma_to_phys(dev, dma_handle); >> >> - dma_release_from_contiguous(dev, >> + freed = dma_release_from_contiguous(dev, >> phys_to_page(paddr), >> size >> PAGE_SHIFT); >> - } else { >> + if (!freed) >> swiotlb_free_coherent(dev, size, vaddr, dma_handle); >> - } >> } > > Is __dma_free_coherent() ever called in atomic context? If yes, the > dma_release_from_contiguous() may not like it since it tries to acquire > a mutex. But since we don't have the gfp flags here, we don't have an > easy way to know what to call. > > So the initial idea of always calling __alloc_from_pool() for both > coherent/non-coherent cases would work better (but still with a single > shared pool, see below). > We should be okay __dma_free_coherent -> dma_release_from_contiguous -> cma_release which bounds checks the CMA region before taking any mutexes unless I missed something. The existing behavior on arm is to not allow non-atomic allocations to be freed atomic context when CMA is enabled so we'd be giving arm64 more leeway there. Is being able to free non-atomic allocations in atomic context really necessary? >> static void *__dma_alloc_noncoherent(struct device *dev, size_t size, >> dma_addr_t *dma_handle, gfp_t flags, >> struct dma_attrs *attrs) >> { >> - struct page *page, **map; >> + struct page *page; >> void *ptr, *coherent_ptr; >> - int order, i; >> >> size = PAGE_ALIGN(size); >> - order = get_order(size); >> + >> + if (!(flags & __GFP_WAIT)) { >> + struct page *page = NULL; >> + void *addr = __alloc_from_pool(size, &page); >> + >> + if (addr) >> + *dma_handle = phys_to_dma(dev, page_to_phys(page)); >> + >> + return addr; >> + >> + } > > If we do the above for the __dma_alloc_coherent() case, we could use the > same pool but instead of returning addr it could just return > page_address(page). The downside of sharing the pool is that you need > cache flushing for every allocation (which we already do for the > non-atomic case). > >> @@ -332,6 +391,67 @@ static struct notifier_block amba_bus_nb = { >> >> extern int swiotlb_late_init_with_default_size(size_t default_size); >> >> +static int __init atomic_pool_init(void) >> +{ >> + pgprot_t prot = __pgprot(PROT_NORMAL_NC); >> + unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; >> + struct page *page; >> + void *addr; >> + >> + >> + if (dev_get_cma_area(NULL)) > > Is it worth using this condition for other places where we check > IS_ENABLED(CONFIG_DMA_CMA) (maybe as a separate patch). > Yes, it would be good to match arm in that respect. >> + page = dma_alloc_from_contiguous(NULL, nr_pages, >> + get_order(atomic_pool_size)); >> + else >> + page = alloc_pages(GFP_KERNEL, get_order(atomic_pool_size)); > > One problem here is that the atomic pool wouldn't be able to honour > GFP_DMA (in the latest kernel, CMA is by default in ZONE_DMA). You > should probably pass GFP_KERNEL|GFP_DMA here. You could also use the > swiotlb_alloc_coherent() which, with a NULL dev, assumes 32-bit DMA mask > but it still expects GFP_DMA to be passed. > I think I missed updating this to GFP_DMA. The only advantage I would see to using swiotlb_alloc_coherent vs. alloc_pages directly would be to allow the fallback to using a bounce buffer if __get_free_pages failed. I'll keep this as alloc_pages for now; it can be changed later if there is a particular need for swiotlb behavior. >> + if (page) { >> + int ret; >> + >> + atomic_pool = gen_pool_create(PAGE_SHIFT, -1); >> + if (!atomic_pool) >> + goto free_page; >> + >> + addr = dma_common_contiguous_remap(page, atomic_pool_size, >> + VM_USERMAP, prot, atomic_pool_init); >> + >> + if (!addr) >> + goto destroy_genpool; >> + >> + memset(addr, 0, atomic_pool_size); >> + __dma_flush_range(addr, addr + atomic_pool_size); > > If you add the flushing in the __dma_alloc_noncoherent(), it won't be > needed here (of course, more efficient here but it would not work if we > share the pool). > >> +postcore_initcall(atomic_pool_init); > > Why not arch_initcall? Or even better, we could have a common DMA init > function that calls swiotlb_late_init() and atomic_pool_init() (in this > order if you decide to use swiotlb allocation above). > Good point. I'll combine the two. Thanks, Laura
On Mon, Jul 21, 2014 at 11:36:49PM +0100, Laura Abbott wrote: > On 7/18/2014 6:43 AM, Catalin Marinas wrote: > > On Wed, Jul 02, 2014 at 07:03:38PM +0100, Laura Abbott wrote: > >> @@ -73,50 +124,56 @@ static void __dma_free_coherent(struct device *dev, size_t size, > >> void *vaddr, dma_addr_t dma_handle, > >> struct dma_attrs *attrs) > >> { > >> + bool freed; > >> + phys_addr_t paddr = dma_to_phys(dev, dma_handle); > >> + > >> if (dev == NULL) { > >> WARN_ONCE(1, "Use an actual device structure for DMA allocation\n"); > >> return; > >> } > >> > >> - if (IS_ENABLED(CONFIG_DMA_CMA)) { > >> - phys_addr_t paddr = dma_to_phys(dev, dma_handle); > >> > >> - dma_release_from_contiguous(dev, > >> + freed = dma_release_from_contiguous(dev, > >> phys_to_page(paddr), > >> size >> PAGE_SHIFT); > >> - } else { > >> + if (!freed) > >> swiotlb_free_coherent(dev, size, vaddr, dma_handle); > >> - } > >> } > > > > Is __dma_free_coherent() ever called in atomic context? If yes, the > > dma_release_from_contiguous() may not like it since it tries to acquire > > a mutex. But since we don't have the gfp flags here, we don't have an > > easy way to know what to call. > > > > So the initial idea of always calling __alloc_from_pool() for both > > coherent/non-coherent cases would work better (but still with a single > > shared pool, see below). > > We should be okay > > __dma_free_coherent -> dma_release_from_contiguous -> cma_release which > bounds checks the CMA region before taking any mutexes unless I missed > something. Ah, good point. I missed the pfn range check in dma_release_from_contiguous. > The existing behavior on arm is to not allow non-atomic allocations to be > freed atomic context when CMA is enabled so we'd be giving arm64 more > leeway there. Is being able to free non-atomic allocations in atomic > context really necessary? No. I was worried that an atomic coherent allocation (falling back to swiotlb) would trigger some CMA mutex in atomic context on the freeing path. But you are right, it shouldn't happen. > >> + page = dma_alloc_from_contiguous(NULL, nr_pages, > >> + get_order(atomic_pool_size)); > >> + else > >> + page = alloc_pages(GFP_KERNEL, get_order(atomic_pool_size)); > > > > One problem here is that the atomic pool wouldn't be able to honour > > GFP_DMA (in the latest kernel, CMA is by default in ZONE_DMA). You > > should probably pass GFP_KERNEL|GFP_DMA here. You could also use the > > swiotlb_alloc_coherent() which, with a NULL dev, assumes 32-bit DMA mask > > but it still expects GFP_DMA to be passed. > > > > I think I missed updating this to GFP_DMA. The only advantage I would see > to using swiotlb_alloc_coherent vs. alloc_pages directly would be to > allow the fallback to using a bounce buffer if __get_free_pages failed. > I'll keep this as alloc_pages for now; it can be changed later if there > is a particular need for swiotlb behavior. That's fine. Since we don't have a device at this point, I don't see how swiotlb could fall back to the bounce buffer. Thanks.
On Wednesday 02 July 2014, Laura Abbott wrote: > + pgprot_t prot = __pgprot(PROT_NORMAL_NC); > + unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; > + struct page *page; > + void *addr; > + > + > + if (dev_get_cma_area(NULL)) > + page = dma_alloc_from_contiguous(NULL, nr_pages, > + get_order(atomic_pool_size)); > + else > + page = alloc_pages(GFP_KERNEL, get_order(atomic_pool_size)); > + > + > + if (page) { > + int ret; > + > + atomic_pool = gen_pool_create(PAGE_SHIFT, -1); > + if (!atomic_pool) > + goto free_page; > + > + addr = dma_common_contiguous_remap(page, atomic_pool_size, > + VM_USERMAP, prot, atomic_pool_init); > + I just stumbled over this thread and noticed the code here: When you do alloc_pages() above, you actually get pages that are already mapped into the linear kernel mapping as cacheable pages. Your new dma_common_contiguous_remap tries to map them as noncacheable. This seems broken because it allows the CPU to treat both mappings as cacheable, and that won't be coherent with device DMA. > + if (!addr) > + goto destroy_genpool; > + > + memset(addr, 0, atomic_pool_size); > + __dma_flush_range(addr, addr + atomic_pool_size); It also seems weird to flush the cache on a virtual address of an uncacheable mapping. Is that well-defined? In the CMA case, the original mapping should already be uncached here, so you don't need to flush it. In the alloc_pages() case, I think you need to unmap the pages from the linear mapping instead. Arnd
On Tue, Jul 22, 2014 at 07:06:44PM +0100, Arnd Bergmann wrote: > On Wednesday 02 July 2014, Laura Abbott wrote: > > + pgprot_t prot = __pgprot(PROT_NORMAL_NC); > > + unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; > > + struct page *page; > > + void *addr; > > + > > + > > + if (dev_get_cma_area(NULL)) > > + page = dma_alloc_from_contiguous(NULL, nr_pages, > > + get_order(atomic_pool_size)); > > + else > > + page = alloc_pages(GFP_KERNEL, get_order(atomic_pool_size)); > > + > > + > > + if (page) { > > + int ret; > > + > > + atomic_pool = gen_pool_create(PAGE_SHIFT, -1); > > + if (!atomic_pool) > > + goto free_page; > > + > > + addr = dma_common_contiguous_remap(page, atomic_pool_size, > > + VM_USERMAP, prot, atomic_pool_init); > > + > > I just stumbled over this thread and noticed the code here: When you do > alloc_pages() above, you actually get pages that are already mapped into > the linear kernel mapping as cacheable pages. Your new > dma_common_contiguous_remap tries to map them as noncacheable. This > seems broken because it allows the CPU to treat both mappings as > cacheable, and that won't be coherent with device DMA. It does *not* allow the CPU to treat both as cacheable. It treats the non-cacheable mapping as non-cacheable (and the cacheable one as cacheable). The only requirements the ARM ARM makes in this situation (B2.9 point 5 in the ARMv8 ARM): - Before writing to a location not using the Write-Back attribute, software must invalidate, or clean, a location from the caches if any agent might have written to the location with the Write-Back attribute. This avoids the possibility of overwriting the location with stale data. - After writing to a location with the Write-Back attribute, software must clean the location from the caches, to make the write visible to external memory. - Before reading the location with a cacheable attribute, software must invalidate the location from the caches, to ensure that any value held in the caches reflects the last value made visible in external memory. So we as long as the CPU accesses such memory only via the non-cacheable mapping, the only requirement is to flush the cache so that there are no dirty lines that could be evicted. (if the mismatched attributes were for example Normal vs Device, the Device guarantees would be lost but in the cacheable vs non-cacheable it's not too bad; same for ARMv7). > > + if (!addr) > > + goto destroy_genpool; > > + > > + memset(addr, 0, atomic_pool_size); > > + __dma_flush_range(addr, addr + atomic_pool_size); > > It also seems weird to flush the cache on a virtual address of > an uncacheable mapping. Is that well-defined? Yes. According to D5.8.1 (Data and unified caches), "if cache maintenance is performed on a memory location, the effect of that cache maintenance is visible to all aliases of that physical memory location. These properties are consistent with implementing all caches that can handle data accesses as Physically-indexed, physically-tagged (PIPT) caches". > In the CMA case, the > original mapping should already be uncached here, so you don't need > to flush it. I don't think it is non-cacheable already, at least not for arm64 (CMA can be used on coherent architectures as well). > In the alloc_pages() case, I think you need to unmap > the pages from the linear mapping instead. Even if unmapped, it would not remove dirty cache lines (which are associated with physical addresses anyway). But we don't need to worry about unmapping anyway, see above (that's unless we find some architecture implementation where having such cacheable/non-cacheable aliases is not efficient enough, the efficiency is not guaranteed by the ARM ARM, just the correct behaviour).
On 7/22/2014 2:03 PM, Catalin Marinas wrote: > On Tue, Jul 22, 2014 at 07:06:44PM +0100, Arnd Bergmann wrote: [...] >>> + if (!addr) >>> + goto destroy_genpool; >>> + >>> + memset(addr, 0, atomic_pool_size); >>> + __dma_flush_range(addr, addr + atomic_pool_size); >> >> It also seems weird to flush the cache on a virtual address of >> an uncacheable mapping. Is that well-defined? > > Yes. According to D5.8.1 (Data and unified caches), "if cache > maintenance is performed on a memory location, the effect of that cache > maintenance is visible to all aliases of that physical memory location. > These properties are consistent with implementing all caches that can > handle data accesses as Physically-indexed, physically-tagged (PIPT) > caches". > This was actually unintentional on my part. I'm going to clean this up to flush via the existing cached mapping to make it clearer what's going on. >> In the CMA case, the >> original mapping should already be uncached here, so you don't need >> to flush it. > > I don't think it is non-cacheable already, at least not for arm64 (CMA > can be used on coherent architectures as well). > Memory allocated via dma_alloc_from_contiguous is not guaranteed to be uncached. On arm, we allocate the page of memory and the remap it as appropriate. >> In the alloc_pages() case, I think you need to unmap >> the pages from the linear mapping instead. > > Even if unmapped, it would not remove dirty cache lines (which are > associated with physical addresses anyway). But we don't need to worry > about unmapping anyway, see above (that's unless we find some > architecture implementation where having such cacheable/non-cacheable > aliases is not efficient enough, the efficiency is not guaranteed by the > ARM ARM, just the correct behaviour). > Let's hope that never happens. Thanks, Laura
On Tuesday 22 July 2014 22:03:52 Catalin Marinas wrote: > On Tue, Jul 22, 2014 at 07:06:44PM +0100, Arnd Bergmann wrote: > > On Wednesday 02 July 2014, Laura Abbott wrote: > > > + pgprot_t prot = __pgprot(PROT_NORMAL_NC); > > > + unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; > > > + struct page *page; > > > + void *addr; > > > + > > > + > > > + if (dev_get_cma_area(NULL)) > > > + page = dma_alloc_from_contiguous(NULL, nr_pages, > > > + get_order(atomic_pool_size)); > > > + else > > > + page = alloc_pages(GFP_KERNEL, get_order(atomic_pool_size)); > > > + > > > + > > > + if (page) { > > > + int ret; > > > + > > > + atomic_pool = gen_pool_create(PAGE_SHIFT, -1); > > > + if (!atomic_pool) > > > + goto free_page; > > > + > > > + addr = dma_common_contiguous_remap(page, atomic_pool_size, > > > + VM_USERMAP, prot, atomic_pool_init); > > > + > > > > I just stumbled over this thread and noticed the code here: When you do > > alloc_pages() above, you actually get pages that are already mapped into > > the linear kernel mapping as cacheable pages. Your new > > dma_common_contiguous_remap tries to map them as noncacheable. This > > seems broken because it allows the CPU to treat both mappings as > > cacheable, and that won't be coherent with device DMA. > > It does *not* allow the CPU to treat both as cacheable. It treats the > non-cacheable mapping as non-cacheable (and the cacheable one as > cacheable). The only requirements the ARM ARM makes in this situation > (B2.9 point 5 in the ARMv8 ARM): > > - Before writing to a location not using the Write-Back attribute, > software must invalidate, or clean, a location from the caches if any > agent might have written to the location with the Write-Back > attribute. This avoids the possibility of overwriting the location > with stale data. > - After writing to a location with the Write-Back attribute, software > must clean the location from the caches, to make the write visible to > external memory. > - Before reading the location with a cacheable attribute, software must > invalidate the location from the caches, to ensure that any value held > in the caches reflects the last value made visible in external memory. > > So we as long as the CPU accesses such memory only via the non-cacheable > mapping, the only requirement is to flush the cache so that there are no > dirty lines that could be evicted. Ok, thanks for the explanation. > (if the mismatched attributes were for example Normal vs Device, the > Device guarantees would be lost but in the cacheable vs non-cacheable > it's not too bad; same for ARMv7). Right, that's probabably what I misremembered. > > > + if (!addr) > > > + goto destroy_genpool; > > > + > > > + memset(addr, 0, atomic_pool_size); > > > + __dma_flush_range(addr, addr + atomic_pool_size); > > > > It also seems weird to flush the cache on a virtual address of > > an uncacheable mapping. Is that well-defined? > > Yes. According to D5.8.1 (Data and unified caches), "if cache > maintenance is performed on a memory location, the effect of that cache > maintenance is visible to all aliases of that physical memory location. > These properties are consistent with implementing all caches that can > handle data accesses as Physically-indexed, physically-tagged (PIPT) > caches". interesting. > > In the CMA case, the > > original mapping should already be uncached here, so you don't need > > to flush it. > > I don't think it is non-cacheable already, at least not for arm64 (CMA > can be used on coherent architectures as well). Ok, I see it now. Sorry for all the confusion on my part. Arnd
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a474de34..cd402b9 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -15,6 +15,7 @@ config ARM64 select COMMON_CLK select CPU_PM if (SUSPEND || CPU_IDLE) select DCACHE_WORD_ACCESS + select GENERIC_ALLOCATOR select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_CPU_AUTOPROBE diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 4164c5a..a2487f1 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -27,6 +27,7 @@ #include <linux/vmalloc.h> #include <linux/swiotlb.h> #include <linux/amba/bus.h> +#include <linux/genalloc.h> #include <asm/cacheflush.h> @@ -41,6 +42,55 @@ static pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot, return prot; } +static struct gen_pool *atomic_pool; + +#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K +static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; + +static int __init early_coherent_pool(char *p) +{ + atomic_pool_size = memparse(p, &p); + return 0; +} +early_param("coherent_pool", early_coherent_pool); + +static void *__alloc_from_pool(size_t size, struct page **ret_page) +{ + unsigned long val; + void *ptr = NULL; + + if (!atomic_pool) { + WARN(1, "coherent pool not initialised!\n"); + return NULL; + } + + val = gen_pool_alloc(atomic_pool, size); + if (val) { + phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); + + *ret_page = phys_to_page(phys); + ptr = (void *)val; + } + + return ptr; +} + +static bool __in_atomic_pool(void *start, size_t size) +{ + return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); +} + +static int __free_from_pool(void *start, size_t size) +{ + if (!__in_atomic_pool(start, size)) + return 0; + + gen_pool_free(atomic_pool, (unsigned long)start, size); + + return 1; +} + + static void *__dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flags, struct dma_attrs *attrs) @@ -53,7 +103,8 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size, if (IS_ENABLED(CONFIG_ZONE_DMA) && dev->coherent_dma_mask <= DMA_BIT_MASK(32)) flags |= GFP_DMA; - if (IS_ENABLED(CONFIG_DMA_CMA)) { + + if (!(flags & __GFP_WAIT) && IS_ENABLED(CONFIG_DMA_CMA)) { struct page *page; size = PAGE_ALIGN(size); @@ -73,50 +124,56 @@ static void __dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs) { + bool freed; + phys_addr_t paddr = dma_to_phys(dev, dma_handle); + if (dev == NULL) { WARN_ONCE(1, "Use an actual device structure for DMA allocation\n"); return; } - if (IS_ENABLED(CONFIG_DMA_CMA)) { - phys_addr_t paddr = dma_to_phys(dev, dma_handle); - dma_release_from_contiguous(dev, + freed = dma_release_from_contiguous(dev, phys_to_page(paddr), size >> PAGE_SHIFT); - } else { + if (!freed) swiotlb_free_coherent(dev, size, vaddr, dma_handle); - } } static void *__dma_alloc_noncoherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flags, struct dma_attrs *attrs) { - struct page *page, **map; + struct page *page; void *ptr, *coherent_ptr; - int order, i; size = PAGE_ALIGN(size); - order = get_order(size); + + if (!(flags & __GFP_WAIT)) { + struct page *page = NULL; + void *addr = __alloc_from_pool(size, &page); + + if (addr) + *dma_handle = phys_to_dma(dev, page_to_phys(page)); + + return addr; + + } ptr = __dma_alloc_coherent(dev, size, dma_handle, flags, attrs); if (!ptr) goto no_mem; - map = kmalloc(sizeof(struct page *) << order, flags & ~GFP_DMA); - if (!map) - goto no_map; /* remove any dirty cache lines on the kernel alias */ __dma_flush_range(ptr, ptr + size); + /* create a coherent mapping */ page = virt_to_page(ptr); - for (i = 0; i < (size >> PAGE_SHIFT); i++) - map[i] = page + i; - coherent_ptr = vmap(map, size >> PAGE_SHIFT, VM_MAP, - __get_dma_pgprot(attrs, __pgprot(PROT_NORMAL_NC), false)); - kfree(map); + coherent_ptr = dma_common_contiguous_remap(page, size, VM_USERMAP, + __get_dma_pgprot(attrs, + __pgprot(PROT_NORMAL_NC), false), + NULL); if (!coherent_ptr) goto no_map; @@ -135,6 +192,8 @@ static void __dma_free_noncoherent(struct device *dev, size_t size, { void *swiotlb_addr = phys_to_virt(dma_to_phys(dev, dma_handle)); + if (__free_from_pool(vaddr, size)) + return; vunmap(vaddr); __dma_free_coherent(dev, size, swiotlb_addr, dma_handle, attrs); } @@ -332,6 +391,67 @@ static struct notifier_block amba_bus_nb = { extern int swiotlb_late_init_with_default_size(size_t default_size); +static int __init atomic_pool_init(void) +{ + pgprot_t prot = __pgprot(PROT_NORMAL_NC); + unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; + struct page *page; + void *addr; + + + if (dev_get_cma_area(NULL)) + page = dma_alloc_from_contiguous(NULL, nr_pages, + get_order(atomic_pool_size)); + else + page = alloc_pages(GFP_KERNEL, get_order(atomic_pool_size)); + + + if (page) { + int ret; + + atomic_pool = gen_pool_create(PAGE_SHIFT, -1); + if (!atomic_pool) + goto free_page; + + addr = dma_common_contiguous_remap(page, atomic_pool_size, + VM_USERMAP, prot, atomic_pool_init); + + if (!addr) + goto destroy_genpool; + + memset(addr, 0, atomic_pool_size); + __dma_flush_range(addr, addr + atomic_pool_size); + + ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, + page_to_phys(page), + atomic_pool_size, -1); + if (ret) + goto remove_mapping; + + gen_pool_set_algo(atomic_pool, + gen_pool_first_fit_order_align, NULL); + + pr_info("DMA: preallocated %zd KiB pool for atomic allocations\n", + atomic_pool_size / 1024); + return 0; + } + goto out; + +remove_mapping: + dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP); +destroy_genpool: + gen_pool_destroy(atomic_pool); + atomic_pool == NULL; +free_page: + if (!dma_release_from_contiguous(NULL, page, nr_pages)) + __free_pages(page, get_order(atomic_pool_size)); +out: + pr_err("DMA: failed to allocate %zx KiB pool for atomic coherent allocation\n", + atomic_pool_size / 1024); + return -ENOMEM; +} +postcore_initcall(atomic_pool_init); + static int __init swiotlb_late_init(void) { size_t swiotlb_size = min(SZ_64M, MAX_ORDER_NR_PAGES << PAGE_SHIFT);
Neither CMA nor noncoherent allocations support atomic allocations. Add a dedicated atomic pool to support this. Signed-off-by: Laura Abbott <lauraa@codeaurora.org> --- arch/arm64/Kconfig | 1 + arch/arm64/mm/dma-mapping.c | 154 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 138 insertions(+), 17 deletions(-)