Message ID | BANLkTimfxrcKWLZJmVC+1zL27th7j4uzEA@mail.gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Jun 21, 2011 at 11:26:27AM +0200, Per Forlin wrote:
> Here are the results.
It looks like this patch is either a no-op or slightly worse. As
people have been telling me that dsb is rather expensive, and this
patch results in less dsbs, I'm finding these results hard to believe.
It seems to be saying that dsb is an effective no-op on your platform.
So either people are wrong about dsb being expensive, the patch is
wrong, or there's something wrong with these results/test method.
You do have an error in the ported patch, as that hasn't updated the
v7 cache cleaning code to remove the dsb() there, but that would only
affect the write tests.
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
On 23 June 2011 15:37, Russell King - ARM Linux <linux@arm.linux.org.uk> wrote: > On Tue, Jun 21, 2011 at 11:26:27AM +0200, Per Forlin wrote: >> Here are the results. > > It looks like this patch is either a no-op or slightly worse. As > people have been telling me that dsb is rather expensive, and this > patch results in less dsbs, I'm finding these results hard to believe. > It seems to be saying that dsb is an effective no-op on your platform. > The result of your patch depends on the number of sg-elements. With your patch there is only on DSB per list instead of element I can write a test to measure performance per number of sg-element in the sg-list. Fixed transfer size but vary the number of sg-elements in the list. This test may give a better understanding of the affect. I have seen performance gain if using __raw_write instead of writel. Writel test includes both the cost of DSB and the outer_sync, where outer_sync is more expensive one I presume. > So either people are wrong about dsb being expensive, the patch is > wrong, or there's something wrong with these results/test method. > > You do have an error in the ported patch, as that hasn't updated the > v7 cache cleaning code to remove the dsb() there, but that would only > affect the write tests. > I will fix that mistake and also improve the test cases to measure the cost per number of sg-elements. I'll come back with new numbers on Monday. Regards, Per -- To unsubscribe from this list: send the line "unsubscribe linux-mmc" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h index 4fff837..ad14c2b 100644 --- a/arch/arm/include/asm/dma-mapping.h +++ b/arch/arm/include/asm/dma-mapping.h @@ -115,6 +115,11 @@ static inline void __dma_page_dev_to_cpu(struct page *page, unsigned long off, ___dma_page_dev_to_cpu(page, off, size, dir); } +static inline void __dma_sync(void) +{ + dsb(); +} + /* * Return whether the given device DMA address mask can be supported * properly. For example, if your device can only drive the low 24-bits @@ -378,6 +383,7 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, BUG_ON(!valid_dma_direction(dir)); addr = __dma_map_single(dev, cpu_addr, size, dir); + __dma_sync(); debug_dma_map_page(dev, virt_to_page(cpu_addr), (unsigned long)cpu_addr & ~PAGE_MASK, size, dir, addr, true); @@ -407,6 +413,7 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, BUG_ON(!valid_dma_direction(dir)); addr = __dma_map_page(dev, page, offset, size, dir); + __dma_sync(); debug_dma_map_page(dev, page, offset, size, dir, addr, false); return addr; @@ -431,6 +438,7 @@ static inline void dma_unmap_single(struct device *dev, dma_addr_t handle, { debug_dma_unmap_page(dev, handle, size, dir, true); __dma_unmap_single(dev, handle, size, dir); + __dma_sync(); } /** @@ -452,6 +460,7 @@ static inline void dma_unmap_page(struct device *dev, dma_addr_t handle, { debug_dma_unmap_page(dev, handle, size, dir, false); __dma_unmap_page(dev, handle, size, dir); + __dma_sync(); } /** @@ -498,6 +507,7 @@ static inline void dma_sync_single_range_for_device(struct device *dev, return; __dma_single_cpu_to_dev(dma_to_virt(dev, handle) + offset, size, dir); + __dma_sync(); } static inline void dma_sync_single_for_cpu(struct device *dev, diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S index 1fa6f71..6eeb734 100644 --- a/arch/arm/mm/cache-fa.S +++ b/arch/arm/mm/cache-fa.S @@ -179,8 +179,6 @@ fa_dma_inv_range: add r0, r0, #CACHE_DLINESIZE cmp r0, r1 blo 1b - mov r0, #0 - mcr p15, 0, r0, c7, c10, 4 @ drain write buffer mov pc, lr /* @@ -197,8 +195,6 @@ fa_dma_clean_range: add r0, r0, #CACHE_DLINESIZE cmp r0, r1 blo 1b - mov r0, #0 - mcr p15, 0, r0, c7, c10, 4 @ drain write buffer mov pc, lr /* @@ -212,8 +208,6 @@ ENTRY(fa_dma_flush_range) add r0, r0, #CACHE_DLINESIZE cmp r0, r1 blo 1b - mov r0, #0 - mcr p15, 0, r0, c7, c10, 4 @ drain write buffer mov pc, lr /* diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S index f40c696..523c0cb 100644 --- a/arch/arm/mm/cache-v4wb.S +++ b/arch/arm/mm/cache-v4wb.S @@ -194,7 +194,6 @@ v4wb_dma_inv_range: add r0, r0, #CACHE_DLINESIZE cmp r0, r1 blo 1b - mcr p15, 0, r0, c7, c10, 4 @ drain write buffer mov pc, lr /* @@ -211,7 +210,6 @@ v4wb_dma_clean_range: add r0, r0, #CACHE_DLINESIZE cmp r0, r1 blo 1b - mcr p15, 0, r0, c7, c10, 4 @ drain write buffer mov pc, lr /* diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S index 73b4a8b..7a842dd 100644 --- a/arch/arm/mm/cache-v6.S +++ b/arch/arm/mm/cache-v6.S @@ -239,8 +239,6 @@ v6_dma_inv_range: strlo r2, [r0] @ write for ownership #endif blo 1b - mov r0, #0 - mcr p15, 0, r0, c7, c10, 4 @ drain write buffer mov pc, lr /* @@ -262,8 +260,6 @@ v6_dma_clean_range: add r0, r0, #D_CACHE_LINE_SIZE cmp r0, r1 blo 1b - mov r0, #0 - mcr p15, 0, r0, c7, c10, 4 @ drain write buffer mov pc, lr /* @@ -290,8 +286,6 @@ ENTRY(v6_dma_flush_range) strlob r2, [r0] @ write for ownership #endif blo 1b - mov r0, #0 - mcr p15, 0, r0, c7, c10, 4 @ drain write buffer mov pc, lr /* diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index d32f02b..18dcef6 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -257,7 +257,6 @@ v7_dma_inv_range: add r0, r0, r2 cmp r0, r1 blo 1b - dsb mov pc, lr ENDPROC(v7_dma_clean_range) @@ -293,7 +291,6 @@ ENTRY(v7_dma_flush_range) add r0, r0, r2 cmp r0, r1 blo 1b - dsb mov pc, lr ENDPROC(v7_dma_flush_range) diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 82a093c..ff85283 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -97,6 +97,7 @@ static struct page *__dma_alloc_buffer(struct device *dev, size_t size, gfp_t gf memset(ptr, 0, size); dmac_flush_range(ptr, ptr + size); outer_flush_range(__pa(ptr), __pa(ptr) + size); + __dma_sync(); return page;