Message ID | 5821a1b2eb82847ccbac0945da040518d6f6f16b.1722578375.git.baruch@tkos.co.il (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | dma: support DMA zone starting above 4GB | expand |
On Fri, Aug 02, 2024 at 09:03:47AM +0300, Baruch Siach wrote: > diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c > index 3b4be4ca3b08..62b36fda44c9 100644 > --- a/kernel/dma/direct.c > +++ b/kernel/dma/direct.c > @@ -20,7 +20,7 @@ > * it for entirely different regions. In that case the arch code needs to > * override the variable below for dma-direct to work properly. > */ > -unsigned int zone_dma_bits __ro_after_init = 24; > +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24); u64 here makes sense even if it may be larger than phys_addr_t. It matches the phys_limit type in the swiotlb code. The compilers should no longer complain. > diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c > index d10613eb0f63..7b04f7575796 100644 > --- a/kernel/dma/pool.c > +++ b/kernel/dma/pool.c > @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp) > /* CMA can't cross zone boundaries, see cma_activate_area() */ > end = cma_get_base(cma) + size - 1; > if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) > - return end <= DMA_BIT_MASK(zone_dma_bits); > + return end <= zone_dma_limit; > if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) > - return end <= DMA_BIT_MASK(32); > + return end <= max(DMA_BIT_MASK(32), zone_dma_limit); > return true; > } > > diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c > index 043b0ecd3e8d..bb51bd5335ad 100644 > --- a/kernel/dma/swiotlb.c > +++ b/kernel/dma/swiotlb.c > @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, > if (!remap) > io_tlb_default_mem.can_grow = true; > if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA)) > - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); > + io_tlb_default_mem.phys_limit = zone_dma_limit; > else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) > - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); > + io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit); > else > io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); > #endif These two look correct to me now and it's the least intrusive (the alternative would have been a zone_dma32_limit). The arch code, however, needs to ensure that zone_dma_limit can always support 32-bit devices even if it is above 4GB (with the relevant dma offsets in place for such devices).
On Fri, 2 Aug 2024 10:37:38 +0100 Catalin Marinas <catalin.marinas@arm.com> wrote: > On Fri, Aug 02, 2024 at 09:03:47AM +0300, Baruch Siach wrote: > > diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c > > index 3b4be4ca3b08..62b36fda44c9 100644 > > --- a/kernel/dma/direct.c > > +++ b/kernel/dma/direct.c > > @@ -20,7 +20,7 @@ > > * it for entirely different regions. In that case the arch code needs to > > * override the variable below for dma-direct to work properly. > > */ > > -unsigned int zone_dma_bits __ro_after_init = 24; > > +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24); > > u64 here makes sense even if it may be larger than phys_addr_t. It > matches the phys_limit type in the swiotlb code. The compilers should no > longer complain. FTR I have never quite understood why phys_limit is u64, but u64 was already used all around the place when I first looked into swiotlb. > > diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c > > index d10613eb0f63..7b04f7575796 100644 > > --- a/kernel/dma/pool.c > > +++ b/kernel/dma/pool.c > > @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp) > > /* CMA can't cross zone boundaries, see cma_activate_area() */ > > end = cma_get_base(cma) + size - 1; > > if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) > > - return end <= DMA_BIT_MASK(zone_dma_bits); > > + return end <= zone_dma_limit; > > if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) > > - return end <= DMA_BIT_MASK(32); > > + return end <= max(DMA_BIT_MASK(32), zone_dma_limit); > > return true; > > } > > > > diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c > > index 043b0ecd3e8d..bb51bd5335ad 100644 > > --- a/kernel/dma/swiotlb.c > > +++ b/kernel/dma/swiotlb.c > > @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, > > if (!remap) > > io_tlb_default_mem.can_grow = true; > > if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA)) > > - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); > > + io_tlb_default_mem.phys_limit = zone_dma_limit; > > else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) > > - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); > > + io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit); > > else > > io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); > > #endif > > These two look correct to me now and it's the least intrusive (the > alternative would have been a zone_dma32_limit). The arch code, however, > needs to ensure that zone_dma_limit can always support 32-bit devices > even if it is above 4GB (with the relevant dma offsets in place for such > devices). Just to make sure, the DMA zone (if present) must map to at most 32-bit bus address space (possibly behind a bridge). Is that what you're saying? Petr T
On Fri, 2 Aug 2024 09:03:47 +0300 Baruch Siach <baruch@tkos.co.il> wrote: > From: Catalin Marinas <catalin.marinas@arm.com> > > Hardware DMA limit might not be power of 2. When RAM range starts above > 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit > can not encode this limit. > > Use plain address for DMA zone limit. > > Since DMA zone can now potentially span beyond 4GB physical limit of > DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. > > Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> > Co-developed-by: Baruch Siach <baruch@tkos.co.il> > Signed-off-by: Baruch Siach <baruch@tkos.co.il> > --- > arch/arm64/mm/init.c | 30 +++++++++++++++--------------- > arch/powerpc/mm/mem.c | 9 ++++----- > arch/s390/mm/init.c | 2 +- > include/linux/dma-direct.h | 2 +- > kernel/dma/direct.c | 4 ++-- > kernel/dma/pool.c | 4 ++-- > kernel/dma/swiotlb.c | 4 ++-- > 7 files changed, 27 insertions(+), 28 deletions(-) > > diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c > index 9b5ab6818f7f..c45e2152ca9e 100644 > --- a/arch/arm64/mm/init.c > +++ b/arch/arm64/mm/init.c > @@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void) > } > > /* > - * Return the maximum physical address for a zone accessible by the given bits > - * limit. If DRAM starts above 32-bit, expand the zone to the maximum > + * Return the maximum physical address for a zone given its limit. > + * If DRAM starts above 32-bit, expand the zone to the maximum > * available memory, otherwise cap it at 32-bit. > */ > -static phys_addr_t __init max_zone_phys(unsigned int zone_bits) > +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit) > { > - phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits); > phys_addr_t phys_start = memblock_start_of_DRAM(); > > if (phys_start > U32_MAX) > - zone_mask = PHYS_ADDR_MAX; > - else if (phys_start > zone_mask) > - zone_mask = U32_MAX; > + zone_limit = PHYS_ADDR_MAX; > + else if (phys_start > zone_limit) > + zone_limit = U32_MAX; > > - return min(zone_mask, memblock_end_of_DRAM() - 1) + 1; > + return min(zone_limit, memblock_end_of_DRAM() - 1) + 1; > } > > static void __init zone_sizes_init(void) > { > unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; > - unsigned int __maybe_unused acpi_zone_dma_bits; > - unsigned int __maybe_unused dt_zone_dma_bits; > - phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32); > + phys_addr_t __maybe_unused acpi_zone_dma_limit; > + phys_addr_t __maybe_unused dt_zone_dma_limit; > + phys_addr_t __maybe_unused dma32_phys_limit = > + max_zone_phys(DMA_BIT_MASK(32)); > > #ifdef CONFIG_ZONE_DMA > - acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address()); > - dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL)); > - zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits); > - arm64_dma_phys_limit = max_zone_phys(zone_dma_bits); > + acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address(); > + dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL); > + zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit); > + arm64_dma_phys_limit = max_zone_phys(zone_dma_limit); > max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); > #endif > #ifdef CONFIG_ZONE_DMA32 > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c > index d325217ab201..342c006cc1b8 100644 > --- a/arch/powerpc/mm/mem.c > +++ b/arch/powerpc/mm/mem.c > @@ -216,7 +216,7 @@ static int __init mark_nonram_nosave(void) > * everything else. GFP_DMA32 page allocations automatically fall back to > * ZONE_DMA. > * > - * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the > + * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the > * generic DMA mapping code. 32-bit only devices (if not handled by an IOMMU > * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by > * ZONE_DMA. > @@ -252,13 +252,12 @@ void __init paging_init(void) > * powerbooks. > */ > if (IS_ENABLED(CONFIG_PPC32)) > - zone_dma_bits = 30; > + zone_dma_limit = DMA_BIT_MASK(30); > else > - zone_dma_bits = 31; > + zone_dma_limit = DMA_BIT_MASK(31); > > #ifdef CONFIG_ZONE_DMA > - max_zone_pfns[ZONE_DMA] = min(max_low_pfn, > - 1UL << (zone_dma_bits - PAGE_SHIFT)); > + max_zone_pfns[ZONE_DMA] = min(max_low_pfn, zone_dma_limit >> PAGE_SHIFT); No big deal, but this is off by one. DMA_BIT_MASK() returns the highest address that can be represented with the given number of bits, whereas max_zone_pfns[] contains the lowest PFN that is NOT contained in the zone. Rest of the patch looks perfect. Petr T
On Wed, Aug 07, 2024 at 04:19:38PM +0200, Petr Tesařík wrote: > On Fri, 2 Aug 2024 10:37:38 +0100 > Catalin Marinas <catalin.marinas@arm.com> wrote: > > On Fri, Aug 02, 2024 at 09:03:47AM +0300, Baruch Siach wrote: > > > diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c > > > index 3b4be4ca3b08..62b36fda44c9 100644 > > > --- a/kernel/dma/direct.c > > > +++ b/kernel/dma/direct.c > > > @@ -20,7 +20,7 @@ > > > * it for entirely different regions. In that case the arch code needs to > > > * override the variable below for dma-direct to work properly. > > > */ > > > -unsigned int zone_dma_bits __ro_after_init = 24; > > > +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24); > > > > u64 here makes sense even if it may be larger than phys_addr_t. It > > matches the phys_limit type in the swiotlb code. The compilers should no > > longer complain. > > FTR I have never quite understood why phys_limit is u64, but u64 was > already used all around the place when I first looked into swiotlb. > > > > diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c > > > index d10613eb0f63..7b04f7575796 100644 > > > --- a/kernel/dma/pool.c > > > +++ b/kernel/dma/pool.c > > > @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp) > > > /* CMA can't cross zone boundaries, see cma_activate_area() */ > > > end = cma_get_base(cma) + size - 1; > > > if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) > > > - return end <= DMA_BIT_MASK(zone_dma_bits); > > > + return end <= zone_dma_limit; > > > if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) > > > - return end <= DMA_BIT_MASK(32); > > > + return end <= max(DMA_BIT_MASK(32), zone_dma_limit); > > > return true; > > > } > > > > > > diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c > > > index 043b0ecd3e8d..bb51bd5335ad 100644 > > > --- a/kernel/dma/swiotlb.c > > > +++ b/kernel/dma/swiotlb.c > > > @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, > > > if (!remap) > > > io_tlb_default_mem.can_grow = true; > > > if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA)) > > > - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); > > > + io_tlb_default_mem.phys_limit = zone_dma_limit; > > > else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) > > > - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); > > > + io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit); > > > else > > > io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); > > > #endif > > > > These two look correct to me now and it's the least intrusive (the > > alternative would have been a zone_dma32_limit). The arch code, however, > > needs to ensure that zone_dma_limit can always support 32-bit devices > > even if it is above 4GB (with the relevant dma offsets in place for such > > devices). > > Just to make sure, the DMA zone (if present) must map to at most 32-bit > bus address space (possibly behind a bridge). Is that what you're > saying? No exactly. What I'm trying to say is that on arm64 zone_dma_limit can go beyond DMA_BIT_MASK(32) when the latter is treated as a CPU address. In such cases, ZONE_DMA32 is empty. TBH, this code is confusing and not entirely suitable for a system where the CPU address offsets are not 0. The device::dma_coherent_mask is about the bus address range and phys_limit is calculated correctly in functions like dma_direct_optimal_gfp_mask(). But that's about it w.r.t. DMA bit masks because zone_dma_bits and DMA_BIT_MASK(32) are assumed to be about the CPU address ranges in some cases (in other cases DMA_BIT_MASK() is used to initialise dma_coherent_mask, so more of a bus address). On the platform Baruch is trying to fix, RAM starts at 32GB and ZONE_DMA should end at 33GB. That's 30-bit mask in bus address terms but something not a power of two for the CPU address, hence the zone_dma_limit introduced here. With ZONE_DMA32, since all the DMA code assumes that ZONE_DMA32 ends at 4GB CPU address, it doesn't really work for such platforms. If there are 32-bit devices with a corresponding CPU address offset, ZONE_DMA32 should end at 36GB on Baruch's platform. But to simplify things, we just ignore this on arm64 and make ZONE_DMA32 empty. In some cases where we have the device structure we could instead do a dma_to_phys(DMA_BIT_MASK(32)) but not in the two cases above. I guess if we really want to address this properly, we'd need to introduce a zone_dma32_limit that's initialised by the arch code. For arm64, I'm happy with just having an empty ZONE_DMA32 on such platforms.
On Wed, 7 Aug 2024 19:14:58 +0100 Catalin Marinas <catalin.marinas@arm.com> wrote: > On Wed, Aug 07, 2024 at 04:19:38PM +0200, Petr Tesařík wrote: > > On Fri, 2 Aug 2024 10:37:38 +0100 > > Catalin Marinas <catalin.marinas@arm.com> wrote: > > > On Fri, Aug 02, 2024 at 09:03:47AM +0300, Baruch Siach wrote: > > > > diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c > > > > index 3b4be4ca3b08..62b36fda44c9 100644 > > > > --- a/kernel/dma/direct.c > > > > +++ b/kernel/dma/direct.c > > > > @@ -20,7 +20,7 @@ > > > > * it for entirely different regions. In that case the arch code needs to > > > > * override the variable below for dma-direct to work properly. > > > > */ > > > > -unsigned int zone_dma_bits __ro_after_init = 24; > > > > +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24); > > > > > > u64 here makes sense even if it may be larger than phys_addr_t. It > > > matches the phys_limit type in the swiotlb code. The compilers should no > > > longer complain. > > > > FTR I have never quite understood why phys_limit is u64, but u64 was > > already used all around the place when I first looked into swiotlb. > > > > > > diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c > > > > index d10613eb0f63..7b04f7575796 100644 > > > > --- a/kernel/dma/pool.c > > > > +++ b/kernel/dma/pool.c > > > > @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp) > > > > /* CMA can't cross zone boundaries, see cma_activate_area() */ > > > > end = cma_get_base(cma) + size - 1; > > > > if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) > > > > - return end <= DMA_BIT_MASK(zone_dma_bits); > > > > + return end <= zone_dma_limit; > > > > if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) > > > > - return end <= DMA_BIT_MASK(32); > > > > + return end <= max(DMA_BIT_MASK(32), zone_dma_limit); > > > > return true; > > > > } > > > > > > > > diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c > > > > index 043b0ecd3e8d..bb51bd5335ad 100644 > > > > --- a/kernel/dma/swiotlb.c > > > > +++ b/kernel/dma/swiotlb.c > > > > @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, > > > > if (!remap) > > > > io_tlb_default_mem.can_grow = true; > > > > if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mhttps://lpc.events/event/18/contributions/1776/ask & __GFP_DMA)) > > > > - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); > > > > + io_tlb_default_mem.phys_limit = zone_dma_limit; > > > > else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) > > > > - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); > > > > + io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit); > > > > else > > > > io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); > > > > #endif > > > > > > These two look correct to me now and it's the least intrusive (the > > > alternative would have been a zone_dma32_limit). The arch code, however, > > > needs to ensure that zone_dma_limit can always support 32-bit devices > > > even if it is above 4GB (with the relevant dma offsets in place for such > > > devices). > > > > Just to make sure, the DMA zone (if present) must map to at most 32-bit > > bus address space (possibly behind a bridge). Is that what you're > > saying? > > No exactly. What I'm trying to say is that on arm64 zone_dma_limit can > go beyond DMA_BIT_MASK(32) when the latter is treated as a CPU address. > In such cases, ZONE_DMA32 is empty. > > TBH, this code is confusing and not entirely suitable for a system where > the CPU address offsets are not 0. The device::dma_coherent_mask is > about the bus address range and phys_limit is calculated correctly in > functions like dma_direct_optimal_gfp_mask(). But that's about it w.r.t. > DMA bit masks because zone_dma_bits and DMA_BIT_MASK(32) are assumed to > be about the CPU address ranges in some cases (in other cases > DMA_BIT_MASK() is used to initialise dma_coherent_mask, so more of a bus > address). Yes, I know. > On the platform Baruch is trying to fix, RAM starts at 32GB and ZONE_DMA > should end at 33GB. That's 30-bit mask in bus address terms but > something not a power of two for the CPU address, hence the > zone_dma_limit introduced here. Yes, I was watching the discussion. > With ZONE_DMA32, since all the DMA code assumes that ZONE_DMA32 ends at > 4GB CPU address, it doesn't really work for such platforms. If there are > 32-bit devices with a corresponding CPU address offset, ZONE_DMA32 > should end at 36GB on Baruch's platform. But to simplify things, we just > ignore this on arm64 and make ZONE_DMA32 empty. Ah. That makes sense. It also seems to support my theory that Linux memory zones are an obsolete concept and should be replaced by a different mechanism. > In some cases where we have the device structure we could instead do a > dma_to_phys(DMA_BIT_MASK(32)) but not in the two cases above. I guess if > we really want to address this properly, we'd need to introduce a > zone_dma32_limit that's initialised by the arch code. For arm64, I'm > happy with just having an empty ZONE_DMA32 on such platforms. The obvious caveat is that zone boundaries are system-wide, but the mapping between bus addresses and CPU addresses depends on the device structure. After all, that's why dma_to_phys takes the device as a parameter... In fact, a system may have multiple busses behind different bridges with a different offset applied by each. FYI I want to make more people aware of these issues at this year's Plumbers, see https://lpc.events/event/18/contributions/1776/ Petr T
On 2024-08-08 10:35 am, Petr Tesařík wrote: > On Wed, 7 Aug 2024 19:14:58 +0100 > Catalin Marinas <catalin.marinas@arm.com> wrote: > >> On Wed, Aug 07, 2024 at 04:19:38PM +0200, Petr Tesařík wrote: >>> On Fri, 2 Aug 2024 10:37:38 +0100 >>> Catalin Marinas <catalin.marinas@arm.com> wrote: >>>> On Fri, Aug 02, 2024 at 09:03:47AM +0300, Baruch Siach wrote: >>>>> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c >>>>> index 3b4be4ca3b08..62b36fda44c9 100644 >>>>> --- a/kernel/dma/direct.c >>>>> +++ b/kernel/dma/direct.c >>>>> @@ -20,7 +20,7 @@ >>>>> * it for entirely different regions. In that case the arch code needs to >>>>> * override the variable below for dma-direct to work properly. >>>>> */ >>>>> -unsigned int zone_dma_bits __ro_after_init = 24; >>>>> +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24); >>>> >>>> u64 here makes sense even if it may be larger than phys_addr_t. It >>>> matches the phys_limit type in the swiotlb code. The compilers should no >>>> longer complain. >>> >>> FTR I have never quite understood why phys_limit is u64, but u64 was >>> already used all around the place when I first looked into swiotlb. >>> >>>>> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c >>>>> index d10613eb0f63..7b04f7575796 100644 >>>>> --- a/kernel/dma/pool.c >>>>> +++ b/kernel/dma/pool.c >>>>> @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp) >>>>> /* CMA can't cross zone boundaries, see cma_activate_area() */ >>>>> end = cma_get_base(cma) + size - 1; >>>>> if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) >>>>> - return end <= DMA_BIT_MASK(zone_dma_bits); >>>>> + return end <= zone_dma_limit; >>>>> if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) >>>>> - return end <= DMA_BIT_MASK(32); >>>>> + return end <= max(DMA_BIT_MASK(32), zone_dma_limit); >>>>> return true; >>>>> } >>>>> >>>>> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c >>>>> index 043b0ecd3e8d..bb51bd5335ad 100644 >>>>> --- a/kernel/dma/swiotlb.c >>>>> +++ b/kernel/dma/swiotlb.c >>>>> @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, >>>>> if (!remap) >>>>> io_tlb_default_mem.can_grow = true; >>>>> if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mhttps://lpc.events/event/18/contributions/1776/ask & __GFP_DMA)) >>>>> - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); >>>>> + io_tlb_default_mem.phys_limit = zone_dma_limit; >>>>> else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) >>>>> - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); >>>>> + io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit); >>>>> else >>>>> io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); >>>>> #endif >>>> >>>> These two look correct to me now and it's the least intrusive (the >>>> alternative would have been a zone_dma32_limit). The arch code, however, >>>> needs to ensure that zone_dma_limit can always support 32-bit devices >>>> even if it is above 4GB (with the relevant dma offsets in place for such >>>> devices). >>> >>> Just to make sure, the DMA zone (if present) must map to at most 32-bit >>> bus address space (possibly behind a bridge). Is that what you're >>> saying? >> >> No exactly. What I'm trying to say is that on arm64 zone_dma_limit can >> go beyond DMA_BIT_MASK(32) when the latter is treated as a CPU address. >> In such cases, ZONE_DMA32 is empty. >> >> TBH, this code is confusing and not entirely suitable for a system where >> the CPU address offsets are not 0. The device::dma_coherent_mask is >> about the bus address range and phys_limit is calculated correctly in >> functions like dma_direct_optimal_gfp_mask(). But that's about it w.r.t. >> DMA bit masks because zone_dma_bits and DMA_BIT_MASK(32) are assumed to >> be about the CPU address ranges in some cases (in other cases >> DMA_BIT_MASK() is used to initialise dma_coherent_mask, so more of a bus >> address). > > Yes, I know. > >> On the platform Baruch is trying to fix, RAM starts at 32GB and ZONE_DMA >> should end at 33GB. That's 30-bit mask in bus address terms but >> something not a power of two for the CPU address, hence the >> zone_dma_limit introduced here. > > Yes, I was watching the discussion. > >> With ZONE_DMA32, since all the DMA code assumes that ZONE_DMA32 ends at >> 4GB CPU address, it doesn't really work for such platforms. If there are >> 32-bit devices with a corresponding CPU address offset, ZONE_DMA32 >> should end at 36GB on Baruch's platform. But to simplify things, we just >> ignore this on arm64 and make ZONE_DMA32 empty. > > Ah. That makes sense. It also seems to support my theory that Linux > memory zones are an obsolete concept and should be replaced by a > different mechanism. > >> In some cases where we have the device structure we could instead do a >> dma_to_phys(DMA_BIT_MASK(32)) but not in the two cases above. I guess if >> we really want to address this properly, we'd need to introduce a >> zone_dma32_limit that's initialised by the arch code. For arm64, I'm >> happy with just having an empty ZONE_DMA32 on such platforms. > > The obvious caveat is that zone boundaries are system-wide, but the > mapping between bus addresses and CPU addresses depends on the device > structure. After all, that's why dma_to_phys takes the device as a > parameter... In fact, a system may have multiple busses behind > different bridges with a different offset applied by each. Right, that's why the *_dma_get_max_cpu_address() functions already walk all known bus translations backwards to find the lowest common denominator in the CPU address space. In principle we could also calculate the lowest translated 32-bit DMA address from every >32-bit range in the same way, however that represents enough extra complexity that it doesn't seem worth trying to implement unless and until someone actually has a clear need for it. Thanks, Robin. > > FYI I want to make more people aware of these issues at this year's > Plumbers, see https://lpc.events/event/18/contributions/1776/ > > Petr T
On Thu, Aug 08, 2024 at 11:35:01AM +0200, Petr Tesařík wrote: > On Wed, 7 Aug 2024 19:14:58 +0100 > Catalin Marinas <catalin.marinas@arm.com> wrote: > > With ZONE_DMA32, since all the DMA code assumes that ZONE_DMA32 ends at > > 4GB CPU address, it doesn't really work for such platforms. If there are > > 32-bit devices with a corresponding CPU address offset, ZONE_DMA32 > > should end at 36GB on Baruch's platform. But to simplify things, we just > > ignore this on arm64 and make ZONE_DMA32 empty. > > Ah. That makes sense. It also seems to support my theory that Linux > memory zones are an obsolete concept and should be replaced by a > different mechanism. I agree, they are too coarse-grained. From an API perspective, what we need is an alloc_pages() that takes a DMA mask or phys address limit, maybe something similar to memblock_alloc_range_nid(). OTOH, an advantage of the zones is that by default you keep the lower memory free by using ZONE_NORMAL as default, you have free lists per zone. Maybe with some alternative data structures we could efficiently search free pages based on phys ranges or bitmasks and get rid of the zones but I haven't put any thoughts into it. We'd still need some boundaries like *_dma_get_max_cpu_address() to at least allocate an swiotlb buffer that's suitable for all devices. > > In some cases where we have the device structure we could instead do a > > dma_to_phys(DMA_BIT_MASK(32)) but not in the two cases above. I guess if > > we really want to address this properly, we'd need to introduce a > > zone_dma32_limit that's initialised by the arch code. For arm64, I'm > > happy with just having an empty ZONE_DMA32 on such platforms. > > The obvious caveat is that zone boundaries are system-wide, but the > mapping between bus addresses and CPU addresses depends on the device > structure. After all, that's why dma_to_phys takes the device as a > parameter... In fact, a system may have multiple busses behind > different bridges with a different offset applied by each. Indeed, and as Robin mentioned, the ACPI/DT code already handle this. > FYI I want to make more people aware of these issues at this year's > Plumbers, see https://lpc.events/event/18/contributions/1776/ Looking forward to this. I'll dial in, unfortunately can't make Plumbers in person this year. In the meantime, I think this series is a good compromise ;).
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 9b5ab6818f7f..c45e2152ca9e 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void) } /* - * Return the maximum physical address for a zone accessible by the given bits - * limit. If DRAM starts above 32-bit, expand the zone to the maximum + * Return the maximum physical address for a zone given its limit. + * If DRAM starts above 32-bit, expand the zone to the maximum * available memory, otherwise cap it at 32-bit. */ -static phys_addr_t __init max_zone_phys(unsigned int zone_bits) +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit) { - phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits); phys_addr_t phys_start = memblock_start_of_DRAM(); if (phys_start > U32_MAX) - zone_mask = PHYS_ADDR_MAX; - else if (phys_start > zone_mask) - zone_mask = U32_MAX; + zone_limit = PHYS_ADDR_MAX; + else if (phys_start > zone_limit) + zone_limit = U32_MAX; - return min(zone_mask, memblock_end_of_DRAM() - 1) + 1; + return min(zone_limit, memblock_end_of_DRAM() - 1) + 1; } static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; - unsigned int __maybe_unused acpi_zone_dma_bits; - unsigned int __maybe_unused dt_zone_dma_bits; - phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32); + phys_addr_t __maybe_unused acpi_zone_dma_limit; + phys_addr_t __maybe_unused dt_zone_dma_limit; + phys_addr_t __maybe_unused dma32_phys_limit = + max_zone_phys(DMA_BIT_MASK(32)); #ifdef CONFIG_ZONE_DMA - acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address()); - dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL)); - zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits); - arm64_dma_phys_limit = max_zone_phys(zone_dma_bits); + acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address(); + dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL); + zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit); + arm64_dma_phys_limit = max_zone_phys(zone_dma_limit); max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); #endif #ifdef CONFIG_ZONE_DMA32 diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index d325217ab201..342c006cc1b8 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -216,7 +216,7 @@ static int __init mark_nonram_nosave(void) * everything else. GFP_DMA32 page allocations automatically fall back to * ZONE_DMA. * - * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the + * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the * generic DMA mapping code. 32-bit only devices (if not handled by an IOMMU * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by * ZONE_DMA. @@ -252,13 +252,12 @@ void __init paging_init(void) * powerbooks. */ if (IS_ENABLED(CONFIG_PPC32)) - zone_dma_bits = 30; + zone_dma_limit = DMA_BIT_MASK(30); else - zone_dma_bits = 31; + zone_dma_limit = DMA_BIT_MASK(31); #ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = min(max_low_pfn, - 1UL << (zone_dma_bits - PAGE_SHIFT)); + max_zone_pfns[ZONE_DMA] = min(max_low_pfn, zone_dma_limit >> PAGE_SHIFT); #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index ddcd39ef4346..91fc2b91adfc 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -97,7 +97,7 @@ void __init paging_init(void) vmem_map_init(); sparse_init(); - zone_dma_bits = 31; + zone_dma_limit = DMA_BIT_MASK(31); memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); max_zone_pfns[ZONE_NORMAL] = max_low_pfn; diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index edbe13d00776..d7e30d4f7503 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -12,7 +12,7 @@ #include <linux/mem_encrypt.h> #include <linux/swiotlb.h> -extern unsigned int zone_dma_bits; +extern u64 zone_dma_limit; /* * Record the mapping of CPU physical to DMA addresses for a given region. diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 3b4be4ca3b08..62b36fda44c9 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -20,7 +20,7 @@ * it for entirely different regions. In that case the arch code needs to * override the variable below for dma-direct to work properly. */ -unsigned int zone_dma_bits __ro_after_init = 24; +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24); static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) @@ -580,7 +580,7 @@ int dma_direct_supported(struct device *dev, u64 mask) * part of the check. */ if (IS_ENABLED(CONFIG_ZONE_DMA)) - min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); + min_mask = min_t(u64, min_mask, zone_dma_limit); return mask >= phys_to_dma_unencrypted(dev, min_mask); } diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index d10613eb0f63..7b04f7575796 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp) /* CMA can't cross zone boundaries, see cma_activate_area() */ end = cma_get_base(cma) + size - 1; if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) - return end <= DMA_BIT_MASK(zone_dma_bits); + return end <= zone_dma_limit; if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) - return end <= DMA_BIT_MASK(32); + return end <= max(DMA_BIT_MASK(32), zone_dma_limit); return true; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 043b0ecd3e8d..bb51bd5335ad 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, if (!remap) io_tlb_default_mem.can_grow = true; if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA)) - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); + io_tlb_default_mem.phys_limit = zone_dma_limit; else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); + io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit); else io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); #endif