diff mbox series

[v5,2/3] dma: replace zone_dma_bits by zone_dma_limit

Message ID 5821a1b2eb82847ccbac0945da040518d6f6f16b.1722578375.git.baruch@tkos.co.il (mailing list archive)
State New, archived
Headers show
Series dma: support DMA zone starting above 4GB | expand

Commit Message

Baruch Siach Aug. 2, 2024, 6:03 a.m. UTC
From: Catalin Marinas <catalin.marinas@arm.com>

Hardware DMA limit might not be power of 2. When RAM range starts above
0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
can not encode this limit.

Use plain address for DMA zone limit.

Since DMA zone can now potentially span beyond 4GB physical limit of
DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Co-developed-by: Baruch Siach <baruch@tkos.co.il>
Signed-off-by: Baruch Siach <baruch@tkos.co.il>
---
 arch/arm64/mm/init.c       | 30 +++++++++++++++---------------
 arch/powerpc/mm/mem.c      |  9 ++++-----
 arch/s390/mm/init.c        |  2 +-
 include/linux/dma-direct.h |  2 +-
 kernel/dma/direct.c        |  4 ++--
 kernel/dma/pool.c          |  4 ++--
 kernel/dma/swiotlb.c       |  4 ++--
 7 files changed, 27 insertions(+), 28 deletions(-)

Comments

Catalin Marinas Aug. 2, 2024, 9:37 a.m. UTC | #1
On Fri, Aug 02, 2024 at 09:03:47AM +0300, Baruch Siach wrote:
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 3b4be4ca3b08..62b36fda44c9 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -20,7 +20,7 @@
>   * it for entirely different regions. In that case the arch code needs to
>   * override the variable below for dma-direct to work properly.
>   */
> -unsigned int zone_dma_bits __ro_after_init = 24;
> +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);

u64 here makes sense even if it may be larger than phys_addr_t. It
matches the phys_limit type in the swiotlb code. The compilers should no
longer complain.

> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
> index d10613eb0f63..7b04f7575796 100644
> --- a/kernel/dma/pool.c
> +++ b/kernel/dma/pool.c
> @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp)
>  	/* CMA can't cross zone boundaries, see cma_activate_area() */
>  	end = cma_get_base(cma) + size - 1;
>  	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
> -		return end <= DMA_BIT_MASK(zone_dma_bits);
> +		return end <= zone_dma_limit;
>  	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> -		return end <= DMA_BIT_MASK(32);
> +		return end <= max(DMA_BIT_MASK(32), zone_dma_limit);
>  	return true;
>  }
>  
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index 043b0ecd3e8d..bb51bd5335ad 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
>  	if (!remap)
>  		io_tlb_default_mem.can_grow = true;
>  	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
> -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits);
> +		io_tlb_default_mem.phys_limit = zone_dma_limit;
>  	else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
> -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32);
> +		io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit);
>  	else
>  		io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
>  #endif

These two look correct to me now and it's the least intrusive (the
alternative would have been a zone_dma32_limit). The arch code, however,
needs to ensure that zone_dma_limit can always support 32-bit devices
even if it is above 4GB (with the relevant dma offsets in place for such
devices).
Petr Tesařík Aug. 7, 2024, 2:19 p.m. UTC | #2
On Fri, 2 Aug 2024 10:37:38 +0100
Catalin Marinas <catalin.marinas@arm.com> wrote:

> On Fri, Aug 02, 2024 at 09:03:47AM +0300, Baruch Siach wrote:
> > diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> > index 3b4be4ca3b08..62b36fda44c9 100644
> > --- a/kernel/dma/direct.c
> > +++ b/kernel/dma/direct.c
> > @@ -20,7 +20,7 @@
> >   * it for entirely different regions. In that case the arch code needs to
> >   * override the variable below for dma-direct to work properly.
> >   */
> > -unsigned int zone_dma_bits __ro_after_init = 24;
> > +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);  
> 
> u64 here makes sense even if it may be larger than phys_addr_t. It
> matches the phys_limit type in the swiotlb code. The compilers should no
> longer complain.

FTR I have never quite understood why phys_limit is u64, but u64 was
already used all around the place when I first looked into swiotlb.

> > diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
> > index d10613eb0f63..7b04f7575796 100644
> > --- a/kernel/dma/pool.c
> > +++ b/kernel/dma/pool.c
> > @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp)
> >  	/* CMA can't cross zone boundaries, see cma_activate_area() */
> >  	end = cma_get_base(cma) + size - 1;
> >  	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
> > -		return end <= DMA_BIT_MASK(zone_dma_bits);
> > +		return end <= zone_dma_limit;
> >  	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> > -		return end <= DMA_BIT_MASK(32);
> > +		return end <= max(DMA_BIT_MASK(32), zone_dma_limit);
> >  	return true;
> >  }
> >  
> > diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> > index 043b0ecd3e8d..bb51bd5335ad 100644
> > --- a/kernel/dma/swiotlb.c
> > +++ b/kernel/dma/swiotlb.c
> > @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
> >  	if (!remap)
> >  		io_tlb_default_mem.can_grow = true;
> >  	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
> > -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits);
> > +		io_tlb_default_mem.phys_limit = zone_dma_limit;
> >  	else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
> > -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32);
> > +		io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit);
> >  	else
> >  		io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
> >  #endif  
> 
> These two look correct to me now and it's the least intrusive (the
> alternative would have been a zone_dma32_limit). The arch code, however,
> needs to ensure that zone_dma_limit can always support 32-bit devices
> even if it is above 4GB (with the relevant dma offsets in place for such
> devices).

Just to make sure, the DMA zone (if present) must map to at most 32-bit
bus address space (possibly behind a bridge). Is that what you're
saying?

Petr T
Petr Tesařík Aug. 7, 2024, 2:30 p.m. UTC | #3
On Fri,  2 Aug 2024 09:03:47 +0300
Baruch Siach <baruch@tkos.co.il> wrote:

> From: Catalin Marinas <catalin.marinas@arm.com>
> 
> Hardware DMA limit might not be power of 2. When RAM range starts above
> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
> can not encode this limit.
> 
> Use plain address for DMA zone limit.
> 
> Since DMA zone can now potentially span beyond 4GB physical limit of
> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.
> 
> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
> Co-developed-by: Baruch Siach <baruch@tkos.co.il>
> Signed-off-by: Baruch Siach <baruch@tkos.co.il>
> ---
>  arch/arm64/mm/init.c       | 30 +++++++++++++++---------------
>  arch/powerpc/mm/mem.c      |  9 ++++-----
>  arch/s390/mm/init.c        |  2 +-
>  include/linux/dma-direct.h |  2 +-
>  kernel/dma/direct.c        |  4 ++--
>  kernel/dma/pool.c          |  4 ++--
>  kernel/dma/swiotlb.c       |  4 ++--
>  7 files changed, 27 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 9b5ab6818f7f..c45e2152ca9e 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void)
>  }
>  
>  /*
> - * Return the maximum physical address for a zone accessible by the given bits
> - * limit. If DRAM starts above 32-bit, expand the zone to the maximum
> + * Return the maximum physical address for a zone given its limit.
> + * If DRAM starts above 32-bit, expand the zone to the maximum
>   * available memory, otherwise cap it at 32-bit.
>   */
> -static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
> +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
>  {
> -	phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits);
>  	phys_addr_t phys_start = memblock_start_of_DRAM();
>  
>  	if (phys_start > U32_MAX)
> -		zone_mask = PHYS_ADDR_MAX;
> -	else if (phys_start > zone_mask)
> -		zone_mask = U32_MAX;
> +		zone_limit = PHYS_ADDR_MAX;
> +	else if (phys_start > zone_limit)
> +		zone_limit = U32_MAX;
>  
> -	return min(zone_mask, memblock_end_of_DRAM() - 1) + 1;
> +	return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
>  }
>  
>  static void __init zone_sizes_init(void)
>  {
>  	unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
> -	unsigned int __maybe_unused acpi_zone_dma_bits;
> -	unsigned int __maybe_unused dt_zone_dma_bits;
> -	phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);
> +	phys_addr_t __maybe_unused acpi_zone_dma_limit;
> +	phys_addr_t __maybe_unused dt_zone_dma_limit;
> +	phys_addr_t __maybe_unused dma32_phys_limit =
> +		max_zone_phys(DMA_BIT_MASK(32));
>  
>  #ifdef CONFIG_ZONE_DMA
> -	acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
> -	dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
> -	zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
> -	arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
> +	acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address();
> +	dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL);
> +	zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit);
> +	arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
>  	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
>  #endif
>  #ifdef CONFIG_ZONE_DMA32
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index d325217ab201..342c006cc1b8 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -216,7 +216,7 @@ static int __init mark_nonram_nosave(void)
>   * everything else. GFP_DMA32 page allocations automatically fall back to
>   * ZONE_DMA.
>   *
> - * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the
> + * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the
>   * generic DMA mapping code.  32-bit only devices (if not handled by an IOMMU
>   * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by
>   * ZONE_DMA.
> @@ -252,13 +252,12 @@ void __init paging_init(void)
>  	 * powerbooks.
>  	 */
>  	if (IS_ENABLED(CONFIG_PPC32))
> -		zone_dma_bits = 30;
> +		zone_dma_limit = DMA_BIT_MASK(30);
>  	else
> -		zone_dma_bits = 31;
> +		zone_dma_limit = DMA_BIT_MASK(31);
>  
>  #ifdef CONFIG_ZONE_DMA
> -	max_zone_pfns[ZONE_DMA]	= min(max_low_pfn,
> -				      1UL << (zone_dma_bits - PAGE_SHIFT));
> +	max_zone_pfns[ZONE_DMA]	= min(max_low_pfn, zone_dma_limit >> PAGE_SHIFT);

No big deal, but this is off by one. DMA_BIT_MASK() returns the highest
address that can be represented with the given number of bits, whereas
max_zone_pfns[] contains the lowest PFN that is NOT contained in the
zone.

Rest of the patch looks perfect.

Petr T
Catalin Marinas Aug. 7, 2024, 6:14 p.m. UTC | #4
On Wed, Aug 07, 2024 at 04:19:38PM +0200, Petr Tesařík wrote:
> On Fri, 2 Aug 2024 10:37:38 +0100
> Catalin Marinas <catalin.marinas@arm.com> wrote:
> > On Fri, Aug 02, 2024 at 09:03:47AM +0300, Baruch Siach wrote:
> > > diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> > > index 3b4be4ca3b08..62b36fda44c9 100644
> > > --- a/kernel/dma/direct.c
> > > +++ b/kernel/dma/direct.c
> > > @@ -20,7 +20,7 @@
> > >   * it for entirely different regions. In that case the arch code needs to
> > >   * override the variable below for dma-direct to work properly.
> > >   */
> > > -unsigned int zone_dma_bits __ro_after_init = 24;
> > > +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);  
> > 
> > u64 here makes sense even if it may be larger than phys_addr_t. It
> > matches the phys_limit type in the swiotlb code. The compilers should no
> > longer complain.
> 
> FTR I have never quite understood why phys_limit is u64, but u64 was
> already used all around the place when I first looked into swiotlb.
> 
> > > diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
> > > index d10613eb0f63..7b04f7575796 100644
> > > --- a/kernel/dma/pool.c
> > > +++ b/kernel/dma/pool.c
> > > @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp)
> > >  	/* CMA can't cross zone boundaries, see cma_activate_area() */
> > >  	end = cma_get_base(cma) + size - 1;
> > >  	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
> > > -		return end <= DMA_BIT_MASK(zone_dma_bits);
> > > +		return end <= zone_dma_limit;
> > >  	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> > > -		return end <= DMA_BIT_MASK(32);
> > > +		return end <= max(DMA_BIT_MASK(32), zone_dma_limit);
> > >  	return true;
> > >  }
> > >  
> > > diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> > > index 043b0ecd3e8d..bb51bd5335ad 100644
> > > --- a/kernel/dma/swiotlb.c
> > > +++ b/kernel/dma/swiotlb.c
> > > @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
> > >  	if (!remap)
> > >  		io_tlb_default_mem.can_grow = true;
> > >  	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
> > > -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits);
> > > +		io_tlb_default_mem.phys_limit = zone_dma_limit;
> > >  	else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
> > > -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32);
> > > +		io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit);
> > >  	else
> > >  		io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
> > >  #endif  
> > 
> > These two look correct to me now and it's the least intrusive (the
> > alternative would have been a zone_dma32_limit). The arch code, however,
> > needs to ensure that zone_dma_limit can always support 32-bit devices
> > even if it is above 4GB (with the relevant dma offsets in place for such
> > devices).
> 
> Just to make sure, the DMA zone (if present) must map to at most 32-bit
> bus address space (possibly behind a bridge). Is that what you're
> saying?

No exactly. What I'm trying to say is that on arm64 zone_dma_limit can
go beyond DMA_BIT_MASK(32) when the latter is treated as a CPU address.
In such cases, ZONE_DMA32 is empty.

TBH, this code is confusing and not entirely suitable for a system where
the CPU address offsets are not 0. The device::dma_coherent_mask is
about the bus address range and phys_limit is calculated correctly in
functions like dma_direct_optimal_gfp_mask(). But that's about it w.r.t.
DMA bit masks because zone_dma_bits and DMA_BIT_MASK(32) are assumed to
be about the CPU address ranges in some cases (in other cases
DMA_BIT_MASK() is used to initialise dma_coherent_mask, so more of a bus
address).

On the platform Baruch is trying to fix, RAM starts at 32GB and ZONE_DMA
should end at 33GB. That's 30-bit mask in bus address terms but
something not a power of two for the CPU address, hence the
zone_dma_limit introduced here.

With ZONE_DMA32, since all the DMA code assumes that ZONE_DMA32 ends at
4GB CPU address, it doesn't really work for such platforms. If there are
32-bit devices with a corresponding CPU address offset, ZONE_DMA32
should end at 36GB on Baruch's platform. But to simplify things, we just
ignore this on arm64 and make ZONE_DMA32 empty.

In some cases where we have the device structure we could instead do a
dma_to_phys(DMA_BIT_MASK(32)) but not in the two cases above. I guess if
we really want to address this properly, we'd need to introduce a
zone_dma32_limit that's initialised by the arch code. For arm64, I'm
happy with just having an empty ZONE_DMA32 on such platforms.
Petr Tesařík Aug. 8, 2024, 9:35 a.m. UTC | #5
On Wed, 7 Aug 2024 19:14:58 +0100
Catalin Marinas <catalin.marinas@arm.com> wrote:

> On Wed, Aug 07, 2024 at 04:19:38PM +0200, Petr Tesařík wrote:
> > On Fri, 2 Aug 2024 10:37:38 +0100
> > Catalin Marinas <catalin.marinas@arm.com> wrote:  
> > > On Fri, Aug 02, 2024 at 09:03:47AM +0300, Baruch Siach wrote:  
> > > > diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> > > > index 3b4be4ca3b08..62b36fda44c9 100644
> > > > --- a/kernel/dma/direct.c
> > > > +++ b/kernel/dma/direct.c
> > > > @@ -20,7 +20,7 @@
> > > >   * it for entirely different regions. In that case the arch code needs to
> > > >   * override the variable below for dma-direct to work properly.
> > > >   */
> > > > -unsigned int zone_dma_bits __ro_after_init = 24;
> > > > +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);    
> > > 
> > > u64 here makes sense even if it may be larger than phys_addr_t. It
> > > matches the phys_limit type in the swiotlb code. The compilers should no
> > > longer complain.  
> > 
> > FTR I have never quite understood why phys_limit is u64, but u64 was
> > already used all around the place when I first looked into swiotlb.
> >   
> > > > diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
> > > > index d10613eb0f63..7b04f7575796 100644
> > > > --- a/kernel/dma/pool.c
> > > > +++ b/kernel/dma/pool.c
> > > > @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp)
> > > >  	/* CMA can't cross zone boundaries, see cma_activate_area() */
> > > >  	end = cma_get_base(cma) + size - 1;
> > > >  	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
> > > > -		return end <= DMA_BIT_MASK(zone_dma_bits);
> > > > +		return end <= zone_dma_limit;
> > > >  	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> > > > -		return end <= DMA_BIT_MASK(32);
> > > > +		return end <= max(DMA_BIT_MASK(32), zone_dma_limit);
> > > >  	return true;
> > > >  }
> > > >  
> > > > diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> > > > index 043b0ecd3e8d..bb51bd5335ad 100644
> > > > --- a/kernel/dma/swiotlb.c
> > > > +++ b/kernel/dma/swiotlb.c
> > > > @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
> > > >  	if (!remap)
> > > >  		io_tlb_default_mem.can_grow = true;
> > > >  	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mhttps://lpc.events/event/18/contributions/1776/ask & __GFP_DMA))
> > > > -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits);
> > > > +		io_tlb_default_mem.phys_limit = zone_dma_limit;
> > > >  	else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
> > > > -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32);
> > > > +		io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit);
> > > >  	else
> > > >  		io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
> > > >  #endif    
> > > 
> > > These two look correct to me now and it's the least intrusive (the
> > > alternative would have been a zone_dma32_limit). The arch code, however,
> > > needs to ensure that zone_dma_limit can always support 32-bit devices
> > > even if it is above 4GB (with the relevant dma offsets in place for such
> > > devices).  
> > 
> > Just to make sure, the DMA zone (if present) must map to at most 32-bit
> > bus address space (possibly behind a bridge). Is that what you're
> > saying?  
> 
> No exactly. What I'm trying to say is that on arm64 zone_dma_limit can
> go beyond DMA_BIT_MASK(32) when the latter is treated as a CPU address.
> In such cases, ZONE_DMA32 is empty.
> 
> TBH, this code is confusing and not entirely suitable for a system where
> the CPU address offsets are not 0. The device::dma_coherent_mask is
> about the bus address range and phys_limit is calculated correctly in
> functions like dma_direct_optimal_gfp_mask(). But that's about it w.r.t.
> DMA bit masks because zone_dma_bits and DMA_BIT_MASK(32) are assumed to
> be about the CPU address ranges in some cases (in other cases
> DMA_BIT_MASK() is used to initialise dma_coherent_mask, so more of a bus
> address).

Yes, I know.

> On the platform Baruch is trying to fix, RAM starts at 32GB and ZONE_DMA
> should end at 33GB. That's 30-bit mask in bus address terms but
> something not a power of two for the CPU address, hence the
> zone_dma_limit introduced here.

Yes, I was watching the discussion.

> With ZONE_DMA32, since all the DMA code assumes that ZONE_DMA32 ends at
> 4GB CPU address, it doesn't really work for such platforms. If there are
> 32-bit devices with a corresponding CPU address offset, ZONE_DMA32
> should end at 36GB on Baruch's platform. But to simplify things, we just
> ignore this on arm64 and make ZONE_DMA32 empty.

Ah. That makes sense. It also seems to support my theory that Linux
memory zones are an obsolete concept and should be replaced by a
different mechanism.

> In some cases where we have the device structure we could instead do a
> dma_to_phys(DMA_BIT_MASK(32)) but not in the two cases above. I guess if
> we really want to address this properly, we'd need to introduce a
> zone_dma32_limit that's initialised by the arch code. For arm64, I'm
> happy with just having an empty ZONE_DMA32 on such platforms.

The obvious caveat is that zone boundaries are system-wide, but the
mapping between bus addresses and CPU addresses depends on the device
structure. After all, that's why dma_to_phys takes the device as a
parameter... In fact, a system may have multiple busses behind
different bridges with a different offset applied by each.

FYI I want to make more people aware of these issues at this year's
Plumbers, see https://lpc.events/event/18/contributions/1776/

Petr T
Robin Murphy Aug. 8, 2024, 10:01 a.m. UTC | #6
On 2024-08-08 10:35 am, Petr Tesařík wrote:
> On Wed, 7 Aug 2024 19:14:58 +0100
> Catalin Marinas <catalin.marinas@arm.com> wrote:
> 
>> On Wed, Aug 07, 2024 at 04:19:38PM +0200, Petr Tesařík wrote:
>>> On Fri, 2 Aug 2024 10:37:38 +0100
>>> Catalin Marinas <catalin.marinas@arm.com> wrote:
>>>> On Fri, Aug 02, 2024 at 09:03:47AM +0300, Baruch Siach wrote:
>>>>> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
>>>>> index 3b4be4ca3b08..62b36fda44c9 100644
>>>>> --- a/kernel/dma/direct.c
>>>>> +++ b/kernel/dma/direct.c
>>>>> @@ -20,7 +20,7 @@
>>>>>    * it for entirely different regions. In that case the arch code needs to
>>>>>    * override the variable below for dma-direct to work properly.
>>>>>    */
>>>>> -unsigned int zone_dma_bits __ro_after_init = 24;
>>>>> +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);
>>>>
>>>> u64 here makes sense even if it may be larger than phys_addr_t. It
>>>> matches the phys_limit type in the swiotlb code. The compilers should no
>>>> longer complain.
>>>
>>> FTR I have never quite understood why phys_limit is u64, but u64 was
>>> already used all around the place when I first looked into swiotlb.
>>>    
>>>>> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
>>>>> index d10613eb0f63..7b04f7575796 100644
>>>>> --- a/kernel/dma/pool.c
>>>>> +++ b/kernel/dma/pool.c
>>>>> @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp)
>>>>>   	/* CMA can't cross zone boundaries, see cma_activate_area() */
>>>>>   	end = cma_get_base(cma) + size - 1;
>>>>>   	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
>>>>> -		return end <= DMA_BIT_MASK(zone_dma_bits);
>>>>> +		return end <= zone_dma_limit;
>>>>>   	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
>>>>> -		return end <= DMA_BIT_MASK(32);
>>>>> +		return end <= max(DMA_BIT_MASK(32), zone_dma_limit);
>>>>>   	return true;
>>>>>   }
>>>>>   
>>>>> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
>>>>> index 043b0ecd3e8d..bb51bd5335ad 100644
>>>>> --- a/kernel/dma/swiotlb.c
>>>>> +++ b/kernel/dma/swiotlb.c
>>>>> @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
>>>>>   	if (!remap)
>>>>>   		io_tlb_default_mem.can_grow = true;
>>>>>   	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mhttps://lpc.events/event/18/contributions/1776/ask & __GFP_DMA))
>>>>> -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits);
>>>>> +		io_tlb_default_mem.phys_limit = zone_dma_limit;
>>>>>   	else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
>>>>> -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32);
>>>>> +		io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit);
>>>>>   	else
>>>>>   		io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
>>>>>   #endif
>>>>
>>>> These two look correct to me now and it's the least intrusive (the
>>>> alternative would have been a zone_dma32_limit). The arch code, however,
>>>> needs to ensure that zone_dma_limit can always support 32-bit devices
>>>> even if it is above 4GB (with the relevant dma offsets in place for such
>>>> devices).
>>>
>>> Just to make sure, the DMA zone (if present) must map to at most 32-bit
>>> bus address space (possibly behind a bridge). Is that what you're
>>> saying?
>>
>> No exactly. What I'm trying to say is that on arm64 zone_dma_limit can
>> go beyond DMA_BIT_MASK(32) when the latter is treated as a CPU address.
>> In such cases, ZONE_DMA32 is empty.
>>
>> TBH, this code is confusing and not entirely suitable for a system where
>> the CPU address offsets are not 0. The device::dma_coherent_mask is
>> about the bus address range and phys_limit is calculated correctly in
>> functions like dma_direct_optimal_gfp_mask(). But that's about it w.r.t.
>> DMA bit masks because zone_dma_bits and DMA_BIT_MASK(32) are assumed to
>> be about the CPU address ranges in some cases (in other cases
>> DMA_BIT_MASK() is used to initialise dma_coherent_mask, so more of a bus
>> address).
> 
> Yes, I know.
> 
>> On the platform Baruch is trying to fix, RAM starts at 32GB and ZONE_DMA
>> should end at 33GB. That's 30-bit mask in bus address terms but
>> something not a power of two for the CPU address, hence the
>> zone_dma_limit introduced here.
> 
> Yes, I was watching the discussion.
> 
>> With ZONE_DMA32, since all the DMA code assumes that ZONE_DMA32 ends at
>> 4GB CPU address, it doesn't really work for such platforms. If there are
>> 32-bit devices with a corresponding CPU address offset, ZONE_DMA32
>> should end at 36GB on Baruch's platform. But to simplify things, we just
>> ignore this on arm64 and make ZONE_DMA32 empty.
> 
> Ah. That makes sense. It also seems to support my theory that Linux
> memory zones are an obsolete concept and should be replaced by a
> different mechanism.
> 
>> In some cases where we have the device structure we could instead do a
>> dma_to_phys(DMA_BIT_MASK(32)) but not in the two cases above. I guess if
>> we really want to address this properly, we'd need to introduce a
>> zone_dma32_limit that's initialised by the arch code. For arm64, I'm
>> happy with just having an empty ZONE_DMA32 on such platforms.
> 
> The obvious caveat is that zone boundaries are system-wide, but the
> mapping between bus addresses and CPU addresses depends on the device
> structure. After all, that's why dma_to_phys takes the device as a
> parameter... In fact, a system may have multiple busses behind
> different bridges with a different offset applied by each.

Right, that's why the *_dma_get_max_cpu_address() functions already walk 
all known bus translations backwards to find the lowest common 
denominator in the CPU address space. In principle we could also 
calculate the lowest translated 32-bit DMA address from every >32-bit 
range in the same way, however that represents enough extra complexity 
that it doesn't seem worth trying to implement unless and until someone 
actually has a clear need for it.

Thanks,
Robin.

> 
> FYI I want to make more people aware of these issues at this year's
> Plumbers, see https://lpc.events/event/18/contributions/1776/
> 
> Petr T
Catalin Marinas Aug. 8, 2024, 1:46 p.m. UTC | #7
On Thu, Aug 08, 2024 at 11:35:01AM +0200, Petr Tesařík wrote:
> On Wed, 7 Aug 2024 19:14:58 +0100
> Catalin Marinas <catalin.marinas@arm.com> wrote:
> > With ZONE_DMA32, since all the DMA code assumes that ZONE_DMA32 ends at
> > 4GB CPU address, it doesn't really work for such platforms. If there are
> > 32-bit devices with a corresponding CPU address offset, ZONE_DMA32
> > should end at 36GB on Baruch's platform. But to simplify things, we just
> > ignore this on arm64 and make ZONE_DMA32 empty.
> 
> Ah. That makes sense. It also seems to support my theory that Linux
> memory zones are an obsolete concept and should be replaced by a
> different mechanism.

I agree, they are too coarse-grained. From an API perspective, what we
need is an alloc_pages() that takes a DMA mask or phys address limit,
maybe something similar to memblock_alloc_range_nid(). OTOH, an
advantage of the zones is that by default you keep the lower memory free
by using ZONE_NORMAL as default, you have free lists per zone. Maybe
with some alternative data structures we could efficiently search free
pages based on phys ranges or bitmasks and get rid of the zones but I
haven't put any thoughts into it.

We'd still need some boundaries like *_dma_get_max_cpu_address() to at
least allocate an swiotlb buffer that's suitable for all devices.

> > In some cases where we have the device structure we could instead do a
> > dma_to_phys(DMA_BIT_MASK(32)) but not in the two cases above. I guess if
> > we really want to address this properly, we'd need to introduce a
> > zone_dma32_limit that's initialised by the arch code. For arm64, I'm
> > happy with just having an empty ZONE_DMA32 on such platforms.
> 
> The obvious caveat is that zone boundaries are system-wide, but the
> mapping between bus addresses and CPU addresses depends on the device
> structure. After all, that's why dma_to_phys takes the device as a
> parameter... In fact, a system may have multiple busses behind
> different bridges with a different offset applied by each.

Indeed, and as Robin mentioned, the ACPI/DT code already handle this.

> FYI I want to make more people aware of these issues at this year's
> Plumbers, see https://lpc.events/event/18/contributions/1776/

Looking forward to this. I'll dial in, unfortunately can't make Plumbers
in person this year.

In the meantime, I think this series is a good compromise ;).
diff mbox series

Patch

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 9b5ab6818f7f..c45e2152ca9e 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -115,35 +115,35 @@  static void __init arch_reserve_crashkernel(void)
 }
 
 /*
- * Return the maximum physical address for a zone accessible by the given bits
- * limit. If DRAM starts above 32-bit, expand the zone to the maximum
+ * Return the maximum physical address for a zone given its limit.
+ * If DRAM starts above 32-bit, expand the zone to the maximum
  * available memory, otherwise cap it at 32-bit.
  */
-static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
+static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
 {
-	phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits);
 	phys_addr_t phys_start = memblock_start_of_DRAM();
 
 	if (phys_start > U32_MAX)
-		zone_mask = PHYS_ADDR_MAX;
-	else if (phys_start > zone_mask)
-		zone_mask = U32_MAX;
+		zone_limit = PHYS_ADDR_MAX;
+	else if (phys_start > zone_limit)
+		zone_limit = U32_MAX;
 
-	return min(zone_mask, memblock_end_of_DRAM() - 1) + 1;
+	return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
 }
 
 static void __init zone_sizes_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
-	unsigned int __maybe_unused acpi_zone_dma_bits;
-	unsigned int __maybe_unused dt_zone_dma_bits;
-	phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);
+	phys_addr_t __maybe_unused acpi_zone_dma_limit;
+	phys_addr_t __maybe_unused dt_zone_dma_limit;
+	phys_addr_t __maybe_unused dma32_phys_limit =
+		max_zone_phys(DMA_BIT_MASK(32));
 
 #ifdef CONFIG_ZONE_DMA
-	acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
-	dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
-	zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
-	arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
+	acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address();
+	dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL);
+	zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit);
+	arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
 	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
 #endif
 #ifdef CONFIG_ZONE_DMA32
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index d325217ab201..342c006cc1b8 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -216,7 +216,7 @@  static int __init mark_nonram_nosave(void)
  * everything else. GFP_DMA32 page allocations automatically fall back to
  * ZONE_DMA.
  *
- * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the
+ * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the
  * generic DMA mapping code.  32-bit only devices (if not handled by an IOMMU
  * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by
  * ZONE_DMA.
@@ -252,13 +252,12 @@  void __init paging_init(void)
 	 * powerbooks.
 	 */
 	if (IS_ENABLED(CONFIG_PPC32))
-		zone_dma_bits = 30;
+		zone_dma_limit = DMA_BIT_MASK(30);
 	else
-		zone_dma_bits = 31;
+		zone_dma_limit = DMA_BIT_MASK(31);
 
 #ifdef CONFIG_ZONE_DMA
-	max_zone_pfns[ZONE_DMA]	= min(max_low_pfn,
-				      1UL << (zone_dma_bits - PAGE_SHIFT));
+	max_zone_pfns[ZONE_DMA]	= min(max_low_pfn, zone_dma_limit >> PAGE_SHIFT);
 #endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 #ifdef CONFIG_HIGHMEM
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index ddcd39ef4346..91fc2b91adfc 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -97,7 +97,7 @@  void __init paging_init(void)
 
 	vmem_map_init();
 	sparse_init();
-	zone_dma_bits = 31;
+	zone_dma_limit = DMA_BIT_MASK(31);
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 	max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS);
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index edbe13d00776..d7e30d4f7503 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -12,7 +12,7 @@ 
 #include <linux/mem_encrypt.h>
 #include <linux/swiotlb.h>
 
-extern unsigned int zone_dma_bits;
+extern u64 zone_dma_limit;
 
 /*
  * Record the mapping of CPU physical to DMA addresses for a given region.
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 3b4be4ca3b08..62b36fda44c9 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -20,7 +20,7 @@ 
  * it for entirely different regions. In that case the arch code needs to
  * override the variable below for dma-direct to work properly.
  */
-unsigned int zone_dma_bits __ro_after_init = 24;
+u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);
 
 static inline dma_addr_t phys_to_dma_direct(struct device *dev,
 		phys_addr_t phys)
@@ -580,7 +580,7 @@  int dma_direct_supported(struct device *dev, u64 mask)
 	 * part of the check.
 	 */
 	if (IS_ENABLED(CONFIG_ZONE_DMA))
-		min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits));
+		min_mask = min_t(u64, min_mask, zone_dma_limit);
 	return mask >= phys_to_dma_unencrypted(dev, min_mask);
 }
 
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index d10613eb0f63..7b04f7575796 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -70,9 +70,9 @@  static bool cma_in_zone(gfp_t gfp)
 	/* CMA can't cross zone boundaries, see cma_activate_area() */
 	end = cma_get_base(cma) + size - 1;
 	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
-		return end <= DMA_BIT_MASK(zone_dma_bits);
+		return end <= zone_dma_limit;
 	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
-		return end <= DMA_BIT_MASK(32);
+		return end <= max(DMA_BIT_MASK(32), zone_dma_limit);
 	return true;
 }
 
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 043b0ecd3e8d..bb51bd5335ad 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -450,9 +450,9 @@  int swiotlb_init_late(size_t size, gfp_t gfp_mask,
 	if (!remap)
 		io_tlb_default_mem.can_grow = true;
 	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
-		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits);
+		io_tlb_default_mem.phys_limit = zone_dma_limit;
 	else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
-		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32);
+		io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit);
 	else
 		io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
 #endif