diff mbox series

[rdma-next,1/2] arm64/io: add memcpy_toio_64

Message ID c3ae87aea7660c3d266905c19d10d8de0f9fb779.1700766072.git.leon@kernel.org (mailing list archive)
State New, archived
Headers show
Series Add and use memcpy_toio_64() | expand

Commit Message

Leon Romanovsky Nov. 23, 2023, 7:04 p.m. UTC
From: Jason Gunthorpe <jgg@nvidia.com>

The kernel supports write combining IO memory which is commonly used to
generate 64 byte TLPs in a PCIe environment. On many CPUs this mechanism
is pretty tolerant and a simple C loop will suffice to generate a 64 byte
TLP.

However modern ARM64 CPUs are quite sensitive and a compiler generated
loop is not enough to reliably generate a 64 byte TLP. Especially given
the ARM64 issue that writel() does not codegen anything other than "[xN]"
as the address calculation.

These newer CPUs require an orderly consecutive block of stores to work
reliably. This is best done with four STP integer instructions (perhaps
ST64B in future), or a single ST4 vector instruction.

Provide a new generic function memcpy_toio_64() which should reliably
generate the needed instructions for the architecture, assuming address
alignment. As the usual need for this operation is performance sensitive a
fast inline implementation is preferred.

Implement an optimized version on ARM that is a block of 4 STP
instructions.

The generic implementation is just a simple loop. x86-64 (clang 16)
compiles this into an unrolled loop of 16 movq pairs.

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arch@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 arch/arm64/include/asm/io.h | 20 ++++++++++++++++++++
 include/asm-generic/io.h    | 30 ++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

Comments

Mark Rutland Nov. 24, 2023, 10:16 a.m. UTC | #1
On Thu, Nov 23, 2023 at 09:04:31PM +0200, Leon Romanovsky wrote:
> From: Jason Gunthorpe <jgg@nvidia.com>
> 
> The kernel supports write combining IO memory which is commonly used to
> generate 64 byte TLPs in a PCIe environment. On many CPUs this mechanism
> is pretty tolerant and a simple C loop will suffice to generate a 64 byte
> TLP.
> 
> However modern ARM64 CPUs are quite sensitive and a compiler generated
> loop is not enough to reliably generate a 64 byte TLP. Especially given
> the ARM64 issue that writel() does not codegen anything other than "[xN]"
> as the address calculation.
> 
> These newer CPUs require an orderly consecutive block of stores to work
> reliably. This is best done with four STP integer instructions (perhaps
> ST64B in future), or a single ST4 vector instruction.
> 
> Provide a new generic function memcpy_toio_64() which should reliably
> generate the needed instructions for the architecture, assuming address
> alignment. As the usual need for this operation is performance sensitive a
> fast inline implementation is preferred.

There is *no* architectural sequence that is guaranteed to reliably generate a
64-byte TLP, and this sequence won't guarnatee that (e.g. even if the CPU
*always* merged adjacent stores, we can take an interrupt mid-sequence that
would prevent that).

What's the actual requirement here? Is this just for performance?

Mark.

> Implement an optimized version on ARM that is a block of 4 STP
> instructions.
> 
> The generic implementation is just a simple loop. x86-64 (clang 16)
> compiles this into an unrolled loop of 16 movq pairs.
> 
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: linux-arch@vger.kernel.org
> Cc: linux-arm-kernel@lists.infradead.org
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
>  arch/arm64/include/asm/io.h | 20 ++++++++++++++++++++
>  include/asm-generic/io.h    | 30 ++++++++++++++++++++++++++++++
>  2 files changed, 50 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
> index 3b694511b98f..73ab91913790 100644
> --- a/arch/arm64/include/asm/io.h
> +++ b/arch/arm64/include/asm/io.h
> @@ -135,6 +135,26 @@ extern void __memset_io(volatile void __iomem *, int, size_t);
>  #define memcpy_fromio(a,c,l)	__memcpy_fromio((a),(c),(l))
>  #define memcpy_toio(c,a,l)	__memcpy_toio((c),(a),(l))
>  
> +static inline void __memcpy_toio_64(volatile void __iomem *to, const void *from)
> +{
> +	const u64 *from64 = from;
> +
> +	/*
> +	 * Newer ARM core have sensitive write combining buffers, it is
> +	 * important that the stores be contiguous blocks of store instructions.
> +	 * Normal memcpy does not work reliably.
> +	 */
> +	asm volatile("stp %x0, %x1, [%8, #16 * 0]\n"
> +		     "stp %x2, %x3, [%8, #16 * 1]\n"
> +		     "stp %x4, %x5, [%8, #16 * 2]\n"
> +		     "stp %x6, %x7, [%8, #16 * 3]\n"
> +		     :
> +		     : "rZ"(from64[0]), "rZ"(from64[1]), "rZ"(from64[2]),
> +		       "rZ"(from64[3]), "rZ"(from64[4]), "rZ"(from64[5]),
> +		       "rZ"(from64[6]), "rZ"(from64[7]), "r"(to));
> +}
> +#define memcpy_toio_64(to, from) __memcpy_toio_64(to, from)
> +
>  /*
>   * I/O memory mapping functions.
>   */
> diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
> index bac63e874c7b..2d6d60ed2128 100644
> --- a/include/asm-generic/io.h
> +++ b/include/asm-generic/io.h
> @@ -1202,6 +1202,36 @@ static inline void memcpy_toio(volatile void __iomem *addr, const void *buffer,
>  }
>  #endif
>  
> +#ifndef memcpy_toio_64
> +#define memcpy_toio_64 memcpy_toio_64
> +/**
> + * memcpy_toio_64	Copy 64 bytes of data into I/O memory
> + * @dst:		The (I/O memory) destination for the copy
> + * @src:		The (RAM) source for the data
> + * @count:		The number of bytes to copy
> + *
> + * dst and src must be aligned to 8 bytes. This operation copies exactly 64
> + * bytes. It is intended to be used for write combining IO memory. The
> + * architecture should provide an implementation that has a high chance of
> + * generating a single combined transaction.
> + */
> +static inline void memcpy_toio_64(volatile void __iomem *addr,
> +				  const void *buffer)
> +{
> +	unsigned int i = 0;
> +
> +#if BITS_PER_LONG == 64
> +	for (; i != 8; i++)
> +		__raw_writeq(((const u64 *)buffer)[i],
> +			     ((u64 __iomem *)addr) + i);
> +#else
> +	for (; i != 16; i++)
> +		__raw_writel(((const u32 *)buffer)[i],
> +			     ((u32 __iomem *)addr) + i);
> +#endif
> +}
> +#endif
> +
>  extern int devmem_is_allowed(unsigned long pfn);
>  
>  #endif /* __KERNEL__ */
> -- 
> 2.42.0
> 
>
Jason Gunthorpe Nov. 24, 2023, 12:23 p.m. UTC | #2
On Fri, Nov 24, 2023 at 10:16:15AM +0000, Mark Rutland wrote:
> On Thu, Nov 23, 2023 at 09:04:31PM +0200, Leon Romanovsky wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > 
> > The kernel supports write combining IO memory which is commonly used to
> > generate 64 byte TLPs in a PCIe environment. On many CPUs this mechanism
> > is pretty tolerant and a simple C loop will suffice to generate a 64 byte
> > TLP.
> > 
> > However modern ARM64 CPUs are quite sensitive and a compiler generated
> > loop is not enough to reliably generate a 64 byte TLP. Especially given
> > the ARM64 issue that writel() does not codegen anything other than "[xN]"
> > as the address calculation.
> > 
> > These newer CPUs require an orderly consecutive block of stores to work
> > reliably. This is best done with four STP integer instructions (perhaps
> > ST64B in future), or a single ST4 vector instruction.
> > 
> > Provide a new generic function memcpy_toio_64() which should reliably
> > generate the needed instructions for the architecture, assuming address
> > alignment. As the usual need for this operation is performance sensitive a
> > fast inline implementation is preferred.
> 
> There is *no* architectural sequence that is guaranteed to reliably generate a
> 64-byte TLP, and this sequence won't guarnatee that (e.g. even if the CPU
> *always* merged adjacent stores, we can take an interrupt mid-sequence that
> would prevent that).

WC is not guaranteed on any arch, that is well known.

The HW has means to handle fragmented TLPs, it just hurts performance
when it happens. "reliable" here means we'd like to see something like
a > 90% chance of the large TLP instead of the < 1% chance with the C
loop.

Future ARM CPUs have the ST64B instruction which does provide the
architectural guarantee, and x86 has a similar guaranteed instruction
now too. 

> What's the actual requirement here? Is this just for performance?

Yes, just performance.

Jason
Robin Murphy Nov. 24, 2023, 12:58 p.m. UTC | #3
On 23/11/2023 7:04 pm, Leon Romanovsky wrote:
> From: Jason Gunthorpe <jgg@nvidia.com>
> 
> The kernel supports write combining IO memory which is commonly used to
> generate 64 byte TLPs in a PCIe environment. On many CPUs this mechanism
> is pretty tolerant and a simple C loop will suffice to generate a 64 byte
> TLP.
> 
> However modern ARM64 CPUs are quite sensitive and a compiler generated
> loop is not enough to reliably generate a 64 byte TLP. Especially given
> the ARM64 issue that writel() does not codegen anything other than "[xN]"
> as the address calculation.
> 
> These newer CPUs require an orderly consecutive block of stores to work
> reliably. This is best done with four STP integer instructions (perhaps
> ST64B in future), or a single ST4 vector instruction.
> 
> Provide a new generic function memcpy_toio_64() which should reliably
> generate the needed instructions for the architecture, assuming address
> alignment. As the usual need for this operation is performance sensitive a
> fast inline implementation is preferred.
> 
> Implement an optimized version on ARM that is a block of 4 STP
> instructions.
> 
> The generic implementation is just a simple loop. x86-64 (clang 16)
> compiles this into an unrolled loop of 16 movq pairs.
> 
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: linux-arch@vger.kernel.org
> Cc: linux-arm-kernel@lists.infradead.org
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
>   arch/arm64/include/asm/io.h | 20 ++++++++++++++++++++
>   include/asm-generic/io.h    | 30 ++++++++++++++++++++++++++++++
>   2 files changed, 50 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
> index 3b694511b98f..73ab91913790 100644
> --- a/arch/arm64/include/asm/io.h
> +++ b/arch/arm64/include/asm/io.h
> @@ -135,6 +135,26 @@ extern void __memset_io(volatile void __iomem *, int, size_t);
>   #define memcpy_fromio(a,c,l)	__memcpy_fromio((a),(c),(l))
>   #define memcpy_toio(c,a,l)	__memcpy_toio((c),(a),(l))
>   
> +static inline void __memcpy_toio_64(volatile void __iomem *to, const void *from)
> +{
> +	const u64 *from64 = from;
> +
> +	/*
> +	 * Newer ARM core have sensitive write combining buffers, it is
> +	 * important that the stores be contiguous blocks of store instructions.
> +	 * Normal memcpy does not work reliably.
> +	 */
> +	asm volatile("stp %x0, %x1, [%8, #16 * 0]\n"
> +		     "stp %x2, %x3, [%8, #16 * 1]\n"
> +		     "stp %x4, %x5, [%8, #16 * 2]\n"
> +		     "stp %x6, %x7, [%8, #16 * 3]\n"
> +		     :
> +		     : "rZ"(from64[0]), "rZ"(from64[1]), "rZ"(from64[2]),
> +		       "rZ"(from64[3]), "rZ"(from64[4]), "rZ"(from64[5]),
> +		       "rZ"(from64[6]), "rZ"(from64[7]), "r"(to));

Is this correct for big-endian? LDP/STP are kinda tricksy in that regard.

Thanks,
Robin.

> +}
> +#define memcpy_toio_64(to, from) __memcpy_toio_64(to, from)
> +
>   /*
>    * I/O memory mapping functions.
>    */
> diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
> index bac63e874c7b..2d6d60ed2128 100644
> --- a/include/asm-generic/io.h
> +++ b/include/asm-generic/io.h
> @@ -1202,6 +1202,36 @@ static inline void memcpy_toio(volatile void __iomem *addr, const void *buffer,
>   }
>   #endif
>   
> +#ifndef memcpy_toio_64
> +#define memcpy_toio_64 memcpy_toio_64
> +/**
> + * memcpy_toio_64	Copy 64 bytes of data into I/O memory
> + * @dst:		The (I/O memory) destination for the copy
> + * @src:		The (RAM) source for the data
> + * @count:		The number of bytes to copy
> + *
> + * dst and src must be aligned to 8 bytes. This operation copies exactly 64
> + * bytes. It is intended to be used for write combining IO memory. The
> + * architecture should provide an implementation that has a high chance of
> + * generating a single combined transaction.
> + */
> +static inline void memcpy_toio_64(volatile void __iomem *addr,
> +				  const void *buffer)
> +{
> +	unsigned int i = 0;
> +
> +#if BITS_PER_LONG == 64
> +	for (; i != 8; i++)
> +		__raw_writeq(((const u64 *)buffer)[i],
> +			     ((u64 __iomem *)addr) + i);
> +#else
> +	for (; i != 16; i++)
> +		__raw_writel(((const u32 *)buffer)[i],
> +			     ((u32 __iomem *)addr) + i);
> +#endif
> +}
> +#endif
> +
>   extern int devmem_is_allowed(unsigned long pfn);
>   
>   #endif /* __KERNEL__ */
Jason Gunthorpe Nov. 24, 2023, 1:45 p.m. UTC | #4
On Fri, Nov 24, 2023 at 12:58:11PM +0000, Robin Murphy wrote:
> > diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
> > index 3b694511b98f..73ab91913790 100644
> > --- a/arch/arm64/include/asm/io.h
> > +++ b/arch/arm64/include/asm/io.h
> > @@ -135,6 +135,26 @@ extern void __memset_io(volatile void __iomem *, int, size_t);
> >   #define memcpy_fromio(a,c,l)	__memcpy_fromio((a),(c),(l))
> >   #define memcpy_toio(c,a,l)	__memcpy_toio((c),(a),(l))
> > +static inline void __memcpy_toio_64(volatile void __iomem *to, const void *from)
> > +{
> > +	const u64 *from64 = from;
> > +
> > +	/*
> > +	 * Newer ARM core have sensitive write combining buffers, it is
> > +	 * important that the stores be contiguous blocks of store instructions.
> > +	 * Normal memcpy does not work reliably.
> > +	 */
> > +	asm volatile("stp %x0, %x1, [%8, #16 * 0]\n"
> > +		     "stp %x2, %x3, [%8, #16 * 1]\n"
> > +		     "stp %x4, %x5, [%8, #16 * 2]\n"
> > +		     "stp %x6, %x7, [%8, #16 * 3]\n"
> > +		     :
> > +		     : "rZ"(from64[0]), "rZ"(from64[1]), "rZ"(from64[2]),
> > +		       "rZ"(from64[3]), "rZ"(from64[4]), "rZ"(from64[5]),
> > +		       "rZ"(from64[6]), "rZ"(from64[7]), "r"(to));
> 
> Is this correct for big-endian? LDP/STP are kinda tricksy in that regard.

Uh.. I didn't think about it at all..

By no means do I have any skill reading the ARM documents, but I think
it is OK, it says:

Mem[address, dbytes, AccType_NORMAL] = data1;
Mem[address+dbytes, dbytes, AccType_NORMAL] = data2;

So I understand that as

Mem[%8, #16 * 0, 8, AccType_NORMAL] = from64[0]
Mem[%8, #16 * 0 + 1 , 8, AccType_NORMAL] = from64[1]
Mem[%8, #16 * 1, 8, AccType_NORMAL] = from64[2]
Mem[%8, #16 * 1 + 1, 8, AccType_NORMAL] = from64[3]
..

Which is the same on BE/LE?

But I don't know the pitfall to watch for here. This is memcpy so we
don't have to swap, the order of the bits in the register doesn't
matter.

Thanks,
Jason
Niklas Schnelle Nov. 24, 2023, 2:10 p.m. UTC | #5
On Thu, 2023-11-23 at 21:04 +0200, Leon Romanovsky wrote:
> From: Jason Gunthorpe <jgg@nvidia.com>
> 
> The kernel supports write combining IO memory which is commonly used to
> generate 64 byte TLPs in a PCIe environment. On many CPUs this mechanism
> is pretty tolerant and a simple C loop will suffice to generate a 64 byte
> TLP.
> 
> However modern ARM64 CPUs are quite sensitive and a compiler generated
> loop is not enough to reliably generate a 64 byte TLP. Especially given
> the ARM64 issue that writel() does not codegen anything other than "[xN]"
> as the address calculation.
> 
> These newer CPUs require an orderly consecutive block of stores to work
> reliably. This is best done with four STP integer instructions (perhaps
> ST64B in future), or a single ST4 vector instruction.
> 
> Provide a new generic function memcpy_toio_64() which should reliably
> generate the needed instructions for the architecture, assuming address
> alignment. As the usual need for this operation is performance sensitive a
> fast inline implementation is preferred.
> 
> Implement an optimized version on ARM that is a block of 4 STP
> instructions.
> 
> The generic implementation is just a simple loop. x86-64 (clang 16)
> compiles this into an unrolled loop of 16 movq pairs.
> 
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: linux-arch@vger.kernel.org
> Cc: linux-arm-kernel@lists.infradead.org
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
---8<---
> +#ifndef memcpy_toio_64
> +#define memcpy_toio_64 memcpy_toio_64
> +/**
> + * memcpy_toio_64	Copy 64 bytes of data into I/O memory
> + * @dst:		The (I/O memory) destination for the copy
> + * @src:		The (RAM) source for the data
> + * @count:		The number of bytes to copy
> + *
> + * dst and src must be aligned to 8 bytes. This operation copies exactly 64
> + * bytes. It is intended to be used for write combining IO memory. The
> + * architecture should provide an implementation that has a high chance of
> + * generating a single combined transaction.
> + */
> +static inline void memcpy_toio_64(volatile void __iomem *addr,
> +				  const void *buffer)
> +{
> +	unsigned int i = 0;
> +
> +#if BITS_PER_LONG == 64
> +	for (; i != 8; i++)
> +		__raw_writeq(((const u64 *)buffer)[i],
> +			     ((u64 __iomem *)addr) + i);
> +#else
> +	for (; i != 16; i++)
> +		__raw_writel(((const u32 *)buffer)[i],
> +			     ((u32 __iomem *)addr) + i);
> +#endif

What's the reasoning behind not using the existing memcpy_toio() here?
For s390 the above generic variant would do 8 of our special PCI store
instructions while memcpy_toio() is defined to zpci_memcpy_toio() which
can do the same as a single PCI store block instruction. Now of course
we could provide our own memcpy_toio_64() but that would end up the
same as just doing memcpy_toio(addr, buffer, 64) here.

> +}
> +#endif
> +
>  extern int devmem_is_allowed(unsigned long pfn);
>  
>  #endif /* __KERNEL__ */
Jason Gunthorpe Nov. 24, 2023, 2:20 p.m. UTC | #6
On Fri, Nov 24, 2023 at 03:10:29PM +0100, Niklas Schnelle wrote:
 
> What's the reasoning behind not using the existing memcpy_toio()
> here?

Going forward CPUs are implementing an instruction to do a 64 byte
aligned store, this is a wrapper for exactly that operation.

memcpy_toio() is much more general, it allows unaligned buffers and
non-multiples of 64. Adapting the general version to generate the
optimized version in the cases it can is complex and has a codegen
penalty..

> For s390 the above generic variant would do 8 of our special PCI store
> instructions while memcpy_toio() is defined to zpci_memcpy_toio() which
> can do the same as a single PCI store block instruction. Now of course
> we could provide our own memcpy_toio_64() but that would end up the
> same as just doing memcpy_toio(addr, buffer, 64) here.

This is probably better?

Jason
Niklas Schnelle Nov. 24, 2023, 2:48 p.m. UTC | #7
On Fri, 2023-11-24 at 10:20 -0400, Jason Gunthorpe wrote:
> On Fri, Nov 24, 2023 at 03:10:29PM +0100, Niklas Schnelle wrote:
>  
> > What's the reasoning behind not using the existing memcpy_toio()
> > here?
> 
> Going forward CPUs are implementing an instruction to do a 64 byte
> aligned store, this is a wrapper for exactly that operation.
> 
> memcpy_toio() is much more general, it allows unaligned buffers and
> non-multiples of 64. Adapting the general version to generate the
> optimized version in the cases it can is complex and has a codegen
> penalty..

I think you misunderstood me. I understand why you want a separate
memcpy_toio_64(). I just wonder if its generic implementation shouldn't
just be a define or inline wrapper for memcpy_toio(addr, buffer, 64).

For s390 that would already result in a single PCI store block which
for us is much much better than 8 consecutive __raw_writeq(). Our
zpci_memcpy_toio() still has some extra code to ensure alignment and
break it up in supported sizes that we could get rid of with our own
memcpy_toio_64() of course. I suspect though that since it's all inline
functions the compiler seeing the constant 64 might already eliminate
some of the extra code.

Also seeing the second patch of course that would no longer really test
for write combining for us which we can also do but I think that's okay
and you're probably going to use memcpy_toio_64() in more places and
there we really want the PCI store block.

Thanks,
Niklas
Niklas Schnelle Nov. 24, 2023, 2:53 p.m. UTC | #8
On Fri, 2023-11-24 at 15:48 +0100, Niklas Schnelle wrote:
> On Fri, 2023-11-24 at 10:20 -0400, Jason Gunthorpe wrote:
> > On Fri, Nov 24, 2023 at 03:10:29PM +0100, Niklas Schnelle wrote:
> >  
---8<---
> 
> Also seeing the second patch of course that would no longer really test
> for write combining for us which we can also do but I think that's okay
> and you're probably going to use memcpy_toio_64() in more places and
> there we really want the PCI store block.

Disregard this part I didn't properly align commit message and code. I
thought you replaced the check with memcpy_toio_64().

Thanks,
Niklas
Jason Gunthorpe Nov. 24, 2023, 2:55 p.m. UTC | #9
On Fri, Nov 24, 2023 at 03:48:22PM +0100, Niklas Schnelle wrote:
> On Fri, 2023-11-24 at 10:20 -0400, Jason Gunthorpe wrote:
> > On Fri, Nov 24, 2023 at 03:10:29PM +0100, Niklas Schnelle wrote:
> >  
> > > What's the reasoning behind not using the existing memcpy_toio()
> > > here?
> > 
> > Going forward CPUs are implementing an instruction to do a 64 byte
> > aligned store, this is a wrapper for exactly that operation.
> > 
> > memcpy_toio() is much more general, it allows unaligned buffers and
> > non-multiples of 64. Adapting the general version to generate the
> > optimized version in the cases it can is complex and has a codegen
> > penalty..
> 
> I think you misunderstood me. I understand why you want a separate
> memcpy_toio_64(). I just wonder if its generic implementation shouldn't
> just be a define or inline wrapper for memcpy_toio(addr, buffer, 64).

Oh, yes, I totally did.

I'm worried that x86 will less reliably generate write combining with
it's memcpy_toio implemention. It codegens byte copies for that
function :(

> Also seeing the second patch of course that would no longer really test
> for write combining for us which we can also do but I think that's okay
> and you're probably going to use memcpy_toio_64() in more places and
> there we really want the PCI store block.

Right now we don't have in-kernel performance use cases for write
combining for mlx5.

Userspace uses the WC and we already have the special 390 instructions
for batching in rdma-core already, IIRC.

So it would be appropriate for s390 to use a consistent path.

Jason
Robin Murphy Nov. 24, 2023, 3:32 p.m. UTC | #10
On 24/11/2023 1:45 pm, Jason Gunthorpe wrote:
> On Fri, Nov 24, 2023 at 12:58:11PM +0000, Robin Murphy wrote:
>>> diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
>>> index 3b694511b98f..73ab91913790 100644
>>> --- a/arch/arm64/include/asm/io.h
>>> +++ b/arch/arm64/include/asm/io.h
>>> @@ -135,6 +135,26 @@ extern void __memset_io(volatile void __iomem *, int, size_t);
>>>    #define memcpy_fromio(a,c,l)	__memcpy_fromio((a),(c),(l))
>>>    #define memcpy_toio(c,a,l)	__memcpy_toio((c),(a),(l))
>>> +static inline void __memcpy_toio_64(volatile void __iomem *to, const void *from)
>>> +{
>>> +	const u64 *from64 = from;
>>> +
>>> +	/*
>>> +	 * Newer ARM core have sensitive write combining buffers, it is
>>> +	 * important that the stores be contiguous blocks of store instructions.
>>> +	 * Normal memcpy does not work reliably.
>>> +	 */
>>> +	asm volatile("stp %x0, %x1, [%8, #16 * 0]\n"
>>> +		     "stp %x2, %x3, [%8, #16 * 1]\n"
>>> +		     "stp %x4, %x5, [%8, #16 * 2]\n"
>>> +		     "stp %x6, %x7, [%8, #16 * 3]\n"
>>> +		     :
>>> +		     : "rZ"(from64[0]), "rZ"(from64[1]), "rZ"(from64[2]),
>>> +		       "rZ"(from64[3]), "rZ"(from64[4]), "rZ"(from64[5]),
>>> +		       "rZ"(from64[6]), "rZ"(from64[7]), "r"(to));
>>
>> Is this correct for big-endian? LDP/STP are kinda tricksy in that regard.
> 
> Uh.. I didn't think about it at all..
> 
> By no means do I have any skill reading the ARM documents, but I think
> it is OK, it says:
> 
> Mem[address, dbytes, AccType_NORMAL] = data1;
> Mem[address+dbytes, dbytes, AccType_NORMAL] = data2;
> 
> So I understand that as
> 
> Mem[%8, #16 * 0, 8, AccType_NORMAL] = from64[0]
> Mem[%8, #16 * 0 + 1 , 8, AccType_NORMAL] = from64[1]
> Mem[%8, #16 * 1, 8, AccType_NORMAL] = from64[2]
> Mem[%8, #16 * 1 + 1, 8, AccType_NORMAL] = from64[3]
> ..
> 
> Which is the same on BE/LE?
> 
> But I don't know the pitfall to watch for here. This is memcpy so we
> don't have to swap, the order of the bits in the register doesn't
> matter.

Indeed you're right - all the way back to Armv7 LDRD/STRD, I always get 
caught out by remembering the path which does an endian-dependent swap 
of the target registers, but forgetting that that's there to 
*counteract* the byteswap in Mem[] itself.

Cheers,
Robin.
Niklas Schnelle Nov. 24, 2023, 3:59 p.m. UTC | #11
On Fri, 2023-11-24 at 10:55 -0400, Jason Gunthorpe wrote:
> On Fri, Nov 24, 2023 at 03:48:22PM +0100, Niklas Schnelle wrote:
> > On Fri, 2023-11-24 at 10:20 -0400, Jason Gunthorpe wrote:
> > > On Fri, Nov 24, 2023 at 03:10:29PM +0100, Niklas Schnelle wrote:
> > >  
> > > > What's the reasoning behind not using the existing memcpy_toio()
> > > > here?
> > > 
> > > Going forward CPUs are implementing an instruction to do a 64 byte
> > > aligned store, this is a wrapper for exactly that operation.
> > > 
> > > memcpy_toio() is much more general, it allows unaligned buffers and
> > > non-multiples of 64. Adapting the general version to generate the
> > > optimized version in the cases it can is complex and has a codegen
> > > penalty..
> > 
> > I think you misunderstood me. I understand why you want a separate
> > memcpy_toio_64(). I just wonder if its generic implementation shouldn't
> > just be a define or inline wrapper for memcpy_toio(addr, buffer, 64).
> 
> Oh, yes, I totally did.
> 
> I'm worried that x86 will less reliably generate write combining with
> it's memcpy_toio implemention. It codegens byte copies for that
> function :(

Oh ok I see what you mean.

> 
> > Also seeing the second patch of course that would no longer really test
> > for write combining for us which we can also do but I think that's okay
> > and you're probably going to use memcpy_toio_64() in more places and
> > there we really want the PCI store block.
> 
> Right now we don't have in-kernel performance use cases for write
> combining for mlx5.

Is the code in patch 2 performance critical?

> 
> Userspace uses the WC and we already have the special 390 instructions
> for batching in rdma-core already, IIRC.

Yes, I added that support to rdma-core :-)

> 
> So it would be appropriate for s390 to use a consistent path.
> 
> Jason

This should be as easy as adding

#define memcpy_toio_64(to, from) zpci_memcpy_toio(to, from, 64)

to arch/s390/include/asm/io.h. I'm wondering if we should do that as
part of this series. It's not as good as a special case but probably
better than the existing loop.

I don't think we have any existing in-kernel users of memcpy_toio() on
s390 so far though so I'd like to give this some extra testing. Could
you share instructions on how to exercise the code path of patch 2 on a
ConnectX-5 or 6? Is this exercised e.g. when using NVMe-oF RDMA?

Thanks,
Niklas
Jason Gunthorpe Nov. 24, 2023, 4:06 p.m. UTC | #12
On Fri, Nov 24, 2023 at 04:59:38PM +0100, Niklas Schnelle wrote:
 
> This should be as easy as adding
> 
> #define memcpy_toio_64(to, from) zpci_memcpy_toio(to, from, 64)
> 
> to arch/s390/include/asm/io.h. I'm wondering if we should do that as
> part of this series. It's not as good as a special case but probably
> better than the existing loop.

Makes sense

> I don't think we have any existing in-kernel users of memcpy_toio() on
> s390 so far though so I'd like to give this some extra testing. Could
> you share instructions on how to exercise the code path of patch 2 on a
> ConnectX-5 or 6? Is this exercised e.g. when using NVMe-oF RDMA?

Simply boot and look at pr_debug from mlx5 to see if writecombining is
on or off - you want to see on.

Thanks,
Jason
Catalin Marinas Nov. 27, 2023, 12:42 p.m. UTC | #13
On Fri, Nov 24, 2023 at 08:23:52AM -0400, Jason Gunthorpe wrote:
> On Fri, Nov 24, 2023 at 10:16:15AM +0000, Mark Rutland wrote:
> > On Thu, Nov 23, 2023 at 09:04:31PM +0200, Leon Romanovsky wrote:
> > > From: Jason Gunthorpe <jgg@nvidia.com>
[...]
> > > Provide a new generic function memcpy_toio_64() which should reliably
> > > generate the needed instructions for the architecture, assuming address
> > > alignment. As the usual need for this operation is performance sensitive a
> > > fast inline implementation is preferred.
> > 
> > There is *no* architectural sequence that is guaranteed to reliably generate a
> > 64-byte TLP, and this sequence won't guarnatee that (e.g. even if the CPU
> > *always* merged adjacent stores, we can take an interrupt mid-sequence that
> > would prevent that).
> 
> WC is not guaranteed on any arch, that is well known.
> 
> The HW has means to handle fragmented TLPs, it just hurts performance
> when it happens. "reliable" here means we'd like to see something like
> a > 90% chance of the large TLP instead of the < 1% chance with the C
> loop.
> 
> Future ARM CPUs have the ST64B instruction which does provide the
> architectural guarantee, and x86 has a similar guaranteed instruction
> now too. 
> 
> > What's the actual requirement here? Is this just for performance?
> 
> Yes, just performance.

Do you have any rough numbers (percentage)? It's highly
microarchitecture-dependent until we get the ST64B instruction.

More of a bike-shedding, I wonder whether the __iowrite*_copy()
semantics are better suited for what you need in terms of ordering (not
that mempcy_toio() to Normal NC memory gives us any ordering).
Jason Gunthorpe Nov. 27, 2023, 1:45 p.m. UTC | #14
On Mon, Nov 27, 2023 at 12:42:41PM +0000, Catalin Marinas wrote:

> > > What's the actual requirement here? Is this just for performance?
> > 
> > Yes, just performance.
> 
> Do you have any rough numbers (percentage)? It's highly
> microarchitecture-dependent until we get the ST64B instruction.

The current C code is an open coded store loop. The kernel does 250
tries and measures if any one of them succeeds to combine.

On x86, and older ARM cores we see that 100% of the time at least 1 in
250 tries succeeds.

With the new CPU cores we see more like 9 out of 10 time there are 0
in 250 tries that succeed. Ie we can go thousands of times without
seeing any successful WC combine.

The STP block brings it back to 100% of the time 1 in 250 succeed.

This is a statistical lower bound, based on what we see performance
wise it almost always works.

However, in userspace we have long been using ST4 to create a
single-instruction 64 byte store on ARM64. As far as I know this is
highly reliable. I don't have direct data on the STP configuration.

> More of a bike-shedding, I wonder whether the __iowrite*_copy()
> semantics are better suited for what you need in terms of ordering (not
> that mempcy_toio() to Normal NC memory gives us any ordering).

I have the same remark I gave to Niklas, this does not require
alignment or an exact 64 byte size. It was clearly made to support WC
stores since Pathscale did it, but I don't see this mapping nicely to
the future 64 byte store instructions are we getting.

We could name it __iowrite512_copy() if that makes more sense?

Thanks,
Jason
Niklas Schnelle Nov. 27, 2023, 5:43 p.m. UTC | #15
On Fri, 2023-11-24 at 12:06 -0400, Jason Gunthorpe wrote:
> On Fri, Nov 24, 2023 at 04:59:38PM +0100, Niklas Schnelle wrote:
>  
> > This should be as easy as adding
> > 
> > #define memcpy_toio_64(to, from) zpci_memcpy_toio(to, from, 64)
> > 
> > to arch/s390/include/asm/io.h. I'm wondering if we should do that as
> > part of this series. It's not as good as a special case but probably
> > better than the existing loop.
> 
> Makes sense

Ok, I overlooked the obvious. Let's make that:

#define memcpy_toio_64(dst, src)       zpci_write_block(dst, src, 64)

> 
> > I don't think we have any existing in-kernel users of memcpy_toio() on
> > s390 so far though so I'd like to give this some extra testing. Could
> > you share instructions on how to exercise the code path of patch 2 on a
> > ConnectX-5 or 6? Is this exercised e.g. when using NVMe-oF RDMA?
> 
> Simply boot and look at pr_debug from mlx5 to see if writecombining is
> on or off - you want to see on.
> 
> Thanks,
> Jason

With the above zpci_write_block(dst, src, 64) we get a PCI store block
without any extra alignment treatment i.e. exactly what we want for
memcpy_toio_64(). If the alignment is wrong the PCI store block
instruction will fail the PCI function will be isolated and we log an
error so I don't see a need for checks there either. On an aside it
looks like our zpci_memcpy_toio() is wrongly looking for tighter than 8
byte alignment on the source address and would issue a series of 8
stores. Still looking into that. I also tested this with our only
privileged (kernel only) PCI stores and that works too.

Also it turns out the writeq() loop we had so far does not produce the
needed 64 byte TLP on s390 either so this actually makes us newly pass
this test.

Thanks,
Niklas
Jason Gunthorpe Nov. 27, 2023, 5:51 p.m. UTC | #16
On Mon, Nov 27, 2023 at 06:43:11PM +0100, Niklas Schnelle wrote:

> Also it turns out the writeq() loop we had so far does not produce the
> needed 64 byte TLP on s390 either so this actually makes us newly pass
> this test.

Ooh, that is a significant problem - the userspace code won't be used
unless this test passes. So we need this on S390 to fix a bug as well
:\

Thanks,
Jason
Niklas Schnelle Nov. 28, 2023, 4:28 p.m. UTC | #17
On Mon, 2023-11-27 at 13:51 -0400, Jason Gunthorpe wrote:
> On Mon, Nov 27, 2023 at 06:43:11PM +0100, Niklas Schnelle wrote:
> 
> > Also it turns out the writeq() loop we had so far does not produce the
> > needed 64 byte TLP on s390 either so this actually makes us newly pass
> > this test.
> 
> Ooh, that is a significant problem - the userspace code won't be used
> unless this test passes. So we need this on S390 to fix a bug as well
> :\
> 
> Thanks,
> Jason
> 

Yes ;-(

In the meantime I also found out that zpci_write_block(dst, src, 64) is
not correct for all cases because the PCI store block requires the
(pseudo-)MMIO write not to cross a 4K boundary and we need src/dst to
be double word aligned. In rdma-core this is neatly handled by the
get_max_write_size() but the kernel variant of that
zpci_get_max_write_size() isn't just a lot harder to read and likely
less efficient but also too strict thus breaking the 64 byte write up
needlessly.

In total we have 5 conditions for the PCI block stores:

1. The dst+len must not cross a 4K boundary in the (pseudo-)MMIO space
2. The length must not exceed the maximum write size
3. The length must be a multiple of 8
4. The src needs to be double word aligned
5. The dst needs to be double word aligned

So I think a good solution would be to improve zpci_memcpy_toio() with
an enhanced zpci_get_max_write_size() based on the code in rdma-core
extended to also handle the alignment and length restrictions which in
rdma-core are already assumed (see comment there). Then we can use
zpci_memcpy_toio(dst, src, 64) for memcpy_toio_64() and rely on the
compiler to optimize out the unnecessary checks (2, 3 and possibly 4,
5).

So yeah this is getting a bit more  complicated than originally
thought. Let me cook up a patch.

Thanks,
Niklas
Catalin Marinas Dec. 4, 2023, 5:31 p.m. UTC | #18
On Mon, Nov 27, 2023 at 09:45:05AM -0400, Jason Gunthorpe wrote:
> On Mon, Nov 27, 2023 at 12:42:41PM +0000, Catalin Marinas wrote:
> > > > What's the actual requirement here? Is this just for performance?
> > > 
> > > Yes, just performance.
> > 
> > Do you have any rough numbers (percentage)? It's highly
> > microarchitecture-dependent until we get the ST64B instruction.
> 
> The current C code is an open coded store loop. The kernel does 250
> tries and measures if any one of them succeeds to combine.
> 
> On x86, and older ARM cores we see that 100% of the time at least 1 in
> 250 tries succeeds.
> 
> With the new CPU cores we see more like 9 out of 10 time there are 0
> in 250 tries that succeed. Ie we can go thousands of times without
> seeing any successful WC combine.
> 
> The STP block brings it back to 100% of the time 1 in 250 succeed.

That's a bit confusing to me: 1 in 250 succeeding is still pretty rare.
But I guess what your benchmark says is that at least 1 succeeded to
write-combine and it might as well be all 250 tries. It's more
interesting to see if there's actual performance gain in real-world
traffic, not just some artificial benchmark (I may have misunderstood
your numbers above).

> However, in userspace we have long been using ST4 to create a
> single-instruction 64 byte store on ARM64. As far as I know this is
> highly reliable. I don't have direct data on the STP configuration.

Personally I'd optimise the mempcy_toio() arm64 implementation to do
STPs if the alignment is right (like we do for classic memcpy()).
There's a slight overhead for alignment checking but I suspect it would
be lost as long as you can get the write-combining. Not sure whether the
interspersed reads in memcpy_toio() would somehow prevent the
write-combining.

A memcpy_toio_64() can use the new ST64B instruction if available or
fall back to memcpy_toio() on arm64. It should also have the DGH
instruction (io_stop_wc()) but only if falling back to classic
memcpy_toio(). We don't need DGH with ST64B.

> > More of a bike-shedding, I wonder whether the __iowrite*_copy()
> > semantics are better suited for what you need in terms of ordering (not
> > that mempcy_toio() to Normal NC memory gives us any ordering).
> 
> I have the same remark I gave to Niklas, this does not require
> alignment or an exact 64 byte size. It was clearly made to support WC
> stores since Pathscale did it, but I don't see this mapping nicely to
> the future 64 byte store instructions are we getting.

As above, I'd suggest just using memcpy_toio() as a fallback if ST64B is
not available.

> We could name it __iowrite512_copy() if that makes more sense?

I've been thinking at the __iowrite*_copy() and these also take a
'count' argument. I assume in this instance we don't really need one, so
it's just additional overhead (more like API clutter, I doubt it makes
much difference for performance). I'd say just stick to the
mempcy_toio_64() but have the io_stop_wc() inside this function as we
won't need it with ST64B.

Well, unless someone has a better name for this function.
Jason Gunthorpe Dec. 4, 2023, 6:23 p.m. UTC | #19
On Mon, Dec 04, 2023 at 05:31:47PM +0000, Catalin Marinas wrote:
> On Mon, Nov 27, 2023 at 09:45:05AM -0400, Jason Gunthorpe wrote:
> > On Mon, Nov 27, 2023 at 12:42:41PM +0000, Catalin Marinas wrote:
> > > > > What's the actual requirement here? Is this just for performance?
> > > > 
> > > > Yes, just performance.
> > > 
> > > Do you have any rough numbers (percentage)? It's highly
> > > microarchitecture-dependent until we get the ST64B instruction.
> > 
> > The current C code is an open coded store loop. The kernel does 250
> > tries and measures if any one of them succeeds to combine.
> > 
> > On x86, and older ARM cores we see that 100% of the time at least 1 in
> > 250 tries succeeds.
> > 
> > With the new CPU cores we see more like 9 out of 10 time there are 0
> > in 250 tries that succeed. Ie we can go thousands of times without
> > seeing any successful WC combine.
> > 
> > The STP block brings it back to 100% of the time 1 in 250 succeed.
> 
> That's a bit confusing to me: 1 in 250 succeeding is still pretty rare.
> But I guess what your benchmark says is that at least 1 succeeded to
> write-combine and it might as well be all 250 tries. It's more
> interesting to see if there's actual performance gain in real-world
> traffic, not just some artificial benchmark (I may have misunderstood
> your numbers above).

Yes, I just don't have better data available to say that 250/250
succeeded, but we expect that is the case.

We have now something like 20 years experiance with write combining
performance on x86 systems. It brings real world gains in real word
HPC applications.

Indeed, the reason this even came up was because one of our existing
applications was performing unexpectedly badly on these ARM64 servers.

We even have data showing that having the CPU do all the write
combining steps and then fail to get writecombining is notably slower
than just assuming no write combining. It is why we go through the
trouble to test the physical HW.

> > However, in userspace we have long been using ST4 to create a
> > single-instruction 64 byte store on ARM64. As far as I know this is
> > highly reliable. I don't have direct data on the STP configuration.
> 
> Personally I'd optimise the mempcy_toio() arm64 implementation to do
> STPs if the alignment is right (like we do for classic memcpy()).
> There's a slight overhead for alignment checking but I suspect it would
> be lost as long as you can get the write-combining. Not sure whether the
> interspersed reads in memcpy_toio() would somehow prevent the
> write-combining.

I understand on these new CPUs anything other than a block of
contiguous STPs is risky to break the WC. I was told we should not
have any loads between them.

So we can't just update memcpy_toio to optimize a 128 bit store
variant like memcpy might. We actually need a special case just for 64
byte.

IMHO it does not look good as the chance any existing callers can use
this optmized 64B path is probably small, but everyone has to pay the
costs to check for it.

I also would not do this on x86 - Pathscale apparently decided the
needed special __iowrite*_copy() things to actually make this work on
xome x86 systems - I'm very leary to change x86 stuff away from the 64
bit copy loopw we know works already on x86.

IMHO encoding the alignment expectation in the API is best, especially
since this is typically a performance path.

> A memcpy_toio_64() can use the new ST64B instruction if available or
> fall back to memcpy_toio() on arm64. It should also have the DGH
> instruction (io_stop_wc()) but only if falling back to classic
> memcpy_toio(). We don't need DGH with ST64B.

I'm told it is problematic, something about ST64B not working with
NORMAL_NC.

We could fold the DGH into the helper though. IHMO I'd like to see how
ST64B actually gets implemented before doing that. If the note about
the NORMAL_NC is true then we need a lot more infrastructure to
actually use it.

Also in a future ST64B world we are going to see HW start relying on
large TLPs, not just being an optional performance win. To my mind it
makes more sense that there is an API that guarantees a large TLP or
oops. We really don't want an automatic fallback to memcpy.

Jason
Catalin Marinas Dec. 5, 2023, 5:21 p.m. UTC | #20
On Mon, Dec 04, 2023 at 02:23:30PM -0400, Jason Gunthorpe wrote:
> On Mon, Dec 04, 2023 at 05:31:47PM +0000, Catalin Marinas wrote:
> > Personally I'd optimise the mempcy_toio() arm64 implementation to do
> > STPs if the alignment is right (like we do for classic memcpy()).
> > There's a slight overhead for alignment checking but I suspect it would
> > be lost as long as you can get the write-combining. Not sure whether the
> > interspersed reads in memcpy_toio() would somehow prevent the
> > write-combining.
> 
> I understand on these new CPUs anything other than a block of
> contiguous STPs is risky to break the WC. I was told we should not
> have any loads between them.

Classic memcpy does similar tricks with four LDPs in a row before
starting to issue the STPs (though there are new LDPs for the next
data in-between). But that was tuned for cacheable memory, not sure
if something similar would behave well on Normal-NC memory.

> So we can't just update memcpy_toio to optimize a 128 bit store
> variant like memcpy might. We actually need a special case just for 64
> byte.
> 
> IMHO it does not look good as the chance any existing callers can use
> this optmized 64B path is probably small, but everyone has to pay the
> costs to check for it.

I don't think the cost of the check is noticeable and there are several
places where the copy goes beyond 64 bytes. It may be worth a try.

> I also would not do this on x86 - Pathscale apparently decided the
> needed special __iowrite*_copy() things to actually make this work on
> xome x86 systems - I'm very leary to change x86 stuff away from the 64
> bit copy loopw we know works already on x86.
> 
> IMHO encoding the alignment expectation in the API is best, especially
> since this is typically a performance path.

The slight downside of a __iowrite512_copy() API is that, if we follow
the 32/64 semantics, it would need the source buffer aligned. Maybe we
can document it to 64-bit alignment only rather than 512.

> > A memcpy_toio_64() can use the new ST64B instruction if available or
> > fall back to memcpy_toio() on arm64. It should also have the DGH
> > instruction (io_stop_wc()) but only if falling back to classic
> > memcpy_toio(). We don't need DGH with ST64B.
> 
> I'm told it is problematic, something about ST64B not working with
> NORMAL_NC.

Last time I checked it was meant to work on Normal-NC (not cacheable
though). That's on page 285 of the Arm ARM J.a.

> Also in a future ST64B world we are going to see HW start relying on
> large TLPs, not just being an optional performance win. To my mind it
> makes more sense that there is an API that guarantees a large TLP or
> oops. We really don't want an automatic fallback to memcpy.

We can't guarantee those large TLPs without the ST64B instructions, so
it needs to be more of a QoS aspect than correctness.
Jason Gunthorpe Dec. 5, 2023, 5:51 p.m. UTC | #21
On Tue, Dec 05, 2023 at 05:21:27PM +0000, Catalin Marinas wrote:
> On Mon, Dec 04, 2023 at 02:23:30PM -0400, Jason Gunthorpe wrote:
> > On Mon, Dec 04, 2023 at 05:31:47PM +0000, Catalin Marinas wrote:
> > > Personally I'd optimise the mempcy_toio() arm64 implementation to do
> > > STPs if the alignment is right (like we do for classic memcpy()).
> > > There's a slight overhead for alignment checking but I suspect it would
> > > be lost as long as you can get the write-combining. Not sure whether the
> > > interspersed reads in memcpy_toio() would somehow prevent the
> > > write-combining.
> > 
> > I understand on these new CPUs anything other than a block of
> > contiguous STPs is risky to break the WC. I was told we should not
> > have any loads between them.
> 
> Classic memcpy does similar tricks with four LDPs in a row before
> starting to issue the STPs (though there are new LDPs for the next
> data in-between). But that was tuned for cacheable memory, not sure
> if something similar would behave well on Normal-NC memory.

Can we conclude a direction here?

1) I don't want to mess with x86 so we keep a dedicated API
   Are we agreed to call it __iowrite512_copy() and note its special
   alignment limitation?

2) You want to #define __iowrite512_copy() to memcpy_toio() on ARM and
   implement some quad STP optimization for this case?

3) A future ST64B and the x86 version would be put under
   __iowrite512_copy()?

4) A future ST64B would come with some kind of 'must do 64b copy or
   oops' to support the future HW that must have this instruction? eg
   we already see on Intel that HW must use ENQCMD and nothing else.

Agreed?

Niklas, is this OK for S390 too?

> > I'm told it is problematic, something about ST64B not working with
> > NORMAL_NC.
> 
> Last time I checked it was meant to work on Normal-NC (not cacheable
> though). That's on page 285 of the Arm ARM J.a.

This is a relief to hear!

Thanks,
Jason
Catalin Marinas Dec. 5, 2023, 7:34 p.m. UTC | #22
On Tue, Dec 05, 2023 at 01:51:27PM -0400, Jason Gunthorpe wrote:
> On Tue, Dec 05, 2023 at 05:21:27PM +0000, Catalin Marinas wrote:
> > On Mon, Dec 04, 2023 at 02:23:30PM -0400, Jason Gunthorpe wrote:
> > > On Mon, Dec 04, 2023 at 05:31:47PM +0000, Catalin Marinas wrote:
> > > > Personally I'd optimise the mempcy_toio() arm64 implementation to do
> > > > STPs if the alignment is right (like we do for classic memcpy()).
> > > > There's a slight overhead for alignment checking but I suspect it would
> > > > be lost as long as you can get the write-combining. Not sure whether the
> > > > interspersed reads in memcpy_toio() would somehow prevent the
> > > > write-combining.
> > > 
> > > I understand on these new CPUs anything other than a block of
> > > contiguous STPs is risky to break the WC. I was told we should not
> > > have any loads between them.
> > 
> > Classic memcpy does similar tricks with four LDPs in a row before
> > starting to issue the STPs (though there are new LDPs for the next
> > data in-between). But that was tuned for cacheable memory, not sure
> > if something similar would behave well on Normal-NC memory.
> 
> Can we conclude a direction here?
> 
> 1) I don't want to mess with x86 so we keep a dedicated API
>    Are we agreed to call it __iowrite512_copy() and note its special
>    alignment limitation?

Sounds fine to me.

> 2) You want to #define __iowrite512_copy() to memcpy_toio() on ARM and
>    implement some quad STP optimization for this case?

We can have the generic __iowrite512_copy() do memcpy_toio() and have
the arm64 implement an optimised version.

What I'm not entirely sure of is the DGH (whatever the io_* barrier name
is). I'd put it in the same __iowrite512_copy() function and remove it
from the driver code. Otherwise when ST64B is added, we have an
unnecessary DGH in the driver. If this does not match the other
__iowrite*_copy() semantics, we can come up with another name. But start
with this for now and document the function.

> 3) A future ST64B and the x86 version would be put under
>    __iowrite512_copy()?

Yes, arch-specific override.

> 4) A future ST64B would come with some kind of 'must do 64b copy or
>    oops' to support the future HW that must have this instruction? eg
>    we already see on Intel that HW must use ENQCMD and nothing else.

I don't agree with the oops part. We can't guarantee it on arm64, ST64B
I think is optional in the architecture. If you do need such guarantees,
we'd need the driver to probe for the feature (e.g. arch_has_...()) and
invoke a new macro. You can't have the __iowrite512_copy() that worked
fine suddenly giving an error just because some driver wants a
guaranteed atomic 64 byte write.
Jason Gunthorpe Dec. 5, 2023, 7:51 p.m. UTC | #23
On Tue, Dec 05, 2023 at 07:34:45PM +0000, Catalin Marinas wrote:

> > 2) You want to #define __iowrite512_copy() to memcpy_toio() on ARM and
> >    implement some quad STP optimization for this case?
> 
> We can have the generic __iowrite512_copy() do memcpy_toio() and have
> the arm64 implement an optimised version.
> 
> What I'm not entirely sure of is the DGH (whatever the io_* barrier name
> is). I'd put it in the same __iowrite512_copy() function and remove it
> from the driver code. Otherwise when ST64B is added, we have an
> unnecessary DGH in the driver. If this does not match the other
> __iowrite*_copy() semantics, we can come up with another name. But start
> with this for now and document the function.

I think the iowrite is only used for WC and the DGH is functionally
harmless for non-WC, so it makes sense.

In this case we should just remove the DGH macro from the generic
architecture code and tell people to use iowrite - since we now
understand that callers basically have to in order to use DGH on new
ARM CPUs.

> > 3) A future ST64B and the x86 version would be put under
> >    __iowrite512_copy()?
> 
> Yes, arch-specific override.
> 
> > 4) A future ST64B would come with some kind of 'must do 64b copy or
> >    oops' to support the future HW that must have this instruction? eg
> >    we already see on Intel that HW must use ENQCMD and nothing else.
> 
> I don't agree with the oops part. We can't guarantee it on arm64, ST64B
> I think is optional in the architecture. If you do need such guarantees,
> we'd need the driver to probe for the feature (e.g. arch_has_...()) and
> invoke a new macro. 

Yes, exactly. The driver must check. The new macro should oops if it
is invoked wrong, the same way enqcmd will oops if invoked wrong on
Intel.

Jason
Catalin Marinas Dec. 6, 2023, 11:09 a.m. UTC | #24
On Tue, Dec 05, 2023 at 03:51:30PM -0400, Jason Gunthorpe wrote:
> On Tue, Dec 05, 2023 at 07:34:45PM +0000, Catalin Marinas wrote:
> > > 2) You want to #define __iowrite512_copy() to memcpy_toio() on ARM and
> > >    implement some quad STP optimization for this case?
> > 
> > We can have the generic __iowrite512_copy() do memcpy_toio() and have
> > the arm64 implement an optimised version.
> > 
> > What I'm not entirely sure of is the DGH (whatever the io_* barrier name
> > is). I'd put it in the same __iowrite512_copy() function and remove it
> > from the driver code. Otherwise when ST64B is added, we have an
> > unnecessary DGH in the driver. If this does not match the other
> > __iowrite*_copy() semantics, we can come up with another name. But start
> > with this for now and document the function.
> 
> I think the iowrite is only used for WC and the DGH is functionally
> harmless for non-WC, so it makes sense.
> 
> In this case we should just remove the DGH macro from the generic
> architecture code and tell people to use iowrite - since we now
> understand that callers basically have to in order to use DGH on new
> ARM CPUs.

That works for me but what would the semantics be for __iowrite64_copy()
for example? Is there a DGH at the end of the whole write or after each
iteration? I'd go with the former since e.g. hns3_tx_push_bd() does
that (and doesn't seem to be a 64 byte copy). Similarly for
__iowrite512_copy(), if you want the DGH after each iteration you should
only pass a count of 1.
Jason Gunthorpe Dec. 6, 2023, 12:59 p.m. UTC | #25
On Wed, Dec 06, 2023 at 11:09:18AM +0000, Catalin Marinas wrote:
> On Tue, Dec 05, 2023 at 03:51:30PM -0400, Jason Gunthorpe wrote:
> > On Tue, Dec 05, 2023 at 07:34:45PM +0000, Catalin Marinas wrote:
> > > > 2) You want to #define __iowrite512_copy() to memcpy_toio() on ARM and
> > > >    implement some quad STP optimization for this case?
> > > 
> > > We can have the generic __iowrite512_copy() do memcpy_toio() and have
> > > the arm64 implement an optimised version.
> > > 
> > > What I'm not entirely sure of is the DGH (whatever the io_* barrier name
> > > is). I'd put it in the same __iowrite512_copy() function and remove it
> > > from the driver code. Otherwise when ST64B is added, we have an
> > > unnecessary DGH in the driver. If this does not match the other
> > > __iowrite*_copy() semantics, we can come up with another name. But start
> > > with this for now and document the function.
> > 
> > I think the iowrite is only used for WC and the DGH is functionally
> > harmless for non-WC, so it makes sense.
> > 
> > In this case we should just remove the DGH macro from the generic
> > architecture code and tell people to use iowrite - since we now
> > understand that callers basically have to in order to use DGH on new
> > ARM CPUs.
> 
> That works for me but what would the semantics be for __iowrite64_copy()
> for example? Is there a DGH at the end of the whole write or after each
> iteration?

End of the iowrite_copy function call. The purpose of DGH is to reduce
latency through write combining buffers by providing a hint to the HW
to close them. __iowrite64_copy can be reasonably thought of as trying
to push the argument into a single TLP.

> I'd go with the former since e.g. hns3_tx_push_bd() does
> that (and doesn't seem to be a 64 byte copy).

sizeof(struct hns3_desc) == 32, HNS3_MAX_PUSH_BD_NUM == 2, so it is 64
bytes.

Indeed, I already know this HW and it functions similar to mlx5. In
userspace it uses the ST4 instruction, in fact HNS was the team that
did that change citing measured improvements on their SOC. Changing
this to be the STP block will likely be an improvement.

Jason
Jason Gunthorpe Jan. 16, 2024, 5:33 p.m. UTC | #26
On Tue, Nov 28, 2023 at 05:28:36PM +0100, Niklas Schnelle wrote:
> On Mon, 2023-11-27 at 13:51 -0400, Jason Gunthorpe wrote:
> > On Mon, Nov 27, 2023 at 06:43:11PM +0100, Niklas Schnelle wrote:
> > 
> > > Also it turns out the writeq() loop we had so far does not produce the
> > > needed 64 byte TLP on s390 either so this actually makes us newly pass
> > > this test.
> > 
> > Ooh, that is a significant problem - the userspace code won't be used
> > unless this test passes. So we need this on S390 to fix a bug as well
> > :\
> 
> Yes ;-(
> 
> In the meantime I also found out that zpci_write_block(dst, src, 64) is
> not correct for all cases because the PCI store block requires the
> (pseudo-)MMIO write not to cross a 4K boundary and we need src/dst to
> be double word aligned. In rdma-core this is neatly handled by the
> get_max_write_size() but the kernel variant of that
> zpci_get_max_write_size() isn't just a lot harder to read and likely
> less efficient but also too strict thus breaking the 64 byte write up
> needlessly.
> 
> In total we have 5 conditions for the PCI block stores:
> 
> 1. The dst+len must not cross a 4K boundary in the (pseudo-)MMIO space
> 2. The length must not exceed the maximum write size
> 3. The length must be a multiple of 8
> 4. The src needs to be double word aligned
> 5. The dst needs to be double word aligned
> 
> So I think a good solution would be to improve zpci_memcpy_toio() with
> an enhanced zpci_get_max_write_size() based on the code in rdma-core
> extended to also handle the alignment and length restrictions which in
> rdma-core are already assumed (see comment there). Then we can use
> zpci_memcpy_toio(dst, src, 64) for memcpy_toio_64() and rely on the
> compiler to optimize out the unnecessary checks (2, 3 and possibly 4,
> 5).
> 
> So yeah this is getting a bit more  complicated than originally
> thought. Let me cook up a patch.

Did you come up with something?

Jason
Jason Gunthorpe Jan. 16, 2024, 6:51 p.m. UTC | #27
Hey Catalin,

I'm just revising this and I'm wondering if you know why ARM64 has this:

#define __raw_writeq __raw_writeq
static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
{
	asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
}

Instead of

#define __raw_writeq __raw_writeq
static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
{
	asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr));
}

?? Like x86 has.

The codegen for a 64 byte unrolled copy loop is way better with "m" on gcc:

"r" constraint (gcc 13.2.0):

.L3:
        ldr     x3, [x1]
        str x3, [x0]
        ldr     x3, [x1, 8]
        add     x4, x0, 8
        str x3, [x4]
        ldr     x3, [x1, 16]
        add     x4, x0, 16
        str x3, [x4]
        ldr     x3, [x1, 24]
        add     x4, x0, 24
        str x3, [x4]
        ldr     x3, [x1, 32]
        add     x4, x0, 32
        str x3, [x4]
        ldr     x3, [x1, 40]
        add     x4, x0, 40
        str x3, [x4]
        ldr     x3, [x1, 48]
        add     x4, x0, 48
        str x3, [x4]
        ldr     x3, [x1, 56]
        add     x4, x0, 56
        str x3, [x4]
        add     x1, x1, 64
        add     x0, x0, 64
        cmp     x2, x1
        bhi     .L3

"m" constraint:

.L3:
        ldp     x10, x9, [x1]
        ldp     x8, x7, [x1, 16]
        ldp     x6, x5, [x1, 32]
        ldp     x4, x3, [x1, 48]
        str x10, [x0]
        str x9, [x0, 8]
        str x8, [x0, 16]
        str x7, [x0, 24]
        str x6, [x0, 32]
        str x5, [x0, 40]
        str x4, [x0, 48]
        str x3, [x0, 56]
        add     x1, x1, 64
        add     x0, x0, 64
        cmp     x2, x1
        bhi     .L3

clang 17 doesn't do any better either way, it doesn't seem to do
anything with 'm', but I guess it could..

clang 17 (either):

.LBB0_2:                                // =>This Inner Loop Header: Depth=1
        ldp     x9, x10, [x1]
        add     x14, x0, #8
        add     x18, x0, #40
        ldp     x11, x12, [x1, #16]
        add     x2, x0, #48
        add     x3, x0, #56
        ldp     x13, x15, [x1, #32]
        ldp     x16, x17, [x1, #48]
        str     x9, [x0]
        str     x10, [x14]
        add     x9, x0, #16
        add     x10, x0, #24
        add     x14, x0, #32
        str     x11, [x9]
        str     x12, [x10]
        str     x13, [x14]
        str     x15, [x18]
        str     x16, [x2]
        str     x17, [x3]
        add     x1, x1, #64
        add     x0, x0, #64
        cmp     x1, x8
        b.lo    .LBB0_2

It doesn't matter for this series, but it seems like something ARM64
might want to look at to improve..

Jason
Mark Rutland Jan. 17, 2024, 12:30 p.m. UTC | #28
On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote:
> Hey Catalin,
> 
> I'm just revising this and I'm wondering if you know why ARM64 has this:
> 
> #define __raw_writeq __raw_writeq
> static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> {
> 	asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
> }
> 
> Instead of
> 
> #define __raw_writeq __raw_writeq
> static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> {
> 	asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr));
> }
> 
> ?? Like x86 has.

I believe this is for the same reason as doing so in all of our other IO
accessors.

We've deliberately ensured that our IO accessors use a single base register
with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT
when reporting a stage-2 abort, which a hypervisor may use for emulating IO.

Mark.

> 
> The codegen for a 64 byte unrolled copy loop is way better with "m" on gcc:
> 
> "r" constraint (gcc 13.2.0):
> 
> .L3:
>         ldr     x3, [x1]
>         str x3, [x0]
>         ldr     x3, [x1, 8]
>         add     x4, x0, 8
>         str x3, [x4]
>         ldr     x3, [x1, 16]
>         add     x4, x0, 16
>         str x3, [x4]
>         ldr     x3, [x1, 24]
>         add     x4, x0, 24
>         str x3, [x4]
>         ldr     x3, [x1, 32]
>         add     x4, x0, 32
>         str x3, [x4]
>         ldr     x3, [x1, 40]
>         add     x4, x0, 40
>         str x3, [x4]
>         ldr     x3, [x1, 48]
>         add     x4, x0, 48
>         str x3, [x4]
>         ldr     x3, [x1, 56]
>         add     x4, x0, 56
>         str x3, [x4]
>         add     x1, x1, 64
>         add     x0, x0, 64
>         cmp     x2, x1
>         bhi     .L3
> 
> "m" constraint:
> 
> .L3:
>         ldp     x10, x9, [x1]
>         ldp     x8, x7, [x1, 16]
>         ldp     x6, x5, [x1, 32]
>         ldp     x4, x3, [x1, 48]
>         str x10, [x0]
>         str x9, [x0, 8]
>         str x8, [x0, 16]
>         str x7, [x0, 24]
>         str x6, [x0, 32]
>         str x5, [x0, 40]
>         str x4, [x0, 48]
>         str x3, [x0, 56]
>         add     x1, x1, 64
>         add     x0, x0, 64
>         cmp     x2, x1
>         bhi     .L3
> 
> clang 17 doesn't do any better either way, it doesn't seem to do
> anything with 'm', but I guess it could..
> 
> clang 17 (either):
> 
> .LBB0_2:                                // =>This Inner Loop Header: Depth=1
>         ldp     x9, x10, [x1]
>         add     x14, x0, #8
>         add     x18, x0, #40
>         ldp     x11, x12, [x1, #16]
>         add     x2, x0, #48
>         add     x3, x0, #56
>         ldp     x13, x15, [x1, #32]
>         ldp     x16, x17, [x1, #48]
>         str     x9, [x0]
>         str     x10, [x14]
>         add     x9, x0, #16
>         add     x10, x0, #24
>         add     x14, x0, #32
>         str     x11, [x9]
>         str     x12, [x10]
>         str     x13, [x14]
>         str     x15, [x18]
>         str     x16, [x2]
>         str     x17, [x3]
>         add     x1, x1, #64
>         add     x0, x0, #64
>         cmp     x1, x8
>         b.lo    .LBB0_2
> 
> It doesn't matter for this series, but it seems like something ARM64
> might want to look at to improve..
> 
> Jason
>
Jason Gunthorpe Jan. 17, 2024, 12:36 p.m. UTC | #29
On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote:
> On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote:
> > Hey Catalin,
> > 
> > I'm just revising this and I'm wondering if you know why ARM64 has this:
> > 
> > #define __raw_writeq __raw_writeq
> > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > {
> > 	asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
> > }
> > 
> > Instead of
> > 
> > #define __raw_writeq __raw_writeq
> > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > {
> > 	asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr));
> > }
> > 
> > ?? Like x86 has.
> 
> I believe this is for the same reason as doing so in all of our other IO
> accessors.
> 
> We've deliberately ensured that our IO accessors use a single base register
> with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT
> when reporting a stage-2 abort, which a hypervisor may use for
> emulating IO.

Wow, harming bare metal performace to accommodate imperfect emulation
sounds like a horrible reason :(

So what happens with this patch where IO is done with STP? Are you
going to tell me I can't do it because of this?

Jason
Jason Gunthorpe Jan. 17, 2024, 12:41 p.m. UTC | #30
On Wed, Jan 17, 2024 at 08:36:18AM -0400, Jason Gunthorpe wrote:
> On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote:
> > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote:
> > > Hey Catalin,
> > > 
> > > I'm just revising this and I'm wondering if you know why ARM64 has this:
> > > 
> > > #define __raw_writeq __raw_writeq
> > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > {
> > > 	asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
> > > }
> > > 
> > > Instead of
> > > 
> > > #define __raw_writeq __raw_writeq
> > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > {
> > > 	asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr));
> > > }
> > > 
> > > ?? Like x86 has.
> > 
> > I believe this is for the same reason as doing so in all of our other IO
> > accessors.
> > 
> > We've deliberately ensured that our IO accessors use a single base register
> > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT
> > when reporting a stage-2 abort, which a hypervisor may use for
> > emulating IO.
> 
> Wow, harming bare metal performace to accommodate imperfect emulation
> sounds like a horrible reason :(
> 
> So what happens with this patch where IO is done with STP? Are you
> going to tell me I can't do it because of this?

I should also point out that userspace IO doesn't follow such a
restriction since nobody ever knew, and things like RDMA stack and
DPDK use ST4 and probably other non-trivial instructions for IO from
userspace.

I'm fearful you saying today's hypervisors don't work right on ARM if
IO is trapped, which does happen for some legimiate reasons and a few
illegitimate ones??

Jason
Niklas Schnelle Jan. 17, 2024, 1:20 p.m. UTC | #31
On Tue, 2024-01-16 at 13:33 -0400, Jason Gunthorpe wrote:
> On Tue, Nov 28, 2023 at 05:28:36PM +0100, Niklas Schnelle wrote:
> > On Mon, 2023-11-27 at 13:51 -0400, Jason Gunthorpe wrote:
> > > On Mon, Nov 27, 2023 at 06:43:11PM +0100, Niklas Schnelle wrote:
> > > 
> > > > Also it turns out the writeq() loop we had so far does not produce the
> > > > needed 64 byte TLP on s390 either so this actually makes us newly pass
> > > > this test.
> > > 
> > > Ooh, that is a significant problem - the userspace code won't be used
> > > unless this test passes. So we need this on S390 to fix a bug as well
> > > :\
> > 
> > Yes ;-(
> > 
> > In the meantime I also found out that zpci_write_block(dst, src, 64) is
> > not correct for all cases because the PCI store block requires the
> > (pseudo-)MMIO write not to cross a 4K boundary and we need src/dst to
> > be double word aligned. In rdma-core this is neatly handled by the
> > get_max_write_size() but the kernel variant of that
> > zpci_get_max_write_size() isn't just a lot harder to read and likely
> > less efficient but also too strict thus breaking the 64 byte write up
> > needlessly.
> > 
> > In total we have 5 conditions for the PCI block stores:
> > 
> > 1. The dst+len must not cross a 4K boundary in the (pseudo-)MMIO space
> > 2. The length must not exceed the maximum write size
> > 3. The length must be a multiple of 8
> > 4. The src needs to be double word aligned
> > 5. The dst needs to be double word aligned
> > 
> > So I think a good solution would be to improve zpci_memcpy_toio() with
> > an enhanced zpci_get_max_write_size() based on the code in rdma-core
> > extended to also handle the alignment and length restrictions which in
> > rdma-core are already assumed (see comment there). Then we can use
> > zpci_memcpy_toio(dst, src, 64) for memcpy_toio_64() and rely on the
> > compiler to optimize out the unnecessary checks (2, 3 and possibly 4,
> > 5).
> > 
> > So yeah this is getting a bit more  complicated than originally
> > thought. Let me cook up a patch.
> 
> Did you come up with something?
> 
> Jason

Hi Jason,

Sorry I haven't replied. Yes, I have a fix for zpci_memcpy_toio()
titled "s390/pci: fix max size calculation in zpci_memcpy_toio()" that
I tested with this series plus the following define added to
arch/s390/include/asm/io.h:

#define memcpy_toio_64 zpci_memcpy_toio(dst, src, 64)

It's already in the s390 tree's feature branch and linux-next.

Thanks,
Niklas
Jason Gunthorpe Jan. 17, 2024, 1:26 p.m. UTC | #32
On Wed, Jan 17, 2024 at 02:20:00PM +0100, Niklas Schnelle wrote:
> Sorry I haven't replied. Yes, I have a fix for zpci_memcpy_toio()
> titled "s390/pci: fix max size calculation in zpci_memcpy_toio()" that
> I tested with this series plus the following define added to
> arch/s390/include/asm/io.h:
> 
> #define memcpy_toio_64 zpci_memcpy_toio(dst, src, 64)
> 
> It's already in the s390 tree's feature branch and linux-next.

Great!

Jason
Mark Rutland Jan. 17, 2024, 1:29 p.m. UTC | #33
On Wed, Jan 17, 2024 at 08:36:18AM -0400, Jason Gunthorpe wrote:
> On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote:
> > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote:
> > > Hey Catalin,
> > > 
> > > I'm just revising this and I'm wondering if you know why ARM64 has this:
> > > 
> > > #define __raw_writeq __raw_writeq
> > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > {
> > > 	asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
> > > }
> > > 
> > > Instead of
> > > 
> > > #define __raw_writeq __raw_writeq
> > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > {
> > > 	asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr));
> > > }
> > > 
> > > ?? Like x86 has.
> > 
> > I believe this is for the same reason as doing so in all of our other IO
> > accessors.
> > 
> > We've deliberately ensured that our IO accessors use a single base register
> > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT
> > when reporting a stage-2 abort, which a hypervisor may use for
> > emulating IO.
> 
> Wow, harming bare metal performace to accommodate imperfect emulation
> sounds like a horrible reason :(

Having working functionality everywhere is a very good reason. :)

> So what happens with this patch where IO is done with STP? Are you
> going to tell me I can't do it because of this?

I'm not personally going to make that judgement, but it's certainly something
for Catalin and Will to consider (and I've added Marc in case he has any
opinion).

Mark.
Mark Rutland Jan. 17, 2024, 2:07 p.m. UTC | #34
On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote:
> On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote:
> > Hey Catalin,
> > 
> > I'm just revising this and I'm wondering if you know why ARM64 has this:
> > 
> > #define __raw_writeq __raw_writeq
> > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > {
> > 	asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
> > }
> > 
> > Instead of
> > 
> > #define __raw_writeq __raw_writeq
> > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > {
> > 	asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr));
> > }
> > 
> > ?? Like x86 has.
> 
> I believe this is for the same reason as doing so in all of our other IO
> accessors.
> 
> We've deliberately ensured that our IO accessors use a single base register
> with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT
> when reporting a stage-2 abort, which a hypervisor may use for emulating IO.

FWIW, IIUC the immediate-offset forms *without* writeback can still be reported
usefully in ESR_ELx, so I believe that we could use the "o" constraint for the
__raw_write*() functions, e.g.

static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
{
	asm volatile("str %x0, %1" : : "rZ" (val), "o" (*(volatile u64 *)addr));
}

However, the __raw_read*() functions would still need to use "r" due to
ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE.

Mark.
Jason Gunthorpe Jan. 17, 2024, 3:28 p.m. UTC | #35
On Wed, Jan 17, 2024 at 02:07:16PM +0000, Mark Rutland wrote:

> > I believe this is for the same reason as doing so in all of our other IO
> > accessors.
> > 
> > We've deliberately ensured that our IO accessors use a single base register
> > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT
> > when reporting a stage-2 abort, which a hypervisor may use for emulating IO.
> 
> FWIW, IIUC the immediate-offset forms *without* writeback can still be reported
> usefully in ESR_ELx, so I believe that we could use the "o" constraint for the
> __raw_write*() functions, e.g.
> 
> static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> {
> 	asm volatile("str %x0, %1" : : "rZ" (val), "o" (*(volatile u64 *)addr));
> }

"o" works well in the same simple memcpy loop:

        add     x2, x1, w2, uxtw 3
        cmp     x1, x2
        bcs     .L1
.L3:
        ldp     x10, x9, [x1]
        ldp     x8, x7, [x1, 16]
        ldp     x6, x5, [x1, 32]
        ldp     x4, x3, [x1, 48]
        str x10, [x0]
        str x9, [x0, 8]
        str x8, [x0, 16]
        str x7, [x0, 24]
        str x6, [x0, 32]
        str x5, [x0, 40]
        str x4, [x0, 48]
        str x3, [x0, 56]
        add     x1, x1, 64
        add     x0, x0, 64
        cmp     x2, x1
        bhi     .L3
.L1:
        ret

Seems intersting to pursue?

Jason
Will Deacon Jan. 17, 2024, 4:05 p.m. UTC | #36
On Wed, Jan 17, 2024 at 11:28:22AM -0400, Jason Gunthorpe wrote:
> On Wed, Jan 17, 2024 at 02:07:16PM +0000, Mark Rutland wrote:
> 
> > > I believe this is for the same reason as doing so in all of our other IO
> > > accessors.
> > > 
> > > We've deliberately ensured that our IO accessors use a single base register
> > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT
> > > when reporting a stage-2 abort, which a hypervisor may use for emulating IO.
> > 
> > FWIW, IIUC the immediate-offset forms *without* writeback can still be reported
> > usefully in ESR_ELx, so I believe that we could use the "o" constraint for the
> > __raw_write*() functions, e.g.
> > 
> > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > {
> > 	asm volatile("str %x0, %1" : : "rZ" (val), "o" (*(volatile u64 *)addr));
> > }
> 
> "o" works well in the same simple memcpy loop:
> 
>         add     x2, x1, w2, uxtw 3
>         cmp     x1, x2
>         bcs     .L1
> .L3:
>         ldp     x10, x9, [x1]
>         ldp     x8, x7, [x1, 16]
>         ldp     x6, x5, [x1, 32]
>         ldp     x4, x3, [x1, 48]
>         str x10, [x0]
>         str x9, [x0, 8]
>         str x8, [x0, 16]
>         str x7, [x0, 24]
>         str x6, [x0, 32]
>         str x5, [x0, 40]
>         str x4, [x0, 48]
>         str x3, [x0, 56]
>         add     x1, x1, 64
>         add     x0, x0, 64
>         cmp     x2, x1
>         bhi     .L3
> .L1:
>         ret
> 
> Seems intersting to pursue?

I've seen the compiler struggle with plain "o" in the past ("Impossible
constraint in asm") so we might want "Qo" if we go down this route.

Will
Jason Gunthorpe Jan. 17, 2024, 5:55 p.m. UTC | #37
On Wed, Jan 17, 2024 at 09:26:13AM -0400, Jason Gunthorpe wrote:
> On Wed, Jan 17, 2024 at 02:20:00PM +0100, Niklas Schnelle wrote:
> > Sorry I haven't replied. Yes, I have a fix for zpci_memcpy_toio()
> > titled "s390/pci: fix max size calculation in zpci_memcpy_toio()" that
> > I tested with this series plus the following define added to
> > arch/s390/include/asm/io.h:
> > 
> > #define memcpy_toio_64 zpci_memcpy_toio(dst, src, 64)
> > 
> > It's already in the s390 tree's feature branch and linux-next.
>
> Great!

Is this wrong too?

/* combine single writes by using store-block insn */
void __iowrite64_copy(void __iomem *to, const void *from, size_t count)
{
       zpci_memcpy_toio(to, from, count);
}

 * __iowrite64_copy - copy data to MMIO space, in 64-bit or 32-bit units
 * @to: destination, in MMIO space (must be 64-bit aligned)
 * @from: source (must be 64-bit aligned)
 * @count: number of 64-bit quantities to copy
                ^^^^^^^^^^^^^^^^^^^^^^^^^

Ie it should be

       zpci_memcpy_toio(to, from, count * 8);

Right?

(I'll fix it)

Jason
Niklas Schnelle Jan. 18, 2024, 1:46 p.m. UTC | #38
On Wed, 2024-01-17 at 13:55 -0400, Jason Gunthorpe wrote:
> On Wed, Jan 17, 2024 at 09:26:13AM -0400, Jason Gunthorpe wrote:
> > On Wed, Jan 17, 2024 at 02:20:00PM +0100, Niklas Schnelle wrote:
> > > Sorry I haven't replied. Yes, I have a fix for zpci_memcpy_toio()
> > > titled "s390/pci: fix max size calculation in zpci_memcpy_toio()" that
> > > I tested with this series plus the following define added to
> > > arch/s390/include/asm/io.h:
> > > 
> > > #define memcpy_toio_64 zpci_memcpy_toio(dst, src, 64)
> > > 
> > > It's already in the s390 tree's feature branch and linux-next.
> > 
> > Great!
> 
> Is this wrong too?
> 
> /* combine single writes by using store-block insn */
> void __iowrite64_copy(void __iomem *to, const void *from, size_t count)
> {
>        zpci_memcpy_toio(to, from, count);
> }
> 
>  * __iowrite64_copy - copy data to MMIO space, in 64-bit or 32-bit units
>  * @to: destination, in MMIO space (must be 64-bit aligned)
>  * @from: source (must be 64-bit aligned)
>  * @count: number of 64-bit quantities to copy
>                 ^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> Ie it should be
> 
>        zpci_memcpy_toio(to, from, count * 8);
> 
> Right?
> 
> (I'll fix it)
> 
> Jason

Yes, we need zpci_memcpy_toio(to, from, count * 8) since our count is
in bytes like for memcpy_toio().

Thanks,
Niklas
Jason Gunthorpe Jan. 18, 2024, 2 p.m. UTC | #39
On Thu, Jan 18, 2024 at 02:46:40PM +0100, Niklas Schnelle wrote:

> Yes, we need zpci_memcpy_toio(to, from, count * 8) since our count is
> in bytes like for memcpy_toio().

https://github.com/jgunthorpe/linux/commits/mlx5_wc/

Jason
Niklas Schnelle Jan. 18, 2024, 3:59 p.m. UTC | #40
On Thu, 2024-01-18 at 10:00 -0400, Jason Gunthorpe wrote:
> On Thu, Jan 18, 2024 at 02:46:40PM +0100, Niklas Schnelle wrote:
> 
> > Yes, we need zpci_memcpy_toio(to, from, count * 8) since our count is
> > in bytes like for memcpy_toio().
> 
> https://github.com/jgunthorpe/linux/commits/mlx5_wc/
> 
> Jason
> 

Thanks, the s390 patches:


s390: Implement __iowrite32_copy() 
s390: Use the correct count for __iowrite64_copy() 
s390: Stop using weak symbols for __iowrite64_copy() 

Look good to me. I.e. you may add my.

Acked-by Niklas Schnelle <schnelle@linux.ibm.com>

I did test your patches too and by accident confirmed again that these
do need commit 80df7d6af7f6 ("s390/pci: fix max size calculation in
zpci_memcpy_toio()") from the s390 feature branch to get the mlx5
driver to detect Write-Combining as supported. Note, as far as I know
Alexander Gordeev is targeting that one for v6.8-rc2 since we had quite
a few changes for v6.8-rc1.

Thanks,
Niklas
Jason Gunthorpe Jan. 18, 2024, 4:18 p.m. UTC | #41
On Wed, Jan 17, 2024 at 04:05:29PM +0000, Will Deacon wrote:
> On Wed, Jan 17, 2024 at 11:28:22AM -0400, Jason Gunthorpe wrote:
> > On Wed, Jan 17, 2024 at 02:07:16PM +0000, Mark Rutland wrote:
> > 
> > > > I believe this is for the same reason as doing so in all of our other IO
> > > > accessors.
> > > > 
> > > > We've deliberately ensured that our IO accessors use a single base register
> > > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT
> > > > when reporting a stage-2 abort, which a hypervisor may use for emulating IO.
> > > 
> > > FWIW, IIUC the immediate-offset forms *without* writeback can still be reported
> > > usefully in ESR_ELx, so I believe that we could use the "o" constraint for the
> > > __raw_write*() functions, e.g.
> > > 
> > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > {
> > > 	asm volatile("str %x0, %1" : : "rZ" (val), "o" (*(volatile u64 *)addr));
> > > }
> > 
> > "o" works well in the same simple memcpy loop:
> > 
> >         add     x2, x1, w2, uxtw 3
> >         cmp     x1, x2
> >         bcs     .L1
> > .L3:
> >         ldp     x10, x9, [x1]
> >         ldp     x8, x7, [x1, 16]
> >         ldp     x6, x5, [x1, 32]
> >         ldp     x4, x3, [x1, 48]
> >         str x10, [x0]
> >         str x9, [x0, 8]
> >         str x8, [x0, 16]
> >         str x7, [x0, 24]
> >         str x6, [x0, 32]
> >         str x5, [x0, 40]
> >         str x4, [x0, 48]
> >         str x3, [x0, 56]
> >         add     x1, x1, 64
> >         add     x0, x0, 64
> >         cmp     x2, x1
> >         bhi     .L3
> > .L1:
> >         ret
> > 
> > Seems intersting to pursue?
> 
> I've seen the compiler struggle with plain "o" in the past ("Impossible
> constraint in asm") so we might want "Qo" if we go down this route.

I'll stick a patch in 0-day and lets see if there are explosions. "Qo"
generates the same assembly.

So to summarize:
 - We don't like "m" because something about virtualization
   traps breaks with post/pre indexed forms like:
     str x1, [x0, 8]!
   And "m" will allow the compiler to emit that. 
 - o selects only base register plus offset so it is OK
 - Q allows base register only (no offset) on some compilers that
   won't allow o for 0 offset
 - read side stays at 'r' due to an alternates errata workaround
   requiring ldar which doesn't accept the same effective address
   as ldr.

Jason
Jason Gunthorpe Jan. 18, 2024, 4:21 p.m. UTC | #42
On Thu, Jan 18, 2024 at 04:59:47PM +0100, Niklas Schnelle wrote:
> On Thu, 2024-01-18 at 10:00 -0400, Jason Gunthorpe wrote:
> > On Thu, Jan 18, 2024 at 02:46:40PM +0100, Niklas Schnelle wrote:
> > 
> > > Yes, we need zpci_memcpy_toio(to, from, count * 8) since our count is
> > > in bytes like for memcpy_toio().
> > 
> > https://github.com/jgunthorpe/linux/commits/mlx5_wc/
> > 
> > Jason
> > 
> 
> Thanks, the s390 patches:
> 
> 
> s390: Implement __iowrite32_copy() 
> s390: Use the correct count for __iowrite64_copy() 
> s390: Stop using weak symbols for __iowrite64_copy() 
> 
> Look good to me. I.e. you may add my.
> 
> Acked-by Niklas Schnelle <schnelle@linux.ibm.com>

Great, thanks. I'll post this once rc1 comes out

> I did test your patches too and by accident confirmed again that these
> do need commit 80df7d6af7f6 ("s390/pci: fix max size calculation in
> zpci_memcpy_toio()") from the s390 feature branch to get the mlx5
> driver to detect Write-Combining as supported. Note, as far as I know
> Alexander Gordeev is targeting that one for v6.8-rc2 since we had quite
> a few changes for v6.8-rc1.

OK, but we can still run these two things in parallel?

Jason
Niklas Schnelle Jan. 18, 2024, 4:25 p.m. UTC | #43
On Thu, 2024-01-18 at 12:21 -0400, Jason Gunthorpe wrote:
> On Thu, Jan 18, 2024 at 04:59:47PM +0100, Niklas Schnelle wrote:
> > On Thu, 2024-01-18 at 10:00 -0400, Jason Gunthorpe wrote:
> > > On Thu, Jan 18, 2024 at 02:46:40PM +0100, Niklas Schnelle wrote:
> > > 
> > > > Yes, we need zpci_memcpy_toio(to, from, count * 8) since our count is
> > > > in bytes like for memcpy_toio().
> > > 
> > > https://github.com/jgunthorpe/linux/commits/mlx5_wc/
> > > 
> > > Jason
> > > 
> > 
> > Thanks, the s390 patches:
> > 
> > 
> > s390: Implement __iowrite32_copy() 
> > s390: Use the correct count for __iowrite64_copy() 
> > s390: Stop using weak symbols for __iowrite64_copy() 
> > 
> > Look good to me. I.e. you may add my.
> > 
> > Acked-by Niklas Schnelle <schnelle@linux.ibm.com>
> 
> Great, thanks. I'll post this once rc1 comes out
> 
> > I did test your patches too and by accident confirmed again that these
> > do need commit 80df7d6af7f6 ("s390/pci: fix max size calculation in
> > zpci_memcpy_toio()") from the s390 feature branch to get the mlx5
> > driver to detect Write-Combining as supported. Note, as far as I know
> > Alexander Gordeev is targeting that one for v6.8-rc2 since we had quite
> > a few changes for v6.8-rc1.
> 
> OK, but we can still run these two things in parallel?
> 
> Jason

Sure, it's not worse without my patch than what we had before and
clearly __iowrite64_copy() has been completely broken for ages without
anyone noticing and is fixed by your patches even without my fix for
the too strict issue in that at least it then copies what it is
supposed to copy even if it does so with 8*8 byte stores.

Thanks,
Niklas
Niklas Schnelle Jan. 19, 2024, 11:52 a.m. UTC | #44
On Thu, 2024-01-18 at 17:25 +0100, Niklas Schnelle wrote:
> On Thu, 2024-01-18 at 12:21 -0400, Jason Gunthorpe wrote:
> > On Thu, Jan 18, 2024 at 04:59:47PM +0100, Niklas Schnelle wrote:
> > > On Thu, 2024-01-18 at 10:00 -0400, Jason Gunthorpe wrote:
> > > > On Thu, Jan 18, 2024 at 02:46:40PM +0100, Niklas Schnelle wrote:
> > > 
---8<---
> > 
> > 
> > > I did test your patches too and by accident confirmed again that these
> > > do need commit 80df7d6af7f6 ("s390/pci: fix max size calculation in
> > > zpci_memcpy_toio()") from the s390 feature branch to get the mlx5
> > > driver to detect Write-Combining as supported. Note, as far as I know
> > > Alexander Gordeev is targeting that one for v6.8-rc2 since we had quite
> > > a few changes for v6.8-rc1.
> > 
> > OK, but we can still run these two things in parallel?
> > 
> > Jason
> 
> Sure, it's not worse without my patch than what we had before and
> clearly __iowrite64_copy() has been completely broken for ages without
> anyone noticing and is fixed by your patches even without my fix for
> the too strict issue in that at least it then copies what it is
> supposed to copy even if it does so with 8*8 byte stores.
> 
> Thanks,
> Niklas
> 

FYI Alexander ended up doing a second v6.8-rc1 pull request and my
change has now landed in Linus' tree as commit 80df7d6af7f6 ("s390/pci:
fix max size calculation in zpci_memcpy_toio()").

Thanks,
Niklas
Catalin Marinas Jan. 23, 2024, 8:38 p.m. UTC | #45
(fixed Marc's email address)

On Wed, Jan 17, 2024 at 01:29:06PM +0000, Mark Rutland wrote:
> On Wed, Jan 17, 2024 at 08:36:18AM -0400, Jason Gunthorpe wrote:
> > On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote:
> > > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote:
> > > > I'm just revising this and I'm wondering if you know why ARM64 has this:
> > > > 
> > > > #define __raw_writeq __raw_writeq
> > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > > {
> > > > 	asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
> > > > }
> > > > 
> > > > Instead of
> > > > 
> > > > #define __raw_writeq __raw_writeq
> > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > > {
> > > > 	asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr));
> > > > }
> > > > 
> > > > ?? Like x86 has.
> > > 
> > > I believe this is for the same reason as doing so in all of our other IO
> > > accessors.
> > > 
> > > We've deliberately ensured that our IO accessors use a single base register
> > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT
> > > when reporting a stage-2 abort, which a hypervisor may use for
> > > emulating IO.
> > 
> > Wow, harming bare metal performace to accommodate imperfect emulation
> > sounds like a horrible reason :(
> 
> Having working functionality everywhere is a very good reason. :)
> 
> > So what happens with this patch where IO is done with STP? Are you
> > going to tell me I can't do it because of this?
> 
> I'm not personally going to make that judgement, but it's certainly something
> for Catalin and Will to consider (and I've added Marc in case he has any
> opinion).

Good point, I missed this part. We definitely can't use STP in the I/O
accessors, we'd have a big surprise when running the same code in a
guest with emulated I/O.

If eight STRs without other operations interleaved give us the
write-combining on most CPUs (with Normal NC), we should go with this
instead of STP.
Jason Gunthorpe Jan. 24, 2024, 1:27 a.m. UTC | #46
On Tue, Jan 23, 2024 at 08:38:55PM +0000, Catalin Marinas wrote:
> (fixed Marc's email address)
> 
> On Wed, Jan 17, 2024 at 01:29:06PM +0000, Mark Rutland wrote:
> > On Wed, Jan 17, 2024 at 08:36:18AM -0400, Jason Gunthorpe wrote:
> > > On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote:
> > > > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote:
> > > > > I'm just revising this and I'm wondering if you know why ARM64 has this:
> > > > > 
> > > > > #define __raw_writeq __raw_writeq
> > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > > > {
> > > > > 	asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
> > > > > }
> > > > > 
> > > > > Instead of
> > > > > 
> > > > > #define __raw_writeq __raw_writeq
> > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > > > {
> > > > > 	asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr));
> > > > > }
> > > > > 
> > > > > ?? Like x86 has.
> > > > 
> > > > I believe this is for the same reason as doing so in all of our other IO
> > > > accessors.
> > > > 
> > > > We've deliberately ensured that our IO accessors use a single base register
> > > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT
> > > > when reporting a stage-2 abort, which a hypervisor may use for
> > > > emulating IO.
> > > 
> > > Wow, harming bare metal performace to accommodate imperfect emulation
> > > sounds like a horrible reason :(
> > 
> > Having working functionality everywhere is a very good reason. :)
> > 
> > > So what happens with this patch where IO is done with STP? Are you
> > > going to tell me I can't do it because of this?
> > 
> > I'm not personally going to make that judgement, but it's certainly something
> > for Catalin and Will to consider (and I've added Marc in case he has any
> > opinion).
> 
> Good point, I missed this part. We definitely can't use STP in the I/O
> accessors, we'd have a big surprise when running the same code in a
> guest with emulated I/O.

Unfortunately there is no hard distinction in KVM/qemu for "emulated
IO" and "VFIO MMIO". Even devices using VFIO can get funneled down the
emulated path for legitimate reasons.

Again, userspace is already widely deployed using complex IO
accessors. ST4 has been out there for years and at this moment this
patch with STP is already being deployed in production environments.

Even if you refuse to take STP to mainline it *will* be running in VMs
under ARM hypervisors.

What exactly do you think should be done about that?

I thought the guiding mantra here was that any time KVM does not
perfectly emulate bare metal it is a bug. "We can't assume all VMs are
Linux!". Indeed we recently had some long and *very* theoretical
discussions about possible incompatibilties due to kvm changes in the
memory attributes thread.

But here it seems to be just shrugging off something so catastrophic
as performance IO accessors *that are widely deployed already* don't
work reliably in VMs!?!?

"Oh well, don't use them"!?

Damn I hope it crashes the VM and doesn't corrupt the MMIO. I just
debugged a x86 KVM issue with it corrupting VFIO MMIO and that was a
total nightmare to find.

> If eight STRs without other operations interleaved give us the
> write-combining on most CPUs (with Normal NC), we should go with this
> instead of STP.

__iowrite64_copy() is a performance IO accessor, we should not degrade
it because buggy hypervisors might exist that have a problem with STP
or other instructions. :( :(

Anyhow, I know nothing about whatever this issue is - Mark said:

 > FWIW, IIUC the immediate-offset forms *without* writeback can still
 > be reported usefully in ESR_ELx,

Which excludes the post/pre increment forms - but does STP and ST4
also have some kind of problem because the emulation path can't know
about wider than a 64 bit access?

What is the plan for ST64B? Don't get to use that either?

Jason
Marc Zyngier Jan. 24, 2024, 8:26 a.m. UTC | #47
On Wed, 24 Jan 2024 01:27:23 +0000,
Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> On Tue, Jan 23, 2024 at 08:38:55PM +0000, Catalin Marinas wrote:
> > (fixed Marc's email address)
> > 
> > On Wed, Jan 17, 2024 at 01:29:06PM +0000, Mark Rutland wrote:
> > > On Wed, Jan 17, 2024 at 08:36:18AM -0400, Jason Gunthorpe wrote:
> > > > On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote:
> > > > > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote:
> > > > > > I'm just revising this and I'm wondering if you know why ARM64 has this:
> > > > > > 
> > > > > > #define __raw_writeq __raw_writeq
> > > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > > > > {
> > > > > > 	asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
> > > > > > }
> > > > > > 
> > > > > > Instead of
> > > > > > 
> > > > > > #define __raw_writeq __raw_writeq
> > > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > > > > {
> > > > > > 	asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr));
> > > > > > }
> > > > > > 
> > > > > > ?? Like x86 has.
> > > > > 
> > > > > I believe this is for the same reason as doing so in all of our other IO
> > > > > accessors.
> > > > > 
> > > > > We've deliberately ensured that our IO accessors use a single base register
> > > > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT
> > > > > when reporting a stage-2 abort, which a hypervisor may use for
> > > > > emulating IO.
> > > > 
> > > > Wow, harming bare metal performace to accommodate imperfect emulation
> > > > sounds like a horrible reason :(
> > > 
> > > Having working functionality everywhere is a very good reason. :)
> > > 
> > > > So what happens with this patch where IO is done with STP? Are you
> > > > going to tell me I can't do it because of this?
> > > 
> > > I'm not personally going to make that judgement, but it's certainly something
> > > for Catalin and Will to consider (and I've added Marc in case he has any
> > > opinion).
> > 
> > Good point, I missed this part. We definitely can't use STP in the I/O
> > accessors, we'd have a big surprise when running the same code in a
> > guest with emulated I/O.
> 
> Unfortunately there is no hard distinction in KVM/qemu for "emulated
> IO" and "VFIO MMIO". Even devices using VFIO can get funneled down the
> emulated path for legitimate reasons.
> 
> Again, userspace is already widely deployed using complex IO
> accessors. ST4 has been out there for years and at this moment this
> patch with STP is already being deployed in production environments.

Then you will get to keep the pieces. Good luck.

> Even if you refuse to take STP to mainline it *will* be running in VMs
> under ARM hypervisors.

A hypervisor can't do anything with it. If you cared to read the
architecture, you'd know by now. So your VM will be either dead, or
dog slow, depending on your hypervisor. In any case, I'm sure it will
reflect positively on your favourite software.

> What exactly do you think should be done about that?

Well, you could use KVM_CAP_ARM_NISV_TO_USER in userspace and see
everything slow down. Your call.

> I thought the guiding mantra here was that any time KVM does not
> perfectly emulate bare metal it is a bug. "We can't assume all VMs are
> Linux!". Indeed we recently had some long and *very* theoretical
> discussions about possible incompatibilties due to kvm changes in the
> memory attributes thread.
>
> But here it seems to be just shrugging off something so catastrophic
> as performance IO accessors *that are widely deployed already* don't
> work reliably in VMs!?!?
> 
> "Oh well, don't use them"!?

Exactly.

You can also take this to the ARM architects and get them to update
the architecture to mandate full syndrome information for all
load/store instructions, and you'll get something useful in 2034.
Maybe.

Or you can stop whining and try to get better performance out of what
we have today.

> Damn I hope it crashes the VM and doesn't corrupt the MMIO. I just
> debugged a x86 KVM issue with it corrupting VFIO MMIO and that was a
> total nightmare to find.
> 
> > If eight STRs without other operations interleaved give us the
> > write-combining on most CPUs (with Normal NC), we should go with this
> > instead of STP.
> 
> __iowrite64_copy() is a performance IO accessor, we should not degrade
> it because buggy hypervisors might exist that have a problem with STP
> or other instructions. :( :(
> 
> Anyhow, I know nothing about whatever this issue is - Mark said:
> 
>  > FWIW, IIUC the immediate-offset forms *without* writeback can still
>  > be reported usefully in ESR_ELx,
> 
> Which excludes the post/pre increment forms - but does STP and ST4
> also have some kind of problem because the emulation path can't know
> about wider than a 64 bit access?
> 
> What is the plan for ST64B? Don't get to use that either?

ST64 has full syndrome information, making it possible to emulate.

In any case, there is no magic there. Everything is documented, and
has been for the past... 15 years?

	M.
Mark Rutland Jan. 24, 2024, 11:31 a.m. UTC | #48
On Thu, Jan 18, 2024 at 12:18:43PM -0400, Jason Gunthorpe wrote:
> On Wed, Jan 17, 2024 at 04:05:29PM +0000, Will Deacon wrote:
> > On Wed, Jan 17, 2024 at 11:28:22AM -0400, Jason Gunthorpe wrote:
> > > On Wed, Jan 17, 2024 at 02:07:16PM +0000, Mark Rutland wrote:
> > > 
> > > > > I believe this is for the same reason as doing so in all of our other IO
> > > > > accessors.
> > > > > 
> > > > > We've deliberately ensured that our IO accessors use a single base register
> > > > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT
> > > > > when reporting a stage-2 abort, which a hypervisor may use for emulating IO.
> > > > 
> > > > FWIW, IIUC the immediate-offset forms *without* writeback can still be reported
> > > > usefully in ESR_ELx, so I believe that we could use the "o" constraint for the
> > > > __raw_write*() functions, e.g.
> > > > 
> > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > > {
> > > > 	asm volatile("str %x0, %1" : : "rZ" (val), "o" (*(volatile u64 *)addr));
> > > > }
> > > 
> > > "o" works well in the same simple memcpy loop:
> > > 
> > >         add     x2, x1, w2, uxtw 3
> > >         cmp     x1, x2
> > >         bcs     .L1
> > > .L3:
> > >         ldp     x10, x9, [x1]
> > >         ldp     x8, x7, [x1, 16]
> > >         ldp     x6, x5, [x1, 32]
> > >         ldp     x4, x3, [x1, 48]
> > >         str x10, [x0]
> > >         str x9, [x0, 8]
> > >         str x8, [x0, 16]
> > >         str x7, [x0, 24]
> > >         str x6, [x0, 32]
> > >         str x5, [x0, 40]
> > >         str x4, [x0, 48]
> > >         str x3, [x0, 56]
> > >         add     x1, x1, 64
> > >         add     x0, x0, 64
> > >         cmp     x2, x1
> > >         bhi     .L3
> > > .L1:
> > >         ret
> > > 
> > > Seems intersting to pursue?
> > 
> > I've seen the compiler struggle with plain "o" in the past ("Impossible
> > constraint in asm") so we might want "Qo" if we go down this route.
> 
> I'll stick a patch in 0-day and lets see if there are explosions. "Qo"
> generates the same assembly.
> 
> So to summarize:
>  - We don't like "m" because something about virtualization
>    traps breaks with post/pre indexed forms like:
>      str x1, [x0, 8]!
>    And "m" will allow the compiler to emit that. 
>  - o selects only base register plus offset so it is OK
>  - Q allows base register only (no offset) on some compilers that
>    won't allow o for 0 offset
>  - read side stays at 'r' due to an alternates errata workaround
>    requiring ldar which doesn't accept the same effective address
>    as ldr.

FWIW I've sent a patch out with a commit message describing all of the above
(with you, Catalin, Marc, and Will Cc'd). It hasn't appeared on lore yet, but
it should eventually show up at:

  https://lore.kernel.org/linux-arm-kernel/20240124111259.874975-1-mark.rutland@arm.com/

Mark.
Mark Rutland Jan. 24, 2024, 11:38 a.m. UTC | #49
On Tue, Jan 23, 2024 at 08:38:55PM +0000, Catalin Marinas wrote:
> (fixed Marc's email address)
> 
> On Wed, Jan 17, 2024 at 01:29:06PM +0000, Mark Rutland wrote:
> > On Wed, Jan 17, 2024 at 08:36:18AM -0400, Jason Gunthorpe wrote:
> > > On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote:
> > > > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote:
> > > > > I'm just revising this and I'm wondering if you know why ARM64 has this:
> > > > > 
> > > > > #define __raw_writeq __raw_writeq
> > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > > > {
> > > > > 	asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
> > > > > }
> > > > > 
> > > > > Instead of
> > > > > 
> > > > > #define __raw_writeq __raw_writeq
> > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > > > {
> > > > > 	asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr));
> > > > > }
> > > > > 
> > > > > ?? Like x86 has.
> > > > 
> > > > I believe this is for the same reason as doing so in all of our other IO
> > > > accessors.
> > > > 
> > > > We've deliberately ensured that our IO accessors use a single base register
> > > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT
> > > > when reporting a stage-2 abort, which a hypervisor may use for
> > > > emulating IO.
> > > 
> > > Wow, harming bare metal performace to accommodate imperfect emulation
> > > sounds like a horrible reason :(
> > 
> > Having working functionality everywhere is a very good reason. :)
> > 
> > > So what happens with this patch where IO is done with STP? Are you
> > > going to tell me I can't do it because of this?
> > 
> > I'm not personally going to make that judgement, but it's certainly something
> > for Catalin and Will to consider (and I've added Marc in case he has any
> > opinion).
> 
> Good point, I missed this part. We definitely can't use STP in the I/O
> accessors, we'd have a big surprise when running the same code in a
> guest with emulated I/O.

Just to be clear, that means we should drop this patch ("arm64/io: add
memcpy_toio_64") for now, right?

It would be helpful if we could explciitly say so in direct reply to that:

  https://lore.kernel.org/linux-arm-kernel/c3ae87aea7660c3d266905c19d10d8de0f9fb779.1700766072.git.leon@kernel.org/

... to avoid any confusion there.
  
> If eight STRs without other operations interleaved give us the
> write-combining on most CPUs (with Normal NC), we should go with this
> instead of STP.

Agreed; I've sent out a patch to allow the offset addressing at:

  https://lore.kernel.org/linux-arm-kernel/20240124111259.874975-1-mark.rutland@arm.com/

... and it should be possible to build atop that to use eight STRs.

Mark.
Catalin Marinas Jan. 24, 2024, 12:40 p.m. UTC | #50
On Wed, Jan 24, 2024 at 11:38:07AM +0000, Mark Rutland wrote:
> On Tue, Jan 23, 2024 at 08:38:55PM +0000, Catalin Marinas wrote:
> > > On Wed, Jan 17, 2024 at 08:36:18AM -0400, Jason Gunthorpe wrote:
> > > > On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote:
> > > > > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote:
> > > > > > I'm just revising this and I'm wondering if you know why ARM64 has this:
> > > > > > 
> > > > > > #define __raw_writeq __raw_writeq
> > > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > > > > {
> > > > > > 	asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
> > > > > > }
> > > > > > 
> > > > > > Instead of
> > > > > > 
> > > > > > #define __raw_writeq __raw_writeq
> > > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> > > > > > {
> > > > > > 	asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr));
> > > > > > }
> > > > > > 
> > > > > > ?? Like x86 has.
> > > > > 
> > > > > I believe this is for the same reason as doing so in all of our other IO
> > > > > accessors.
> > > > > 
> > > > > We've deliberately ensured that our IO accessors use a single base register
> > > > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT
> > > > > when reporting a stage-2 abort, which a hypervisor may use for
> > > > > emulating IO.
> > > > 
> > > > Wow, harming bare metal performace to accommodate imperfect emulation
> > > > sounds like a horrible reason :(
> > > 
> > > Having working functionality everywhere is a very good reason. :)
> > > 
> > > > So what happens with this patch where IO is done with STP? Are you
> > > > going to tell me I can't do it because of this?
> > > 
> > > I'm not personally going to make that judgement, but it's certainly something
> > > for Catalin and Will to consider (and I've added Marc in case he has any
> > > opinion).
> > 
> > Good point, I missed this part. We definitely can't use STP in the I/O
> > accessors, we'd have a big surprise when running the same code in a
> > guest with emulated I/O.
> 
> Just to be clear, that means we should drop this patch ("arm64/io: add
> memcpy_toio_64") for now, right?

In its current form yes, but that doesn't mean that memcpy_toio_64()
cannot be reworked differently.

> It would be helpful if we could explciitly say so in direct reply to that:
> 
>   https://lore.kernel.org/linux-arm-kernel/c3ae87aea7660c3d266905c19d10d8de0f9fb779.1700766072.git.leon@kernel.org/
> 
> ... to avoid any confusion there.

I will.

> > If eight STRs without other operations interleaved give us the
> > write-combining on most CPUs (with Normal NC), we should go with this
> > instead of STP.
> 
> Agreed; I've sent out a patch to allow the offset addressing at:
> 
>   https://lore.kernel.org/linux-arm-kernel/20240124111259.874975-1-mark.rutland@arm.com/
> 
> ... and it should be possible to build atop that to use eight STRs.

That's great, thanks.
Jason Gunthorpe Jan. 24, 2024, 1:06 p.m. UTC | #51
On Wed, Jan 24, 2024 at 08:26:28AM +0000, Marc Zyngier wrote:

> > Even if you refuse to take STP to mainline it *will* be running in VMs
> > under ARM hypervisors.
> 
> A hypervisor can't do anything with it. If you cared to read the
> architecture, you'd know by now. So your VM will be either dead, or
> dog slow, depending on your hypervisor. In any case, I'm sure it will
> reflect positively on your favourite software.

"Dog slow" is fine. Forcing IO emulation on paths that shouldn't have
it is a VMM problem. KVM & qemu have some issues where this can happen
infrequently for VFIO MMIO maps. It is just important that it be
functionally correct if you get unlucky. The performance path is to
not take a fault in the first place.

> > What exactly do you think should be done about that?
> 
> Well, you could use KVM_CAP_ARM_NISV_TO_USER in userspace and see
> everything slow down. Your call.

The issue Mark raised here was that things like STP/etc cannot work in
VMs, not that they are slow.

The places we are talking about using the STP pattern are all high
performance HW drivers, that do not have any existing SW emulation to
worry about. ie the VMM will be using VFIO to back the MMIO the
acessors target.

So, I'm fine if the answer is that VMM's using VFIO need to use
KVM_CAP_ARM_NISV_TO_USER and do instruction parsing for emulated IO in
userspace if they have a design where VFIO MMIO can infrequently
generate faults. That is all VMM design stuff and has nothing to do
with the kernel.

My objection is this notion we should degrade a performance hot path
in drivers to accomodate an ARM VMM issue that should be solved in the
VMM.

> Or you can stop whining and try to get better performance out of what
> we have today.

"better performance"!?!? You are telling me I have to destroy one of
our important fast paths for HPC workloads to accommodate some
theoretical ARM KVM problem?

Jason
Jason Gunthorpe Jan. 24, 2024, 1:27 p.m. UTC | #52
On Wed, Jan 24, 2024 at 12:40:29PM +0000, Catalin Marinas wrote:

> > Just to be clear, that means we should drop this patch ("arm64/io: add
> > memcpy_toio_64") for now, right?
> 
> In its current form yes, but that doesn't mean that memcpy_toio_64()
> cannot be reworked differently.

I gave up on touching memcpy_toio_64(), it doesn't work very well
because of the weak alignment

Instead I followed your suggestion to fix __iowrite64_copy()

There are only a couple of places that use this API:

drivers/infiniband/hw/bnxt_re/qplib_rcfw.c:     __iowrite32_copy(mbox->reg.bar_reg, &init, sizeof(init) / 4);
drivers/mtd/nand/raw/mxc_nand.c:        __iowrite32_copy(trg, src, size / 4);
drivers/net/ethernet/amazon/ena/ena_eth_com.c:  __iowrite64_copy(io_sq->desc_addr.pbuf_dev_addr + dst_offset,
drivers/net/ethernet/broadcom/bnxt/bnxt.c:                      __iowrite64_copy(db, tx_push_buf, 16);
drivers/net/ethernet/broadcom/bnxt/bnxt.c:                      __iowrite32_copy(db + 4, tx_push_buf + 1,
drivers/net/ethernet/broadcom/bnxt/bnxt.c:                      __iowrite64_copy(db, tx_push_buf, push_len);
drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c: __iowrite32_copy(bp->bar0 + bar_offset, data, msg_len / 4);
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c:        __iowrite64_copy(ring->tqp->mem_base, desc,
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c:        __iowrite64_copy(ring->tqp->mem_base + HNS3_MEM_DOORBELL_OFFSET,
drivers/net/ethernet/mellanox/mlx4/en_tx.c:     __iowrite64_copy(dst, src, bytecnt / 8);
drivers/net/ethernet/myricom/myri10ge/myri10ge.c:#define myri10ge_pio_copy(to,from,size) __iowrite64_copy(to,from,size/8)
drivers/net/ethernet/sfc/tx.c:  __iowrite64_copy(*piobuf, data, block_len >> 3);
drivers/net/ethernet/sfc/tx.c:          __iowrite64_copy(*piobuf, copy_buf->buf,
drivers/net/ethernet/sfc/tx.c:          __iowrite64_copy(piobuf, copy_buf->buf,
drivers/net/ethernet/sfc/tx.c:          __iowrite64_copy(tx_queue->piobuf, skb->data,
drivers/net/wireless/mediatek/mt76/mmio.c:      __iowrite32_copy(dev->mmio.regs + offset, data, DIV_ROUND_UP(len, 4));
drivers/net/wireless/ralink/rt2x00/rt2x00mmio.h:        __iowrite32_copy(rt2x00dev->csr.base + offset, value, length >> 2);
drivers/remoteproc/mtk_scp_ipi.c:       __iowrite32_copy(dst + i, src + i, (len - i) / 4);
drivers/rpmsg/qcom_glink_rpm.c:         __iowrite32_copy(pipe->fifo + head, data,
drivers/rpmsg/qcom_glink_rpm.c:         __iowrite32_copy(pipe->fifo, data + len,
drivers/rpmsg/qcom_smd.c:               __iowrite32_copy(dst, src, count / sizeof(u32));
drivers/scsi/lpfc/lpfc_compat.h:        __iowrite32_copy(dest, src, bytes / sizeof(uint32_t));
drivers/slimbus/qcom-ctrl.c:    __iowrite32_copy(ctrl->base + tx_reg, buf, count);
drivers/soc/qcom/qcom_aoss.c:   __iowrite32_copy(qmp->msgram + qmp->offset + sizeof(u32),
drivers/soc/qcom/spm.c: __iowrite32_copy(addr, drv->reg_data->seq,
drivers/spi/spi-hisi-sfc-v3xx.c:                __iowrite32_copy(to, from, words);
sound/soc/intel/atom/sst/sst_loader.c:  __iowrite32_copy(dst, src, count / 4);
sound/soc/sof/iomem-utils.c:    __iowrite32_copy(dest, src, m);

At least the networking ones I recognize as performance paths, we
don't want to degrade them.

__iowrite64_copy() has a sufficient API that the compiler can inline
the STP block as this patch did.

I experimented with having memcpy_toio_64() invoke __iowrite64_copy(),
but it did not look very nice. Maybe there is a possible performance
win there, I don't know.

> > > If eight STRs without other operations interleaved give us the
> > > write-combining on most CPUs (with Normal NC), we should go with this
> > > instead of STP.
> > 
> > Agreed; I've sent out a patch to allow the offset addressing at:
> > 
> >   https://lore.kernel.org/linux-arm-kernel/20240124111259.874975-1-mark.rutland@arm.com/
> > 
> > ... and it should be possible to build atop that to use eight STRs.
> 
> That's great, thanks.

It is a nice patch but it does not really help this problem. The
compiler cannot be trusted to use the new writeq() properly, eg clang
doesn't optimize the new constraint at all.

Regardless this has to be a fixed inline assembly block of either STR
or STP.

Jason
Marc Zyngier Jan. 24, 2024, 1:32 p.m. UTC | #53
On Wed, 24 Jan 2024 13:06:38 +0000,
Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> On Wed, Jan 24, 2024 at 08:26:28AM +0000, Marc Zyngier wrote:
> 
> > > Even if you refuse to take STP to mainline it *will* be running in VMs
> > > under ARM hypervisors.
> > 
> > A hypervisor can't do anything with it. If you cared to read the
> > architecture, you'd know by now. So your VM will be either dead, or
> > dog slow, depending on your hypervisor. In any case, I'm sure it will
> > reflect positively on your favourite software.
> 
> "Dog slow" is fine. Forcing IO emulation on paths that shouldn't have
> it is a VMM problem. KVM & qemu have some issues where this can happen
> infrequently for VFIO MMIO maps. It is just important that it be
> functionally correct if you get unlucky. The performance path is to
> not take a fault in the first place.
> 
> > > What exactly do you think should be done about that?
> > 
> > Well, you could use KVM_CAP_ARM_NISV_TO_USER in userspace and see
> > everything slow down. Your call.
> 
> The issue Mark raised here was that things like STP/etc cannot work in
> VMs, not that they are slow.
>
> The places we are talking about using the STP pattern are all high
> performance HW drivers, that do not have any existing SW emulation to
> worry about. ie the VMM will be using VFIO to back the MMIO the
> acessors target.
> 
> So, I'm fine if the answer is that VMM's using VFIO need to use
> KVM_CAP_ARM_NISV_TO_USER and do instruction parsing for emulated IO in
> userspace if they have a design where VFIO MMIO can infrequently
> generate faults. That is all VMM design stuff and has nothing to do
> with the kernel.

Which will work a treat with things like CCA, I'm sure.

> 
> My objection is this notion we should degrade a performance hot path
> in drivers to accomodate an ARM VMM issue that should be solved in the
> VMM.
> 
> > Or you can stop whining and try to get better performance out of what
> > we have today.
> 
> "better performance"!?!? You are telling me I have to destroy one of
> our important fast paths for HPC workloads to accommodate some
> theoretical ARM KVM problem?

What I'm saying is that there are way to make it better without
breaking your particular toy workload which, as important as it may be
to *you*, doesn't cover everybody's use case.

Mark did post such an example that has the potential of having that
improvement. I'd suggest that you give it a go.

But your attitude of "who cares if it breaks as long as it works for
me" is not something I can adhere to.

	M.
Jason Gunthorpe Jan. 24, 2024, 3:52 p.m. UTC | #54
On Wed, Jan 24, 2024 at 01:32:22PM +0000, Marc Zyngier wrote:
> > So, I'm fine if the answer is that VMM's using VFIO need to use
> > KVM_CAP_ARM_NISV_TO_USER and do instruction parsing for emulated IO in
> > userspace if they have a design where VFIO MMIO can infrequently
> > generate faults. That is all VMM design stuff and has nothing to do
> > with the kernel.
> 
> Which will work a treat with things like CCA, I'm sure.

CCA shouldn't have emulation or trapping on the MMIO mappings.

> > > Or you can stop whining and try to get better performance out of what
> > > we have today.
> > 
> > "better performance"!?!? You are telling me I have to destroy one of
> > our important fast paths for HPC workloads to accommodate some
> > theoretical ARM KVM problem?
> 
> What I'm saying is that there are way to make it better without
> breaking your particular toy workload which, as important as it may be
> to *you*, doesn't cover everybody's use case.

Please, do we need the "toy" stuff? The industry is spending 10's of
billions of dollars right now to run "my workload". Currently not
widely on ARM servers, but we are all hoping ARM can succeed here,
right?

I still don't know what you mean by "better". There are several issues
now

1) This series, where WC doesn't trigger on new cores. Maybe 8x STR
   will fix it, but it is not better performance wise than 4x STP.

2) Userspace does ST4 to MMIO memory, and the VMM can't explode
   because of this. Replacing the ST4 with 8x STR is NOT better,
   that would be a big performance downside, especially for the
   quirky hi-silicon hardware.

3) The other series changing the S2 so that WC can work in the VM

> Mark did post such an example that has the potential of having that
> improvement. I'd suggest that you give it a go.

Mark's patch doesn't help this, I've already written and evaluated his
patch last week. Unfortunately it needs to be done with explicit
inline assembly either STP or STR blocks.

I don't know if the 8x STR is workable or not. I need to get someone
to test it, but even if it is the userspace IO for this HW will
continue to use ST4.

So, regardless of the kernel decision, if someone is going to put this
HW into a VM then their VMM needs to do *something* to ensure that the
VMM does not malfunction when the VM issues STP/ST4 to the VFIO MMIO.

There are good choices for the VMM here - ensuring it never has to
process a S2 VFIO MMIO fault, always resuming and never emulating VFIO
MMIO, or correctly handling an emulated S2 fault from a STP/ST4
instruction via instruction parsing.

Therefore we can assume that working VMM's will exist. Indeed I would
go farther and say that mlx5 HW in a VM must have a working VMM.

So the question is only how pessimistic should the arch code for
__iowrite64_copy() be. My view is that it is only used in a small
number of drivers and if a VMM creates vPCI devices for those drivers
then the VMM should be expected to bring proper vMMIO support too.

I do not like this notion that all drivers using __iowrite64_copy()
should have sub-optimal bare metal performance because a VMM *might*
exist that has a problem.

> But your attitude of "who cares if it breaks as long as it works for
> me" is not something I can adhere to.

In my world failing to reach performance is a "break" as well.

So you have a server that is "broken" because its performance is
degraded vs an unknown VMM that is "broken" because it wants to
emulate IO (without implementing instruction parsing) for a device
with a __iowrite64_copy() using driver.

My server really does exist. I'm not so sure about the other case.

Jason
Catalin Marinas Jan. 24, 2024, 5:22 p.m. UTC | #55
On Wed, Jan 24, 2024 at 09:27:19AM -0400, Jason Gunthorpe wrote:
> On Wed, Jan 24, 2024 at 12:40:29PM +0000, Catalin Marinas wrote:
> 
> > > Just to be clear, that means we should drop this patch ("arm64/io: add
> > > memcpy_toio_64") for now, right?
> > 
> > In its current form yes, but that doesn't mean that memcpy_toio_64()
> > cannot be reworked differently.
> 
> I gave up on touching memcpy_toio_64(), it doesn't work very well
> because of the weak alignment
> 
> Instead I followed your suggestion to fix __iowrite64_copy()

I forgot the details. Was it to introduce an __iowrite512_copy()
function or to simply use __iowrite64_copy() with a count of 8?

> There are only a couple of places that use this API:
[...]
> __iowrite64_copy() has a sufficient API that the compiler can inline
> the STP block as this patch did.
> 
> I experimented with having memcpy_toio_64() invoke __iowrite64_copy(),
> but it did not look very nice. Maybe there is a possible performance
> win there, I don't know.

Just invoking __iowrite64_copy() with a count of 8 wouldn't work well
even if we have the writeq generating STR with an offset (well, it also
increments the pointers, so I don't think Mark's optimisation would
help). The copy implies loads and these would be interleaved with stores
and potentially get in the way of write combining. An
__iowrite512_copy() or maybe even an optimised __iowrite64_copy() for
count 8 could do the loads first followed by the stores. You can use a
special path in __iowrite64_copy() with __builtin_contant_p().

You can try with an arm64 specific __iowrite64_copy() and see how it
goes (together with Mark's patch):

void __iowrite64_copy(void __iomem *to, const void *from,
		      size_t count)
{
	u64 __iomem *dst = to;
	const u64 *src = from;
	const u64 *end = src + count;

	/*
	 * Try a 64-byte write, the CPUs tend to write-combine them.
	 */
	if (__builtin_contant_p(count) && count == 8) {
		__raw_writeq(*src, dst);
		__raw_writeq(*(src + 1), dst + 1);
		__raw_writeq(*(src + 2), dst + 2);
		__raw_writeq(*(src + 3), dst + 3);
		__raw_writeq(*(src + 4), dst + 4);
		__raw_writeq(*(src + 5), dst + 5);
		__raw_writeq(*(src + 6), dst + 6);
		__raw_writeq(*(src + 7), dst + 7);
		return;
	}

	while (src < end)
		__raw_writeq(*src++, dst++);
}
EXPORT_SYMBOL_GPL(__iowrite64_copy);

What we don't have is inlining of __iowrite64_copy() but if we need that
we can move away from a weak symbol to a static inline.

Give this a go and see if it you get write-combining in your hardware.
If the loads interleaves with stores get in the way, maybe we can resort
to inline asm.
Catalin Marinas Jan. 24, 2024, 5:54 p.m. UTC | #56
On Wed, Jan 24, 2024 at 11:52:25AM -0400, Jason Gunthorpe wrote:
> On Wed, Jan 24, 2024 at 01:32:22PM +0000, Marc Zyngier wrote:
> > What I'm saying is that there are way to make it better without
> > breaking your particular toy workload which, as important as it may be
> > to *you*, doesn't cover everybody's use case.
> 
> Please, do we need the "toy" stuff? The industry is spending 10's of
> billions of dollars right now to run "my workload". Currently not
> widely on ARM servers, but we are all hoping ARM can succeed here,
> right?
> 
> I still don't know what you mean by "better". There are several issues
> now
> 
> 1) This series, where WC doesn't trigger on new cores. Maybe 8x STR
>    will fix it, but it is not better performance wise than 4x STP.

It would be good to know. If the performance difference is significant,
we can revisit. I'm not keen on using alternatives here without backing
it up by numbers (do we even have a way to detect whether Linux is
running natively or not? we may have to invent something).

> 2) Userspace does ST4 to MMIO memory, and the VMM can't explode
>    because of this. Replacing the ST4 with 8x STR is NOT better,
>    that would be a big performance downside, especially for the
>    quirky hi-silicon hardware.

I was hoping KVM injects an error into the guest rather than killing it
but at a quick look I couldn't find it. The kvm_handle_guest_abort() ->
io_mem_abort() ends up returning -ENOSYS while handle_trap_exceptions()
only understands handled or not (like 1 or 0). Well, maybe I didn't look
deep enough.
Jason Gunthorpe Jan. 24, 2024, 7:26 p.m. UTC | #57
On Wed, Jan 24, 2024 at 05:22:05PM +0000, Catalin Marinas wrote:
> On Wed, Jan 24, 2024 at 09:27:19AM -0400, Jason Gunthorpe wrote:
> > On Wed, Jan 24, 2024 at 12:40:29PM +0000, Catalin Marinas wrote:
> > 
> > > > Just to be clear, that means we should drop this patch ("arm64/io: add
> > > > memcpy_toio_64") for now, right?
> > > 
> > > In its current form yes, but that doesn't mean that memcpy_toio_64()
> > > cannot be reworked differently.
> > 
> > I gave up on touching memcpy_toio_64(), it doesn't work very well
> > because of the weak alignment
> > 
> > Instead I followed your suggestion to fix __iowrite64_copy()
> 
> I forgot the details. Was it to introduce an __iowrite512_copy()
> function or to simply use __iowrite64_copy() with a count of 8?

Count of 8

> Just invoking __iowrite64_copy() with a count of 8 wouldn't work well
> even if we have the writeq generating STR with an offset (well, it also
> increments the pointers, so I don't think Mark's optimisation would
> help). The copy implies loads and these would be interleaved with stores
> and potentially get in the way of write combining. An
> __iowrite512_copy() or maybe even an optimised __iowrite64_copy() for
> count 8 could do the loads first followed by the stores. You can use a
> special path in __iowrite64_copy() with __builtin_contant_p().

I did exactly the latter like this:

static inline void __const_memcpy_toio_aligned64(volatile u64 __iomem *to,
						 const u64 *from, size_t count)
{
	switch (count) {
	case 8:
		asm volatile("stp %x0, %x1, [%8, #16 * 0]\n"
			     "stp %x2, %x3, [%8, #16 * 1]\n"
			     "stp %x4, %x5, [%8, #16 * 2]\n"
			     "stp %x6, %x7, [%8, #16 * 3]\n"
			     :
			     : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
			       "rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]),
			       "rZ"(from[6]), "rZ"(from[7]), "r"(to));
		break;
	case 4:
		asm volatile("stp %x0, %x1, [%4, #16 * 0]\n"
			     "stp %x2, %x3, [%4, #16 * 1]\n"
			     :
			     : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
			       "rZ"(from[3]), "r"(to));
		break;
	case 2:
		asm volatile("stp %x0, %x1, [%2, #16 * 0]\n"
			     :
			     : "rZ"(from[0]), "rZ"(from[1]), "r"(to));
		break;
	case 1:
		__raw_writeq(*from, to);
		break;
	default:
		BUILD_BUG();
	}
}

void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count);

static inline void __const_iowrite64_copy(void __iomem *to, const void *from,
					  size_t count)
{
	if (count == 8 || count == 4 || count == 2 || count == 1) {
		__const_memcpy_toio_aligned64(to, from, count);
		dgh();
	} else {
		__iowrite64_copy_full(to, from, count);
	}
}

#define __iowrite64_copy(to, from, count)                  \
	(__builtin_constant_p(count) ?                     \
		 __const_iowrite64_copy(to, from, count) : \
		 __iowrite64_copy_full(to, from, count))

And the out of line __iowrite64_copy_full() generates good
assembly that loops 8/4/2/1 sized blocks.

I was going to send it out yesterday but am waiting for some
conclusion on the STP.

https://github.com/jgunthorpe/linux/commits/mlx5_wc/

> void __iowrite64_copy(void __iomem *to, const void *from,
> 		      size_t count)
> {
> 	u64 __iomem *dst = to;
> 	const u64 *src = from;
> 	const u64 *end = src + count;
> 
> 	/*
> 	 * Try a 64-byte write, the CPUs tend to write-combine them.
> 	 */
> 	if (__builtin_contant_p(count) && count == 8) {
> 		__raw_writeq(*src, dst);
> 		__raw_writeq(*(src + 1), dst + 1);
> 		__raw_writeq(*(src + 2), dst + 2);
> 		__raw_writeq(*(src + 3), dst + 3);
> 		__raw_writeq(*(src + 4), dst + 4);
> 		__raw_writeq(*(src + 5), dst + 5);
> 		__raw_writeq(*(src + 6), dst + 6);
> 		__raw_writeq(*(src + 7), dst + 7);
> 		return;
> 	}

I already looked at this, clang with the "Qo" constraint does:

ffffffc08086e6ec:       f9400029        ldr     x9, [x1]
ffffffc08086e6f0:       91002008        add     x8, x0, #0x8
ffffffc08086e6f4:       f9000009        str     x9, [x0]
ffffffc08086e6f8:       f9400429        ldr     x9, [x1, #8]
ffffffc08086e6fc:       f9000109        str     x9, [x8]
ffffffc08086e700:       91004008        add     x8, x0, #0x10
ffffffc08086e704:       f9400829        ldr     x9, [x1, #16]
ffffffc08086e708:       f9000109        str     x9, [x8]
ffffffc08086e70c:       91006008        add     x8, x0, #0x18
ffffffc08086e710:       f9400c29        ldr     x9, [x1, #24]
ffffffc08086e714:       f9000109        str     x9, [x8]
ffffffc08086e718:       91008008        add     x8, x0, #0x20
ffffffc08086e71c:       f9401029        ldr     x9, [x1, #32]
ffffffc08086e720:       f9000109        str     x9, [x8]
ffffffc08086e724:       9100a008        add     x8, x0, #0x28
ffffffc08086e728:       f9401429        ldr     x9, [x1, #40]
ffffffc08086e72c:       f9000109        str     x9, [x8]
ffffffc08086e730:       9100c008        add     x8, x0, #0x30
ffffffc08086e734:       f9401829        ldr     x9, [x1, #48]
ffffffc08086e738:       f9000109        str     x9, [x8]
ffffffc08086e73c:       f9401c28        ldr     x8, [x1, #56]
ffffffc08086e740:       9100e009        add     x9, x0, #0x38
ffffffc08086e744:       f9000128        str     x8, [x9]

Which is not good. Gcc is a better, but not perfect.

> What we don't have is inlining of __iowrite64_copy() but if we need that
> we can move away from a weak symbol to a static inline.

Yes I did this as well. It helps s390 and x86 nicely too.

> Give this a go and see if it you get write-combining in your hardware.
> If the loads interleaves with stores get in the way, maybe we can resort
> to inline asm.

For reference the actual assembly (see post_send_nop()) that fails is:

   13534:       d503201f        nop
   13538:       93407ea1        sxtw    x1, w21
   1353c:       f100403f        cmp     x1, #0x10
   13540:       54000488        b.hi    135d0 <post_send_nop.isra.0+0x260>  // b.pmore
   13544:       a9408a63        ldp     x3, x2, [x19, #8]
   13548:       f84086c4        ldr     x4, [x22], #8
   1354c:       f9400042        ldr     x2, [x2]
   13550:       8b030283        add     x3, x20, x3
   13554:       8b030042        add     x2, x2, x3
   13558:       f9000044        str     x4, [x2]
   1355c:       91002294        add     x20, x20, #0x8
   13560:       11000ab5        add     w21, w21, #0x2
   13564:       f101029f        cmp     x20, #0x40
   13568:       54fffe81        b.ne    13538 <post_send_nop.isra.0+0x1c8>  // b.any
   1356c:       d50320df        hint    #0x6

Not very good code the compiler wrote (the main issue is that it
reloads the dest pointer every iteration), but still, all those loads
are coming from memory that was recently touched so should be in-cache
most of the time. So it isn't like we are sitting around waiting for a
lengthy dcache fill and timing out the WC buffer.

However, it is 136 instructions, so it feels like the issue may be the
write combining buffer auto-flushes in less. Maybe it auto-flushes
after 128/64/32/16/8 cycles now. I know there has been a tension to
reduce WC latency vs maximum aggregation.

The suggestion that it should not have any interleaving instructions
and use STP came from our CPU architecture team.

The assembly I have been able to get tested from this series that did
works is this:

ffffffc08086ec84:       d5033e9f        dsb     st
ffffffc08086ec88:       f941de6b        ldr     x11, [x19, #952]
ffffffc08086ec8c:       f941da6c        ldr     x12, [x19, #944]
ffffffc08086ec90:       f940016b        ldr     x11, [x11]
ffffffc08086ec94:       8b0c016b        add     x11, x11, x12
ffffffc08086ec98:       a9002969        stp     x9, x10, [x11]
ffffffc08086ec9c:       a9012168        stp     x8, x8, [x11, #16]
ffffffc08086eca0:       a9022168        stp     x8, x8, [x11, #32]
ffffffc08086eca4:       a9032168        stp     x8, x8, [x11, #48]
ffffffc08086eca8:       d50320df        hint    #0x6

The revised __iowrite64_copy() version also creates this assembly.

The ST4 based thing in userspace also works.

Remember there are two related topics here.. mlx5 would like high
frequency of large TLP generation, but doesn't care about raw
performance. If the 24 instructions clang generates does that then
great.

hns/broadcom/others need the large TLP and care about performance. In
that case the stp block is the best we can do in the kernel as st4 is
off the table.

I would like the architecture code to do a good job for performance
since it is a generic API for all drivers.

Regarding the 8x STR option, I have to get that tested.

Jason
Jason Gunthorpe Jan. 25, 2024, 1:29 a.m. UTC | #58
On Wed, Jan 24, 2024 at 05:54:49PM +0000, Catalin Marinas wrote:
> On Wed, Jan 24, 2024 at 11:52:25AM -0400, Jason Gunthorpe wrote:
> > On Wed, Jan 24, 2024 at 01:32:22PM +0000, Marc Zyngier wrote:
> > > What I'm saying is that there are way to make it better without
> > > breaking your particular toy workload which, as important as it may be
> > > to *you*, doesn't cover everybody's use case.
> > 
> > Please, do we need the "toy" stuff? The industry is spending 10's of
> > billions of dollars right now to run "my workload". Currently not
> > widely on ARM servers, but we are all hoping ARM can succeed here,
> > right?
> > 
> > I still don't know what you mean by "better". There are several issues
> > now
> > 
> > 1) This series, where WC doesn't trigger on new cores. Maybe 8x STR
> >    will fix it, but it is not better performance wise than 4x STP.
> 
> It would be good to know. If the performance difference is significant,
> we can revisit. I'm not keen on using alternatives here without backing
> it up by numbers (do we even have a way to detect whether Linux is
> running natively or not? we may have to invent something).

I don't have a setup to measure performance, mlx5 is not using it in a
performance path. The other drivers in the tree are. I feel bad about
hobbling them.

> > 2) Userspace does ST4 to MMIO memory, and the VMM can't explode
> >    because of this. Replacing the ST4 with 8x STR is NOT better,
> >    that would be a big performance downside, especially for the
> >    quirky hi-silicon hardware.
> 
> I was hoping KVM injects an error into the guest rather than killing it
> but at a quick look I couldn't find it. The kvm_handle_guest_abort() ->
> io_mem_abort() ends up returning -ENOSYS while handle_trap_exceptions()
> only understands handled or not (like 1 or 0). Well, maybe I didn't look
> deep enough.

It looks to me like qemu turns on the KVM_CAP_ARM_NISV_TO_USER and
then when it gets a NISV it always converts it to a data abort to the
guest. See kvm_arm_handle_dabt_nisv() in qemu. So it is just a
correctness issue, not a 'VM userspace can crash the VMM' security
problem.

The reason we've never seen this fault in any of our testing is
because the whole system is designed to have qemu back vMMIO space
that is under hot path use by only a VFIO memslot. ie it never drops
the memslot and forces emulation. (KVM has no issue to handle a S2
abort if a memslot is present, obviously)

VFIO IO emulation is used to cover corner cases and establish a slow
technical correctness. It is not fast path. Avoid this if you want any
sort of performance.

Thus, IMHO, doing IO emulation for VFIO that doesn't support all the
instructions actual existing SW uses to do IO is hard to justify. We
are already on a slow path that only exists for technical correctness,
it should be perfect. It is perfect on x86 because x86 KVM does SW
instruction decode and emulation. ARM could too, but doesn't.

To put it in a practical example, I predict that if someone steps
outside our "engineered" box and runs a 64k page size hypervisor
kernel with a mlx5 device that is not engineered for 64K page size
they will get a MMIO BAR layout where the 64k page that covers the MSI
items will overlap with hot path addresses. The existing user space
stack could issue ST4's to hot path addresses within that emulated 64k
of vMMIO and explode. 4k page size hypervisors avoid this because the
typical mlx5 device has a BAR layout with a 4k granule in mind.

Jason
Jason Gunthorpe Jan. 25, 2024, 5:43 p.m. UTC | #59
On Wed, Jan 24, 2024 at 03:26:34PM -0400, Jason Gunthorpe wrote:

> The suggestion that it should not have any interleaving instructions
> and use STP came from our CPU architecture team.

I got some more details here.

They point to the ARM publication about write combining

https://community.arm.com/cfs-file/__key/telligent-evolution-components-attachments/13-150-00-00-00-00-10-12/Understanding_5F00_Write_5F00_Combining_5F00_on_5F00_Arm_5F00_V.1.0.pdf

specifically to the example code using 4x 128 bit NEON stores.

They point at the actual CPU design and say it is optimized for 128
bit stores (STP and ST4 included, it seems).

64 bit stores trigger some different behavior.

I have no way to know if it will be OK for other drivers that expect
this to be a performance path in the kernel.

Are you *sure* you want to do this str version? If it works for mlx5 I
will send the patch and the other companies can come later with
performance data.

Jason
Catalin Marinas Jan. 26, 2024, 2:56 p.m. UTC | #60
On Thu, Jan 25, 2024 at 01:43:33PM -0400, Jason Gunthorpe wrote:
> On Wed, Jan 24, 2024 at 03:26:34PM -0400, Jason Gunthorpe wrote:
> > The suggestion that it should not have any interleaving instructions
> > and use STP came from our CPU architecture team.
> 
> I got some more details here.
> 
> They point to the ARM publication about write combining
> 
> https://community.arm.com/cfs-file/__key/telligent-evolution-components-attachments/13-150-00-00-00-00-10-12/Understanding_5F00_Write_5F00_Combining_5F00_on_5F00_Arm_5F00_V.1.0.pdf
> 
> specifically to the example code using 4x 128 bit NEON stores.

That's an example but this document doesn't make any statements about
64-bit writes.

> They point at the actual CPU design and say it is optimized for 128
> bit stores (STP and ST4 included, it seems).
> 
> 64 bit stores trigger some different behavior.

This is highly microarchitecture specific. The best bet in the future is
the ST64B instruction but in the meantime it's pretty much guessing.

> I have no way to know if it will be OK for other drivers that expect
> this to be a performance path in the kernel.
> 
> Are you *sure* you want to do this str version? If it works for mlx5 I
> will send the patch and the other companies can come later with
> performance data.

Yeah, I'd stick to the STR for now, it makes things simpler as we don't
have to care about what emulation does.
Jason Gunthorpe Jan. 26, 2024, 3:24 p.m. UTC | #61
On Fri, Jan 26, 2024 at 02:56:31PM +0000, Catalin Marinas wrote:
> On Thu, Jan 25, 2024 at 01:43:33PM -0400, Jason Gunthorpe wrote:
> > On Wed, Jan 24, 2024 at 03:26:34PM -0400, Jason Gunthorpe wrote:
> > > The suggestion that it should not have any interleaving instructions
> > > and use STP came from our CPU architecture team.
> > 
> > I got some more details here.
> > 
> > They point to the ARM publication about write combining
> > 
> > https://community.arm.com/cfs-file/__key/telligent-evolution-components-attachments/13-150-00-00-00-00-10-12/Understanding_5F00_Write_5F00_Combining_5F00_on_5F00_Arm_5F00_V.1.0.pdf
> > 
> > specifically to the example code using 4x 128 bit NEON stores.
> 
> That's an example but this document doesn't make any statements about
> 64-bit writes.

ARM has consistently left this area as informally specified by
documents like this. This document arose specifically because a
certain implementation choose an architecturally complaint way to do
write combining but it was informally decided that Linux upstream
would not support it. These gaps were discovered during DOE's path
finding Astra supercomputer program about 6 years ago during testing
with mlx5 devices.

The document was specifically intended to guide HPC implementations
expecting to run inside the Linux ecosystem.

Based on all this I'm not surprised that the ecosystem has decided to
focus primarily on consecutive 128 bit writes, absent any other
guidance designers are following what information they have.

Jason
Catalin Marinas Jan. 26, 2024, 4:15 p.m. UTC | #62
On Wed, Jan 24, 2024 at 09:29:24PM -0400, Jason Gunthorpe wrote:
> On Wed, Jan 24, 2024 at 05:54:49PM +0000, Catalin Marinas wrote:
> > On Wed, Jan 24, 2024 at 11:52:25AM -0400, Jason Gunthorpe wrote:
> > > 2) Userspace does ST4 to MMIO memory, and the VMM can't explode
> > >    because of this. Replacing the ST4 with 8x STR is NOT better,
> > >    that would be a big performance downside, especially for the
> > >    quirky hi-silicon hardware.
> > 
> > I was hoping KVM injects an error into the guest rather than killing it
> > but at a quick look I couldn't find it. The kvm_handle_guest_abort() ->
> > io_mem_abort() ends up returning -ENOSYS while handle_trap_exceptions()
> > only understands handled or not (like 1 or 0). Well, maybe I didn't look
> > deep enough.
> 
> It looks to me like qemu turns on the KVM_CAP_ARM_NISV_TO_USER and
> then when it gets a NISV it always converts it to a data abort to the
> guest. See kvm_arm_handle_dabt_nisv() in qemu. So it is just a
> correctness issue, not a 'VM userspace can crash the VMM' security
> problem.

The VMM wasn't my concern but rather a guest getting killed or not
functioning correctly (user app killed).

> Thus, IMHO, doing IO emulation for VFIO that doesn't support all the
> instructions actual existing SW uses to do IO is hard to justify. We
> are already on a slow path that only exists for technical correctness,
> it should be perfect. It is perfect on x86 because x86 KVM does SW
> instruction decode and emulation. ARM could too, but doesn't.

It could fall back to instruction decode, either in KVM or the VMM
(strong preference for the latter), but I'd only do this if it's
justified.

I don't think the issue here is VFIO, I doubt we'd ever see emulation
for hardware like mlx5. But we are changing generic kernel functions
like memcpy_toio/__iowrite64_copy() that end up being used in other
drivers (e.g. USB, UART) for emulated devices. If we can keep these
functions as generic as possible for both guest and native runs, that's
great. If the performance difference is significant, we can revisit.
Jason Gunthorpe Jan. 26, 2024, 5:09 p.m. UTC | #63
On Fri, Jan 26, 2024 at 04:15:16PM +0000, Catalin Marinas wrote:
> > It looks to me like qemu turns on the KVM_CAP_ARM_NISV_TO_USER and
> > then when it gets a NISV it always converts it to a data abort to the
> > guest. See kvm_arm_handle_dabt_nisv() in qemu. So it is just a
> > correctness issue, not a 'VM userspace can crash the VMM' security
> > problem.
> 
> The VMM wasn't my concern but rather a guest getting killed or not
> functioning correctly (user app killed).

Right, hopefully it is the latter.

> > Thus, IMHO, doing IO emulation for VFIO that doesn't support all the
> > instructions actual existing SW uses to do IO is hard to justify. We
> > are already on a slow path that only exists for technical correctness,
> > it should be perfect. It is perfect on x86 because x86 KVM does SW
> > instruction decode and emulation. ARM could too, but doesn't.
> 
> It could fall back to instruction decode, either in KVM or the VMM
> (strong preference for the latter), but I'd only do this if it's
> justified.

From a performance perspective, if the VMM is doing pure emulation and
wants to memcpy lots of data to emulated vMMIO I'd look at it like this:

  1xST4 transfers 512 bits and requires one vmexit and one
  instruction parse.

  4xSTP is four instruction parses and four vmexits

  8xSTR is no instruction parses and eight vmexits

The instruction parse is not pure overhead, it saves on vmexit's which
are expensive things (at least on x86). I don't have a sense how this
stacks up on arm, but I wouldn't jump to it being horribly
non-performing.

> I don't think the issue here is VFIO, I doubt we'd ever see emulation
> for hardware like mlx5.

Sadly no :(

It can happen in non-production corner cases due to the VFIO MSI emulation.

There is a qemu bug prior to 8.something that causes it to happen at
random, with VFIO, rarely.

There is a non-prodcution debug mode in qemu where all VFIO MMIO is
trapped. The qemu expectation is that this is functionally identical
to non-trapping. (The E in qemu is emulation after all, kind of a core
reason it exists)

Finally, we do actually have an internal simulation tool that does
software emulate mlx5 HW without VFIO.

> But we are changing generic kernel functions
> like memcpy_toio/__iowrite64_copy() that end up being used in other
> drivers (e.g. USB, UART) for emulated devices. 

I didn't touch memcpy_toio, I think given this problem we shouldn't
touch it. I only touched __iowriteXX_copy() which did not look like it
is being used in any drivers with emulation.

Even if I got this wrong we can revert any impacted drivers to use
memcpy_toio() instead.

Jason
Niklas Schnelle Feb. 16, 2024, 12:09 p.m. UTC | #64
On Thu, 2024-01-18 at 16:59 +0100, Niklas Schnelle wrote:
> On Thu, 2024-01-18 at 10:00 -0400, Jason Gunthorpe wrote:
> > On Thu, Jan 18, 2024 at 02:46:40PM +0100, Niklas Schnelle wrote:
> > 
> > > Yes, we need zpci_memcpy_toio(to, from, count * 8) since our count is
> > > in bytes like for memcpy_toio().
> > 
> > https://github.com/jgunthorpe/linux/commits/mlx5_wc/
> > 
> > Jason
> > 
> 
> Thanks, the s390 patches:
> 
> 
> s390: Implement __iowrite32_copy() 
> s390: Use the correct count for __iowrite64_copy() 
> s390: Stop using weak symbols for __iowrite64_copy() 
> 
> Look good to me. I.e. you may add my.
> 
> Acked-by Niklas Schnelle <schnelle@linux.ibm.com>
> 
> I did test your patches too and by accident confirmed again that these
> do need commit 80df7d6af7f6 ("s390/pci: fix max size calculation in
> zpci_memcpy_toio()") from the s390 feature branch to get the mlx5
> driver to detect Write-Combining as supported. Note, as far as I know
> Alexander Gordeev is targeting that one for v6.8-rc2 since we had quite
> a few changes for v6.8-rc1.
> 
> Thanks,
> Niklas
> 


Hi Jason,

Since I haven't seen a new version of this yet I was just wondering
what the current plan is. If this is somewhat stuck would you be
willing to send the "s390: Use the correct count for
__iowrite64_copy()" patch separately, preferably with "s390/pci:"
prefix?

Thanks,
Niklas
Jason Gunthorpe Feb. 16, 2024, 12:39 p.m. UTC | #65
On Fri, Feb 16, 2024 at 01:09:10PM +0100, Niklas Schnelle wrote:
> On Thu, 2024-01-18 at 16:59 +0100, Niklas Schnelle wrote:
> > On Thu, 2024-01-18 at 10:00 -0400, Jason Gunthorpe wrote:
> > > On Thu, Jan 18, 2024 at 02:46:40PM +0100, Niklas Schnelle wrote:
> > > 
> > > > Yes, we need zpci_memcpy_toio(to, from, count * 8) since our count is
> > > > in bytes like for memcpy_toio().
> > > 
> > > https://github.com/jgunthorpe/linux/commits/mlx5_wc/
> > > 
> > > Jason
> > > 
> > 
> > Thanks, the s390 patches:
> > 
> > 
> > s390: Implement __iowrite32_copy() 
> > s390: Use the correct count for __iowrite64_copy() 
> > s390: Stop using weak symbols for __iowrite64_copy() 
> > 
> > Look good to me. I.e. you may add my.
> > 
> > Acked-by Niklas Schnelle <schnelle@linux.ibm.com>
> > 
> > I did test your patches too and by accident confirmed again that these
> > do need commit 80df7d6af7f6 ("s390/pci: fix max size calculation in
> > zpci_memcpy_toio()") from the s390 feature branch to get the mlx5
> > driver to detect Write-Combining as supported. Note, as far as I know
> > Alexander Gordeev is targeting that one for v6.8-rc2 since we had quite
> > a few changes for v6.8-rc1.
> 
> Since I haven't seen a new version of this yet I was just wondering
> what the current plan is. If this is somewhat stuck would you be
> willing to send the "s390: Use the correct count for
> __iowrite64_copy()" patch separately, preferably with "s390/pci:"
> prefix?

I have been waiting for test cycles on the right ARM64 system and it
is taking a long time :(

I'll send your bit so you can take it to -rc

Thanks,
Jason
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 3b694511b98f..73ab91913790 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -135,6 +135,26 @@  extern void __memset_io(volatile void __iomem *, int, size_t);
 #define memcpy_fromio(a,c,l)	__memcpy_fromio((a),(c),(l))
 #define memcpy_toio(c,a,l)	__memcpy_toio((c),(a),(l))
 
+static inline void __memcpy_toio_64(volatile void __iomem *to, const void *from)
+{
+	const u64 *from64 = from;
+
+	/*
+	 * Newer ARM core have sensitive write combining buffers, it is
+	 * important that the stores be contiguous blocks of store instructions.
+	 * Normal memcpy does not work reliably.
+	 */
+	asm volatile("stp %x0, %x1, [%8, #16 * 0]\n"
+		     "stp %x2, %x3, [%8, #16 * 1]\n"
+		     "stp %x4, %x5, [%8, #16 * 2]\n"
+		     "stp %x6, %x7, [%8, #16 * 3]\n"
+		     :
+		     : "rZ"(from64[0]), "rZ"(from64[1]), "rZ"(from64[2]),
+		       "rZ"(from64[3]), "rZ"(from64[4]), "rZ"(from64[5]),
+		       "rZ"(from64[6]), "rZ"(from64[7]), "r"(to));
+}
+#define memcpy_toio_64(to, from) __memcpy_toio_64(to, from)
+
 /*
  * I/O memory mapping functions.
  */
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index bac63e874c7b..2d6d60ed2128 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -1202,6 +1202,36 @@  static inline void memcpy_toio(volatile void __iomem *addr, const void *buffer,
 }
 #endif
 
+#ifndef memcpy_toio_64
+#define memcpy_toio_64 memcpy_toio_64
+/**
+ * memcpy_toio_64	Copy 64 bytes of data into I/O memory
+ * @dst:		The (I/O memory) destination for the copy
+ * @src:		The (RAM) source for the data
+ * @count:		The number of bytes to copy
+ *
+ * dst and src must be aligned to 8 bytes. This operation copies exactly 64
+ * bytes. It is intended to be used for write combining IO memory. The
+ * architecture should provide an implementation that has a high chance of
+ * generating a single combined transaction.
+ */
+static inline void memcpy_toio_64(volatile void __iomem *addr,
+				  const void *buffer)
+{
+	unsigned int i = 0;
+
+#if BITS_PER_LONG == 64
+	for (; i != 8; i++)
+		__raw_writeq(((const u64 *)buffer)[i],
+			     ((u64 __iomem *)addr) + i);
+#else
+	for (; i != 16; i++)
+		__raw_writel(((const u32 *)buffer)[i],
+			     ((u32 __iomem *)addr) + i);
+#endif
+}
+#endif
+
 extern int devmem_is_allowed(unsigned long pfn);
 
 #endif /* __KERNEL__ */