Message ID | c3ae87aea7660c3d266905c19d10d8de0f9fb779.1700766072.git.leon@kernel.org (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
Series | Add and use memcpy_toio_64() | expand |
On Thu, Nov 23, 2023 at 09:04:31PM +0200, Leon Romanovsky wrote: > From: Jason Gunthorpe <jgg@nvidia.com> > > The kernel supports write combining IO memory which is commonly used to > generate 64 byte TLPs in a PCIe environment. On many CPUs this mechanism > is pretty tolerant and a simple C loop will suffice to generate a 64 byte > TLP. > > However modern ARM64 CPUs are quite sensitive and a compiler generated > loop is not enough to reliably generate a 64 byte TLP. Especially given > the ARM64 issue that writel() does not codegen anything other than "[xN]" > as the address calculation. > > These newer CPUs require an orderly consecutive block of stores to work > reliably. This is best done with four STP integer instructions (perhaps > ST64B in future), or a single ST4 vector instruction. > > Provide a new generic function memcpy_toio_64() which should reliably > generate the needed instructions for the architecture, assuming address > alignment. As the usual need for this operation is performance sensitive a > fast inline implementation is preferred. There is *no* architectural sequence that is guaranteed to reliably generate a 64-byte TLP, and this sequence won't guarnatee that (e.g. even if the CPU *always* merged adjacent stores, we can take an interrupt mid-sequence that would prevent that). What's the actual requirement here? Is this just for performance? Mark. > Implement an optimized version on ARM that is a block of 4 STP > instructions. > > The generic implementation is just a simple loop. x86-64 (clang 16) > compiles this into an unrolled loop of 16 movq pairs. > > Cc: Arnd Bergmann <arnd@arndb.de> > Cc: Catalin Marinas <catalin.marinas@arm.com> > Cc: Will Deacon <will@kernel.org> > Cc: linux-arch@vger.kernel.org > Cc: linux-arm-kernel@lists.infradead.org > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> > --- > arch/arm64/include/asm/io.h | 20 ++++++++++++++++++++ > include/asm-generic/io.h | 30 ++++++++++++++++++++++++++++++ > 2 files changed, 50 insertions(+) > > diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h > index 3b694511b98f..73ab91913790 100644 > --- a/arch/arm64/include/asm/io.h > +++ b/arch/arm64/include/asm/io.h > @@ -135,6 +135,26 @@ extern void __memset_io(volatile void __iomem *, int, size_t); > #define memcpy_fromio(a,c,l) __memcpy_fromio((a),(c),(l)) > #define memcpy_toio(c,a,l) __memcpy_toio((c),(a),(l)) > > +static inline void __memcpy_toio_64(volatile void __iomem *to, const void *from) > +{ > + const u64 *from64 = from; > + > + /* > + * Newer ARM core have sensitive write combining buffers, it is > + * important that the stores be contiguous blocks of store instructions. > + * Normal memcpy does not work reliably. > + */ > + asm volatile("stp %x0, %x1, [%8, #16 * 0]\n" > + "stp %x2, %x3, [%8, #16 * 1]\n" > + "stp %x4, %x5, [%8, #16 * 2]\n" > + "stp %x6, %x7, [%8, #16 * 3]\n" > + : > + : "rZ"(from64[0]), "rZ"(from64[1]), "rZ"(from64[2]), > + "rZ"(from64[3]), "rZ"(from64[4]), "rZ"(from64[5]), > + "rZ"(from64[6]), "rZ"(from64[7]), "r"(to)); > +} > +#define memcpy_toio_64(to, from) __memcpy_toio_64(to, from) > + > /* > * I/O memory mapping functions. > */ > diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h > index bac63e874c7b..2d6d60ed2128 100644 > --- a/include/asm-generic/io.h > +++ b/include/asm-generic/io.h > @@ -1202,6 +1202,36 @@ static inline void memcpy_toio(volatile void __iomem *addr, const void *buffer, > } > #endif > > +#ifndef memcpy_toio_64 > +#define memcpy_toio_64 memcpy_toio_64 > +/** > + * memcpy_toio_64 Copy 64 bytes of data into I/O memory > + * @dst: The (I/O memory) destination for the copy > + * @src: The (RAM) source for the data > + * @count: The number of bytes to copy > + * > + * dst and src must be aligned to 8 bytes. This operation copies exactly 64 > + * bytes. It is intended to be used for write combining IO memory. The > + * architecture should provide an implementation that has a high chance of > + * generating a single combined transaction. > + */ > +static inline void memcpy_toio_64(volatile void __iomem *addr, > + const void *buffer) > +{ > + unsigned int i = 0; > + > +#if BITS_PER_LONG == 64 > + for (; i != 8; i++) > + __raw_writeq(((const u64 *)buffer)[i], > + ((u64 __iomem *)addr) + i); > +#else > + for (; i != 16; i++) > + __raw_writel(((const u32 *)buffer)[i], > + ((u32 __iomem *)addr) + i); > +#endif > +} > +#endif > + > extern int devmem_is_allowed(unsigned long pfn); > > #endif /* __KERNEL__ */ > -- > 2.42.0 > >
On Fri, Nov 24, 2023 at 10:16:15AM +0000, Mark Rutland wrote: > On Thu, Nov 23, 2023 at 09:04:31PM +0200, Leon Romanovsky wrote: > > From: Jason Gunthorpe <jgg@nvidia.com> > > > > The kernel supports write combining IO memory which is commonly used to > > generate 64 byte TLPs in a PCIe environment. On many CPUs this mechanism > > is pretty tolerant and a simple C loop will suffice to generate a 64 byte > > TLP. > > > > However modern ARM64 CPUs are quite sensitive and a compiler generated > > loop is not enough to reliably generate a 64 byte TLP. Especially given > > the ARM64 issue that writel() does not codegen anything other than "[xN]" > > as the address calculation. > > > > These newer CPUs require an orderly consecutive block of stores to work > > reliably. This is best done with four STP integer instructions (perhaps > > ST64B in future), or a single ST4 vector instruction. > > > > Provide a new generic function memcpy_toio_64() which should reliably > > generate the needed instructions for the architecture, assuming address > > alignment. As the usual need for this operation is performance sensitive a > > fast inline implementation is preferred. > > There is *no* architectural sequence that is guaranteed to reliably generate a > 64-byte TLP, and this sequence won't guarnatee that (e.g. even if the CPU > *always* merged adjacent stores, we can take an interrupt mid-sequence that > would prevent that). WC is not guaranteed on any arch, that is well known. The HW has means to handle fragmented TLPs, it just hurts performance when it happens. "reliable" here means we'd like to see something like a > 90% chance of the large TLP instead of the < 1% chance with the C loop. Future ARM CPUs have the ST64B instruction which does provide the architectural guarantee, and x86 has a similar guaranteed instruction now too. > What's the actual requirement here? Is this just for performance? Yes, just performance. Jason
On 23/11/2023 7:04 pm, Leon Romanovsky wrote: > From: Jason Gunthorpe <jgg@nvidia.com> > > The kernel supports write combining IO memory which is commonly used to > generate 64 byte TLPs in a PCIe environment. On many CPUs this mechanism > is pretty tolerant and a simple C loop will suffice to generate a 64 byte > TLP. > > However modern ARM64 CPUs are quite sensitive and a compiler generated > loop is not enough to reliably generate a 64 byte TLP. Especially given > the ARM64 issue that writel() does not codegen anything other than "[xN]" > as the address calculation. > > These newer CPUs require an orderly consecutive block of stores to work > reliably. This is best done with four STP integer instructions (perhaps > ST64B in future), or a single ST4 vector instruction. > > Provide a new generic function memcpy_toio_64() which should reliably > generate the needed instructions for the architecture, assuming address > alignment. As the usual need for this operation is performance sensitive a > fast inline implementation is preferred. > > Implement an optimized version on ARM that is a block of 4 STP > instructions. > > The generic implementation is just a simple loop. x86-64 (clang 16) > compiles this into an unrolled loop of 16 movq pairs. > > Cc: Arnd Bergmann <arnd@arndb.de> > Cc: Catalin Marinas <catalin.marinas@arm.com> > Cc: Will Deacon <will@kernel.org> > Cc: linux-arch@vger.kernel.org > Cc: linux-arm-kernel@lists.infradead.org > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> > --- > arch/arm64/include/asm/io.h | 20 ++++++++++++++++++++ > include/asm-generic/io.h | 30 ++++++++++++++++++++++++++++++ > 2 files changed, 50 insertions(+) > > diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h > index 3b694511b98f..73ab91913790 100644 > --- a/arch/arm64/include/asm/io.h > +++ b/arch/arm64/include/asm/io.h > @@ -135,6 +135,26 @@ extern void __memset_io(volatile void __iomem *, int, size_t); > #define memcpy_fromio(a,c,l) __memcpy_fromio((a),(c),(l)) > #define memcpy_toio(c,a,l) __memcpy_toio((c),(a),(l)) > > +static inline void __memcpy_toio_64(volatile void __iomem *to, const void *from) > +{ > + const u64 *from64 = from; > + > + /* > + * Newer ARM core have sensitive write combining buffers, it is > + * important that the stores be contiguous blocks of store instructions. > + * Normal memcpy does not work reliably. > + */ > + asm volatile("stp %x0, %x1, [%8, #16 * 0]\n" > + "stp %x2, %x3, [%8, #16 * 1]\n" > + "stp %x4, %x5, [%8, #16 * 2]\n" > + "stp %x6, %x7, [%8, #16 * 3]\n" > + : > + : "rZ"(from64[0]), "rZ"(from64[1]), "rZ"(from64[2]), > + "rZ"(from64[3]), "rZ"(from64[4]), "rZ"(from64[5]), > + "rZ"(from64[6]), "rZ"(from64[7]), "r"(to)); Is this correct for big-endian? LDP/STP are kinda tricksy in that regard. Thanks, Robin. > +} > +#define memcpy_toio_64(to, from) __memcpy_toio_64(to, from) > + > /* > * I/O memory mapping functions. > */ > diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h > index bac63e874c7b..2d6d60ed2128 100644 > --- a/include/asm-generic/io.h > +++ b/include/asm-generic/io.h > @@ -1202,6 +1202,36 @@ static inline void memcpy_toio(volatile void __iomem *addr, const void *buffer, > } > #endif > > +#ifndef memcpy_toio_64 > +#define memcpy_toio_64 memcpy_toio_64 > +/** > + * memcpy_toio_64 Copy 64 bytes of data into I/O memory > + * @dst: The (I/O memory) destination for the copy > + * @src: The (RAM) source for the data > + * @count: The number of bytes to copy > + * > + * dst and src must be aligned to 8 bytes. This operation copies exactly 64 > + * bytes. It is intended to be used for write combining IO memory. The > + * architecture should provide an implementation that has a high chance of > + * generating a single combined transaction. > + */ > +static inline void memcpy_toio_64(volatile void __iomem *addr, > + const void *buffer) > +{ > + unsigned int i = 0; > + > +#if BITS_PER_LONG == 64 > + for (; i != 8; i++) > + __raw_writeq(((const u64 *)buffer)[i], > + ((u64 __iomem *)addr) + i); > +#else > + for (; i != 16; i++) > + __raw_writel(((const u32 *)buffer)[i], > + ((u32 __iomem *)addr) + i); > +#endif > +} > +#endif > + > extern int devmem_is_allowed(unsigned long pfn); > > #endif /* __KERNEL__ */
On Fri, Nov 24, 2023 at 12:58:11PM +0000, Robin Murphy wrote: > > diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h > > index 3b694511b98f..73ab91913790 100644 > > --- a/arch/arm64/include/asm/io.h > > +++ b/arch/arm64/include/asm/io.h > > @@ -135,6 +135,26 @@ extern void __memset_io(volatile void __iomem *, int, size_t); > > #define memcpy_fromio(a,c,l) __memcpy_fromio((a),(c),(l)) > > #define memcpy_toio(c,a,l) __memcpy_toio((c),(a),(l)) > > +static inline void __memcpy_toio_64(volatile void __iomem *to, const void *from) > > +{ > > + const u64 *from64 = from; > > + > > + /* > > + * Newer ARM core have sensitive write combining buffers, it is > > + * important that the stores be contiguous blocks of store instructions. > > + * Normal memcpy does not work reliably. > > + */ > > + asm volatile("stp %x0, %x1, [%8, #16 * 0]\n" > > + "stp %x2, %x3, [%8, #16 * 1]\n" > > + "stp %x4, %x5, [%8, #16 * 2]\n" > > + "stp %x6, %x7, [%8, #16 * 3]\n" > > + : > > + : "rZ"(from64[0]), "rZ"(from64[1]), "rZ"(from64[2]), > > + "rZ"(from64[3]), "rZ"(from64[4]), "rZ"(from64[5]), > > + "rZ"(from64[6]), "rZ"(from64[7]), "r"(to)); > > Is this correct for big-endian? LDP/STP are kinda tricksy in that regard. Uh.. I didn't think about it at all.. By no means do I have any skill reading the ARM documents, but I think it is OK, it says: Mem[address, dbytes, AccType_NORMAL] = data1; Mem[address+dbytes, dbytes, AccType_NORMAL] = data2; So I understand that as Mem[%8, #16 * 0, 8, AccType_NORMAL] = from64[0] Mem[%8, #16 * 0 + 1 , 8, AccType_NORMAL] = from64[1] Mem[%8, #16 * 1, 8, AccType_NORMAL] = from64[2] Mem[%8, #16 * 1 + 1, 8, AccType_NORMAL] = from64[3] .. Which is the same on BE/LE? But I don't know the pitfall to watch for here. This is memcpy so we don't have to swap, the order of the bits in the register doesn't matter. Thanks, Jason
On Thu, 2023-11-23 at 21:04 +0200, Leon Romanovsky wrote: > From: Jason Gunthorpe <jgg@nvidia.com> > > The kernel supports write combining IO memory which is commonly used to > generate 64 byte TLPs in a PCIe environment. On many CPUs this mechanism > is pretty tolerant and a simple C loop will suffice to generate a 64 byte > TLP. > > However modern ARM64 CPUs are quite sensitive and a compiler generated > loop is not enough to reliably generate a 64 byte TLP. Especially given > the ARM64 issue that writel() does not codegen anything other than "[xN]" > as the address calculation. > > These newer CPUs require an orderly consecutive block of stores to work > reliably. This is best done with four STP integer instructions (perhaps > ST64B in future), or a single ST4 vector instruction. > > Provide a new generic function memcpy_toio_64() which should reliably > generate the needed instructions for the architecture, assuming address > alignment. As the usual need for this operation is performance sensitive a > fast inline implementation is preferred. > > Implement an optimized version on ARM that is a block of 4 STP > instructions. > > The generic implementation is just a simple loop. x86-64 (clang 16) > compiles this into an unrolled loop of 16 movq pairs. > > Cc: Arnd Bergmann <arnd@arndb.de> > Cc: Catalin Marinas <catalin.marinas@arm.com> > Cc: Will Deacon <will@kernel.org> > Cc: linux-arch@vger.kernel.org > Cc: linux-arm-kernel@lists.infradead.org > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> > --- ---8<--- > +#ifndef memcpy_toio_64 > +#define memcpy_toio_64 memcpy_toio_64 > +/** > + * memcpy_toio_64 Copy 64 bytes of data into I/O memory > + * @dst: The (I/O memory) destination for the copy > + * @src: The (RAM) source for the data > + * @count: The number of bytes to copy > + * > + * dst and src must be aligned to 8 bytes. This operation copies exactly 64 > + * bytes. It is intended to be used for write combining IO memory. The > + * architecture should provide an implementation that has a high chance of > + * generating a single combined transaction. > + */ > +static inline void memcpy_toio_64(volatile void __iomem *addr, > + const void *buffer) > +{ > + unsigned int i = 0; > + > +#if BITS_PER_LONG == 64 > + for (; i != 8; i++) > + __raw_writeq(((const u64 *)buffer)[i], > + ((u64 __iomem *)addr) + i); > +#else > + for (; i != 16; i++) > + __raw_writel(((const u32 *)buffer)[i], > + ((u32 __iomem *)addr) + i); > +#endif What's the reasoning behind not using the existing memcpy_toio() here? For s390 the above generic variant would do 8 of our special PCI store instructions while memcpy_toio() is defined to zpci_memcpy_toio() which can do the same as a single PCI store block instruction. Now of course we could provide our own memcpy_toio_64() but that would end up the same as just doing memcpy_toio(addr, buffer, 64) here. > +} > +#endif > + > extern int devmem_is_allowed(unsigned long pfn); > > #endif /* __KERNEL__ */
On Fri, Nov 24, 2023 at 03:10:29PM +0100, Niklas Schnelle wrote: > What's the reasoning behind not using the existing memcpy_toio() > here? Going forward CPUs are implementing an instruction to do a 64 byte aligned store, this is a wrapper for exactly that operation. memcpy_toio() is much more general, it allows unaligned buffers and non-multiples of 64. Adapting the general version to generate the optimized version in the cases it can is complex and has a codegen penalty.. > For s390 the above generic variant would do 8 of our special PCI store > instructions while memcpy_toio() is defined to zpci_memcpy_toio() which > can do the same as a single PCI store block instruction. Now of course > we could provide our own memcpy_toio_64() but that would end up the > same as just doing memcpy_toio(addr, buffer, 64) here. This is probably better? Jason
On Fri, 2023-11-24 at 10:20 -0400, Jason Gunthorpe wrote: > On Fri, Nov 24, 2023 at 03:10:29PM +0100, Niklas Schnelle wrote: > > > What's the reasoning behind not using the existing memcpy_toio() > > here? > > Going forward CPUs are implementing an instruction to do a 64 byte > aligned store, this is a wrapper for exactly that operation. > > memcpy_toio() is much more general, it allows unaligned buffers and > non-multiples of 64. Adapting the general version to generate the > optimized version in the cases it can is complex and has a codegen > penalty.. I think you misunderstood me. I understand why you want a separate memcpy_toio_64(). I just wonder if its generic implementation shouldn't just be a define or inline wrapper for memcpy_toio(addr, buffer, 64). For s390 that would already result in a single PCI store block which for us is much much better than 8 consecutive __raw_writeq(). Our zpci_memcpy_toio() still has some extra code to ensure alignment and break it up in supported sizes that we could get rid of with our own memcpy_toio_64() of course. I suspect though that since it's all inline functions the compiler seeing the constant 64 might already eliminate some of the extra code. Also seeing the second patch of course that would no longer really test for write combining for us which we can also do but I think that's okay and you're probably going to use memcpy_toio_64() in more places and there we really want the PCI store block. Thanks, Niklas
On Fri, 2023-11-24 at 15:48 +0100, Niklas Schnelle wrote: > On Fri, 2023-11-24 at 10:20 -0400, Jason Gunthorpe wrote: > > On Fri, Nov 24, 2023 at 03:10:29PM +0100, Niklas Schnelle wrote: > > ---8<--- > > Also seeing the second patch of course that would no longer really test > for write combining for us which we can also do but I think that's okay > and you're probably going to use memcpy_toio_64() in more places and > there we really want the PCI store block. Disregard this part I didn't properly align commit message and code. I thought you replaced the check with memcpy_toio_64(). Thanks, Niklas
On Fri, Nov 24, 2023 at 03:48:22PM +0100, Niklas Schnelle wrote: > On Fri, 2023-11-24 at 10:20 -0400, Jason Gunthorpe wrote: > > On Fri, Nov 24, 2023 at 03:10:29PM +0100, Niklas Schnelle wrote: > > > > > What's the reasoning behind not using the existing memcpy_toio() > > > here? > > > > Going forward CPUs are implementing an instruction to do a 64 byte > > aligned store, this is a wrapper for exactly that operation. > > > > memcpy_toio() is much more general, it allows unaligned buffers and > > non-multiples of 64. Adapting the general version to generate the > > optimized version in the cases it can is complex and has a codegen > > penalty.. > > I think you misunderstood me. I understand why you want a separate > memcpy_toio_64(). I just wonder if its generic implementation shouldn't > just be a define or inline wrapper for memcpy_toio(addr, buffer, 64). Oh, yes, I totally did. I'm worried that x86 will less reliably generate write combining with it's memcpy_toio implemention. It codegens byte copies for that function :( > Also seeing the second patch of course that would no longer really test > for write combining for us which we can also do but I think that's okay > and you're probably going to use memcpy_toio_64() in more places and > there we really want the PCI store block. Right now we don't have in-kernel performance use cases for write combining for mlx5. Userspace uses the WC and we already have the special 390 instructions for batching in rdma-core already, IIRC. So it would be appropriate for s390 to use a consistent path. Jason
On 24/11/2023 1:45 pm, Jason Gunthorpe wrote: > On Fri, Nov 24, 2023 at 12:58:11PM +0000, Robin Murphy wrote: >>> diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h >>> index 3b694511b98f..73ab91913790 100644 >>> --- a/arch/arm64/include/asm/io.h >>> +++ b/arch/arm64/include/asm/io.h >>> @@ -135,6 +135,26 @@ extern void __memset_io(volatile void __iomem *, int, size_t); >>> #define memcpy_fromio(a,c,l) __memcpy_fromio((a),(c),(l)) >>> #define memcpy_toio(c,a,l) __memcpy_toio((c),(a),(l)) >>> +static inline void __memcpy_toio_64(volatile void __iomem *to, const void *from) >>> +{ >>> + const u64 *from64 = from; >>> + >>> + /* >>> + * Newer ARM core have sensitive write combining buffers, it is >>> + * important that the stores be contiguous blocks of store instructions. >>> + * Normal memcpy does not work reliably. >>> + */ >>> + asm volatile("stp %x0, %x1, [%8, #16 * 0]\n" >>> + "stp %x2, %x3, [%8, #16 * 1]\n" >>> + "stp %x4, %x5, [%8, #16 * 2]\n" >>> + "stp %x6, %x7, [%8, #16 * 3]\n" >>> + : >>> + : "rZ"(from64[0]), "rZ"(from64[1]), "rZ"(from64[2]), >>> + "rZ"(from64[3]), "rZ"(from64[4]), "rZ"(from64[5]), >>> + "rZ"(from64[6]), "rZ"(from64[7]), "r"(to)); >> >> Is this correct for big-endian? LDP/STP are kinda tricksy in that regard. > > Uh.. I didn't think about it at all.. > > By no means do I have any skill reading the ARM documents, but I think > it is OK, it says: > > Mem[address, dbytes, AccType_NORMAL] = data1; > Mem[address+dbytes, dbytes, AccType_NORMAL] = data2; > > So I understand that as > > Mem[%8, #16 * 0, 8, AccType_NORMAL] = from64[0] > Mem[%8, #16 * 0 + 1 , 8, AccType_NORMAL] = from64[1] > Mem[%8, #16 * 1, 8, AccType_NORMAL] = from64[2] > Mem[%8, #16 * 1 + 1, 8, AccType_NORMAL] = from64[3] > .. > > Which is the same on BE/LE? > > But I don't know the pitfall to watch for here. This is memcpy so we > don't have to swap, the order of the bits in the register doesn't > matter. Indeed you're right - all the way back to Armv7 LDRD/STRD, I always get caught out by remembering the path which does an endian-dependent swap of the target registers, but forgetting that that's there to *counteract* the byteswap in Mem[] itself. Cheers, Robin.
On Fri, 2023-11-24 at 10:55 -0400, Jason Gunthorpe wrote: > On Fri, Nov 24, 2023 at 03:48:22PM +0100, Niklas Schnelle wrote: > > On Fri, 2023-11-24 at 10:20 -0400, Jason Gunthorpe wrote: > > > On Fri, Nov 24, 2023 at 03:10:29PM +0100, Niklas Schnelle wrote: > > > > > > > What's the reasoning behind not using the existing memcpy_toio() > > > > here? > > > > > > Going forward CPUs are implementing an instruction to do a 64 byte > > > aligned store, this is a wrapper for exactly that operation. > > > > > > memcpy_toio() is much more general, it allows unaligned buffers and > > > non-multiples of 64. Adapting the general version to generate the > > > optimized version in the cases it can is complex and has a codegen > > > penalty.. > > > > I think you misunderstood me. I understand why you want a separate > > memcpy_toio_64(). I just wonder if its generic implementation shouldn't > > just be a define or inline wrapper for memcpy_toio(addr, buffer, 64). > > Oh, yes, I totally did. > > I'm worried that x86 will less reliably generate write combining with > it's memcpy_toio implemention. It codegens byte copies for that > function :( Oh ok I see what you mean. > > > Also seeing the second patch of course that would no longer really test > > for write combining for us which we can also do but I think that's okay > > and you're probably going to use memcpy_toio_64() in more places and > > there we really want the PCI store block. > > Right now we don't have in-kernel performance use cases for write > combining for mlx5. Is the code in patch 2 performance critical? > > Userspace uses the WC and we already have the special 390 instructions > for batching in rdma-core already, IIRC. Yes, I added that support to rdma-core :-) > > So it would be appropriate for s390 to use a consistent path. > > Jason This should be as easy as adding #define memcpy_toio_64(to, from) zpci_memcpy_toio(to, from, 64) to arch/s390/include/asm/io.h. I'm wondering if we should do that as part of this series. It's not as good as a special case but probably better than the existing loop. I don't think we have any existing in-kernel users of memcpy_toio() on s390 so far though so I'd like to give this some extra testing. Could you share instructions on how to exercise the code path of patch 2 on a ConnectX-5 or 6? Is this exercised e.g. when using NVMe-oF RDMA? Thanks, Niklas
On Fri, Nov 24, 2023 at 04:59:38PM +0100, Niklas Schnelle wrote: > This should be as easy as adding > > #define memcpy_toio_64(to, from) zpci_memcpy_toio(to, from, 64) > > to arch/s390/include/asm/io.h. I'm wondering if we should do that as > part of this series. It's not as good as a special case but probably > better than the existing loop. Makes sense > I don't think we have any existing in-kernel users of memcpy_toio() on > s390 so far though so I'd like to give this some extra testing. Could > you share instructions on how to exercise the code path of patch 2 on a > ConnectX-5 or 6? Is this exercised e.g. when using NVMe-oF RDMA? Simply boot and look at pr_debug from mlx5 to see if writecombining is on or off - you want to see on. Thanks, Jason
On Fri, Nov 24, 2023 at 08:23:52AM -0400, Jason Gunthorpe wrote: > On Fri, Nov 24, 2023 at 10:16:15AM +0000, Mark Rutland wrote: > > On Thu, Nov 23, 2023 at 09:04:31PM +0200, Leon Romanovsky wrote: > > > From: Jason Gunthorpe <jgg@nvidia.com> [...] > > > Provide a new generic function memcpy_toio_64() which should reliably > > > generate the needed instructions for the architecture, assuming address > > > alignment. As the usual need for this operation is performance sensitive a > > > fast inline implementation is preferred. > > > > There is *no* architectural sequence that is guaranteed to reliably generate a > > 64-byte TLP, and this sequence won't guarnatee that (e.g. even if the CPU > > *always* merged adjacent stores, we can take an interrupt mid-sequence that > > would prevent that). > > WC is not guaranteed on any arch, that is well known. > > The HW has means to handle fragmented TLPs, it just hurts performance > when it happens. "reliable" here means we'd like to see something like > a > 90% chance of the large TLP instead of the < 1% chance with the C > loop. > > Future ARM CPUs have the ST64B instruction which does provide the > architectural guarantee, and x86 has a similar guaranteed instruction > now too. > > > What's the actual requirement here? Is this just for performance? > > Yes, just performance. Do you have any rough numbers (percentage)? It's highly microarchitecture-dependent until we get the ST64B instruction. More of a bike-shedding, I wonder whether the __iowrite*_copy() semantics are better suited for what you need in terms of ordering (not that mempcy_toio() to Normal NC memory gives us any ordering).
On Mon, Nov 27, 2023 at 12:42:41PM +0000, Catalin Marinas wrote: > > > What's the actual requirement here? Is this just for performance? > > > > Yes, just performance. > > Do you have any rough numbers (percentage)? It's highly > microarchitecture-dependent until we get the ST64B instruction. The current C code is an open coded store loop. The kernel does 250 tries and measures if any one of them succeeds to combine. On x86, and older ARM cores we see that 100% of the time at least 1 in 250 tries succeeds. With the new CPU cores we see more like 9 out of 10 time there are 0 in 250 tries that succeed. Ie we can go thousands of times without seeing any successful WC combine. The STP block brings it back to 100% of the time 1 in 250 succeed. This is a statistical lower bound, based on what we see performance wise it almost always works. However, in userspace we have long been using ST4 to create a single-instruction 64 byte store on ARM64. As far as I know this is highly reliable. I don't have direct data on the STP configuration. > More of a bike-shedding, I wonder whether the __iowrite*_copy() > semantics are better suited for what you need in terms of ordering (not > that mempcy_toio() to Normal NC memory gives us any ordering). I have the same remark I gave to Niklas, this does not require alignment or an exact 64 byte size. It was clearly made to support WC stores since Pathscale did it, but I don't see this mapping nicely to the future 64 byte store instructions are we getting. We could name it __iowrite512_copy() if that makes more sense? Thanks, Jason
On Fri, 2023-11-24 at 12:06 -0400, Jason Gunthorpe wrote: > On Fri, Nov 24, 2023 at 04:59:38PM +0100, Niklas Schnelle wrote: > > > This should be as easy as adding > > > > #define memcpy_toio_64(to, from) zpci_memcpy_toio(to, from, 64) > > > > to arch/s390/include/asm/io.h. I'm wondering if we should do that as > > part of this series. It's not as good as a special case but probably > > better than the existing loop. > > Makes sense Ok, I overlooked the obvious. Let's make that: #define memcpy_toio_64(dst, src) zpci_write_block(dst, src, 64) > > > I don't think we have any existing in-kernel users of memcpy_toio() on > > s390 so far though so I'd like to give this some extra testing. Could > > you share instructions on how to exercise the code path of patch 2 on a > > ConnectX-5 or 6? Is this exercised e.g. when using NVMe-oF RDMA? > > Simply boot and look at pr_debug from mlx5 to see if writecombining is > on or off - you want to see on. > > Thanks, > Jason With the above zpci_write_block(dst, src, 64) we get a PCI store block without any extra alignment treatment i.e. exactly what we want for memcpy_toio_64(). If the alignment is wrong the PCI store block instruction will fail the PCI function will be isolated and we log an error so I don't see a need for checks there either. On an aside it looks like our zpci_memcpy_toio() is wrongly looking for tighter than 8 byte alignment on the source address and would issue a series of 8 stores. Still looking into that. I also tested this with our only privileged (kernel only) PCI stores and that works too. Also it turns out the writeq() loop we had so far does not produce the needed 64 byte TLP on s390 either so this actually makes us newly pass this test. Thanks, Niklas
On Mon, Nov 27, 2023 at 06:43:11PM +0100, Niklas Schnelle wrote: > Also it turns out the writeq() loop we had so far does not produce the > needed 64 byte TLP on s390 either so this actually makes us newly pass > this test. Ooh, that is a significant problem - the userspace code won't be used unless this test passes. So we need this on S390 to fix a bug as well :\ Thanks, Jason
On Mon, 2023-11-27 at 13:51 -0400, Jason Gunthorpe wrote: > On Mon, Nov 27, 2023 at 06:43:11PM +0100, Niklas Schnelle wrote: > > > Also it turns out the writeq() loop we had so far does not produce the > > needed 64 byte TLP on s390 either so this actually makes us newly pass > > this test. > > Ooh, that is a significant problem - the userspace code won't be used > unless this test passes. So we need this on S390 to fix a bug as well > :\ > > Thanks, > Jason > Yes ;-( In the meantime I also found out that zpci_write_block(dst, src, 64) is not correct for all cases because the PCI store block requires the (pseudo-)MMIO write not to cross a 4K boundary and we need src/dst to be double word aligned. In rdma-core this is neatly handled by the get_max_write_size() but the kernel variant of that zpci_get_max_write_size() isn't just a lot harder to read and likely less efficient but also too strict thus breaking the 64 byte write up needlessly. In total we have 5 conditions for the PCI block stores: 1. The dst+len must not cross a 4K boundary in the (pseudo-)MMIO space 2. The length must not exceed the maximum write size 3. The length must be a multiple of 8 4. The src needs to be double word aligned 5. The dst needs to be double word aligned So I think a good solution would be to improve zpci_memcpy_toio() with an enhanced zpci_get_max_write_size() based on the code in rdma-core extended to also handle the alignment and length restrictions which in rdma-core are already assumed (see comment there). Then we can use zpci_memcpy_toio(dst, src, 64) for memcpy_toio_64() and rely on the compiler to optimize out the unnecessary checks (2, 3 and possibly 4, 5). So yeah this is getting a bit more complicated than originally thought. Let me cook up a patch. Thanks, Niklas
On Mon, Nov 27, 2023 at 09:45:05AM -0400, Jason Gunthorpe wrote: > On Mon, Nov 27, 2023 at 12:42:41PM +0000, Catalin Marinas wrote: > > > > What's the actual requirement here? Is this just for performance? > > > > > > Yes, just performance. > > > > Do you have any rough numbers (percentage)? It's highly > > microarchitecture-dependent until we get the ST64B instruction. > > The current C code is an open coded store loop. The kernel does 250 > tries and measures if any one of them succeeds to combine. > > On x86, and older ARM cores we see that 100% of the time at least 1 in > 250 tries succeeds. > > With the new CPU cores we see more like 9 out of 10 time there are 0 > in 250 tries that succeed. Ie we can go thousands of times without > seeing any successful WC combine. > > The STP block brings it back to 100% of the time 1 in 250 succeed. That's a bit confusing to me: 1 in 250 succeeding is still pretty rare. But I guess what your benchmark says is that at least 1 succeeded to write-combine and it might as well be all 250 tries. It's more interesting to see if there's actual performance gain in real-world traffic, not just some artificial benchmark (I may have misunderstood your numbers above). > However, in userspace we have long been using ST4 to create a > single-instruction 64 byte store on ARM64. As far as I know this is > highly reliable. I don't have direct data on the STP configuration. Personally I'd optimise the mempcy_toio() arm64 implementation to do STPs if the alignment is right (like we do for classic memcpy()). There's a slight overhead for alignment checking but I suspect it would be lost as long as you can get the write-combining. Not sure whether the interspersed reads in memcpy_toio() would somehow prevent the write-combining. A memcpy_toio_64() can use the new ST64B instruction if available or fall back to memcpy_toio() on arm64. It should also have the DGH instruction (io_stop_wc()) but only if falling back to classic memcpy_toio(). We don't need DGH with ST64B. > > More of a bike-shedding, I wonder whether the __iowrite*_copy() > > semantics are better suited for what you need in terms of ordering (not > > that mempcy_toio() to Normal NC memory gives us any ordering). > > I have the same remark I gave to Niklas, this does not require > alignment or an exact 64 byte size. It was clearly made to support WC > stores since Pathscale did it, but I don't see this mapping nicely to > the future 64 byte store instructions are we getting. As above, I'd suggest just using memcpy_toio() as a fallback if ST64B is not available. > We could name it __iowrite512_copy() if that makes more sense? I've been thinking at the __iowrite*_copy() and these also take a 'count' argument. I assume in this instance we don't really need one, so it's just additional overhead (more like API clutter, I doubt it makes much difference for performance). I'd say just stick to the mempcy_toio_64() but have the io_stop_wc() inside this function as we won't need it with ST64B. Well, unless someone has a better name for this function.
On Mon, Dec 04, 2023 at 05:31:47PM +0000, Catalin Marinas wrote: > On Mon, Nov 27, 2023 at 09:45:05AM -0400, Jason Gunthorpe wrote: > > On Mon, Nov 27, 2023 at 12:42:41PM +0000, Catalin Marinas wrote: > > > > > What's the actual requirement here? Is this just for performance? > > > > > > > > Yes, just performance. > > > > > > Do you have any rough numbers (percentage)? It's highly > > > microarchitecture-dependent until we get the ST64B instruction. > > > > The current C code is an open coded store loop. The kernel does 250 > > tries and measures if any one of them succeeds to combine. > > > > On x86, and older ARM cores we see that 100% of the time at least 1 in > > 250 tries succeeds. > > > > With the new CPU cores we see more like 9 out of 10 time there are 0 > > in 250 tries that succeed. Ie we can go thousands of times without > > seeing any successful WC combine. > > > > The STP block brings it back to 100% of the time 1 in 250 succeed. > > That's a bit confusing to me: 1 in 250 succeeding is still pretty rare. > But I guess what your benchmark says is that at least 1 succeeded to > write-combine and it might as well be all 250 tries. It's more > interesting to see if there's actual performance gain in real-world > traffic, not just some artificial benchmark (I may have misunderstood > your numbers above). Yes, I just don't have better data available to say that 250/250 succeeded, but we expect that is the case. We have now something like 20 years experiance with write combining performance on x86 systems. It brings real world gains in real word HPC applications. Indeed, the reason this even came up was because one of our existing applications was performing unexpectedly badly on these ARM64 servers. We even have data showing that having the CPU do all the write combining steps and then fail to get writecombining is notably slower than just assuming no write combining. It is why we go through the trouble to test the physical HW. > > However, in userspace we have long been using ST4 to create a > > single-instruction 64 byte store on ARM64. As far as I know this is > > highly reliable. I don't have direct data on the STP configuration. > > Personally I'd optimise the mempcy_toio() arm64 implementation to do > STPs if the alignment is right (like we do for classic memcpy()). > There's a slight overhead for alignment checking but I suspect it would > be lost as long as you can get the write-combining. Not sure whether the > interspersed reads in memcpy_toio() would somehow prevent the > write-combining. I understand on these new CPUs anything other than a block of contiguous STPs is risky to break the WC. I was told we should not have any loads between them. So we can't just update memcpy_toio to optimize a 128 bit store variant like memcpy might. We actually need a special case just for 64 byte. IMHO it does not look good as the chance any existing callers can use this optmized 64B path is probably small, but everyone has to pay the costs to check for it. I also would not do this on x86 - Pathscale apparently decided the needed special __iowrite*_copy() things to actually make this work on xome x86 systems - I'm very leary to change x86 stuff away from the 64 bit copy loopw we know works already on x86. IMHO encoding the alignment expectation in the API is best, especially since this is typically a performance path. > A memcpy_toio_64() can use the new ST64B instruction if available or > fall back to memcpy_toio() on arm64. It should also have the DGH > instruction (io_stop_wc()) but only if falling back to classic > memcpy_toio(). We don't need DGH with ST64B. I'm told it is problematic, something about ST64B not working with NORMAL_NC. We could fold the DGH into the helper though. IHMO I'd like to see how ST64B actually gets implemented before doing that. If the note about the NORMAL_NC is true then we need a lot more infrastructure to actually use it. Also in a future ST64B world we are going to see HW start relying on large TLPs, not just being an optional performance win. To my mind it makes more sense that there is an API that guarantees a large TLP or oops. We really don't want an automatic fallback to memcpy. Jason
On Mon, Dec 04, 2023 at 02:23:30PM -0400, Jason Gunthorpe wrote: > On Mon, Dec 04, 2023 at 05:31:47PM +0000, Catalin Marinas wrote: > > Personally I'd optimise the mempcy_toio() arm64 implementation to do > > STPs if the alignment is right (like we do for classic memcpy()). > > There's a slight overhead for alignment checking but I suspect it would > > be lost as long as you can get the write-combining. Not sure whether the > > interspersed reads in memcpy_toio() would somehow prevent the > > write-combining. > > I understand on these new CPUs anything other than a block of > contiguous STPs is risky to break the WC. I was told we should not > have any loads between them. Classic memcpy does similar tricks with four LDPs in a row before starting to issue the STPs (though there are new LDPs for the next data in-between). But that was tuned for cacheable memory, not sure if something similar would behave well on Normal-NC memory. > So we can't just update memcpy_toio to optimize a 128 bit store > variant like memcpy might. We actually need a special case just for 64 > byte. > > IMHO it does not look good as the chance any existing callers can use > this optmized 64B path is probably small, but everyone has to pay the > costs to check for it. I don't think the cost of the check is noticeable and there are several places where the copy goes beyond 64 bytes. It may be worth a try. > I also would not do this on x86 - Pathscale apparently decided the > needed special __iowrite*_copy() things to actually make this work on > xome x86 systems - I'm very leary to change x86 stuff away from the 64 > bit copy loopw we know works already on x86. > > IMHO encoding the alignment expectation in the API is best, especially > since this is typically a performance path. The slight downside of a __iowrite512_copy() API is that, if we follow the 32/64 semantics, it would need the source buffer aligned. Maybe we can document it to 64-bit alignment only rather than 512. > > A memcpy_toio_64() can use the new ST64B instruction if available or > > fall back to memcpy_toio() on arm64. It should also have the DGH > > instruction (io_stop_wc()) but only if falling back to classic > > memcpy_toio(). We don't need DGH with ST64B. > > I'm told it is problematic, something about ST64B not working with > NORMAL_NC. Last time I checked it was meant to work on Normal-NC (not cacheable though). That's on page 285 of the Arm ARM J.a. > Also in a future ST64B world we are going to see HW start relying on > large TLPs, not just being an optional performance win. To my mind it > makes more sense that there is an API that guarantees a large TLP or > oops. We really don't want an automatic fallback to memcpy. We can't guarantee those large TLPs without the ST64B instructions, so it needs to be more of a QoS aspect than correctness.
On Tue, Dec 05, 2023 at 05:21:27PM +0000, Catalin Marinas wrote: > On Mon, Dec 04, 2023 at 02:23:30PM -0400, Jason Gunthorpe wrote: > > On Mon, Dec 04, 2023 at 05:31:47PM +0000, Catalin Marinas wrote: > > > Personally I'd optimise the mempcy_toio() arm64 implementation to do > > > STPs if the alignment is right (like we do for classic memcpy()). > > > There's a slight overhead for alignment checking but I suspect it would > > > be lost as long as you can get the write-combining. Not sure whether the > > > interspersed reads in memcpy_toio() would somehow prevent the > > > write-combining. > > > > I understand on these new CPUs anything other than a block of > > contiguous STPs is risky to break the WC. I was told we should not > > have any loads between them. > > Classic memcpy does similar tricks with four LDPs in a row before > starting to issue the STPs (though there are new LDPs for the next > data in-between). But that was tuned for cacheable memory, not sure > if something similar would behave well on Normal-NC memory. Can we conclude a direction here? 1) I don't want to mess with x86 so we keep a dedicated API Are we agreed to call it __iowrite512_copy() and note its special alignment limitation? 2) You want to #define __iowrite512_copy() to memcpy_toio() on ARM and implement some quad STP optimization for this case? 3) A future ST64B and the x86 version would be put under __iowrite512_copy()? 4) A future ST64B would come with some kind of 'must do 64b copy or oops' to support the future HW that must have this instruction? eg we already see on Intel that HW must use ENQCMD and nothing else. Agreed? Niklas, is this OK for S390 too? > > I'm told it is problematic, something about ST64B not working with > > NORMAL_NC. > > Last time I checked it was meant to work on Normal-NC (not cacheable > though). That's on page 285 of the Arm ARM J.a. This is a relief to hear! Thanks, Jason
On Tue, Dec 05, 2023 at 01:51:27PM -0400, Jason Gunthorpe wrote: > On Tue, Dec 05, 2023 at 05:21:27PM +0000, Catalin Marinas wrote: > > On Mon, Dec 04, 2023 at 02:23:30PM -0400, Jason Gunthorpe wrote: > > > On Mon, Dec 04, 2023 at 05:31:47PM +0000, Catalin Marinas wrote: > > > > Personally I'd optimise the mempcy_toio() arm64 implementation to do > > > > STPs if the alignment is right (like we do for classic memcpy()). > > > > There's a slight overhead for alignment checking but I suspect it would > > > > be lost as long as you can get the write-combining. Not sure whether the > > > > interspersed reads in memcpy_toio() would somehow prevent the > > > > write-combining. > > > > > > I understand on these new CPUs anything other than a block of > > > contiguous STPs is risky to break the WC. I was told we should not > > > have any loads between them. > > > > Classic memcpy does similar tricks with four LDPs in a row before > > starting to issue the STPs (though there are new LDPs for the next > > data in-between). But that was tuned for cacheable memory, not sure > > if something similar would behave well on Normal-NC memory. > > Can we conclude a direction here? > > 1) I don't want to mess with x86 so we keep a dedicated API > Are we agreed to call it __iowrite512_copy() and note its special > alignment limitation? Sounds fine to me. > 2) You want to #define __iowrite512_copy() to memcpy_toio() on ARM and > implement some quad STP optimization for this case? We can have the generic __iowrite512_copy() do memcpy_toio() and have the arm64 implement an optimised version. What I'm not entirely sure of is the DGH (whatever the io_* barrier name is). I'd put it in the same __iowrite512_copy() function and remove it from the driver code. Otherwise when ST64B is added, we have an unnecessary DGH in the driver. If this does not match the other __iowrite*_copy() semantics, we can come up with another name. But start with this for now and document the function. > 3) A future ST64B and the x86 version would be put under > __iowrite512_copy()? Yes, arch-specific override. > 4) A future ST64B would come with some kind of 'must do 64b copy or > oops' to support the future HW that must have this instruction? eg > we already see on Intel that HW must use ENQCMD and nothing else. I don't agree with the oops part. We can't guarantee it on arm64, ST64B I think is optional in the architecture. If you do need such guarantees, we'd need the driver to probe for the feature (e.g. arch_has_...()) and invoke a new macro. You can't have the __iowrite512_copy() that worked fine suddenly giving an error just because some driver wants a guaranteed atomic 64 byte write.
On Tue, Dec 05, 2023 at 07:34:45PM +0000, Catalin Marinas wrote: > > 2) You want to #define __iowrite512_copy() to memcpy_toio() on ARM and > > implement some quad STP optimization for this case? > > We can have the generic __iowrite512_copy() do memcpy_toio() and have > the arm64 implement an optimised version. > > What I'm not entirely sure of is the DGH (whatever the io_* barrier name > is). I'd put it in the same __iowrite512_copy() function and remove it > from the driver code. Otherwise when ST64B is added, we have an > unnecessary DGH in the driver. If this does not match the other > __iowrite*_copy() semantics, we can come up with another name. But start > with this for now and document the function. I think the iowrite is only used for WC and the DGH is functionally harmless for non-WC, so it makes sense. In this case we should just remove the DGH macro from the generic architecture code and tell people to use iowrite - since we now understand that callers basically have to in order to use DGH on new ARM CPUs. > > 3) A future ST64B and the x86 version would be put under > > __iowrite512_copy()? > > Yes, arch-specific override. > > > 4) A future ST64B would come with some kind of 'must do 64b copy or > > oops' to support the future HW that must have this instruction? eg > > we already see on Intel that HW must use ENQCMD and nothing else. > > I don't agree with the oops part. We can't guarantee it on arm64, ST64B > I think is optional in the architecture. If you do need such guarantees, > we'd need the driver to probe for the feature (e.g. arch_has_...()) and > invoke a new macro. Yes, exactly. The driver must check. The new macro should oops if it is invoked wrong, the same way enqcmd will oops if invoked wrong on Intel. Jason
On Tue, Dec 05, 2023 at 03:51:30PM -0400, Jason Gunthorpe wrote: > On Tue, Dec 05, 2023 at 07:34:45PM +0000, Catalin Marinas wrote: > > > 2) You want to #define __iowrite512_copy() to memcpy_toio() on ARM and > > > implement some quad STP optimization for this case? > > > > We can have the generic __iowrite512_copy() do memcpy_toio() and have > > the arm64 implement an optimised version. > > > > What I'm not entirely sure of is the DGH (whatever the io_* barrier name > > is). I'd put it in the same __iowrite512_copy() function and remove it > > from the driver code. Otherwise when ST64B is added, we have an > > unnecessary DGH in the driver. If this does not match the other > > __iowrite*_copy() semantics, we can come up with another name. But start > > with this for now and document the function. > > I think the iowrite is only used for WC and the DGH is functionally > harmless for non-WC, so it makes sense. > > In this case we should just remove the DGH macro from the generic > architecture code and tell people to use iowrite - since we now > understand that callers basically have to in order to use DGH on new > ARM CPUs. That works for me but what would the semantics be for __iowrite64_copy() for example? Is there a DGH at the end of the whole write or after each iteration? I'd go with the former since e.g. hns3_tx_push_bd() does that (and doesn't seem to be a 64 byte copy). Similarly for __iowrite512_copy(), if you want the DGH after each iteration you should only pass a count of 1.
On Wed, Dec 06, 2023 at 11:09:18AM +0000, Catalin Marinas wrote: > On Tue, Dec 05, 2023 at 03:51:30PM -0400, Jason Gunthorpe wrote: > > On Tue, Dec 05, 2023 at 07:34:45PM +0000, Catalin Marinas wrote: > > > > 2) You want to #define __iowrite512_copy() to memcpy_toio() on ARM and > > > > implement some quad STP optimization for this case? > > > > > > We can have the generic __iowrite512_copy() do memcpy_toio() and have > > > the arm64 implement an optimised version. > > > > > > What I'm not entirely sure of is the DGH (whatever the io_* barrier name > > > is). I'd put it in the same __iowrite512_copy() function and remove it > > > from the driver code. Otherwise when ST64B is added, we have an > > > unnecessary DGH in the driver. If this does not match the other > > > __iowrite*_copy() semantics, we can come up with another name. But start > > > with this for now and document the function. > > > > I think the iowrite is only used for WC and the DGH is functionally > > harmless for non-WC, so it makes sense. > > > > In this case we should just remove the DGH macro from the generic > > architecture code and tell people to use iowrite - since we now > > understand that callers basically have to in order to use DGH on new > > ARM CPUs. > > That works for me but what would the semantics be for __iowrite64_copy() > for example? Is there a DGH at the end of the whole write or after each > iteration? End of the iowrite_copy function call. The purpose of DGH is to reduce latency through write combining buffers by providing a hint to the HW to close them. __iowrite64_copy can be reasonably thought of as trying to push the argument into a single TLP. > I'd go with the former since e.g. hns3_tx_push_bd() does > that (and doesn't seem to be a 64 byte copy). sizeof(struct hns3_desc) == 32, HNS3_MAX_PUSH_BD_NUM == 2, so it is 64 bytes. Indeed, I already know this HW and it functions similar to mlx5. In userspace it uses the ST4 instruction, in fact HNS was the team that did that change citing measured improvements on their SOC. Changing this to be the STP block will likely be an improvement. Jason
On Tue, Nov 28, 2023 at 05:28:36PM +0100, Niklas Schnelle wrote: > On Mon, 2023-11-27 at 13:51 -0400, Jason Gunthorpe wrote: > > On Mon, Nov 27, 2023 at 06:43:11PM +0100, Niklas Schnelle wrote: > > > > > Also it turns out the writeq() loop we had so far does not produce the > > > needed 64 byte TLP on s390 either so this actually makes us newly pass > > > this test. > > > > Ooh, that is a significant problem - the userspace code won't be used > > unless this test passes. So we need this on S390 to fix a bug as well > > :\ > > Yes ;-( > > In the meantime I also found out that zpci_write_block(dst, src, 64) is > not correct for all cases because the PCI store block requires the > (pseudo-)MMIO write not to cross a 4K boundary and we need src/dst to > be double word aligned. In rdma-core this is neatly handled by the > get_max_write_size() but the kernel variant of that > zpci_get_max_write_size() isn't just a lot harder to read and likely > less efficient but also too strict thus breaking the 64 byte write up > needlessly. > > In total we have 5 conditions for the PCI block stores: > > 1. The dst+len must not cross a 4K boundary in the (pseudo-)MMIO space > 2. The length must not exceed the maximum write size > 3. The length must be a multiple of 8 > 4. The src needs to be double word aligned > 5. The dst needs to be double word aligned > > So I think a good solution would be to improve zpci_memcpy_toio() with > an enhanced zpci_get_max_write_size() based on the code in rdma-core > extended to also handle the alignment and length restrictions which in > rdma-core are already assumed (see comment there). Then we can use > zpci_memcpy_toio(dst, src, 64) for memcpy_toio_64() and rely on the > compiler to optimize out the unnecessary checks (2, 3 and possibly 4, > 5). > > So yeah this is getting a bit more complicated than originally > thought. Let me cook up a patch. Did you come up with something? Jason
Hey Catalin, I'm just revising this and I'm wondering if you know why ARM64 has this: #define __raw_writeq __raw_writeq static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) { asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr)); } Instead of #define __raw_writeq __raw_writeq static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) { asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr)); } ?? Like x86 has. The codegen for a 64 byte unrolled copy loop is way better with "m" on gcc: "r" constraint (gcc 13.2.0): .L3: ldr x3, [x1] str x3, [x0] ldr x3, [x1, 8] add x4, x0, 8 str x3, [x4] ldr x3, [x1, 16] add x4, x0, 16 str x3, [x4] ldr x3, [x1, 24] add x4, x0, 24 str x3, [x4] ldr x3, [x1, 32] add x4, x0, 32 str x3, [x4] ldr x3, [x1, 40] add x4, x0, 40 str x3, [x4] ldr x3, [x1, 48] add x4, x0, 48 str x3, [x4] ldr x3, [x1, 56] add x4, x0, 56 str x3, [x4] add x1, x1, 64 add x0, x0, 64 cmp x2, x1 bhi .L3 "m" constraint: .L3: ldp x10, x9, [x1] ldp x8, x7, [x1, 16] ldp x6, x5, [x1, 32] ldp x4, x3, [x1, 48] str x10, [x0] str x9, [x0, 8] str x8, [x0, 16] str x7, [x0, 24] str x6, [x0, 32] str x5, [x0, 40] str x4, [x0, 48] str x3, [x0, 56] add x1, x1, 64 add x0, x0, 64 cmp x2, x1 bhi .L3 clang 17 doesn't do any better either way, it doesn't seem to do anything with 'm', but I guess it could.. clang 17 (either): .LBB0_2: // =>This Inner Loop Header: Depth=1 ldp x9, x10, [x1] add x14, x0, #8 add x18, x0, #40 ldp x11, x12, [x1, #16] add x2, x0, #48 add x3, x0, #56 ldp x13, x15, [x1, #32] ldp x16, x17, [x1, #48] str x9, [x0] str x10, [x14] add x9, x0, #16 add x10, x0, #24 add x14, x0, #32 str x11, [x9] str x12, [x10] str x13, [x14] str x15, [x18] str x16, [x2] str x17, [x3] add x1, x1, #64 add x0, x0, #64 cmp x1, x8 b.lo .LBB0_2 It doesn't matter for this series, but it seems like something ARM64 might want to look at to improve.. Jason
On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote: > Hey Catalin, > > I'm just revising this and I'm wondering if you know why ARM64 has this: > > #define __raw_writeq __raw_writeq > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > { > asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr)); > } > > Instead of > > #define __raw_writeq __raw_writeq > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > { > asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr)); > } > > ?? Like x86 has. I believe this is for the same reason as doing so in all of our other IO accessors. We've deliberately ensured that our IO accessors use a single base register with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT when reporting a stage-2 abort, which a hypervisor may use for emulating IO. Mark. > > The codegen for a 64 byte unrolled copy loop is way better with "m" on gcc: > > "r" constraint (gcc 13.2.0): > > .L3: > ldr x3, [x1] > str x3, [x0] > ldr x3, [x1, 8] > add x4, x0, 8 > str x3, [x4] > ldr x3, [x1, 16] > add x4, x0, 16 > str x3, [x4] > ldr x3, [x1, 24] > add x4, x0, 24 > str x3, [x4] > ldr x3, [x1, 32] > add x4, x0, 32 > str x3, [x4] > ldr x3, [x1, 40] > add x4, x0, 40 > str x3, [x4] > ldr x3, [x1, 48] > add x4, x0, 48 > str x3, [x4] > ldr x3, [x1, 56] > add x4, x0, 56 > str x3, [x4] > add x1, x1, 64 > add x0, x0, 64 > cmp x2, x1 > bhi .L3 > > "m" constraint: > > .L3: > ldp x10, x9, [x1] > ldp x8, x7, [x1, 16] > ldp x6, x5, [x1, 32] > ldp x4, x3, [x1, 48] > str x10, [x0] > str x9, [x0, 8] > str x8, [x0, 16] > str x7, [x0, 24] > str x6, [x0, 32] > str x5, [x0, 40] > str x4, [x0, 48] > str x3, [x0, 56] > add x1, x1, 64 > add x0, x0, 64 > cmp x2, x1 > bhi .L3 > > clang 17 doesn't do any better either way, it doesn't seem to do > anything with 'm', but I guess it could.. > > clang 17 (either): > > .LBB0_2: // =>This Inner Loop Header: Depth=1 > ldp x9, x10, [x1] > add x14, x0, #8 > add x18, x0, #40 > ldp x11, x12, [x1, #16] > add x2, x0, #48 > add x3, x0, #56 > ldp x13, x15, [x1, #32] > ldp x16, x17, [x1, #48] > str x9, [x0] > str x10, [x14] > add x9, x0, #16 > add x10, x0, #24 > add x14, x0, #32 > str x11, [x9] > str x12, [x10] > str x13, [x14] > str x15, [x18] > str x16, [x2] > str x17, [x3] > add x1, x1, #64 > add x0, x0, #64 > cmp x1, x8 > b.lo .LBB0_2 > > It doesn't matter for this series, but it seems like something ARM64 > might want to look at to improve.. > > Jason >
On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote: > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote: > > Hey Catalin, > > > > I'm just revising this and I'm wondering if you know why ARM64 has this: > > > > #define __raw_writeq __raw_writeq > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > { > > asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr)); > > } > > > > Instead of > > > > #define __raw_writeq __raw_writeq > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > { > > asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr)); > > } > > > > ?? Like x86 has. > > I believe this is for the same reason as doing so in all of our other IO > accessors. > > We've deliberately ensured that our IO accessors use a single base register > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT > when reporting a stage-2 abort, which a hypervisor may use for > emulating IO. Wow, harming bare metal performace to accommodate imperfect emulation sounds like a horrible reason :( So what happens with this patch where IO is done with STP? Are you going to tell me I can't do it because of this? Jason
On Wed, Jan 17, 2024 at 08:36:18AM -0400, Jason Gunthorpe wrote: > On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote: > > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote: > > > Hey Catalin, > > > > > > I'm just revising this and I'm wondering if you know why ARM64 has this: > > > > > > #define __raw_writeq __raw_writeq > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > { > > > asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr)); > > > } > > > > > > Instead of > > > > > > #define __raw_writeq __raw_writeq > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > { > > > asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr)); > > > } > > > > > > ?? Like x86 has. > > > > I believe this is for the same reason as doing so in all of our other IO > > accessors. > > > > We've deliberately ensured that our IO accessors use a single base register > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT > > when reporting a stage-2 abort, which a hypervisor may use for > > emulating IO. > > Wow, harming bare metal performace to accommodate imperfect emulation > sounds like a horrible reason :( > > So what happens with this patch where IO is done with STP? Are you > going to tell me I can't do it because of this? I should also point out that userspace IO doesn't follow such a restriction since nobody ever knew, and things like RDMA stack and DPDK use ST4 and probably other non-trivial instructions for IO from userspace. I'm fearful you saying today's hypervisors don't work right on ARM if IO is trapped, which does happen for some legimiate reasons and a few illegitimate ones?? Jason
On Tue, 2024-01-16 at 13:33 -0400, Jason Gunthorpe wrote: > On Tue, Nov 28, 2023 at 05:28:36PM +0100, Niklas Schnelle wrote: > > On Mon, 2023-11-27 at 13:51 -0400, Jason Gunthorpe wrote: > > > On Mon, Nov 27, 2023 at 06:43:11PM +0100, Niklas Schnelle wrote: > > > > > > > Also it turns out the writeq() loop we had so far does not produce the > > > > needed 64 byte TLP on s390 either so this actually makes us newly pass > > > > this test. > > > > > > Ooh, that is a significant problem - the userspace code won't be used > > > unless this test passes. So we need this on S390 to fix a bug as well > > > :\ > > > > Yes ;-( > > > > In the meantime I also found out that zpci_write_block(dst, src, 64) is > > not correct for all cases because the PCI store block requires the > > (pseudo-)MMIO write not to cross a 4K boundary and we need src/dst to > > be double word aligned. In rdma-core this is neatly handled by the > > get_max_write_size() but the kernel variant of that > > zpci_get_max_write_size() isn't just a lot harder to read and likely > > less efficient but also too strict thus breaking the 64 byte write up > > needlessly. > > > > In total we have 5 conditions for the PCI block stores: > > > > 1. The dst+len must not cross a 4K boundary in the (pseudo-)MMIO space > > 2. The length must not exceed the maximum write size > > 3. The length must be a multiple of 8 > > 4. The src needs to be double word aligned > > 5. The dst needs to be double word aligned > > > > So I think a good solution would be to improve zpci_memcpy_toio() with > > an enhanced zpci_get_max_write_size() based on the code in rdma-core > > extended to also handle the alignment and length restrictions which in > > rdma-core are already assumed (see comment there). Then we can use > > zpci_memcpy_toio(dst, src, 64) for memcpy_toio_64() and rely on the > > compiler to optimize out the unnecessary checks (2, 3 and possibly 4, > > 5). > > > > So yeah this is getting a bit more complicated than originally > > thought. Let me cook up a patch. > > Did you come up with something? > > Jason Hi Jason, Sorry I haven't replied. Yes, I have a fix for zpci_memcpy_toio() titled "s390/pci: fix max size calculation in zpci_memcpy_toio()" that I tested with this series plus the following define added to arch/s390/include/asm/io.h: #define memcpy_toio_64 zpci_memcpy_toio(dst, src, 64) It's already in the s390 tree's feature branch and linux-next. Thanks, Niklas
On Wed, Jan 17, 2024 at 02:20:00PM +0100, Niklas Schnelle wrote: > Sorry I haven't replied. Yes, I have a fix for zpci_memcpy_toio() > titled "s390/pci: fix max size calculation in zpci_memcpy_toio()" that > I tested with this series plus the following define added to > arch/s390/include/asm/io.h: > > #define memcpy_toio_64 zpci_memcpy_toio(dst, src, 64) > > It's already in the s390 tree's feature branch and linux-next. Great! Jason
On Wed, Jan 17, 2024 at 08:36:18AM -0400, Jason Gunthorpe wrote: > On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote: > > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote: > > > Hey Catalin, > > > > > > I'm just revising this and I'm wondering if you know why ARM64 has this: > > > > > > #define __raw_writeq __raw_writeq > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > { > > > asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr)); > > > } > > > > > > Instead of > > > > > > #define __raw_writeq __raw_writeq > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > { > > > asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr)); > > > } > > > > > > ?? Like x86 has. > > > > I believe this is for the same reason as doing so in all of our other IO > > accessors. > > > > We've deliberately ensured that our IO accessors use a single base register > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT > > when reporting a stage-2 abort, which a hypervisor may use for > > emulating IO. > > Wow, harming bare metal performace to accommodate imperfect emulation > sounds like a horrible reason :( Having working functionality everywhere is a very good reason. :) > So what happens with this patch where IO is done with STP? Are you > going to tell me I can't do it because of this? I'm not personally going to make that judgement, but it's certainly something for Catalin and Will to consider (and I've added Marc in case he has any opinion). Mark.
On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote: > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote: > > Hey Catalin, > > > > I'm just revising this and I'm wondering if you know why ARM64 has this: > > > > #define __raw_writeq __raw_writeq > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > { > > asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr)); > > } > > > > Instead of > > > > #define __raw_writeq __raw_writeq > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > { > > asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr)); > > } > > > > ?? Like x86 has. > > I believe this is for the same reason as doing so in all of our other IO > accessors. > > We've deliberately ensured that our IO accessors use a single base register > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT > when reporting a stage-2 abort, which a hypervisor may use for emulating IO. FWIW, IIUC the immediate-offset forms *without* writeback can still be reported usefully in ESR_ELx, so I believe that we could use the "o" constraint for the __raw_write*() functions, e.g. static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) { asm volatile("str %x0, %1" : : "rZ" (val), "o" (*(volatile u64 *)addr)); } However, the __raw_read*() functions would still need to use "r" due to ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE. Mark.
On Wed, Jan 17, 2024 at 02:07:16PM +0000, Mark Rutland wrote: > > I believe this is for the same reason as doing so in all of our other IO > > accessors. > > > > We've deliberately ensured that our IO accessors use a single base register > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT > > when reporting a stage-2 abort, which a hypervisor may use for emulating IO. > > FWIW, IIUC the immediate-offset forms *without* writeback can still be reported > usefully in ESR_ELx, so I believe that we could use the "o" constraint for the > __raw_write*() functions, e.g. > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > { > asm volatile("str %x0, %1" : : "rZ" (val), "o" (*(volatile u64 *)addr)); > } "o" works well in the same simple memcpy loop: add x2, x1, w2, uxtw 3 cmp x1, x2 bcs .L1 .L3: ldp x10, x9, [x1] ldp x8, x7, [x1, 16] ldp x6, x5, [x1, 32] ldp x4, x3, [x1, 48] str x10, [x0] str x9, [x0, 8] str x8, [x0, 16] str x7, [x0, 24] str x6, [x0, 32] str x5, [x0, 40] str x4, [x0, 48] str x3, [x0, 56] add x1, x1, 64 add x0, x0, 64 cmp x2, x1 bhi .L3 .L1: ret Seems intersting to pursue? Jason
On Wed, Jan 17, 2024 at 11:28:22AM -0400, Jason Gunthorpe wrote: > On Wed, Jan 17, 2024 at 02:07:16PM +0000, Mark Rutland wrote: > > > > I believe this is for the same reason as doing so in all of our other IO > > > accessors. > > > > > > We've deliberately ensured that our IO accessors use a single base register > > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT > > > when reporting a stage-2 abort, which a hypervisor may use for emulating IO. > > > > FWIW, IIUC the immediate-offset forms *without* writeback can still be reported > > usefully in ESR_ELx, so I believe that we could use the "o" constraint for the > > __raw_write*() functions, e.g. > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > { > > asm volatile("str %x0, %1" : : "rZ" (val), "o" (*(volatile u64 *)addr)); > > } > > "o" works well in the same simple memcpy loop: > > add x2, x1, w2, uxtw 3 > cmp x1, x2 > bcs .L1 > .L3: > ldp x10, x9, [x1] > ldp x8, x7, [x1, 16] > ldp x6, x5, [x1, 32] > ldp x4, x3, [x1, 48] > str x10, [x0] > str x9, [x0, 8] > str x8, [x0, 16] > str x7, [x0, 24] > str x6, [x0, 32] > str x5, [x0, 40] > str x4, [x0, 48] > str x3, [x0, 56] > add x1, x1, 64 > add x0, x0, 64 > cmp x2, x1 > bhi .L3 > .L1: > ret > > Seems intersting to pursue? I've seen the compiler struggle with plain "o" in the past ("Impossible constraint in asm") so we might want "Qo" if we go down this route. Will
On Wed, Jan 17, 2024 at 09:26:13AM -0400, Jason Gunthorpe wrote: > On Wed, Jan 17, 2024 at 02:20:00PM +0100, Niklas Schnelle wrote: > > Sorry I haven't replied. Yes, I have a fix for zpci_memcpy_toio() > > titled "s390/pci: fix max size calculation in zpci_memcpy_toio()" that > > I tested with this series plus the following define added to > > arch/s390/include/asm/io.h: > > > > #define memcpy_toio_64 zpci_memcpy_toio(dst, src, 64) > > > > It's already in the s390 tree's feature branch and linux-next. > > Great! Is this wrong too? /* combine single writes by using store-block insn */ void __iowrite64_copy(void __iomem *to, const void *from, size_t count) { zpci_memcpy_toio(to, from, count); } * __iowrite64_copy - copy data to MMIO space, in 64-bit or 32-bit units * @to: destination, in MMIO space (must be 64-bit aligned) * @from: source (must be 64-bit aligned) * @count: number of 64-bit quantities to copy ^^^^^^^^^^^^^^^^^^^^^^^^^ Ie it should be zpci_memcpy_toio(to, from, count * 8); Right? (I'll fix it) Jason
On Wed, 2024-01-17 at 13:55 -0400, Jason Gunthorpe wrote: > On Wed, Jan 17, 2024 at 09:26:13AM -0400, Jason Gunthorpe wrote: > > On Wed, Jan 17, 2024 at 02:20:00PM +0100, Niklas Schnelle wrote: > > > Sorry I haven't replied. Yes, I have a fix for zpci_memcpy_toio() > > > titled "s390/pci: fix max size calculation in zpci_memcpy_toio()" that > > > I tested with this series plus the following define added to > > > arch/s390/include/asm/io.h: > > > > > > #define memcpy_toio_64 zpci_memcpy_toio(dst, src, 64) > > > > > > It's already in the s390 tree's feature branch and linux-next. > > > > Great! > > Is this wrong too? > > /* combine single writes by using store-block insn */ > void __iowrite64_copy(void __iomem *to, const void *from, size_t count) > { > zpci_memcpy_toio(to, from, count); > } > > * __iowrite64_copy - copy data to MMIO space, in 64-bit or 32-bit units > * @to: destination, in MMIO space (must be 64-bit aligned) > * @from: source (must be 64-bit aligned) > * @count: number of 64-bit quantities to copy > ^^^^^^^^^^^^^^^^^^^^^^^^^ > > Ie it should be > > zpci_memcpy_toio(to, from, count * 8); > > Right? > > (I'll fix it) > > Jason Yes, we need zpci_memcpy_toio(to, from, count * 8) since our count is in bytes like for memcpy_toio(). Thanks, Niklas
On Thu, Jan 18, 2024 at 02:46:40PM +0100, Niklas Schnelle wrote: > Yes, we need zpci_memcpy_toio(to, from, count * 8) since our count is > in bytes like for memcpy_toio(). https://github.com/jgunthorpe/linux/commits/mlx5_wc/ Jason
On Thu, 2024-01-18 at 10:00 -0400, Jason Gunthorpe wrote: > On Thu, Jan 18, 2024 at 02:46:40PM +0100, Niklas Schnelle wrote: > > > Yes, we need zpci_memcpy_toio(to, from, count * 8) since our count is > > in bytes like for memcpy_toio(). > > https://github.com/jgunthorpe/linux/commits/mlx5_wc/ > > Jason > Thanks, the s390 patches: s390: Implement __iowrite32_copy() s390: Use the correct count for __iowrite64_copy() s390: Stop using weak symbols for __iowrite64_copy() Look good to me. I.e. you may add my. Acked-by Niklas Schnelle <schnelle@linux.ibm.com> I did test your patches too and by accident confirmed again that these do need commit 80df7d6af7f6 ("s390/pci: fix max size calculation in zpci_memcpy_toio()") from the s390 feature branch to get the mlx5 driver to detect Write-Combining as supported. Note, as far as I know Alexander Gordeev is targeting that one for v6.8-rc2 since we had quite a few changes for v6.8-rc1. Thanks, Niklas
On Wed, Jan 17, 2024 at 04:05:29PM +0000, Will Deacon wrote: > On Wed, Jan 17, 2024 at 11:28:22AM -0400, Jason Gunthorpe wrote: > > On Wed, Jan 17, 2024 at 02:07:16PM +0000, Mark Rutland wrote: > > > > > > I believe this is for the same reason as doing so in all of our other IO > > > > accessors. > > > > > > > > We've deliberately ensured that our IO accessors use a single base register > > > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT > > > > when reporting a stage-2 abort, which a hypervisor may use for emulating IO. > > > > > > FWIW, IIUC the immediate-offset forms *without* writeback can still be reported > > > usefully in ESR_ELx, so I believe that we could use the "o" constraint for the > > > __raw_write*() functions, e.g. > > > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > { > > > asm volatile("str %x0, %1" : : "rZ" (val), "o" (*(volatile u64 *)addr)); > > > } > > > > "o" works well in the same simple memcpy loop: > > > > add x2, x1, w2, uxtw 3 > > cmp x1, x2 > > bcs .L1 > > .L3: > > ldp x10, x9, [x1] > > ldp x8, x7, [x1, 16] > > ldp x6, x5, [x1, 32] > > ldp x4, x3, [x1, 48] > > str x10, [x0] > > str x9, [x0, 8] > > str x8, [x0, 16] > > str x7, [x0, 24] > > str x6, [x0, 32] > > str x5, [x0, 40] > > str x4, [x0, 48] > > str x3, [x0, 56] > > add x1, x1, 64 > > add x0, x0, 64 > > cmp x2, x1 > > bhi .L3 > > .L1: > > ret > > > > Seems intersting to pursue? > > I've seen the compiler struggle with plain "o" in the past ("Impossible > constraint in asm") so we might want "Qo" if we go down this route. I'll stick a patch in 0-day and lets see if there are explosions. "Qo" generates the same assembly. So to summarize: - We don't like "m" because something about virtualization traps breaks with post/pre indexed forms like: str x1, [x0, 8]! And "m" will allow the compiler to emit that. - o selects only base register plus offset so it is OK - Q allows base register only (no offset) on some compilers that won't allow o for 0 offset - read side stays at 'r' due to an alternates errata workaround requiring ldar which doesn't accept the same effective address as ldr. Jason
On Thu, Jan 18, 2024 at 04:59:47PM +0100, Niklas Schnelle wrote: > On Thu, 2024-01-18 at 10:00 -0400, Jason Gunthorpe wrote: > > On Thu, Jan 18, 2024 at 02:46:40PM +0100, Niklas Schnelle wrote: > > > > > Yes, we need zpci_memcpy_toio(to, from, count * 8) since our count is > > > in bytes like for memcpy_toio(). > > > > https://github.com/jgunthorpe/linux/commits/mlx5_wc/ > > > > Jason > > > > Thanks, the s390 patches: > > > s390: Implement __iowrite32_copy() > s390: Use the correct count for __iowrite64_copy() > s390: Stop using weak symbols for __iowrite64_copy() > > Look good to me. I.e. you may add my. > > Acked-by Niklas Schnelle <schnelle@linux.ibm.com> Great, thanks. I'll post this once rc1 comes out > I did test your patches too and by accident confirmed again that these > do need commit 80df7d6af7f6 ("s390/pci: fix max size calculation in > zpci_memcpy_toio()") from the s390 feature branch to get the mlx5 > driver to detect Write-Combining as supported. Note, as far as I know > Alexander Gordeev is targeting that one for v6.8-rc2 since we had quite > a few changes for v6.8-rc1. OK, but we can still run these two things in parallel? Jason
On Thu, 2024-01-18 at 12:21 -0400, Jason Gunthorpe wrote: > On Thu, Jan 18, 2024 at 04:59:47PM +0100, Niklas Schnelle wrote: > > On Thu, 2024-01-18 at 10:00 -0400, Jason Gunthorpe wrote: > > > On Thu, Jan 18, 2024 at 02:46:40PM +0100, Niklas Schnelle wrote: > > > > > > > Yes, we need zpci_memcpy_toio(to, from, count * 8) since our count is > > > > in bytes like for memcpy_toio(). > > > > > > https://github.com/jgunthorpe/linux/commits/mlx5_wc/ > > > > > > Jason > > > > > > > Thanks, the s390 patches: > > > > > > s390: Implement __iowrite32_copy() > > s390: Use the correct count for __iowrite64_copy() > > s390: Stop using weak symbols for __iowrite64_copy() > > > > Look good to me. I.e. you may add my. > > > > Acked-by Niklas Schnelle <schnelle@linux.ibm.com> > > Great, thanks. I'll post this once rc1 comes out > > > I did test your patches too and by accident confirmed again that these > > do need commit 80df7d6af7f6 ("s390/pci: fix max size calculation in > > zpci_memcpy_toio()") from the s390 feature branch to get the mlx5 > > driver to detect Write-Combining as supported. Note, as far as I know > > Alexander Gordeev is targeting that one for v6.8-rc2 since we had quite > > a few changes for v6.8-rc1. > > OK, but we can still run these two things in parallel? > > Jason Sure, it's not worse without my patch than what we had before and clearly __iowrite64_copy() has been completely broken for ages without anyone noticing and is fixed by your patches even without my fix for the too strict issue in that at least it then copies what it is supposed to copy even if it does so with 8*8 byte stores. Thanks, Niklas
On Thu, 2024-01-18 at 17:25 +0100, Niklas Schnelle wrote: > On Thu, 2024-01-18 at 12:21 -0400, Jason Gunthorpe wrote: > > On Thu, Jan 18, 2024 at 04:59:47PM +0100, Niklas Schnelle wrote: > > > On Thu, 2024-01-18 at 10:00 -0400, Jason Gunthorpe wrote: > > > > On Thu, Jan 18, 2024 at 02:46:40PM +0100, Niklas Schnelle wrote: > > > ---8<--- > > > > > > > I did test your patches too and by accident confirmed again that these > > > do need commit 80df7d6af7f6 ("s390/pci: fix max size calculation in > > > zpci_memcpy_toio()") from the s390 feature branch to get the mlx5 > > > driver to detect Write-Combining as supported. Note, as far as I know > > > Alexander Gordeev is targeting that one for v6.8-rc2 since we had quite > > > a few changes for v6.8-rc1. > > > > OK, but we can still run these two things in parallel? > > > > Jason > > Sure, it's not worse without my patch than what we had before and > clearly __iowrite64_copy() has been completely broken for ages without > anyone noticing and is fixed by your patches even without my fix for > the too strict issue in that at least it then copies what it is > supposed to copy even if it does so with 8*8 byte stores. > > Thanks, > Niklas > FYI Alexander ended up doing a second v6.8-rc1 pull request and my change has now landed in Linus' tree as commit 80df7d6af7f6 ("s390/pci: fix max size calculation in zpci_memcpy_toio()"). Thanks, Niklas
(fixed Marc's email address) On Wed, Jan 17, 2024 at 01:29:06PM +0000, Mark Rutland wrote: > On Wed, Jan 17, 2024 at 08:36:18AM -0400, Jason Gunthorpe wrote: > > On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote: > > > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote: > > > > I'm just revising this and I'm wondering if you know why ARM64 has this: > > > > > > > > #define __raw_writeq __raw_writeq > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > > { > > > > asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr)); > > > > } > > > > > > > > Instead of > > > > > > > > #define __raw_writeq __raw_writeq > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > > { > > > > asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr)); > > > > } > > > > > > > > ?? Like x86 has. > > > > > > I believe this is for the same reason as doing so in all of our other IO > > > accessors. > > > > > > We've deliberately ensured that our IO accessors use a single base register > > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT > > > when reporting a stage-2 abort, which a hypervisor may use for > > > emulating IO. > > > > Wow, harming bare metal performace to accommodate imperfect emulation > > sounds like a horrible reason :( > > Having working functionality everywhere is a very good reason. :) > > > So what happens with this patch where IO is done with STP? Are you > > going to tell me I can't do it because of this? > > I'm not personally going to make that judgement, but it's certainly something > for Catalin and Will to consider (and I've added Marc in case he has any > opinion). Good point, I missed this part. We definitely can't use STP in the I/O accessors, we'd have a big surprise when running the same code in a guest with emulated I/O. If eight STRs without other operations interleaved give us the write-combining on most CPUs (with Normal NC), we should go with this instead of STP.
On Tue, Jan 23, 2024 at 08:38:55PM +0000, Catalin Marinas wrote: > (fixed Marc's email address) > > On Wed, Jan 17, 2024 at 01:29:06PM +0000, Mark Rutland wrote: > > On Wed, Jan 17, 2024 at 08:36:18AM -0400, Jason Gunthorpe wrote: > > > On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote: > > > > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote: > > > > > I'm just revising this and I'm wondering if you know why ARM64 has this: > > > > > > > > > > #define __raw_writeq __raw_writeq > > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > > > { > > > > > asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr)); > > > > > } > > > > > > > > > > Instead of > > > > > > > > > > #define __raw_writeq __raw_writeq > > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > > > { > > > > > asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr)); > > > > > } > > > > > > > > > > ?? Like x86 has. > > > > > > > > I believe this is for the same reason as doing so in all of our other IO > > > > accessors. > > > > > > > > We've deliberately ensured that our IO accessors use a single base register > > > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT > > > > when reporting a stage-2 abort, which a hypervisor may use for > > > > emulating IO. > > > > > > Wow, harming bare metal performace to accommodate imperfect emulation > > > sounds like a horrible reason :( > > > > Having working functionality everywhere is a very good reason. :) > > > > > So what happens with this patch where IO is done with STP? Are you > > > going to tell me I can't do it because of this? > > > > I'm not personally going to make that judgement, but it's certainly something > > for Catalin and Will to consider (and I've added Marc in case he has any > > opinion). > > Good point, I missed this part. We definitely can't use STP in the I/O > accessors, we'd have a big surprise when running the same code in a > guest with emulated I/O. Unfortunately there is no hard distinction in KVM/qemu for "emulated IO" and "VFIO MMIO". Even devices using VFIO can get funneled down the emulated path for legitimate reasons. Again, userspace is already widely deployed using complex IO accessors. ST4 has been out there for years and at this moment this patch with STP is already being deployed in production environments. Even if you refuse to take STP to mainline it *will* be running in VMs under ARM hypervisors. What exactly do you think should be done about that? I thought the guiding mantra here was that any time KVM does not perfectly emulate bare metal it is a bug. "We can't assume all VMs are Linux!". Indeed we recently had some long and *very* theoretical discussions about possible incompatibilties due to kvm changes in the memory attributes thread. But here it seems to be just shrugging off something so catastrophic as performance IO accessors *that are widely deployed already* don't work reliably in VMs!?!? "Oh well, don't use them"!? Damn I hope it crashes the VM and doesn't corrupt the MMIO. I just debugged a x86 KVM issue with it corrupting VFIO MMIO and that was a total nightmare to find. > If eight STRs without other operations interleaved give us the > write-combining on most CPUs (with Normal NC), we should go with this > instead of STP. __iowrite64_copy() is a performance IO accessor, we should not degrade it because buggy hypervisors might exist that have a problem with STP or other instructions. :( :( Anyhow, I know nothing about whatever this issue is - Mark said: > FWIW, IIUC the immediate-offset forms *without* writeback can still > be reported usefully in ESR_ELx, Which excludes the post/pre increment forms - but does STP and ST4 also have some kind of problem because the emulation path can't know about wider than a 64 bit access? What is the plan for ST64B? Don't get to use that either? Jason
On Wed, 24 Jan 2024 01:27:23 +0000, Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Tue, Jan 23, 2024 at 08:38:55PM +0000, Catalin Marinas wrote: > > (fixed Marc's email address) > > > > On Wed, Jan 17, 2024 at 01:29:06PM +0000, Mark Rutland wrote: > > > On Wed, Jan 17, 2024 at 08:36:18AM -0400, Jason Gunthorpe wrote: > > > > On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote: > > > > > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote: > > > > > > I'm just revising this and I'm wondering if you know why ARM64 has this: > > > > > > > > > > > > #define __raw_writeq __raw_writeq > > > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > > > > { > > > > > > asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr)); > > > > > > } > > > > > > > > > > > > Instead of > > > > > > > > > > > > #define __raw_writeq __raw_writeq > > > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > > > > { > > > > > > asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr)); > > > > > > } > > > > > > > > > > > > ?? Like x86 has. > > > > > > > > > > I believe this is for the same reason as doing so in all of our other IO > > > > > accessors. > > > > > > > > > > We've deliberately ensured that our IO accessors use a single base register > > > > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT > > > > > when reporting a stage-2 abort, which a hypervisor may use for > > > > > emulating IO. > > > > > > > > Wow, harming bare metal performace to accommodate imperfect emulation > > > > sounds like a horrible reason :( > > > > > > Having working functionality everywhere is a very good reason. :) > > > > > > > So what happens with this patch where IO is done with STP? Are you > > > > going to tell me I can't do it because of this? > > > > > > I'm not personally going to make that judgement, but it's certainly something > > > for Catalin and Will to consider (and I've added Marc in case he has any > > > opinion). > > > > Good point, I missed this part. We definitely can't use STP in the I/O > > accessors, we'd have a big surprise when running the same code in a > > guest with emulated I/O. > > Unfortunately there is no hard distinction in KVM/qemu for "emulated > IO" and "VFIO MMIO". Even devices using VFIO can get funneled down the > emulated path for legitimate reasons. > > Again, userspace is already widely deployed using complex IO > accessors. ST4 has been out there for years and at this moment this > patch with STP is already being deployed in production environments. Then you will get to keep the pieces. Good luck. > Even if you refuse to take STP to mainline it *will* be running in VMs > under ARM hypervisors. A hypervisor can't do anything with it. If you cared to read the architecture, you'd know by now. So your VM will be either dead, or dog slow, depending on your hypervisor. In any case, I'm sure it will reflect positively on your favourite software. > What exactly do you think should be done about that? Well, you could use KVM_CAP_ARM_NISV_TO_USER in userspace and see everything slow down. Your call. > I thought the guiding mantra here was that any time KVM does not > perfectly emulate bare metal it is a bug. "We can't assume all VMs are > Linux!". Indeed we recently had some long and *very* theoretical > discussions about possible incompatibilties due to kvm changes in the > memory attributes thread. > > But here it seems to be just shrugging off something so catastrophic > as performance IO accessors *that are widely deployed already* don't > work reliably in VMs!?!? > > "Oh well, don't use them"!? Exactly. You can also take this to the ARM architects and get them to update the architecture to mandate full syndrome information for all load/store instructions, and you'll get something useful in 2034. Maybe. Or you can stop whining and try to get better performance out of what we have today. > Damn I hope it crashes the VM and doesn't corrupt the MMIO. I just > debugged a x86 KVM issue with it corrupting VFIO MMIO and that was a > total nightmare to find. > > > If eight STRs without other operations interleaved give us the > > write-combining on most CPUs (with Normal NC), we should go with this > > instead of STP. > > __iowrite64_copy() is a performance IO accessor, we should not degrade > it because buggy hypervisors might exist that have a problem with STP > or other instructions. :( :( > > Anyhow, I know nothing about whatever this issue is - Mark said: > > > FWIW, IIUC the immediate-offset forms *without* writeback can still > > be reported usefully in ESR_ELx, > > Which excludes the post/pre increment forms - but does STP and ST4 > also have some kind of problem because the emulation path can't know > about wider than a 64 bit access? > > What is the plan for ST64B? Don't get to use that either? ST64 has full syndrome information, making it possible to emulate. In any case, there is no magic there. Everything is documented, and has been for the past... 15 years? M.
On Thu, Jan 18, 2024 at 12:18:43PM -0400, Jason Gunthorpe wrote: > On Wed, Jan 17, 2024 at 04:05:29PM +0000, Will Deacon wrote: > > On Wed, Jan 17, 2024 at 11:28:22AM -0400, Jason Gunthorpe wrote: > > > On Wed, Jan 17, 2024 at 02:07:16PM +0000, Mark Rutland wrote: > > > > > > > > I believe this is for the same reason as doing so in all of our other IO > > > > > accessors. > > > > > > > > > > We've deliberately ensured that our IO accessors use a single base register > > > > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT > > > > > when reporting a stage-2 abort, which a hypervisor may use for emulating IO. > > > > > > > > FWIW, IIUC the immediate-offset forms *without* writeback can still be reported > > > > usefully in ESR_ELx, so I believe that we could use the "o" constraint for the > > > > __raw_write*() functions, e.g. > > > > > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > > { > > > > asm volatile("str %x0, %1" : : "rZ" (val), "o" (*(volatile u64 *)addr)); > > > > } > > > > > > "o" works well in the same simple memcpy loop: > > > > > > add x2, x1, w2, uxtw 3 > > > cmp x1, x2 > > > bcs .L1 > > > .L3: > > > ldp x10, x9, [x1] > > > ldp x8, x7, [x1, 16] > > > ldp x6, x5, [x1, 32] > > > ldp x4, x3, [x1, 48] > > > str x10, [x0] > > > str x9, [x0, 8] > > > str x8, [x0, 16] > > > str x7, [x0, 24] > > > str x6, [x0, 32] > > > str x5, [x0, 40] > > > str x4, [x0, 48] > > > str x3, [x0, 56] > > > add x1, x1, 64 > > > add x0, x0, 64 > > > cmp x2, x1 > > > bhi .L3 > > > .L1: > > > ret > > > > > > Seems intersting to pursue? > > > > I've seen the compiler struggle with plain "o" in the past ("Impossible > > constraint in asm") so we might want "Qo" if we go down this route. > > I'll stick a patch in 0-day and lets see if there are explosions. "Qo" > generates the same assembly. > > So to summarize: > - We don't like "m" because something about virtualization > traps breaks with post/pre indexed forms like: > str x1, [x0, 8]! > And "m" will allow the compiler to emit that. > - o selects only base register plus offset so it is OK > - Q allows base register only (no offset) on some compilers that > won't allow o for 0 offset > - read side stays at 'r' due to an alternates errata workaround > requiring ldar which doesn't accept the same effective address > as ldr. FWIW I've sent a patch out with a commit message describing all of the above (with you, Catalin, Marc, and Will Cc'd). It hasn't appeared on lore yet, but it should eventually show up at: https://lore.kernel.org/linux-arm-kernel/20240124111259.874975-1-mark.rutland@arm.com/ Mark.
On Tue, Jan 23, 2024 at 08:38:55PM +0000, Catalin Marinas wrote: > (fixed Marc's email address) > > On Wed, Jan 17, 2024 at 01:29:06PM +0000, Mark Rutland wrote: > > On Wed, Jan 17, 2024 at 08:36:18AM -0400, Jason Gunthorpe wrote: > > > On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote: > > > > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote: > > > > > I'm just revising this and I'm wondering if you know why ARM64 has this: > > > > > > > > > > #define __raw_writeq __raw_writeq > > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > > > { > > > > > asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr)); > > > > > } > > > > > > > > > > Instead of > > > > > > > > > > #define __raw_writeq __raw_writeq > > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > > > { > > > > > asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr)); > > > > > } > > > > > > > > > > ?? Like x86 has. > > > > > > > > I believe this is for the same reason as doing so in all of our other IO > > > > accessors. > > > > > > > > We've deliberately ensured that our IO accessors use a single base register > > > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT > > > > when reporting a stage-2 abort, which a hypervisor may use for > > > > emulating IO. > > > > > > Wow, harming bare metal performace to accommodate imperfect emulation > > > sounds like a horrible reason :( > > > > Having working functionality everywhere is a very good reason. :) > > > > > So what happens with this patch where IO is done with STP? Are you > > > going to tell me I can't do it because of this? > > > > I'm not personally going to make that judgement, but it's certainly something > > for Catalin and Will to consider (and I've added Marc in case he has any > > opinion). > > Good point, I missed this part. We definitely can't use STP in the I/O > accessors, we'd have a big surprise when running the same code in a > guest with emulated I/O. Just to be clear, that means we should drop this patch ("arm64/io: add memcpy_toio_64") for now, right? It would be helpful if we could explciitly say so in direct reply to that: https://lore.kernel.org/linux-arm-kernel/c3ae87aea7660c3d266905c19d10d8de0f9fb779.1700766072.git.leon@kernel.org/ ... to avoid any confusion there. > If eight STRs without other operations interleaved give us the > write-combining on most CPUs (with Normal NC), we should go with this > instead of STP. Agreed; I've sent out a patch to allow the offset addressing at: https://lore.kernel.org/linux-arm-kernel/20240124111259.874975-1-mark.rutland@arm.com/ ... and it should be possible to build atop that to use eight STRs. Mark.
On Wed, Jan 24, 2024 at 11:38:07AM +0000, Mark Rutland wrote: > On Tue, Jan 23, 2024 at 08:38:55PM +0000, Catalin Marinas wrote: > > > On Wed, Jan 17, 2024 at 08:36:18AM -0400, Jason Gunthorpe wrote: > > > > On Wed, Jan 17, 2024 at 12:30:00PM +0000, Mark Rutland wrote: > > > > > On Tue, Jan 16, 2024 at 02:51:21PM -0400, Jason Gunthorpe wrote: > > > > > > I'm just revising this and I'm wondering if you know why ARM64 has this: > > > > > > > > > > > > #define __raw_writeq __raw_writeq > > > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > > > > { > > > > > > asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr)); > > > > > > } > > > > > > > > > > > > Instead of > > > > > > > > > > > > #define __raw_writeq __raw_writeq > > > > > > static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) > > > > > > { > > > > > > asm volatile("str %x0, %1" : : "rZ" (val), "m" (*(volatile u64 *)addr)); > > > > > > } > > > > > > > > > > > > ?? Like x86 has. > > > > > > > > > > I believe this is for the same reason as doing so in all of our other IO > > > > > accessors. > > > > > > > > > > We've deliberately ensured that our IO accessors use a single base register > > > > > with no offset as this is the only form that HW can represent in ESR_ELx.ISS.SRT > > > > > when reporting a stage-2 abort, which a hypervisor may use for > > > > > emulating IO. > > > > > > > > Wow, harming bare metal performace to accommodate imperfect emulation > > > > sounds like a horrible reason :( > > > > > > Having working functionality everywhere is a very good reason. :) > > > > > > > So what happens with this patch where IO is done with STP? Are you > > > > going to tell me I can't do it because of this? > > > > > > I'm not personally going to make that judgement, but it's certainly something > > > for Catalin and Will to consider (and I've added Marc in case he has any > > > opinion). > > > > Good point, I missed this part. We definitely can't use STP in the I/O > > accessors, we'd have a big surprise when running the same code in a > > guest with emulated I/O. > > Just to be clear, that means we should drop this patch ("arm64/io: add > memcpy_toio_64") for now, right? In its current form yes, but that doesn't mean that memcpy_toio_64() cannot be reworked differently. > It would be helpful if we could explciitly say so in direct reply to that: > > https://lore.kernel.org/linux-arm-kernel/c3ae87aea7660c3d266905c19d10d8de0f9fb779.1700766072.git.leon@kernel.org/ > > ... to avoid any confusion there. I will. > > If eight STRs without other operations interleaved give us the > > write-combining on most CPUs (with Normal NC), we should go with this > > instead of STP. > > Agreed; I've sent out a patch to allow the offset addressing at: > > https://lore.kernel.org/linux-arm-kernel/20240124111259.874975-1-mark.rutland@arm.com/ > > ... and it should be possible to build atop that to use eight STRs. That's great, thanks.
On Wed, Jan 24, 2024 at 08:26:28AM +0000, Marc Zyngier wrote: > > Even if you refuse to take STP to mainline it *will* be running in VMs > > under ARM hypervisors. > > A hypervisor can't do anything with it. If you cared to read the > architecture, you'd know by now. So your VM will be either dead, or > dog slow, depending on your hypervisor. In any case, I'm sure it will > reflect positively on your favourite software. "Dog slow" is fine. Forcing IO emulation on paths that shouldn't have it is a VMM problem. KVM & qemu have some issues where this can happen infrequently for VFIO MMIO maps. It is just important that it be functionally correct if you get unlucky. The performance path is to not take a fault in the first place. > > What exactly do you think should be done about that? > > Well, you could use KVM_CAP_ARM_NISV_TO_USER in userspace and see > everything slow down. Your call. The issue Mark raised here was that things like STP/etc cannot work in VMs, not that they are slow. The places we are talking about using the STP pattern are all high performance HW drivers, that do not have any existing SW emulation to worry about. ie the VMM will be using VFIO to back the MMIO the acessors target. So, I'm fine if the answer is that VMM's using VFIO need to use KVM_CAP_ARM_NISV_TO_USER and do instruction parsing for emulated IO in userspace if they have a design where VFIO MMIO can infrequently generate faults. That is all VMM design stuff and has nothing to do with the kernel. My objection is this notion we should degrade a performance hot path in drivers to accomodate an ARM VMM issue that should be solved in the VMM. > Or you can stop whining and try to get better performance out of what > we have today. "better performance"!?!? You are telling me I have to destroy one of our important fast paths for HPC workloads to accommodate some theoretical ARM KVM problem? Jason
On Wed, Jan 24, 2024 at 12:40:29PM +0000, Catalin Marinas wrote: > > Just to be clear, that means we should drop this patch ("arm64/io: add > > memcpy_toio_64") for now, right? > > In its current form yes, but that doesn't mean that memcpy_toio_64() > cannot be reworked differently. I gave up on touching memcpy_toio_64(), it doesn't work very well because of the weak alignment Instead I followed your suggestion to fix __iowrite64_copy() There are only a couple of places that use this API: drivers/infiniband/hw/bnxt_re/qplib_rcfw.c: __iowrite32_copy(mbox->reg.bar_reg, &init, sizeof(init) / 4); drivers/mtd/nand/raw/mxc_nand.c: __iowrite32_copy(trg, src, size / 4); drivers/net/ethernet/amazon/ena/ena_eth_com.c: __iowrite64_copy(io_sq->desc_addr.pbuf_dev_addr + dst_offset, drivers/net/ethernet/broadcom/bnxt/bnxt.c: __iowrite64_copy(db, tx_push_buf, 16); drivers/net/ethernet/broadcom/bnxt/bnxt.c: __iowrite32_copy(db + 4, tx_push_buf + 1, drivers/net/ethernet/broadcom/bnxt/bnxt.c: __iowrite64_copy(db, tx_push_buf, push_len); drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c: __iowrite32_copy(bp->bar0 + bar_offset, data, msg_len / 4); drivers/net/ethernet/hisilicon/hns3/hns3_enet.c: __iowrite64_copy(ring->tqp->mem_base, desc, drivers/net/ethernet/hisilicon/hns3/hns3_enet.c: __iowrite64_copy(ring->tqp->mem_base + HNS3_MEM_DOORBELL_OFFSET, drivers/net/ethernet/mellanox/mlx4/en_tx.c: __iowrite64_copy(dst, src, bytecnt / 8); drivers/net/ethernet/myricom/myri10ge/myri10ge.c:#define myri10ge_pio_copy(to,from,size) __iowrite64_copy(to,from,size/8) drivers/net/ethernet/sfc/tx.c: __iowrite64_copy(*piobuf, data, block_len >> 3); drivers/net/ethernet/sfc/tx.c: __iowrite64_copy(*piobuf, copy_buf->buf, drivers/net/ethernet/sfc/tx.c: __iowrite64_copy(piobuf, copy_buf->buf, drivers/net/ethernet/sfc/tx.c: __iowrite64_copy(tx_queue->piobuf, skb->data, drivers/net/wireless/mediatek/mt76/mmio.c: __iowrite32_copy(dev->mmio.regs + offset, data, DIV_ROUND_UP(len, 4)); drivers/net/wireless/ralink/rt2x00/rt2x00mmio.h: __iowrite32_copy(rt2x00dev->csr.base + offset, value, length >> 2); drivers/remoteproc/mtk_scp_ipi.c: __iowrite32_copy(dst + i, src + i, (len - i) / 4); drivers/rpmsg/qcom_glink_rpm.c: __iowrite32_copy(pipe->fifo + head, data, drivers/rpmsg/qcom_glink_rpm.c: __iowrite32_copy(pipe->fifo, data + len, drivers/rpmsg/qcom_smd.c: __iowrite32_copy(dst, src, count / sizeof(u32)); drivers/scsi/lpfc/lpfc_compat.h: __iowrite32_copy(dest, src, bytes / sizeof(uint32_t)); drivers/slimbus/qcom-ctrl.c: __iowrite32_copy(ctrl->base + tx_reg, buf, count); drivers/soc/qcom/qcom_aoss.c: __iowrite32_copy(qmp->msgram + qmp->offset + sizeof(u32), drivers/soc/qcom/spm.c: __iowrite32_copy(addr, drv->reg_data->seq, drivers/spi/spi-hisi-sfc-v3xx.c: __iowrite32_copy(to, from, words); sound/soc/intel/atom/sst/sst_loader.c: __iowrite32_copy(dst, src, count / 4); sound/soc/sof/iomem-utils.c: __iowrite32_copy(dest, src, m); At least the networking ones I recognize as performance paths, we don't want to degrade them. __iowrite64_copy() has a sufficient API that the compiler can inline the STP block as this patch did. I experimented with having memcpy_toio_64() invoke __iowrite64_copy(), but it did not look very nice. Maybe there is a possible performance win there, I don't know. > > > If eight STRs without other operations interleaved give us the > > > write-combining on most CPUs (with Normal NC), we should go with this > > > instead of STP. > > > > Agreed; I've sent out a patch to allow the offset addressing at: > > > > https://lore.kernel.org/linux-arm-kernel/20240124111259.874975-1-mark.rutland@arm.com/ > > > > ... and it should be possible to build atop that to use eight STRs. > > That's great, thanks. It is a nice patch but it does not really help this problem. The compiler cannot be trusted to use the new writeq() properly, eg clang doesn't optimize the new constraint at all. Regardless this has to be a fixed inline assembly block of either STR or STP. Jason
On Wed, 24 Jan 2024 13:06:38 +0000, Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Wed, Jan 24, 2024 at 08:26:28AM +0000, Marc Zyngier wrote: > > > > Even if you refuse to take STP to mainline it *will* be running in VMs > > > under ARM hypervisors. > > > > A hypervisor can't do anything with it. If you cared to read the > > architecture, you'd know by now. So your VM will be either dead, or > > dog slow, depending on your hypervisor. In any case, I'm sure it will > > reflect positively on your favourite software. > > "Dog slow" is fine. Forcing IO emulation on paths that shouldn't have > it is a VMM problem. KVM & qemu have some issues where this can happen > infrequently for VFIO MMIO maps. It is just important that it be > functionally correct if you get unlucky. The performance path is to > not take a fault in the first place. > > > > What exactly do you think should be done about that? > > > > Well, you could use KVM_CAP_ARM_NISV_TO_USER in userspace and see > > everything slow down. Your call. > > The issue Mark raised here was that things like STP/etc cannot work in > VMs, not that they are slow. > > The places we are talking about using the STP pattern are all high > performance HW drivers, that do not have any existing SW emulation to > worry about. ie the VMM will be using VFIO to back the MMIO the > acessors target. > > So, I'm fine if the answer is that VMM's using VFIO need to use > KVM_CAP_ARM_NISV_TO_USER and do instruction parsing for emulated IO in > userspace if they have a design where VFIO MMIO can infrequently > generate faults. That is all VMM design stuff and has nothing to do > with the kernel. Which will work a treat with things like CCA, I'm sure. > > My objection is this notion we should degrade a performance hot path > in drivers to accomodate an ARM VMM issue that should be solved in the > VMM. > > > Or you can stop whining and try to get better performance out of what > > we have today. > > "better performance"!?!? You are telling me I have to destroy one of > our important fast paths for HPC workloads to accommodate some > theoretical ARM KVM problem? What I'm saying is that there are way to make it better without breaking your particular toy workload which, as important as it may be to *you*, doesn't cover everybody's use case. Mark did post such an example that has the potential of having that improvement. I'd suggest that you give it a go. But your attitude of "who cares if it breaks as long as it works for me" is not something I can adhere to. M.
On Wed, Jan 24, 2024 at 01:32:22PM +0000, Marc Zyngier wrote: > > So, I'm fine if the answer is that VMM's using VFIO need to use > > KVM_CAP_ARM_NISV_TO_USER and do instruction parsing for emulated IO in > > userspace if they have a design where VFIO MMIO can infrequently > > generate faults. That is all VMM design stuff and has nothing to do > > with the kernel. > > Which will work a treat with things like CCA, I'm sure. CCA shouldn't have emulation or trapping on the MMIO mappings. > > > Or you can stop whining and try to get better performance out of what > > > we have today. > > > > "better performance"!?!? You are telling me I have to destroy one of > > our important fast paths for HPC workloads to accommodate some > > theoretical ARM KVM problem? > > What I'm saying is that there are way to make it better without > breaking your particular toy workload which, as important as it may be > to *you*, doesn't cover everybody's use case. Please, do we need the "toy" stuff? The industry is spending 10's of billions of dollars right now to run "my workload". Currently not widely on ARM servers, but we are all hoping ARM can succeed here, right? I still don't know what you mean by "better". There are several issues now 1) This series, where WC doesn't trigger on new cores. Maybe 8x STR will fix it, but it is not better performance wise than 4x STP. 2) Userspace does ST4 to MMIO memory, and the VMM can't explode because of this. Replacing the ST4 with 8x STR is NOT better, that would be a big performance downside, especially for the quirky hi-silicon hardware. 3) The other series changing the S2 so that WC can work in the VM > Mark did post such an example that has the potential of having that > improvement. I'd suggest that you give it a go. Mark's patch doesn't help this, I've already written and evaluated his patch last week. Unfortunately it needs to be done with explicit inline assembly either STP or STR blocks. I don't know if the 8x STR is workable or not. I need to get someone to test it, but even if it is the userspace IO for this HW will continue to use ST4. So, regardless of the kernel decision, if someone is going to put this HW into a VM then their VMM needs to do *something* to ensure that the VMM does not malfunction when the VM issues STP/ST4 to the VFIO MMIO. There are good choices for the VMM here - ensuring it never has to process a S2 VFIO MMIO fault, always resuming and never emulating VFIO MMIO, or correctly handling an emulated S2 fault from a STP/ST4 instruction via instruction parsing. Therefore we can assume that working VMM's will exist. Indeed I would go farther and say that mlx5 HW in a VM must have a working VMM. So the question is only how pessimistic should the arch code for __iowrite64_copy() be. My view is that it is only used in a small number of drivers and if a VMM creates vPCI devices for those drivers then the VMM should be expected to bring proper vMMIO support too. I do not like this notion that all drivers using __iowrite64_copy() should have sub-optimal bare metal performance because a VMM *might* exist that has a problem. > But your attitude of "who cares if it breaks as long as it works for > me" is not something I can adhere to. In my world failing to reach performance is a "break" as well. So you have a server that is "broken" because its performance is degraded vs an unknown VMM that is "broken" because it wants to emulate IO (without implementing instruction parsing) for a device with a __iowrite64_copy() using driver. My server really does exist. I'm not so sure about the other case. Jason
On Wed, Jan 24, 2024 at 09:27:19AM -0400, Jason Gunthorpe wrote: > On Wed, Jan 24, 2024 at 12:40:29PM +0000, Catalin Marinas wrote: > > > > Just to be clear, that means we should drop this patch ("arm64/io: add > > > memcpy_toio_64") for now, right? > > > > In its current form yes, but that doesn't mean that memcpy_toio_64() > > cannot be reworked differently. > > I gave up on touching memcpy_toio_64(), it doesn't work very well > because of the weak alignment > > Instead I followed your suggestion to fix __iowrite64_copy() I forgot the details. Was it to introduce an __iowrite512_copy() function or to simply use __iowrite64_copy() with a count of 8? > There are only a couple of places that use this API: [...] > __iowrite64_copy() has a sufficient API that the compiler can inline > the STP block as this patch did. > > I experimented with having memcpy_toio_64() invoke __iowrite64_copy(), > but it did not look very nice. Maybe there is a possible performance > win there, I don't know. Just invoking __iowrite64_copy() with a count of 8 wouldn't work well even if we have the writeq generating STR with an offset (well, it also increments the pointers, so I don't think Mark's optimisation would help). The copy implies loads and these would be interleaved with stores and potentially get in the way of write combining. An __iowrite512_copy() or maybe even an optimised __iowrite64_copy() for count 8 could do the loads first followed by the stores. You can use a special path in __iowrite64_copy() with __builtin_contant_p(). You can try with an arm64 specific __iowrite64_copy() and see how it goes (together with Mark's patch): void __iowrite64_copy(void __iomem *to, const void *from, size_t count) { u64 __iomem *dst = to; const u64 *src = from; const u64 *end = src + count; /* * Try a 64-byte write, the CPUs tend to write-combine them. */ if (__builtin_contant_p(count) && count == 8) { __raw_writeq(*src, dst); __raw_writeq(*(src + 1), dst + 1); __raw_writeq(*(src + 2), dst + 2); __raw_writeq(*(src + 3), dst + 3); __raw_writeq(*(src + 4), dst + 4); __raw_writeq(*(src + 5), dst + 5); __raw_writeq(*(src + 6), dst + 6); __raw_writeq(*(src + 7), dst + 7); return; } while (src < end) __raw_writeq(*src++, dst++); } EXPORT_SYMBOL_GPL(__iowrite64_copy); What we don't have is inlining of __iowrite64_copy() but if we need that we can move away from a weak symbol to a static inline. Give this a go and see if it you get write-combining in your hardware. If the loads interleaves with stores get in the way, maybe we can resort to inline asm.
On Wed, Jan 24, 2024 at 11:52:25AM -0400, Jason Gunthorpe wrote: > On Wed, Jan 24, 2024 at 01:32:22PM +0000, Marc Zyngier wrote: > > What I'm saying is that there are way to make it better without > > breaking your particular toy workload which, as important as it may be > > to *you*, doesn't cover everybody's use case. > > Please, do we need the "toy" stuff? The industry is spending 10's of > billions of dollars right now to run "my workload". Currently not > widely on ARM servers, but we are all hoping ARM can succeed here, > right? > > I still don't know what you mean by "better". There are several issues > now > > 1) This series, where WC doesn't trigger on new cores. Maybe 8x STR > will fix it, but it is not better performance wise than 4x STP. It would be good to know. If the performance difference is significant, we can revisit. I'm not keen on using alternatives here without backing it up by numbers (do we even have a way to detect whether Linux is running natively or not? we may have to invent something). > 2) Userspace does ST4 to MMIO memory, and the VMM can't explode > because of this. Replacing the ST4 with 8x STR is NOT better, > that would be a big performance downside, especially for the > quirky hi-silicon hardware. I was hoping KVM injects an error into the guest rather than killing it but at a quick look I couldn't find it. The kvm_handle_guest_abort() -> io_mem_abort() ends up returning -ENOSYS while handle_trap_exceptions() only understands handled or not (like 1 or 0). Well, maybe I didn't look deep enough.
On Wed, Jan 24, 2024 at 05:22:05PM +0000, Catalin Marinas wrote: > On Wed, Jan 24, 2024 at 09:27:19AM -0400, Jason Gunthorpe wrote: > > On Wed, Jan 24, 2024 at 12:40:29PM +0000, Catalin Marinas wrote: > > > > > > Just to be clear, that means we should drop this patch ("arm64/io: add > > > > memcpy_toio_64") for now, right? > > > > > > In its current form yes, but that doesn't mean that memcpy_toio_64() > > > cannot be reworked differently. > > > > I gave up on touching memcpy_toio_64(), it doesn't work very well > > because of the weak alignment > > > > Instead I followed your suggestion to fix __iowrite64_copy() > > I forgot the details. Was it to introduce an __iowrite512_copy() > function or to simply use __iowrite64_copy() with a count of 8? Count of 8 > Just invoking __iowrite64_copy() with a count of 8 wouldn't work well > even if we have the writeq generating STR with an offset (well, it also > increments the pointers, so I don't think Mark's optimisation would > help). The copy implies loads and these would be interleaved with stores > and potentially get in the way of write combining. An > __iowrite512_copy() or maybe even an optimised __iowrite64_copy() for > count 8 could do the loads first followed by the stores. You can use a > special path in __iowrite64_copy() with __builtin_contant_p(). I did exactly the latter like this: static inline void __const_memcpy_toio_aligned64(volatile u64 __iomem *to, const u64 *from, size_t count) { switch (count) { case 8: asm volatile("stp %x0, %x1, [%8, #16 * 0]\n" "stp %x2, %x3, [%8, #16 * 1]\n" "stp %x4, %x5, [%8, #16 * 2]\n" "stp %x6, %x7, [%8, #16 * 3]\n" : : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]), "rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]), "rZ"(from[6]), "rZ"(from[7]), "r"(to)); break; case 4: asm volatile("stp %x0, %x1, [%4, #16 * 0]\n" "stp %x2, %x3, [%4, #16 * 1]\n" : : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]), "rZ"(from[3]), "r"(to)); break; case 2: asm volatile("stp %x0, %x1, [%2, #16 * 0]\n" : : "rZ"(from[0]), "rZ"(from[1]), "r"(to)); break; case 1: __raw_writeq(*from, to); break; default: BUILD_BUG(); } } void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count); static inline void __const_iowrite64_copy(void __iomem *to, const void *from, size_t count) { if (count == 8 || count == 4 || count == 2 || count == 1) { __const_memcpy_toio_aligned64(to, from, count); dgh(); } else { __iowrite64_copy_full(to, from, count); } } #define __iowrite64_copy(to, from, count) \ (__builtin_constant_p(count) ? \ __const_iowrite64_copy(to, from, count) : \ __iowrite64_copy_full(to, from, count)) And the out of line __iowrite64_copy_full() generates good assembly that loops 8/4/2/1 sized blocks. I was going to send it out yesterday but am waiting for some conclusion on the STP. https://github.com/jgunthorpe/linux/commits/mlx5_wc/ > void __iowrite64_copy(void __iomem *to, const void *from, > size_t count) > { > u64 __iomem *dst = to; > const u64 *src = from; > const u64 *end = src + count; > > /* > * Try a 64-byte write, the CPUs tend to write-combine them. > */ > if (__builtin_contant_p(count) && count == 8) { > __raw_writeq(*src, dst); > __raw_writeq(*(src + 1), dst + 1); > __raw_writeq(*(src + 2), dst + 2); > __raw_writeq(*(src + 3), dst + 3); > __raw_writeq(*(src + 4), dst + 4); > __raw_writeq(*(src + 5), dst + 5); > __raw_writeq(*(src + 6), dst + 6); > __raw_writeq(*(src + 7), dst + 7); > return; > } I already looked at this, clang with the "Qo" constraint does: ffffffc08086e6ec: f9400029 ldr x9, [x1] ffffffc08086e6f0: 91002008 add x8, x0, #0x8 ffffffc08086e6f4: f9000009 str x9, [x0] ffffffc08086e6f8: f9400429 ldr x9, [x1, #8] ffffffc08086e6fc: f9000109 str x9, [x8] ffffffc08086e700: 91004008 add x8, x0, #0x10 ffffffc08086e704: f9400829 ldr x9, [x1, #16] ffffffc08086e708: f9000109 str x9, [x8] ffffffc08086e70c: 91006008 add x8, x0, #0x18 ffffffc08086e710: f9400c29 ldr x9, [x1, #24] ffffffc08086e714: f9000109 str x9, [x8] ffffffc08086e718: 91008008 add x8, x0, #0x20 ffffffc08086e71c: f9401029 ldr x9, [x1, #32] ffffffc08086e720: f9000109 str x9, [x8] ffffffc08086e724: 9100a008 add x8, x0, #0x28 ffffffc08086e728: f9401429 ldr x9, [x1, #40] ffffffc08086e72c: f9000109 str x9, [x8] ffffffc08086e730: 9100c008 add x8, x0, #0x30 ffffffc08086e734: f9401829 ldr x9, [x1, #48] ffffffc08086e738: f9000109 str x9, [x8] ffffffc08086e73c: f9401c28 ldr x8, [x1, #56] ffffffc08086e740: 9100e009 add x9, x0, #0x38 ffffffc08086e744: f9000128 str x8, [x9] Which is not good. Gcc is a better, but not perfect. > What we don't have is inlining of __iowrite64_copy() but if we need that > we can move away from a weak symbol to a static inline. Yes I did this as well. It helps s390 and x86 nicely too. > Give this a go and see if it you get write-combining in your hardware. > If the loads interleaves with stores get in the way, maybe we can resort > to inline asm. For reference the actual assembly (see post_send_nop()) that fails is: 13534: d503201f nop 13538: 93407ea1 sxtw x1, w21 1353c: f100403f cmp x1, #0x10 13540: 54000488 b.hi 135d0 <post_send_nop.isra.0+0x260> // b.pmore 13544: a9408a63 ldp x3, x2, [x19, #8] 13548: f84086c4 ldr x4, [x22], #8 1354c: f9400042 ldr x2, [x2] 13550: 8b030283 add x3, x20, x3 13554: 8b030042 add x2, x2, x3 13558: f9000044 str x4, [x2] 1355c: 91002294 add x20, x20, #0x8 13560: 11000ab5 add w21, w21, #0x2 13564: f101029f cmp x20, #0x40 13568: 54fffe81 b.ne 13538 <post_send_nop.isra.0+0x1c8> // b.any 1356c: d50320df hint #0x6 Not very good code the compiler wrote (the main issue is that it reloads the dest pointer every iteration), but still, all those loads are coming from memory that was recently touched so should be in-cache most of the time. So it isn't like we are sitting around waiting for a lengthy dcache fill and timing out the WC buffer. However, it is 136 instructions, so it feels like the issue may be the write combining buffer auto-flushes in less. Maybe it auto-flushes after 128/64/32/16/8 cycles now. I know there has been a tension to reduce WC latency vs maximum aggregation. The suggestion that it should not have any interleaving instructions and use STP came from our CPU architecture team. The assembly I have been able to get tested from this series that did works is this: ffffffc08086ec84: d5033e9f dsb st ffffffc08086ec88: f941de6b ldr x11, [x19, #952] ffffffc08086ec8c: f941da6c ldr x12, [x19, #944] ffffffc08086ec90: f940016b ldr x11, [x11] ffffffc08086ec94: 8b0c016b add x11, x11, x12 ffffffc08086ec98: a9002969 stp x9, x10, [x11] ffffffc08086ec9c: a9012168 stp x8, x8, [x11, #16] ffffffc08086eca0: a9022168 stp x8, x8, [x11, #32] ffffffc08086eca4: a9032168 stp x8, x8, [x11, #48] ffffffc08086eca8: d50320df hint #0x6 The revised __iowrite64_copy() version also creates this assembly. The ST4 based thing in userspace also works. Remember there are two related topics here.. mlx5 would like high frequency of large TLP generation, but doesn't care about raw performance. If the 24 instructions clang generates does that then great. hns/broadcom/others need the large TLP and care about performance. In that case the stp block is the best we can do in the kernel as st4 is off the table. I would like the architecture code to do a good job for performance since it is a generic API for all drivers. Regarding the 8x STR option, I have to get that tested. Jason
On Wed, Jan 24, 2024 at 05:54:49PM +0000, Catalin Marinas wrote: > On Wed, Jan 24, 2024 at 11:52:25AM -0400, Jason Gunthorpe wrote: > > On Wed, Jan 24, 2024 at 01:32:22PM +0000, Marc Zyngier wrote: > > > What I'm saying is that there are way to make it better without > > > breaking your particular toy workload which, as important as it may be > > > to *you*, doesn't cover everybody's use case. > > > > Please, do we need the "toy" stuff? The industry is spending 10's of > > billions of dollars right now to run "my workload". Currently not > > widely on ARM servers, but we are all hoping ARM can succeed here, > > right? > > > > I still don't know what you mean by "better". There are several issues > > now > > > > 1) This series, where WC doesn't trigger on new cores. Maybe 8x STR > > will fix it, but it is not better performance wise than 4x STP. > > It would be good to know. If the performance difference is significant, > we can revisit. I'm not keen on using alternatives here without backing > it up by numbers (do we even have a way to detect whether Linux is > running natively or not? we may have to invent something). I don't have a setup to measure performance, mlx5 is not using it in a performance path. The other drivers in the tree are. I feel bad about hobbling them. > > 2) Userspace does ST4 to MMIO memory, and the VMM can't explode > > because of this. Replacing the ST4 with 8x STR is NOT better, > > that would be a big performance downside, especially for the > > quirky hi-silicon hardware. > > I was hoping KVM injects an error into the guest rather than killing it > but at a quick look I couldn't find it. The kvm_handle_guest_abort() -> > io_mem_abort() ends up returning -ENOSYS while handle_trap_exceptions() > only understands handled or not (like 1 or 0). Well, maybe I didn't look > deep enough. It looks to me like qemu turns on the KVM_CAP_ARM_NISV_TO_USER and then when it gets a NISV it always converts it to a data abort to the guest. See kvm_arm_handle_dabt_nisv() in qemu. So it is just a correctness issue, not a 'VM userspace can crash the VMM' security problem. The reason we've never seen this fault in any of our testing is because the whole system is designed to have qemu back vMMIO space that is under hot path use by only a VFIO memslot. ie it never drops the memslot and forces emulation. (KVM has no issue to handle a S2 abort if a memslot is present, obviously) VFIO IO emulation is used to cover corner cases and establish a slow technical correctness. It is not fast path. Avoid this if you want any sort of performance. Thus, IMHO, doing IO emulation for VFIO that doesn't support all the instructions actual existing SW uses to do IO is hard to justify. We are already on a slow path that only exists for technical correctness, it should be perfect. It is perfect on x86 because x86 KVM does SW instruction decode and emulation. ARM could too, but doesn't. To put it in a practical example, I predict that if someone steps outside our "engineered" box and runs a 64k page size hypervisor kernel with a mlx5 device that is not engineered for 64K page size they will get a MMIO BAR layout where the 64k page that covers the MSI items will overlap with hot path addresses. The existing user space stack could issue ST4's to hot path addresses within that emulated 64k of vMMIO and explode. 4k page size hypervisors avoid this because the typical mlx5 device has a BAR layout with a 4k granule in mind. Jason
On Wed, Jan 24, 2024 at 03:26:34PM -0400, Jason Gunthorpe wrote: > The suggestion that it should not have any interleaving instructions > and use STP came from our CPU architecture team. I got some more details here. They point to the ARM publication about write combining https://community.arm.com/cfs-file/__key/telligent-evolution-components-attachments/13-150-00-00-00-00-10-12/Understanding_5F00_Write_5F00_Combining_5F00_on_5F00_Arm_5F00_V.1.0.pdf specifically to the example code using 4x 128 bit NEON stores. They point at the actual CPU design and say it is optimized for 128 bit stores (STP and ST4 included, it seems). 64 bit stores trigger some different behavior. I have no way to know if it will be OK for other drivers that expect this to be a performance path in the kernel. Are you *sure* you want to do this str version? If it works for mlx5 I will send the patch and the other companies can come later with performance data. Jason
On Thu, Jan 25, 2024 at 01:43:33PM -0400, Jason Gunthorpe wrote: > On Wed, Jan 24, 2024 at 03:26:34PM -0400, Jason Gunthorpe wrote: > > The suggestion that it should not have any interleaving instructions > > and use STP came from our CPU architecture team. > > I got some more details here. > > They point to the ARM publication about write combining > > https://community.arm.com/cfs-file/__key/telligent-evolution-components-attachments/13-150-00-00-00-00-10-12/Understanding_5F00_Write_5F00_Combining_5F00_on_5F00_Arm_5F00_V.1.0.pdf > > specifically to the example code using 4x 128 bit NEON stores. That's an example but this document doesn't make any statements about 64-bit writes. > They point at the actual CPU design and say it is optimized for 128 > bit stores (STP and ST4 included, it seems). > > 64 bit stores trigger some different behavior. This is highly microarchitecture specific. The best bet in the future is the ST64B instruction but in the meantime it's pretty much guessing. > I have no way to know if it will be OK for other drivers that expect > this to be a performance path in the kernel. > > Are you *sure* you want to do this str version? If it works for mlx5 I > will send the patch and the other companies can come later with > performance data. Yeah, I'd stick to the STR for now, it makes things simpler as we don't have to care about what emulation does.
On Fri, Jan 26, 2024 at 02:56:31PM +0000, Catalin Marinas wrote: > On Thu, Jan 25, 2024 at 01:43:33PM -0400, Jason Gunthorpe wrote: > > On Wed, Jan 24, 2024 at 03:26:34PM -0400, Jason Gunthorpe wrote: > > > The suggestion that it should not have any interleaving instructions > > > and use STP came from our CPU architecture team. > > > > I got some more details here. > > > > They point to the ARM publication about write combining > > > > https://community.arm.com/cfs-file/__key/telligent-evolution-components-attachments/13-150-00-00-00-00-10-12/Understanding_5F00_Write_5F00_Combining_5F00_on_5F00_Arm_5F00_V.1.0.pdf > > > > specifically to the example code using 4x 128 bit NEON stores. > > That's an example but this document doesn't make any statements about > 64-bit writes. ARM has consistently left this area as informally specified by documents like this. This document arose specifically because a certain implementation choose an architecturally complaint way to do write combining but it was informally decided that Linux upstream would not support it. These gaps were discovered during DOE's path finding Astra supercomputer program about 6 years ago during testing with mlx5 devices. The document was specifically intended to guide HPC implementations expecting to run inside the Linux ecosystem. Based on all this I'm not surprised that the ecosystem has decided to focus primarily on consecutive 128 bit writes, absent any other guidance designers are following what information they have. Jason
On Wed, Jan 24, 2024 at 09:29:24PM -0400, Jason Gunthorpe wrote: > On Wed, Jan 24, 2024 at 05:54:49PM +0000, Catalin Marinas wrote: > > On Wed, Jan 24, 2024 at 11:52:25AM -0400, Jason Gunthorpe wrote: > > > 2) Userspace does ST4 to MMIO memory, and the VMM can't explode > > > because of this. Replacing the ST4 with 8x STR is NOT better, > > > that would be a big performance downside, especially for the > > > quirky hi-silicon hardware. > > > > I was hoping KVM injects an error into the guest rather than killing it > > but at a quick look I couldn't find it. The kvm_handle_guest_abort() -> > > io_mem_abort() ends up returning -ENOSYS while handle_trap_exceptions() > > only understands handled or not (like 1 or 0). Well, maybe I didn't look > > deep enough. > > It looks to me like qemu turns on the KVM_CAP_ARM_NISV_TO_USER and > then when it gets a NISV it always converts it to a data abort to the > guest. See kvm_arm_handle_dabt_nisv() in qemu. So it is just a > correctness issue, not a 'VM userspace can crash the VMM' security > problem. The VMM wasn't my concern but rather a guest getting killed or not functioning correctly (user app killed). > Thus, IMHO, doing IO emulation for VFIO that doesn't support all the > instructions actual existing SW uses to do IO is hard to justify. We > are already on a slow path that only exists for technical correctness, > it should be perfect. It is perfect on x86 because x86 KVM does SW > instruction decode and emulation. ARM could too, but doesn't. It could fall back to instruction decode, either in KVM or the VMM (strong preference for the latter), but I'd only do this if it's justified. I don't think the issue here is VFIO, I doubt we'd ever see emulation for hardware like mlx5. But we are changing generic kernel functions like memcpy_toio/__iowrite64_copy() that end up being used in other drivers (e.g. USB, UART) for emulated devices. If we can keep these functions as generic as possible for both guest and native runs, that's great. If the performance difference is significant, we can revisit.
On Fri, Jan 26, 2024 at 04:15:16PM +0000, Catalin Marinas wrote: > > It looks to me like qemu turns on the KVM_CAP_ARM_NISV_TO_USER and > > then when it gets a NISV it always converts it to a data abort to the > > guest. See kvm_arm_handle_dabt_nisv() in qemu. So it is just a > > correctness issue, not a 'VM userspace can crash the VMM' security > > problem. > > The VMM wasn't my concern but rather a guest getting killed or not > functioning correctly (user app killed). Right, hopefully it is the latter. > > Thus, IMHO, doing IO emulation for VFIO that doesn't support all the > > instructions actual existing SW uses to do IO is hard to justify. We > > are already on a slow path that only exists for technical correctness, > > it should be perfect. It is perfect on x86 because x86 KVM does SW > > instruction decode and emulation. ARM could too, but doesn't. > > It could fall back to instruction decode, either in KVM or the VMM > (strong preference for the latter), but I'd only do this if it's > justified. From a performance perspective, if the VMM is doing pure emulation and wants to memcpy lots of data to emulated vMMIO I'd look at it like this: 1xST4 transfers 512 bits and requires one vmexit and one instruction parse. 4xSTP is four instruction parses and four vmexits 8xSTR is no instruction parses and eight vmexits The instruction parse is not pure overhead, it saves on vmexit's which are expensive things (at least on x86). I don't have a sense how this stacks up on arm, but I wouldn't jump to it being horribly non-performing. > I don't think the issue here is VFIO, I doubt we'd ever see emulation > for hardware like mlx5. Sadly no :( It can happen in non-production corner cases due to the VFIO MSI emulation. There is a qemu bug prior to 8.something that causes it to happen at random, with VFIO, rarely. There is a non-prodcution debug mode in qemu where all VFIO MMIO is trapped. The qemu expectation is that this is functionally identical to non-trapping. (The E in qemu is emulation after all, kind of a core reason it exists) Finally, we do actually have an internal simulation tool that does software emulate mlx5 HW without VFIO. > But we are changing generic kernel functions > like memcpy_toio/__iowrite64_copy() that end up being used in other > drivers (e.g. USB, UART) for emulated devices. I didn't touch memcpy_toio, I think given this problem we shouldn't touch it. I only touched __iowriteXX_copy() which did not look like it is being used in any drivers with emulation. Even if I got this wrong we can revert any impacted drivers to use memcpy_toio() instead. Jason
On Thu, 2024-01-18 at 16:59 +0100, Niklas Schnelle wrote: > On Thu, 2024-01-18 at 10:00 -0400, Jason Gunthorpe wrote: > > On Thu, Jan 18, 2024 at 02:46:40PM +0100, Niklas Schnelle wrote: > > > > > Yes, we need zpci_memcpy_toio(to, from, count * 8) since our count is > > > in bytes like for memcpy_toio(). > > > > https://github.com/jgunthorpe/linux/commits/mlx5_wc/ > > > > Jason > > > > Thanks, the s390 patches: > > > s390: Implement __iowrite32_copy() > s390: Use the correct count for __iowrite64_copy() > s390: Stop using weak symbols for __iowrite64_copy() > > Look good to me. I.e. you may add my. > > Acked-by Niklas Schnelle <schnelle@linux.ibm.com> > > I did test your patches too and by accident confirmed again that these > do need commit 80df7d6af7f6 ("s390/pci: fix max size calculation in > zpci_memcpy_toio()") from the s390 feature branch to get the mlx5 > driver to detect Write-Combining as supported. Note, as far as I know > Alexander Gordeev is targeting that one for v6.8-rc2 since we had quite > a few changes for v6.8-rc1. > > Thanks, > Niklas > Hi Jason, Since I haven't seen a new version of this yet I was just wondering what the current plan is. If this is somewhat stuck would you be willing to send the "s390: Use the correct count for __iowrite64_copy()" patch separately, preferably with "s390/pci:" prefix? Thanks, Niklas
On Fri, Feb 16, 2024 at 01:09:10PM +0100, Niklas Schnelle wrote: > On Thu, 2024-01-18 at 16:59 +0100, Niklas Schnelle wrote: > > On Thu, 2024-01-18 at 10:00 -0400, Jason Gunthorpe wrote: > > > On Thu, Jan 18, 2024 at 02:46:40PM +0100, Niklas Schnelle wrote: > > > > > > > Yes, we need zpci_memcpy_toio(to, from, count * 8) since our count is > > > > in bytes like for memcpy_toio(). > > > > > > https://github.com/jgunthorpe/linux/commits/mlx5_wc/ > > > > > > Jason > > > > > > > Thanks, the s390 patches: > > > > > > s390: Implement __iowrite32_copy() > > s390: Use the correct count for __iowrite64_copy() > > s390: Stop using weak symbols for __iowrite64_copy() > > > > Look good to me. I.e. you may add my. > > > > Acked-by Niklas Schnelle <schnelle@linux.ibm.com> > > > > I did test your patches too and by accident confirmed again that these > > do need commit 80df7d6af7f6 ("s390/pci: fix max size calculation in > > zpci_memcpy_toio()") from the s390 feature branch to get the mlx5 > > driver to detect Write-Combining as supported. Note, as far as I know > > Alexander Gordeev is targeting that one for v6.8-rc2 since we had quite > > a few changes for v6.8-rc1. > > Since I haven't seen a new version of this yet I was just wondering > what the current plan is. If this is somewhat stuck would you be > willing to send the "s390: Use the correct count for > __iowrite64_copy()" patch separately, preferably with "s390/pci:" > prefix? I have been waiting for test cycles on the right ARM64 system and it is taking a long time :( I'll send your bit so you can take it to -rc Thanks, Jason
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 3b694511b98f..73ab91913790 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -135,6 +135,26 @@ extern void __memset_io(volatile void __iomem *, int, size_t); #define memcpy_fromio(a,c,l) __memcpy_fromio((a),(c),(l)) #define memcpy_toio(c,a,l) __memcpy_toio((c),(a),(l)) +static inline void __memcpy_toio_64(volatile void __iomem *to, const void *from) +{ + const u64 *from64 = from; + + /* + * Newer ARM core have sensitive write combining buffers, it is + * important that the stores be contiguous blocks of store instructions. + * Normal memcpy does not work reliably. + */ + asm volatile("stp %x0, %x1, [%8, #16 * 0]\n" + "stp %x2, %x3, [%8, #16 * 1]\n" + "stp %x4, %x5, [%8, #16 * 2]\n" + "stp %x6, %x7, [%8, #16 * 3]\n" + : + : "rZ"(from64[0]), "rZ"(from64[1]), "rZ"(from64[2]), + "rZ"(from64[3]), "rZ"(from64[4]), "rZ"(from64[5]), + "rZ"(from64[6]), "rZ"(from64[7]), "r"(to)); +} +#define memcpy_toio_64(to, from) __memcpy_toio_64(to, from) + /* * I/O memory mapping functions. */ diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index bac63e874c7b..2d6d60ed2128 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1202,6 +1202,36 @@ static inline void memcpy_toio(volatile void __iomem *addr, const void *buffer, } #endif +#ifndef memcpy_toio_64 +#define memcpy_toio_64 memcpy_toio_64 +/** + * memcpy_toio_64 Copy 64 bytes of data into I/O memory + * @dst: The (I/O memory) destination for the copy + * @src: The (RAM) source for the data + * @count: The number of bytes to copy + * + * dst and src must be aligned to 8 bytes. This operation copies exactly 64 + * bytes. It is intended to be used for write combining IO memory. The + * architecture should provide an implementation that has a high chance of + * generating a single combined transaction. + */ +static inline void memcpy_toio_64(volatile void __iomem *addr, + const void *buffer) +{ + unsigned int i = 0; + +#if BITS_PER_LONG == 64 + for (; i != 8; i++) + __raw_writeq(((const u64 *)buffer)[i], + ((u64 __iomem *)addr) + i); +#else + for (; i != 16; i++) + __raw_writel(((const u32 *)buffer)[i], + ((u32 __iomem *)addr) + i); +#endif +} +#endif + extern int devmem_is_allowed(unsigned long pfn); #endif /* __KERNEL__ */