Message ID | 20230905-optimize_checksum-v2-1-ccd658db743b@rivosinc.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | riscv: Add fine-tuned checksum functions | expand |
Hey Charlie, Just a passing thought that I can't really test since I am meant to be on holidays... On Tue, Sep 05, 2023 at 09:46:50PM -0700, Charlie Jenkins wrote: > Provide checksum algorithms that have been designed to leverage riscv > instructions such as rotate. In 64-bit, can take advantage of the larger > register to avoid some overflow checking. > > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com> > --- > arch/riscv/include/asm/checksum.h | 96 +++++++++++++++++++++++++++++++++++++++ > 1 file changed, 96 insertions(+) > > diff --git a/arch/riscv/include/asm/checksum.h b/arch/riscv/include/asm/checksum.h > new file mode 100644 > index 000000000000..573714b9ea15 > --- /dev/null > +++ b/arch/riscv/include/asm/checksum.h > @@ -0,0 +1,96 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * IP checksum routines > + * > + * Copyright (C) 2023 Rivos Inc. > + */ > +#ifndef __ASM_RISCV_CHECKSUM_H > +#define __ASM_RISCV_CHECKSUM_H > + > +#include <linux/in6.h> > +#include <linux/uaccess.h> > + > +#ifdef CONFIG_32BIT > +typedef unsigned int csum_t; > +#else > +typedef unsigned long csum_t; > +#endif > + > +/* > + * Fold a partial checksum without adding pseudo headers > + */ > +static inline __sum16 csum_fold(__wsum sum) > +{ > + sum += (sum >> 16) | (sum << 16); > + return (__force __sum16)(~(sum >> 16)); > +} > + > +#define csum_fold csum_fold > + > +/* > + * Quickly compute an IP checksum with the assumption that IPv4 headers will > + * always be in multiples of 32-bits, and have an ihl of at least 5. > + * @ihl is the number of 32 bit segments and must be greater than or equal to 5. > + * @iph is assumed to be word aligned. > + */ > +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) > +{ > + csum_t csum = 0; > + int pos = 0; > + > + do { > + csum += ((const unsigned int *)iph)[pos]; > +#ifdef CONFIG_32BIT > + csum += csum < ((const unsigned int *)iph)[pos]; > +#endif // !CONFIG_32BIT Some of this ifdeffery really should become IS_ENABLED(), there's nothing in some of them that can't just get removed by the compiler. > + } while (++pos < ihl); > + > +#ifdef CONFIG_RISCV_ISA_ZBB This here I can't test since I'm supposed to be AFK, but can this also be an IS_ENABLED()? I know it is guarding code that the toolchain may not support, but does it get removed before that matters? I mainly ask because there's a host of ifdeffery here & the code would be a lot easier to understand if we could cut it down to a minimum. > + if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { > + csum_t fold_temp; > + > + asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, > + RISCV_ISA_EXT_ZBB, 1) > + : > + : > + : > + : no_zbb); > +#ifdef CONFIG_32BIT > + asm(".option push \n\ > + .option arch,+zbb \n\ > + rori %[fold_temp],%[csum],16 \n\ > + add %[csum],%[fold_temp],%[csum] \n\ > + .option pop" > + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)); > +#else // !CONFIG_32BIT > + asm(".option push \n\ > + .option arch,+zbb \n\ > + rori %[fold_temp], %[csum], 32 \n\ > + add %[csum], %[fold_temp], %[csum] \n\ > + srli %[csum], %[csum], 32 \n\ > + roriw %[fold_temp], %[csum], 16 \n\ > + addw %[csum], %[fold_temp], %[csum] \n\ > + .option pop" > + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)); > +#endif // !CONFIG_32BIT These 3 I think are pretty easy to follow immediately, but... > + return ~(csum >> 16); > + } > + /* > + * ZBB only saves three instructions on 32-bit and five on 64-bit so not > + * worth checking if supported without Alternatives. > + */ > +no_zbb: > +#endif // CONFIG_RISCV_ISA_ZBB > +#ifdef CONFIG_32BIT > +#else // !CONFIG_32BIT > + csum += (csum >> 32) | (csum << 32); > + csum >>= 16; > +#endif // !CONFIG_32BIT ...these ones here could be converted too, afaict. Thanks, Conor. > + return csum_fold((__force __wsum)csum); > +} > + > +#define ip_fast_csum ip_fast_csum > + > +#include <asm-generic/checksum.h> > + > +#endif // __ASM_RISCV_CHECKSUM_H > > -- > 2.42.0 >
On Thu, Sep 07, 2023 at 10:40:54AM +0100, Conor Dooley wrote: > Hey Charlie, > > Just a passing thought that I can't really test since I am meant to be > on holidays... > > On Tue, Sep 05, 2023 at 09:46:50PM -0700, Charlie Jenkins wrote: > > Provide checksum algorithms that have been designed to leverage riscv > > instructions such as rotate. In 64-bit, can take advantage of the larger > > register to avoid some overflow checking. > > > > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com> > > --- > > arch/riscv/include/asm/checksum.h | 96 +++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 96 insertions(+) > > > > diff --git a/arch/riscv/include/asm/checksum.h b/arch/riscv/include/asm/checksum.h > > new file mode 100644 > > index 000000000000..573714b9ea15 > > --- /dev/null > > +++ b/arch/riscv/include/asm/checksum.h > > @@ -0,0 +1,96 @@ > > +/* SPDX-License-Identifier: GPL-2.0 */ > > +/* > > + * IP checksum routines > > + * > > + * Copyright (C) 2023 Rivos Inc. > > + */ > > +#ifndef __ASM_RISCV_CHECKSUM_H > > +#define __ASM_RISCV_CHECKSUM_H > > + > > +#include <linux/in6.h> > > +#include <linux/uaccess.h> > > + > > +#ifdef CONFIG_32BIT > > +typedef unsigned int csum_t; > > +#else > > +typedef unsigned long csum_t; > > +#endif > > + > > +/* > > + * Fold a partial checksum without adding pseudo headers > > + */ > > +static inline __sum16 csum_fold(__wsum sum) > > +{ > > + sum += (sum >> 16) | (sum << 16); > > + return (__force __sum16)(~(sum >> 16)); > > +} > > + > > +#define csum_fold csum_fold > > + > > +/* > > + * Quickly compute an IP checksum with the assumption that IPv4 headers will > > + * always be in multiples of 32-bits, and have an ihl of at least 5. > > + * @ihl is the number of 32 bit segments and must be greater than or equal to 5. > > + * @iph is assumed to be word aligned. > > + */ > > +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) > > +{ > > + csum_t csum = 0; > > + int pos = 0; > > + > > + do { > > + csum += ((const unsigned int *)iph)[pos]; > > +#ifdef CONFIG_32BIT > > + csum += csum < ((const unsigned int *)iph)[pos]; > > +#endif // !CONFIG_32BIT > > Some of this ifdeffery really should become IS_ENABLED(), there's > nothing in some of them that can't just get removed by the compiler. > > > + } while (++pos < ihl); > > + > > +#ifdef CONFIG_RISCV_ISA_ZBB > > This here I can't test since I'm supposed to be AFK, but can this also > be an IS_ENABLED()? I know it is guarding code that the toolchain may > not support, but does it get removed before that matters? > > I mainly ask because there's a host of ifdeffery here & the code would > be a lot easier to understand if we could cut it down to a minimum. > > > + if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { > > + csum_t fold_temp; > > + > > + asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, > > + RISCV_ISA_EXT_ZBB, 1) > > + : > > + : > > + : > > + : no_zbb); > > +#ifdef CONFIG_32BIT > > + asm(".option push \n\ > > + .option arch,+zbb \n\ > > + rori %[fold_temp],%[csum],16 \n\ > > + add %[csum],%[fold_temp],%[csum] \n\ > > + .option pop" > > + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)); > > +#else // !CONFIG_32BIT > > + asm(".option push \n\ > > + .option arch,+zbb \n\ > > + rori %[fold_temp], %[csum], 32 \n\ > > + add %[csum], %[fold_temp], %[csum] \n\ > > + srli %[csum], %[csum], 32 \n\ > > + roriw %[fold_temp], %[csum], 16 \n\ > > + addw %[csum], %[fold_temp], %[csum] \n\ > > + .option pop" > > + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)); > > +#endif // !CONFIG_32BIT > > These 3 I think are pretty easy to follow immediately, but... > > > + return ~(csum >> 16); > > + } > > + /* > > + * ZBB only saves three instructions on 32-bit and five on 64-bit so not > > + * worth checking if supported without Alternatives. > > + */ > > +no_zbb: > > +#endif // CONFIG_RISCV_ISA_ZBB > > +#ifdef CONFIG_32BIT > > +#else // !CONFIG_32BIT > > + csum += (csum >> 32) | (csum << 32); > > + csum >>= 16; > > +#endif // !CONFIG_32BIT > > ...these ones here could be converted too, afaict. > > Thanks, > Conor. > That should make it look much cleaner, I will switch over to IS_ENABLED. - Charlie > > + return csum_fold((__force __wsum)csum); > > +} > > + > > +#define ip_fast_csum ip_fast_csum > > + > > +#include <asm-generic/checksum.h> > > + > > +#endif // __ASM_RISCV_CHECKSUM_H > > > > -- > > 2.42.0 > >
... > > > +/* > > > + * Fold a partial checksum without adding pseudo headers > > > + */ > > > +static inline __sum16 csum_fold(__wsum sum) > > > +{ > > > + sum += (sum >> 16) | (sum << 16); > > > + return (__force __sum16)(~(sum >> 16)); > > > +} I'm intrigued, gcc normally compiler that quite well. The very similar (from arch/arc): return (~sum - rol32(sum, 16)) >> 16; is slightly better on most architectures. (Especially if the ~sum and rol() can be executed together.) The only odd archs I saw were sparc32 (carry flag bug no rotate) and arm (barrel shifter on all instructions). It is better than the current asm for a lot of archs including x64. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On Sun, Sep 10, 2023 at 09:20:33PM +0000, David Laight wrote: > ... > > > > +/* > > > > + * Fold a partial checksum without adding pseudo headers > > > > + */ > > > > +static inline __sum16 csum_fold(__wsum sum) > > > > +{ > > > > + sum += (sum >> 16) | (sum << 16); > > > > + return (__force __sum16)(~(sum >> 16)); > > > > +} > > I'm intrigued, gcc normally compiler that quite well. > The very similar (from arch/arc): > return (~sum - rol32(sum, 16)) >> 16; > is slightly better on most architectures. > (Especially if the ~sum and rol() can be executed together.) Oh wow that solves the problem of needing to zero extend the result after taking the not. Taking the shift after the not both zero-extends and places the result in the correct spot. It is a very interesting property that it is possible to change the order of the operations if the addition becomes a subtraction. I will make the change. Thank you! - Charlie > > The only odd archs I saw were sparc32 (carry flag bug no rotate) > and arm (barrel shifter on all instructions). > > It is better than the current asm for a lot of archs including x64. > > David > > - > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK > Registration No: 1397386 (Wales) >
diff --git a/arch/riscv/include/asm/checksum.h b/arch/riscv/include/asm/checksum.h new file mode 100644 index 000000000000..573714b9ea15 --- /dev/null +++ b/arch/riscv/include/asm/checksum.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * IP checksum routines + * + * Copyright (C) 2023 Rivos Inc. + */ +#ifndef __ASM_RISCV_CHECKSUM_H +#define __ASM_RISCV_CHECKSUM_H + +#include <linux/in6.h> +#include <linux/uaccess.h> + +#ifdef CONFIG_32BIT +typedef unsigned int csum_t; +#else +typedef unsigned long csum_t; +#endif + +/* + * Fold a partial checksum without adding pseudo headers + */ +static inline __sum16 csum_fold(__wsum sum) +{ + sum += (sum >> 16) | (sum << 16); + return (__force __sum16)(~(sum >> 16)); +} + +#define csum_fold csum_fold + +/* + * Quickly compute an IP checksum with the assumption that IPv4 headers will + * always be in multiples of 32-bits, and have an ihl of at least 5. + * @ihl is the number of 32 bit segments and must be greater than or equal to 5. + * @iph is assumed to be word aligned. + */ +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) +{ + csum_t csum = 0; + int pos = 0; + + do { + csum += ((const unsigned int *)iph)[pos]; +#ifdef CONFIG_32BIT + csum += csum < ((const unsigned int *)iph)[pos]; +#endif // !CONFIG_32BIT + } while (++pos < ihl); + +#ifdef CONFIG_RISCV_ISA_ZBB + if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { + csum_t fold_temp; + + asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + RISCV_ISA_EXT_ZBB, 1) + : + : + : + : no_zbb); +#ifdef CONFIG_32BIT + asm(".option push \n\ + .option arch,+zbb \n\ + rori %[fold_temp],%[csum],16 \n\ + add %[csum],%[fold_temp],%[csum] \n\ + .option pop" + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)); +#else // !CONFIG_32BIT + asm(".option push \n\ + .option arch,+zbb \n\ + rori %[fold_temp], %[csum], 32 \n\ + add %[csum], %[fold_temp], %[csum] \n\ + srli %[csum], %[csum], 32 \n\ + roriw %[fold_temp], %[csum], 16 \n\ + addw %[csum], %[fold_temp], %[csum] \n\ + .option pop" + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)); +#endif // !CONFIG_32BIT + return ~(csum >> 16); + } + /* + * ZBB only saves three instructions on 32-bit and five on 64-bit so not + * worth checking if supported without Alternatives. + */ +no_zbb: +#endif // CONFIG_RISCV_ISA_ZBB +#ifdef CONFIG_32BIT +#else // !CONFIG_32BIT + csum += (csum >> 32) | (csum << 32); + csum >>= 16; +#endif // !CONFIG_32BIT + return csum_fold((__force __wsum)csum); +} + +#define ip_fast_csum ip_fast_csum + +#include <asm-generic/checksum.h> + +#endif // __ASM_RISCV_CHECKSUM_H
Provide checksum algorithms that have been designed to leverage riscv instructions such as rotate. In 64-bit, can take advantage of the larger register to avoid some overflow checking. Signed-off-by: Charlie Jenkins <charlie@rivosinc.com> --- arch/riscv/include/asm/checksum.h | 96 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+)