@@ -625,6 +625,29 @@ config RISCV_ISA_ZBB
If you don't know what to do here, say Y.
+config TOOLCHAIN_HAS_ZBC
+ bool
+ default y
+ depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zbc)
+ depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zbc)
+ depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900
+ depends on AS_HAS_OPTION_ARCH
+
+config RISCV_ISA_ZBC
+ bool "Zbc extension support for carry-less multiplication instructions"
+ depends on TOOLCHAIN_HAS_ZBC
+ depends on MMU
+ depends on RISCV_ALTERNATIVE
+ default y
+ help
+ Adds support to dynamically detect the presence of the Zbc
+ extension (carry-less multiplication) and enable its usage.
+
+ The Zbc extension could accelerate CRC (cyclic redundancy check)
+ calculations.
+
+ If you don't know what to do here, say Y.
+
config RISCV_ISA_ZICBOM
bool "Zicbom extension support for non-coherent DMA operation"
depends on MMU
@@ -13,6 +13,7 @@ endif
lib-$(CONFIG_MMU) += uaccess.o
lib-$(CONFIG_64BIT) += tishift.o
lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o
+lib-$(CONFIG_RISCV_ISA_ZBC) += crc32.o
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
lib-$(CONFIG_RISCV_ISA_V) += xor.o
new file mode 100644
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC32 implementation with Zbc extension.
+ *
+ * Copyright (C) 2024 Intel Corporation
+ */
+
+#include <asm/hwcap.h>
+#include <asm/alternative-macros.h>
+#include <asm/byteorder.h>
+
+#include <linux/types.h>
+#include <linux/minmax.h>
+#include <linux/crc32poly.h>
+#include <linux/crc32.h>
+#include <linux/byteorder/generic.h>
+
+/*
+ * Refer to https://www.corsix.org/content/barrett-reduction-polynomials for
+ * better understanding of how this math works.
+ *
+ * let "+" denotes polynomial add (XOR)
+ * let "-" denotes polynomial sub (XOR)
+ * let "*" denotes polynomial multiplication
+ * let "/" denotes polynomial floor division
+ * let "S" denotes source data, XLEN bit wide
+ * let "P" denotes CRC32 polynomial
+ * let "T" denotes 2^(XLEN+32)
+ * let "QT" denotes quotient of T/P, with the bit for 2^XLEN being implicit
+ *
+ * crc32(S, P)
+ * => S * (2^32) - S * (2^32) / P * P
+ * => lowest 32 bits of: S * (2^32) / P * P
+ * => lowest 32 bits of: S * (2^32) * (T / P) / T * P
+ * => lowest 32 bits of: S * (2^32) * quotient / T * P
+ * => lowest 32 bits of: S * quotient / 2^XLEN * P
+ * => lowest 32 bits of: (clmul_high_part(S, QT) + S) * P
+ * => clmul_low_part(clmul_high_part(S, QT) + S, P)
+ *
+ * In terms of below implementations, the BE case is more intuitive, since the
+ * higher order bit sits at more significant position.
+ */
+
+#if __riscv_xlen == 64
+/* Slide by XLEN bits per iteration */
+# define STEP_ORDER 3
+
+/* Each below polynomial quotient has an implicit bit for 2^XLEN */
+
+/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in LE format */
+# define CRC32_POLY_QT_LE 0x5a72d812fb808b20
+
+/* Polynomial quotient of (2^(XLEN+32))/CRC32C_POLY, in LE format */
+# define CRC32C_POLY_QT_LE 0xa434f61c6f5389f8
+
+/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in BE format, it should be
+ * the same as the bit-reversed version of CRC32_POLY_QT_LE
+ */
+# define CRC32_POLY_QT_BE 0x04d101df481b4e5a
+
+static inline u64 crc32_le_prep(u32 crc, unsigned long const *ptr)
+{
+ return (u64)crc ^ (__force u64)__cpu_to_le64(*ptr);
+}
+
+static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt)
+{
+ u32 crc;
+
+ /* We don't have a "clmulrh" insn, so use clmul + slli instead. */
+ asm volatile (".option push\n"
+ ".option arch,+zbc\n"
+ "clmul %0, %1, %2\n"
+ "slli %0, %0, 1\n"
+ "xor %0, %0, %1\n"
+ "clmulr %0, %0, %3\n"
+ "srli %0, %0, 32\n"
+ ".option pop\n"
+ : "=&r" (crc)
+ : "r" (s),
+ "r" (poly_qt),
+ "r" ((u64)poly << 32)
+ :);
+ return crc;
+}
+
+static inline u64 crc32_be_prep(u32 crc, unsigned long const *ptr)
+{
+ return ((u64)crc << 32) ^ (__force u64)__cpu_to_be64(*ptr);
+}
+
+#elif __riscv_xlen == 32
+# define STEP_ORDER 2
+/* Each quotient should match the upper half of its analog in RV64 */
+# define CRC32_POLY_QT_LE 0xfb808b20
+# define CRC32C_POLY_QT_LE 0x6f5389f8
+# define CRC32_POLY_QT_BE 0x04d101df
+
+static inline u32 crc32_le_prep(u32 crc, unsigned long const *ptr)
+{
+ return crc ^ (__force u32)__cpu_to_le32(*ptr);
+}
+
+static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt)
+{
+ u32 crc;
+
+ /* We don't have a "clmulrh" insn, so use clmul + slli instead. */
+ asm volatile (".option push\n"
+ ".option arch,+zbc\n"
+ "clmul %0, %1, %2\n"
+ "slli %0, %0, 1\n"
+ "xor %0, %0, %1\n"
+ "clmulr %0, %0, %3\n"
+ ".option pop\n"
+ : "=&r" (crc)
+ : "r" (s),
+ "r" (poly_qt),
+ "r" (poly)
+ :);
+ return crc;
+}
+
+static inline u32 crc32_be_prep(u32 crc, unsigned long const *ptr)
+{
+ return crc ^ (__force u32)__cpu_to_be32(*ptr);
+}
+
+#else
+# error "Unexpected __riscv_xlen"
+#endif
+
+static inline u32 crc32_be_zbc(unsigned long s)
+{
+ u32 crc;
+
+ asm volatile (".option push\n"
+ ".option arch,+zbc\n"
+ "clmulh %0, %1, %2\n"
+ "xor %0, %0, %1\n"
+ "clmul %0, %0, %3\n"
+ ".option pop\n"
+ : "=&r" (crc)
+ : "r" (s),
+ "r" (CRC32_POLY_QT_BE),
+ "r" (CRC32_POLY_BE)
+ :);
+ return crc;
+}
+
+#define STEP (1 << STEP_ORDER)
+#define OFFSET_MASK (STEP - 1)
+
+typedef u32 (*fallback)(u32 crc, unsigned char const *p, size_t len);
+
+static inline u32 crc32_le_unaligned(u32 crc, unsigned char const *p,
+ size_t len, u32 poly,
+ unsigned long poly_qt)
+{
+ size_t bits = len * 8;
+ unsigned long s = 0;
+ u32 crc_low = 0;
+
+ for (int i = 0; i < len; i++)
+ s = ((unsigned long)*p++ << (__riscv_xlen - 8)) | (s >> 8);
+
+ s ^= (unsigned long)crc << (__riscv_xlen - bits);
+ if (__riscv_xlen == 32 || len < sizeof(u32))
+ crc_low = crc >> bits;
+
+ crc = crc32_le_zbc(s, poly, poly_qt);
+ crc ^= crc_low;
+
+ return crc;
+}
+
+static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
+ size_t len, u32 poly,
+ unsigned long poly_qt,
+ fallback crc_fb)
+{
+ size_t offset, head_len, tail_len;
+ unsigned long const *p_ul;
+ unsigned long s;
+
+ asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+ RISCV_ISA_EXT_ZBC, 1)
+ : : : : legacy);
+
+ /* Handle the unaligned head. */
+ offset = (unsigned long)p & OFFSET_MASK;
+ if (offset && len) {
+ head_len = min(STEP - offset, len);
+ crc = crc32_le_unaligned(crc, p, head_len, poly, poly_qt);
+ p += head_len;
+ len -= head_len;
+ }
+
+ tail_len = len & OFFSET_MASK;
+ len = len >> STEP_ORDER;
+ p_ul = (unsigned long const *)p;
+
+ for (int i = 0; i < len; i++) {
+ s = crc32_le_prep(crc, p_ul);
+ crc = crc32_le_zbc(s, poly, poly_qt);
+ p_ul++;
+ }
+
+ /* Handle the tail bytes. */
+ p = (unsigned char const *)p_ul;
+ if (tail_len)
+ crc = crc32_le_unaligned(crc, p, tail_len, poly, poly_qt);
+
+ return crc;
+
+legacy:
+ return crc_fb(crc, p, len);
+}
+
+u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+{
+ return crc32_le_generic(crc, p, len, CRC32_POLY_LE, CRC32_POLY_QT_LE,
+ crc32_le_base);
+}
+
+u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
+{
+ return crc32_le_generic(crc, p, len, CRC32C_POLY_LE,
+ CRC32C_POLY_QT_LE, __crc32c_le_base);
+}
+
+static inline u32 crc32_be_unaligned(u32 crc, unsigned char const *p,
+ size_t len)
+{
+ size_t bits = len * 8;
+ unsigned long s = 0;
+ u32 crc_low = 0;
+
+ s = 0;
+ for (int i = 0; i < len; i++)
+ s = *p++ | (s << 8);
+
+ if (__riscv_xlen == 32 || len < sizeof(u32)) {
+ s ^= crc >> (32 - bits);
+ crc_low = crc << bits;
+ } else {
+ s ^= (unsigned long)crc << (bits - 32);
+ }
+
+ crc = crc32_be_zbc(s);
+ crc ^= crc_low;
+
+ return crc;
+}
+
+u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
+{
+ size_t offset, head_len, tail_len;
+ unsigned long const *p_ul;
+ unsigned long s;
+
+ asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+ RISCV_ISA_EXT_ZBC, 1)
+ : : : : legacy);
+
+ /* Handle the unaligned head. */
+ offset = (unsigned long)p & OFFSET_MASK;
+ if (offset && len) {
+ head_len = min(STEP - offset, len);
+ crc = crc32_be_unaligned(crc, p, head_len);
+ p += head_len;
+ len -= head_len;
+ }
+
+ tail_len = len & OFFSET_MASK;
+ len = len >> STEP_ORDER;
+ p_ul = (unsigned long const *)p;
+
+ for (int i = 0; i < len; i++) {
+ s = crc32_be_prep(crc, p_ul);
+ crc = crc32_be_zbc(s);
+ p_ul++;
+ }
+
+ /* Handle the tail bytes. */
+ p = (unsigned char const *)p_ul;
+ if (tail_len)
+ crc = crc32_be_unaligned(crc, p, tail_len);
+
+ return crc;
+
+legacy:
+ return crc32_be_base(crc, p, len);
+}
@@ -9,7 +9,9 @@
#include <linux/bitrev.h>
u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len);
+u32 __pure crc32_le_base(u32 crc, unsigned char const *p, size_t len);
u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len);
+u32 __pure crc32_be_base(u32 crc, unsigned char const *p, size_t len);
/**
* crc32_le_combine - Combine two crc32 check values into one. For two
@@ -37,6 +39,7 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2)
}
u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len);
+u32 __pure __crc32c_le_base(u32 crc, unsigned char const *p, size_t len);
/**
* __crc32c_le_combine - Combine two crc32c check values into one. For two