@@ -2,4 +2,15 @@
menu "Accelerated Cryptographic Algorithms for CPU (riscv)"
+config CRYPTO_GHASH_RISCV64
+ tristate "Hash functions: GHASH"
+ depends on 64BIT && RISCV_ISA_ZBC
+ select CRYPTO_HASH
+ select CRYPTO_LIB_GF128MUL
+ help
+ GCM GHASH function (NIST SP800-38D)
+
+ Architecture: riscv64 using one of:
+ - ZBC extension
+
endmenu
@@ -2,3 +2,17 @@
#
# linux/arch/riscv/crypto/Makefile
#
+
+obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o
+ghash-riscv64-y := ghash-riscv64-glue.o
+ifdef CONFIG_RISCV_ISA_ZBC
+ghash-riscv64-y += ghash-riscv64-zbc.o
+endif
+
+quiet_cmd_perlasm = PERLASM $@
+ cmd_perlasm = $(PERL) $(<) void $(@)
+
+$(obj)/ghash-riscv64-zbc.S: $(src)/ghash-riscv64-zbc.pl
+ $(call cmd,perlasm)
+
+clean-files += ghash-riscv64-zbc.S
new file mode 100644
@@ -0,0 +1,263 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * GHASH routines supporting VMX instructions on the Power 8
+ *
+ * Copyright (C) 2015, 2019 International Business Machines Inc.
+ *
+ * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
+ *
+ * Extended by Daniel Axtens <dja@axtens.net> to replace the fallback
+ * mechanism. The new approach is based on arm64 code, which is:
+ * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/types.h>
+#include <linux/err.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+#include <asm/simd.h>
+#include <crypto/ghash.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+
+/* Zbc (optional with zbkb improvements) */
+void gcm_ghash_rv64i_zbc(u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len);
+void gcm_ghash_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len);
+
+struct riscv64_ghash_ctx {
+ void (*ghash_func)(u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len);
+
+ /* key used by vector asm */
+ u128 htable[16];
+ /* key used by software fallback */
+ be128 key;
+};
+
+struct riscv64_ghash_desc_ctx {
+ u64 shash[2];
+ u8 buffer[GHASH_DIGEST_SIZE];
+ int bytes;
+};
+
+static int riscv64_ghash_init(struct shash_desc *desc)
+{
+ struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ dctx->bytes = 0;
+ memset(dctx->shash, 0, GHASH_DIGEST_SIZE);
+ return 0;
+}
+
+#ifdef CONFIG_RISCV_ISA_ZBC
+
+#define RISCV64_ZBC_SETKEY(VARIANT, GHASH) \
+void gcm_init_rv64i_ ## VARIANT(u128 Htable[16], const u64 Xi[2]); \
+static int riscv64_zbc_ghash_setkey_ ## VARIANT(struct crypto_shash *tfm, \
+ const u8 *key, \
+ unsigned int keylen) \
+{ \
+ struct riscv64_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(tfm)); \
+ const u64 k[2] = { cpu_to_be64(((const u64 *)key)[0]), \
+ cpu_to_be64(((const u64 *)key)[1]) }; \
+ \
+ if (keylen != GHASH_BLOCK_SIZE) \
+ return -EINVAL; \
+ \
+ memcpy(&ctx->key, key, GHASH_BLOCK_SIZE); \
+ gcm_init_rv64i_ ## VARIANT(ctx->htable, k); \
+ \
+ ctx->ghash_func = gcm_ghash_rv64i_ ## GHASH; \
+ \
+ return 0; \
+}
+
+static int riscv64_zbc_ghash_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ unsigned int len;
+ struct riscv64_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
+ struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ if (dctx->bytes) {
+ if (dctx->bytes + srclen < GHASH_DIGEST_SIZE) {
+ memcpy(dctx->buffer + dctx->bytes, src,
+ srclen);
+ dctx->bytes += srclen;
+ return 0;
+ }
+ memcpy(dctx->buffer + dctx->bytes, src,
+ GHASH_DIGEST_SIZE - dctx->bytes);
+
+ ctx->ghash_func(dctx->shash, ctx->htable,
+ dctx->buffer, GHASH_DIGEST_SIZE);
+
+ src += GHASH_DIGEST_SIZE - dctx->bytes;
+ srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
+ dctx->bytes = 0;
+ }
+ len = srclen & ~(GHASH_DIGEST_SIZE - 1);
+
+ if (len) {
+ gcm_ghash_rv64i_zbc(dctx->shash, ctx->htable,
+ src, len);
+ src += len;
+ srclen -= len;
+ }
+
+ if (srclen) {
+ memcpy(dctx->buffer, src, srclen);
+ dctx->bytes = srclen;
+ }
+ return 0;
+}
+
+static int riscv64_zbc_ghash_final(struct shash_desc *desc, u8 *out)
+{
+ int i;
+ struct riscv64_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
+ struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ if (dctx->bytes) {
+ for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++)
+ dctx->buffer[i] = 0;
+ ctx->ghash_func(dctx->shash, ctx->htable,
+ dctx->buffer, GHASH_DIGEST_SIZE);
+ dctx->bytes = 0;
+ }
+ memcpy(out, dctx->shash, GHASH_DIGEST_SIZE);
+ return 0;
+}
+
+RISCV64_ZBC_SETKEY(zbc, zbc);
+struct shash_alg riscv64_zbc_ghash_alg = {
+ .digestsize = GHASH_DIGEST_SIZE,
+ .init = riscv64_ghash_init,
+ .update = riscv64_zbc_ghash_update,
+ .final = riscv64_zbc_ghash_final,
+ .setkey = riscv64_zbc_ghash_setkey_zbc,
+ .descsize = sizeof(struct riscv64_ghash_desc_ctx)
+ + sizeof(struct ghash_desc_ctx),
+ .base = {
+ .cra_name = "ghash",
+ .cra_driver_name = "riscv64_zbc_ghash",
+ .cra_priority = 250,
+ .cra_blocksize = GHASH_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct riscv64_ghash_ctx),
+ .cra_module = THIS_MODULE,
+ },
+};
+
+RISCV64_ZBC_SETKEY(zbc__zbb, zbc);
+struct shash_alg riscv64_zbc_zbb_ghash_alg = {
+ .digestsize = GHASH_DIGEST_SIZE,
+ .init = riscv64_ghash_init,
+ .update = riscv64_zbc_ghash_update,
+ .final = riscv64_zbc_ghash_final,
+ .setkey = riscv64_zbc_ghash_setkey_zbc__zbb,
+ .descsize = sizeof(struct riscv64_ghash_desc_ctx)
+ + sizeof(struct ghash_desc_ctx),
+ .base = {
+ .cra_name = "ghash",
+ .cra_driver_name = "riscv64_zbc_zbb_ghash",
+ .cra_priority = 251,
+ .cra_blocksize = GHASH_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct riscv64_ghash_ctx),
+ .cra_module = THIS_MODULE,
+ },
+};
+
+RISCV64_ZBC_SETKEY(zbc__zbkb, zbc__zbkb);
+struct shash_alg riscv64_zbc_zbkb_ghash_alg = {
+ .digestsize = GHASH_DIGEST_SIZE,
+ .init = riscv64_ghash_init,
+ .update = riscv64_zbc_ghash_update,
+ .final = riscv64_zbc_ghash_final,
+ .setkey = riscv64_zbc_ghash_setkey_zbc__zbkb,
+ .descsize = sizeof(struct riscv64_ghash_desc_ctx)
+ + sizeof(struct ghash_desc_ctx),
+ .base = {
+ .cra_name = "ghash",
+ .cra_driver_name = "riscv64_zbc_zbkb_ghash",
+ .cra_priority = 252,
+ .cra_blocksize = GHASH_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct riscv64_ghash_ctx),
+ .cra_module = THIS_MODULE,
+ },
+};
+
+#endif /* CONFIG_RISCV_ISA_ZBC */
+
+#define RISCV64_DEFINED_GHASHES 7
+
+static struct shash_alg *riscv64_ghashes[RISCV64_DEFINED_GHASHES];
+static int num_riscv64_ghashes;
+
+static int __init riscv64_ghash_register(struct shash_alg *ghash)
+{
+ int ret;
+
+ ret = crypto_register_shash(ghash);
+ if (ret < 0) {
+ int i;
+
+ for (i = num_riscv64_ghashes - 1; i >= 0 ; i--)
+ crypto_unregister_shash(riscv64_ghashes[i]);
+
+ num_riscv64_ghashes = 0;
+
+ return ret;
+ }
+
+ pr_debug("Registered RISC-V ghash %s\n", ghash->base.cra_driver_name);
+ riscv64_ghashes[num_riscv64_ghashes] = ghash;
+ num_riscv64_ghashes++;
+ return 0;
+}
+
+static int __init riscv64_ghash_mod_init(void)
+{
+ int ret = 0;
+
+#ifdef CONFIG_RISCV_ISA_ZBC
+ if (riscv_isa_extension_available(NULL, ZBC)) {
+ ret = riscv64_ghash_register(&riscv64_zbc_ghash_alg);
+ if (ret < 0)
+ return ret;
+
+ if (riscv_isa_extension_available(NULL, ZBB)) {
+ ret = riscv64_ghash_register(&riscv64_zbc_zbb_ghash_alg);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (riscv_isa_extension_available(NULL, ZBKB)) {
+ ret = riscv64_ghash_register(&riscv64_zbc_zbkb_ghash_alg);
+ if (ret < 0)
+ return ret;
+ }
+ }
+#endif
+
+ return 0;
+}
+
+static void __exit riscv64_ghash_mod_fini(void)
+{
+ int i;
+
+ for (i = num_riscv64_ghashes - 1; i >= 0 ; i--)
+ crypto_unregister_shash(riscv64_ghashes[i]);
+
+ num_riscv64_ghashes = 0;
+}
+
+module_init(riscv64_ghash_mod_init);
+module_exit(riscv64_ghash_mod_fini);
+
+MODULE_DESCRIPTION("GSM GHASH (accelerated)");
+MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("ghash");
new file mode 100644
@@ -0,0 +1,400 @@
+#! /usr/bin/env perl
+# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+use strict;
+use warnings;
+
+use FindBin qw($Bin);
+use lib "$Bin";
+use lib "$Bin/../../perlasm";
+use riscv;
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$output and open STDOUT,">$output";
+
+my $code=<<___;
+.text
+___
+
+################################################################################
+# void gcm_init_rv64i_zbc(u128 Htable[16], const u64 H[2]);
+# void gcm_init_rv64i_zbc__zbb(u128 Htable[16], const u64 H[2]);
+# void gcm_init_rv64i_zbc__zbkb(u128 Htable[16], const u64 H[2]);
+#
+# input: H: 128-bit H - secret parameter E(K, 0^128)
+# output: Htable: Preprocessed key data for gcm_gmult_rv64i_zbc* and
+# gcm_ghash_rv64i_zbc*
+#
+# All callers of this function revert the byte-order unconditionally
+# on little-endian machines. So we need to revert the byte-order back.
+# Additionally we reverse the bits of each byte.
+
+{
+my ($Htable,$H,$VAL0,$VAL1,$TMP0,$TMP1,$TMP2) = ("a0","a1","a2","a3","t0","t1","t2");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_init_rv64i_zbc
+.type gcm_init_rv64i_zbc,\@function
+gcm_init_rv64i_zbc:
+ ld $VAL0,0($H)
+ ld $VAL1,8($H)
+ @{[brev8_rv64i $VAL0, $TMP0, $TMP1, $TMP2]}
+ @{[brev8_rv64i $VAL1, $TMP0, $TMP1, $TMP2]}
+ @{[sd_rev8_rv64i $VAL0, $Htable, 0, $TMP0]}
+ @{[sd_rev8_rv64i $VAL1, $Htable, 8, $TMP0]}
+ ret
+.size gcm_init_rv64i_zbc,.-gcm_init_rv64i_zbc
+___
+}
+
+{
+my ($Htable,$H,$VAL0,$VAL1,$TMP0,$TMP1,$TMP2) = ("a0","a1","a2","a3","t0","t1","t2");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_init_rv64i_zbc__zbb
+.type gcm_init_rv64i_zbc__zbb,\@function
+gcm_init_rv64i_zbc__zbb:
+ ld $VAL0,0($H)
+ ld $VAL1,8($H)
+ @{[brev8_rv64i $VAL0, $TMP0, $TMP1, $TMP2]}
+ @{[brev8_rv64i $VAL1, $TMP0, $TMP1, $TMP2]}
+ @{[rev8 $VAL0, $VAL0]}
+ @{[rev8 $VAL1, $VAL1]}
+ sd $VAL0,0($Htable)
+ sd $VAL1,8($Htable)
+ ret
+.size gcm_init_rv64i_zbc__zbb,.-gcm_init_rv64i_zbc__zbb
+___
+}
+
+{
+my ($Htable,$H,$TMP0,$TMP1) = ("a0","a1","t0","t1");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_init_rv64i_zbc__zbkb
+.type gcm_init_rv64i_zbc__zbkb,\@function
+gcm_init_rv64i_zbc__zbkb:
+ ld $TMP0,0($H)
+ ld $TMP1,8($H)
+ @{[brev8 $TMP0, $TMP0]}
+ @{[brev8 $TMP1, $TMP1]}
+ @{[rev8 $TMP0, $TMP0]}
+ @{[rev8 $TMP1, $TMP1]}
+ sd $TMP0,0($Htable)
+ sd $TMP1,8($Htable)
+ ret
+.size gcm_init_rv64i_zbc__zbkb,.-gcm_init_rv64i_zbc__zbkb
+___
+}
+
+################################################################################
+# void gcm_gmult_rv64i_zbc(u64 Xi[2], const u128 Htable[16]);
+# void gcm_gmult_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16]);
+#
+# input: Xi: current hash value
+# Htable: copy of H
+# output: Xi: next hash value Xi
+#
+# Compute GMULT (Xi*H mod f) using the Zbc (clmul) and Zbb (basic bit manip)
+# extensions. Using the no-Karatsuba approach and clmul for the final reduction.
+# This results in an implementation with minimized number of instructions.
+# HW with clmul latencies higher than 2 cycles might observe a performance
+# improvement with Karatsuba. HW with clmul latencies higher than 6 cycles
+# might observe a performance improvement with additionally converting the
+# reduction to shift&xor. For a full discussion of this estimates see
+# https://github.com/riscv/riscv-crypto/blob/master/doc/supp/gcm-mode-cmul.adoc
+{
+my ($Xi,$Htable,$x0,$x1,$y0,$y1) = ("a0","a1","a4","a5","a6","a7");
+my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_gmult_rv64i_zbc
+.type gcm_gmult_rv64i_zbc,\@function
+gcm_gmult_rv64i_zbc:
+ # Load Xi and bit-reverse it
+ ld $x0, 0($Xi)
+ ld $x1, 8($Xi)
+ @{[brev8_rv64i $x0, $z0, $z1, $z2]}
+ @{[brev8_rv64i $x1, $z0, $z1, $z2]}
+
+ # Load the key (already bit-reversed)
+ ld $y0, 0($Htable)
+ ld $y1, 8($Htable)
+
+ # Load the reduction constant
+ la $polymod, Lpolymod
+ lbu $polymod, 0($polymod)
+
+ # Multiplication (without Karatsuba)
+ @{[clmulh $z3, $x1, $y1]}
+ @{[clmul $z2, $x1, $y1]}
+ @{[clmulh $t1, $x0, $y1]}
+ @{[clmul $z1, $x0, $y1]}
+ xor $z2, $z2, $t1
+ @{[clmulh $t1, $x1, $y0]}
+ @{[clmul $t0, $x1, $y0]}
+ xor $z2, $z2, $t1
+ xor $z1, $z1, $t0
+ @{[clmulh $t1, $x0, $y0]}
+ @{[clmul $z0, $x0, $y0]}
+ xor $z1, $z1, $t1
+
+ # Reduction with clmul
+ @{[clmulh $t1, $z3, $polymod]}
+ @{[clmul $t0, $z3, $polymod]}
+ xor $z2, $z2, $t1
+ xor $z1, $z1, $t0
+ @{[clmulh $t1, $z2, $polymod]}
+ @{[clmul $t0, $z2, $polymod]}
+ xor $x1, $z1, $t1
+ xor $x0, $z0, $t0
+
+ # Bit-reverse Xi back and store it
+ @{[brev8_rv64i $x0, $z0, $z1, $z2]}
+ @{[brev8_rv64i $x1, $z0, $z1, $z2]}
+ sd $x0, 0($Xi)
+ sd $x1, 8($Xi)
+ ret
+.size gcm_gmult_rv64i_zbc,.-gcm_gmult_rv64i_zbc
+___
+}
+
+{
+my ($Xi,$Htable,$x0,$x1,$y0,$y1) = ("a0","a1","a4","a5","a6","a7");
+my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_gmult_rv64i_zbc__zbkb
+.type gcm_gmult_rv64i_zbc__zbkb,\@function
+gcm_gmult_rv64i_zbc__zbkb:
+ # Load Xi and bit-reverse it
+ ld $x0, 0($Xi)
+ ld $x1, 8($Xi)
+ @{[brev8 $x0, $x0]}
+ @{[brev8 $x1, $x1]}
+
+ # Load the key (already bit-reversed)
+ ld $y0, 0($Htable)
+ ld $y1, 8($Htable)
+
+ # Load the reduction constant
+ la $polymod, Lpolymod
+ lbu $polymod, 0($polymod)
+
+ # Multiplication (without Karatsuba)
+ @{[clmulh $z3, $x1, $y1]}
+ @{[clmul $z2, $x1, $y1]}
+ @{[clmulh $t1, $x0, $y1]}
+ @{[clmul $z1, $x0, $y1]}
+ xor $z2, $z2, $t1
+ @{[clmulh $t1, $x1, $y0]}
+ @{[clmul $t0, $x1, $y0]}
+ xor $z2, $z2, $t1
+ xor $z1, $z1, $t0
+ @{[clmulh $t1, $x0, $y0]}
+ @{[clmul $z0, $x0, $y0]}
+ xor $z1, $z1, $t1
+
+ # Reduction with clmul
+ @{[clmulh $t1, $z3, $polymod]}
+ @{[clmul $t0, $z3, $polymod]}
+ xor $z2, $z2, $t1
+ xor $z1, $z1, $t0
+ @{[clmulh $t1, $z2, $polymod]}
+ @{[clmul $t0, $z2, $polymod]}
+ xor $x1, $z1, $t1
+ xor $x0, $z0, $t0
+
+ # Bit-reverse Xi back and store it
+ @{[brev8 $x0, $x0]}
+ @{[brev8 $x1, $x1]}
+ sd $x0, 0($Xi)
+ sd $x1, 8($Xi)
+ ret
+.size gcm_gmult_rv64i_zbc__zbkb,.-gcm_gmult_rv64i_zbc__zbkb
+___
+}
+
+################################################################################
+# void gcm_ghash_rv64i_zbc(u64 Xi[2], const u128 Htable[16],
+# const u8 *inp, size_t len);
+# void gcm_ghash_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16],
+# const u8 *inp, size_t len);
+#
+# input: Xi: current hash value
+# Htable: copy of H
+# inp: pointer to input data
+# len: length of input data in bytes (mutiple of block size)
+# output: Xi: Xi+1 (next hash value Xi)
+{
+my ($Xi,$Htable,$inp,$len,$x0,$x1,$y0,$y1) = ("a0","a1","a2","a3","a4","a5","a6","a7");
+my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_ghash_rv64i_zbc
+.type gcm_ghash_rv64i_zbc,\@function
+gcm_ghash_rv64i_zbc:
+ # Load Xi and bit-reverse it
+ ld $x0, 0($Xi)
+ ld $x1, 8($Xi)
+ @{[brev8_rv64i $x0, $z0, $z1, $z2]}
+ @{[brev8_rv64i $x1, $z0, $z1, $z2]}
+
+ # Load the key (already bit-reversed)
+ ld $y0, 0($Htable)
+ ld $y1, 8($Htable)
+
+ # Load the reduction constant
+ la $polymod, Lpolymod
+ lbu $polymod, 0($polymod)
+
+Lstep:
+ # Load the input data, bit-reverse them, and XOR them with Xi
+ ld $t0, 0($inp)
+ ld $t1, 8($inp)
+ add $inp, $inp, 16
+ add $len, $len, -16
+ @{[brev8_rv64i $t0, $z0, $z1, $z2]}
+ @{[brev8_rv64i $t1, $z0, $z1, $z2]}
+ xor $x0, $x0, $t0
+ xor $x1, $x1, $t1
+
+ # Multiplication (without Karatsuba)
+ @{[clmulh $z3, $x1, $y1]}
+ @{[clmul $z2, $x1, $y1]}
+ @{[clmulh $t1, $x0, $y1]}
+ @{[clmul $z1, $x0, $y1]}
+ xor $z2, $z2, $t1
+ @{[clmulh $t1, $x1, $y0]}
+ @{[clmul $t0, $x1, $y0]}
+ xor $z2, $z2, $t1
+ xor $z1, $z1, $t0
+ @{[clmulh $t1, $x0, $y0]}
+ @{[clmul $z0, $x0, $y0]}
+ xor $z1, $z1, $t1
+
+ # Reduction with clmul
+ @{[clmulh $t1, $z3, $polymod]}
+ @{[clmul $t0, $z3, $polymod]}
+ xor $z2, $z2, $t1
+ xor $z1, $z1, $t0
+ @{[clmulh $t1, $z2, $polymod]}
+ @{[clmul $t0, $z2, $polymod]}
+ xor $x1, $z1, $t1
+ xor $x0, $z0, $t0
+
+ # Iterate over all blocks
+ bnez $len, Lstep
+
+ # Bit-reverse final Xi back and store it
+ @{[brev8_rv64i $x0, $z0, $z1, $z2]}
+ @{[brev8_rv64i $x1, $z0, $z1, $z2]}
+ sd $x0, 0($Xi)
+ sd $x1, 8($Xi)
+ ret
+.size gcm_ghash_rv64i_zbc,.-gcm_ghash_rv64i_zbc
+___
+}
+
+{
+my ($Xi,$Htable,$inp,$len,$x0,$x1,$y0,$y1) = ("a0","a1","a2","a3","a4","a5","a6","a7");
+my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_ghash_rv64i_zbc__zbkb
+.type gcm_ghash_rv64i_zbc__zbkb,\@function
+gcm_ghash_rv64i_zbc__zbkb:
+ # Load Xi and bit-reverse it
+ ld $x0, 0($Xi)
+ ld $x1, 8($Xi)
+ @{[brev8 $x0, $x0]}
+ @{[brev8 $x1, $x1]}
+
+ # Load the key (already bit-reversed)
+ ld $y0, 0($Htable)
+ ld $y1, 8($Htable)
+
+ # Load the reduction constant
+ la $polymod, Lpolymod
+ lbu $polymod, 0($polymod)
+
+Lstep_zkbk:
+ # Load the input data, bit-reverse them, and XOR them with Xi
+ ld $t0, 0($inp)
+ ld $t1, 8($inp)
+ add $inp, $inp, 16
+ add $len, $len, -16
+ @{[brev8 $t0, $t0]}
+ @{[brev8 $t1, $t1]}
+ xor $x0, $x0, $t0
+ xor $x1, $x1, $t1
+
+ # Multiplication (without Karatsuba)
+ @{[clmulh $z3, $x1, $y1]}
+ @{[clmul $z2, $x1, $y1]}
+ @{[clmulh $t1, $x0, $y1]}
+ @{[clmul $z1, $x0, $y1]}
+ xor $z2, $z2, $t1
+ @{[clmulh $t1, $x1, $y0]}
+ @{[clmul $t0, $x1, $y0]}
+ xor $z2, $z2, $t1
+ xor $z1, $z1, $t0
+ @{[clmulh $t1, $x0, $y0]}
+ @{[clmul $z0, $x0, $y0]}
+ xor $z1, $z1, $t1
+
+ # Reduction with clmul
+ @{[clmulh $t1, $z3, $polymod]}
+ @{[clmul $t0, $z3, $polymod]}
+ xor $z2, $z2, $t1
+ xor $z1, $z1, $t0
+ @{[clmulh $t1, $z2, $polymod]}
+ @{[clmul $t0, $z2, $polymod]}
+ xor $x1, $z1, $t1
+ xor $x0, $z0, $t0
+
+ # Iterate over all blocks
+ bnez $len, Lstep_zkbk
+
+ # Bit-reverse final Xi back and store it
+ @{[brev8 $x0, $x0]}
+ @{[brev8 $x1, $x1]}
+ sd $x0, 0($Xi)
+ sd $x1, 8($Xi)
+ ret
+.size gcm_ghash_rv64i_zbc__zbkb,.-gcm_ghash_rv64i_zbc__zbkb
+___
+}
+
+$code .= <<___;
+.p2align 3
+Lbrev8_const:
+ .dword 0xAAAAAAAAAAAAAAAA
+ .dword 0xCCCCCCCCCCCCCCCC
+ .dword 0xF0F0F0F0F0F0F0F0
+.size Lbrev8_const,.-Lbrev8_const
+
+Lpolymod:
+ .byte 0x87
+.size Lpolymod,.-Lpolymod
+___
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
new file mode 100644
@@ -0,0 +1,230 @@
+#! /usr/bin/env perl
+# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+use strict;
+use warnings;
+
+# Set $have_stacktrace to 1 if we have Devel::StackTrace
+my $have_stacktrace = 0;
+if (eval {require Devel::StackTrace;1;}) {
+ $have_stacktrace = 1;
+}
+
+my @regs = map("x$_",(0..31));
+my @regaliases = ('zero','ra','sp','gp','tp','t0','t1','t2','s0','s1',
+ map("a$_",(0..7)),
+ map("s$_",(2..11)),
+ map("t$_",(3..6))
+);
+
+my %reglookup;
+@reglookup{@regs} = @regs;
+@reglookup{@regaliases} = @regs;
+
+# Takes a register name, possibly an alias, and converts it to a register index
+# from 0 to 31
+sub read_reg {
+ my $reg = lc shift;
+ if (!exists($reglookup{$reg})) {
+ my $trace = "";
+ if ($have_stacktrace) {
+ $trace = Devel::StackTrace->new->as_string;
+ }
+ die("Unknown register ".$reg."\n".$trace);
+ }
+ my $regstr = $reglookup{$reg};
+ if (!($regstr =~ /^x([0-9]+)$/)) {
+ my $trace = "";
+ if ($have_stacktrace) {
+ $trace = Devel::StackTrace->new->as_string;
+ }
+ die("Could not process register ".$reg."\n".$trace);
+ }
+ return $1;
+}
+
+# Helper functions
+
+sub brev8_rv64i {
+ # brev8 without `brev8` instruction (only in Zkbk)
+ # Bit-reverses the first argument and needs three scratch registers
+ my $val = shift;
+ my $t0 = shift;
+ my $t1 = shift;
+ my $brev8_const = shift;
+ my $seq = <<___;
+ la $brev8_const, Lbrev8_const
+
+ ld $t0, 0($brev8_const) # 0xAAAAAAAAAAAAAAAA
+ slli $t1, $val, 1
+ and $t1, $t1, $t0
+ and $val, $val, $t0
+ srli $val, $val, 1
+ or $val, $t1, $val
+
+ ld $t0, 8($brev8_const) # 0xCCCCCCCCCCCCCCCC
+ slli $t1, $val, 2
+ and $t1, $t1, $t0
+ and $val, $val, $t0
+ srli $val, $val, 2
+ or $val, $t1, $val
+
+ ld $t0, 16($brev8_const) # 0xF0F0F0F0F0F0F0F0
+ slli $t1, $val, 4
+ and $t1, $t1, $t0
+ and $val, $val, $t0
+ srli $val, $val, 4
+ or $val, $t1, $val
+___
+ return $seq;
+}
+
+sub sd_rev8_rv64i {
+ # rev8 without `rev8` instruction (only in Zbb or Zbkb)
+ # Stores the given value byte-reversed and needs one scratch register
+ my $val = shift;
+ my $addr = shift;
+ my $off = shift;
+ my $tmp = shift;
+ my $off0 = ($off + 0);
+ my $off1 = ($off + 1);
+ my $off2 = ($off + 2);
+ my $off3 = ($off + 3);
+ my $off4 = ($off + 4);
+ my $off5 = ($off + 5);
+ my $off6 = ($off + 6);
+ my $off7 = ($off + 7);
+ my $seq = <<___;
+ sb $val, $off7($addr)
+ srli $tmp, $val, 8
+ sb $tmp, $off6($addr)
+ srli $tmp, $val, 16
+ sb $tmp, $off5($addr)
+ srli $tmp, $val, 24
+ sb $tmp, $off4($addr)
+ srli $tmp, $val, 32
+ sb $tmp, $off3($addr)
+ srli $tmp, $val, 40
+ sb $tmp, $off2($addr)
+ srli $tmp, $val, 48
+ sb $tmp, $off1($addr)
+ srli $tmp, $val, 56
+ sb $tmp, $off0($addr)
+___
+ return $seq;
+}
+
+# Scalar crypto instructions
+
+sub aes64ds {
+ # Encoding for aes64ds rd, rs1, rs2 instruction on RV64
+ # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
+ my $template = 0b0011101_00000_00000_000_00000_0110011;
+ my $rd = read_reg shift;
+ my $rs1 = read_reg shift;
+ my $rs2 = read_reg shift;
+ return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
+}
+
+sub aes64dsm {
+ # Encoding for aes64dsm rd, rs1, rs2 instruction on RV64
+ # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
+ my $template = 0b0011111_00000_00000_000_00000_0110011;
+ my $rd = read_reg shift;
+ my $rs1 = read_reg shift;
+ my $rs2 = read_reg shift;
+ return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
+}
+
+sub aes64es {
+ # Encoding for aes64es rd, rs1, rs2 instruction on RV64
+ # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
+ my $template = 0b0011001_00000_00000_000_00000_0110011;
+ my $rd = read_reg shift;
+ my $rs1 = read_reg shift;
+ my $rs2 = read_reg shift;
+ return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
+}
+
+sub aes64esm {
+ # Encoding for aes64esm rd, rs1, rs2 instruction on RV64
+ # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
+ my $template = 0b0011011_00000_00000_000_00000_0110011;
+ my $rd = read_reg shift;
+ my $rs1 = read_reg shift;
+ my $rs2 = read_reg shift;
+ return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
+}
+
+sub aes64im {
+ # Encoding for aes64im rd, rs1 instruction on RV64
+ # XXXXXXXXXXXX_ rs1 _XXX_ rd _XXXXXXX
+ my $template = 0b001100000000_00000_001_00000_0010011;
+ my $rd = read_reg shift;
+ my $rs1 = read_reg shift;
+ return ".word ".($template | ($rs1 << 15) | ($rd << 7));
+}
+
+sub aes64ks1i {
+ # Encoding for aes64ks1i rd, rs1, rnum instruction on RV64
+ # XXXXXXXX_rnum_ rs1 _XXX_ rd _XXXXXXX
+ my $template = 0b00110001_0000_00000_001_00000_0010011;
+ my $rd = read_reg shift;
+ my $rs1 = read_reg shift;
+ my $rnum = shift;
+ return ".word ".($template | ($rnum << 20) | ($rs1 << 15) | ($rd << 7));
+}
+
+sub aes64ks2 {
+ # Encoding for aes64ks2 rd, rs1, rs2 instruction on RV64
+ # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
+ my $template = 0b0111111_00000_00000_000_00000_0110011;
+ my $rd = read_reg shift;
+ my $rs1 = read_reg shift;
+ my $rs2 = read_reg shift;
+ return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
+}
+
+sub brev8 {
+ # brev8 rd, rs
+ my $template = 0b011010000111_00000_101_00000_0010011;
+ my $rd = read_reg shift;
+ my $rs = read_reg shift;
+ return ".word ".($template | ($rs << 15) | ($rd << 7));
+}
+
+sub clmul {
+ # Encoding for clmul rd, rs1, rs2 instruction on RV64
+ # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
+ my $template = 0b0000101_00000_00000_001_00000_0110011;
+ my $rd = read_reg shift;
+ my $rs1 = read_reg shift;
+ my $rs2 = read_reg shift;
+ return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
+}
+
+sub clmulh {
+ # Encoding for clmulh rd, rs1, rs2 instruction on RV64
+ # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
+ my $template = 0b0000101_00000_00000_011_00000_0110011;
+ my $rd = read_reg shift;
+ my $rs1 = read_reg shift;
+ my $rs2 = read_reg shift;
+ return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
+}
+
+sub rev8 {
+ # Encoding for rev8 rd, rs instruction on RV64
+ # XXXXXXXXXXXXX_ rs _XXX_ rd _XXXXXXX
+ my $template = 0b011010111000_00000_101_00000_0010011;
+ my $rd = read_reg shift;
+ my $rs = read_reg shift;
+ return ".word ".($template | ($rs << 15) | ($rd << 7));
+}
+
+1;