diff mbox

[V7,4/7] crypto: AES CBC by8 encryption

Message ID 1501035000-6283-5-git-send-email-megha.dey@linux.intel.com (mailing list archive)
State Changes Requested
Delegated to: Herbert Xu
Headers show

Commit Message

Dey, Megha July 26, 2017, 2:09 a.m. UTC
This patch introduces the assembly routine to do a by8 AES CBC
encryption in support of the AES CBC multi-buffer implementation.

It encrypts 8 data streams of the same key size simultaneously.

Originally-by: Chandramouli Narayanan <mouli_7982@yahoo.com>
Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Acked-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 arch/x86/crypto/aes-cbc-mb/aes_cbc_enc_x8.S | 775 ++++++++++++++++++++++++++++
 1 file changed, 775 insertions(+)
 create mode 100644 arch/x86/crypto/aes-cbc-mb/aes_cbc_enc_x8.S
diff mbox

Patch

diff --git a/arch/x86/crypto/aes-cbc-mb/aes_cbc_enc_x8.S b/arch/x86/crypto/aes-cbc-mb/aes_cbc_enc_x8.S
new file mode 100644
index 0000000..2130574
--- /dev/null
+++ b/arch/x86/crypto/aes-cbc-mb/aes_cbc_enc_x8.S
@@ -0,0 +1,775 @@ 
+/*
+ *	AES CBC by8 multibuffer optimization (x86_64)
+ *	This file implements 128/192/256 bit AES CBC encryption
+ *
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/linkage.h>
+
+/* stack size needs to be an odd multiple of 8 for alignment */
+
+#define AES_KEYSIZE_128        16
+#define AES_KEYSIZE_192        24
+#define AES_KEYSIZE_256        32
+
+#define XMM_SAVE_SIZE	16*10
+#define GPR_SAVE_SIZE	8*9
+#define STACK_SIZE	(XMM_SAVE_SIZE + GPR_SAVE_SIZE)
+
+#define GPR_SAVE_REG	%rsp
+#define GPR_SAVE_AREA	%rsp + XMM_SAVE_SIZE
+#define LEN_AREA_OFFSET	XMM_SAVE_SIZE + 8*8
+#define LEN_AREA_REG	%rsp
+#define LEN_AREA	%rsp + XMM_SAVE_SIZE + 8*8
+
+#define IN_OFFSET	0
+#define OUT_OFFSET	8*8
+#define KEYS_OFFSET	16*8
+#define IV_OFFSET	24*8
+
+
+#define IDX	%rax
+#define TMP	%rbx
+#define ARG	%rdi
+#define LEN	%rsi
+
+#define KEYS0	%r14
+#define KEYS1	%r15
+#define KEYS2	%rbp
+#define KEYS3	%rdx
+#define KEYS4	%rcx
+#define KEYS5	%r8
+#define KEYS6	%r9
+#define KEYS7	%r10
+
+#define IN0	%r11
+#define IN2	%r12
+#define IN4	%r13
+#define IN6	LEN
+
+#define XDATA0	%xmm0
+#define XDATA1	%xmm1
+#define XDATA2	%xmm2
+#define XDATA3	%xmm3
+#define XDATA4	%xmm4
+#define XDATA5	%xmm5
+#define XDATA6	%xmm6
+#define XDATA7	%xmm7
+
+#define XKEY0_3	%xmm8
+#define XKEY1_4	%xmm9
+#define XKEY2_5	%xmm10
+#define XKEY3_6	%xmm11
+#define XKEY4_7	%xmm12
+#define XKEY5_8	%xmm13
+#define XKEY6_9	%xmm14
+#define XTMP	%xmm15
+
+#define	MOVDQ movdqu /* assume buffers not aligned */
+#define CONCAT(a, b)	a##b
+#define INPUT_REG_SUFX	1	/* IN */
+#define XDATA_REG_SUFX	2	/* XDAT */
+#define KEY_REG_SUFX	3	/* KEY */
+#define XMM_REG_SUFX	4	/* XMM */
+
+/*
+ * To avoid positional parameter errors while compiling
+ * three registers need to be passed
+ */
+.text
+
+.macro pxor2 x, y, z
+	MOVDQ	(\x,\y), XTMP
+	pxor	XTMP, \z
+.endm
+
+.macro inreg n
+	.if (\n == 0)
+		reg_IN = IN0
+	.elseif (\n == 2)
+		reg_IN = IN2
+	.elseif (\n == 4)
+		reg_IN = IN4
+	.elseif (\n == 6)
+		reg_IN = IN6
+	.else
+		error "inreg: incorrect register number"
+	.endif
+.endm
+.macro xdatareg n
+	.if (\n == 0)
+		reg_XDAT = XDATA0
+	.elseif (\n == 1)
+		reg_XDAT = XDATA1
+	.elseif (\n == 2)
+		reg_XDAT = XDATA2
+	.elseif (\n == 3)
+		reg_XDAT = XDATA3
+	.elseif (\n == 4)
+		reg_XDAT = XDATA4
+	.elseif (\n == 5)
+		reg_XDAT = XDATA5
+	.elseif (\n == 6)
+		reg_XDAT = XDATA6
+	.elseif (\n == 7)
+		reg_XDAT = XDATA7
+	.endif
+.endm
+.macro xkeyreg n
+	.if (\n == 0)
+		reg_KEY = KEYS0
+	.elseif (\n == 1)
+		reg_KEY = KEYS1
+	.elseif (\n == 2)
+		reg_KEY = KEYS2
+	.elseif (\n == 3)
+		reg_KEY = KEYS3
+	.elseif (\n == 4)
+		reg_KEY = KEYS4
+	.elseif (\n == 5)
+		reg_KEY = KEYS5
+	.elseif (\n == 6)
+		reg_KEY = KEYS6
+	.elseif (\n == 7)
+		reg_KEY = KEYS7
+	.endif
+.endm
+.macro xmmreg n
+	.if (\n >= 0) && (\n < 16)
+		/* Valid register number */
+		reg_XMM = %xmm\n
+	.else
+		error "xmmreg: incorrect register number"
+	.endif
+.endm
+
+/*
+ * suffix - register suffix
+ * set up the register name using the loop index I
+ */
+.macro define_reg suffix
+.altmacro
+	.if (\suffix == INPUT_REG_SUFX)
+		inreg  %I
+	.elseif (\suffix == XDATA_REG_SUFX)
+		xdatareg  %I
+	.elseif (\suffix == KEY_REG_SUFX)
+		xkeyreg  %I
+	.elseif (\suffix == XMM_REG_SUFX)
+		xmmreg  %I
+	.else
+		error "define_reg: unknown register suffix"
+	.endif
+.noaltmacro
+.endm
+
+/*
+ * aes_cbc_enc_x8 key_len
+ * macro to encode data for 128bit, 192bit and 256bit keys
+ */
+
+.macro aes_cbc_enc_x8 key_len
+
+	sub	$STACK_SIZE, %rsp
+
+	mov	%rbx, (XMM_SAVE_SIZE + 8*0)(GPR_SAVE_REG)
+	mov	%rbp, (XMM_SAVE_SIZE + 8*3)(GPR_SAVE_REG)
+	mov	%r12, (XMM_SAVE_SIZE + 8*4)(GPR_SAVE_REG)
+	mov	%r13, (XMM_SAVE_SIZE + 8*5)(GPR_SAVE_REG)
+	mov	%r14, (XMM_SAVE_SIZE + 8*6)(GPR_SAVE_REG)
+	mov	%r15, (XMM_SAVE_SIZE + 8*7)(GPR_SAVE_REG)
+
+	mov	$16, IDX
+	shl	$4, LEN	/* LEN = LEN * 16 */
+	/* LEN is now in terms of bytes */
+	mov	LEN, (LEN_AREA_OFFSET)(LEN_AREA_REG)
+
+	/* Run through storing arguments in IN0,2,4,6 */
+	I = 0
+	.rept 4
+		define_reg	INPUT_REG_SUFX
+		mov	(IN_OFFSET + 8*I)(ARG), reg_IN
+		I = (I + 2)
+	.endr
+
+	/* load 1 .. 8 blocks of plain text into XDATA0..XDATA7 */
+	I = 0
+	.rept 4
+		mov		(IN_OFFSET + 8*(I+1))(ARG), TMP
+		define_reg	INPUT_REG_SUFX
+		define_reg	XDATA_REG_SUFX
+		/* load first block of plain text */
+		MOVDQ		(reg_IN), reg_XDAT
+		I = (I + 1)
+		define_reg	XDATA_REG_SUFX
+		/* load next block of plain text */
+		MOVDQ		(TMP), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	/* Run through XDATA0 .. XDATA7 to perform plaintext XOR IV */
+	I = 0
+	.rept 8
+		define_reg	XDATA_REG_SUFX
+		pxor	(IV_OFFSET + 16*I)(ARG), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	I = 0
+	.rept 8
+		define_reg	KEY_REG_SUFX
+		mov	(KEYS_OFFSET + 8*I)(ARG), reg_KEY
+		I = (I + 1)
+	.endr
+
+	I = 0
+	/* 0..7 ARK */
+	.rept 8
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		pxor	16*0(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	I = 0
+		/* 1. ENC */
+	.rept 8
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	(16*1)(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	movdqa		16*3(KEYS0), XKEY0_3	/* load round 3 key */
+
+	I = 0
+		/* 2. ENC */
+	.rept 8
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	(16*2)(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	movdqa		16*4(KEYS1), XKEY1_4	/* load round 4 key */
+
+		/* 3. ENC */
+	aesenc		XKEY0_3, XDATA0
+	I = 1
+	.rept 7
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	(16*3)(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	/*
+	 * FIXME:
+	 * why can't we reorder encrypt DATA0..DATA7 and load 5th round?
+	 */
+	aesenc		(16*4)(KEYS0), XDATA0		/* 4. ENC */
+	movdqa		16*5(KEYS2), XKEY2_5		/* load round 5 key */
+	aesenc		XKEY1_4, XDATA1			/* 4. ENC */
+
+	I = 2
+	.rept 6
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	(16*4)(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	aesenc		(16*5)(KEYS0), XDATA0		/* 5. ENC */
+	aesenc		(16*5)(KEYS1), XDATA1		/* 5. ENC */
+	movdqa		16*6(KEYS3), XKEY3_6		/* load round 6 key */
+	aesenc		XKEY2_5, XDATA2			/* 5. ENC */
+	I = 3
+	.rept 5
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	(16*5)(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	aesenc		(16*6)(KEYS0), XDATA0		/* 6. ENC */
+	aesenc		(16*6)(KEYS1), XDATA1		/* 6. ENC */
+	aesenc		(16*6)(KEYS2), XDATA2		/* 6. ENC */
+	movdqa		16*7(KEYS4), XKEY4_7		/* load round 7 key */
+	aesenc		XKEY3_6, XDATA3			/* 6. ENC */
+
+	I = 4
+	.rept 4
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	(16*6)(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	I = 0
+	.rept 4
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	(16*7)(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+	movdqa		16*8(KEYS5), XKEY5_8		/* load round 8 key */
+	aesenc		XKEY4_7, XDATA4			/* 7. ENC */
+	I = 5
+	.rept 3
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	(16*7)(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	I = 0
+	.rept 5
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	(16*8)(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+	movdqa		16*9(KEYS6), XKEY6_9		/* load round 9 key */
+	aesenc		XKEY5_8, XDATA5			/* 8. ENC */
+	aesenc		16*8(KEYS6), XDATA6		/* 8. ENC */
+	aesenc		16*8(KEYS7), XDATA7		/* 8. ENC */
+
+	I = 0
+	.rept 6
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	(16*9)(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+	mov		(OUT_OFFSET + 8*0)(ARG), TMP
+	aesenc		XKEY6_9, XDATA6			/* 9. ENC */
+	aesenc		16*9(KEYS7), XDATA7		/* 9. ENC */
+
+		/* 10. ENC (last for 128bit keys) */
+	I = 0
+	.rept 8
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		.if (\key_len == AES_KEYSIZE_128)
+			aesenclast	(16*10)(reg_KEY), reg_XDAT
+		.else
+			aesenc	(16*10)(reg_KEY), reg_XDAT
+		.endif
+		I = (I + 1)
+	.endr
+
+	.if (\key_len != AES_KEYSIZE_128)
+		/* 11. ENC */
+		I = 0
+		.rept 8
+			define_reg	XDATA_REG_SUFX
+			define_reg	KEY_REG_SUFX
+			aesenc	(16*11)(reg_KEY), reg_XDAT
+			I = (I + 1)
+		.endr
+
+		/* 12. ENC (last for 192bit key) */
+		I = 0
+		.rept 8
+			define_reg	XDATA_REG_SUFX
+			define_reg	KEY_REG_SUFX
+			.if (\key_len == AES_KEYSIZE_192)
+				aesenclast	(16*12)(reg_KEY), reg_XDAT
+			.else
+				aesenc		(16*12)(reg_KEY), reg_XDAT
+			.endif
+			I = (I + 1)
+		.endr
+
+		/* for 256bit, two more rounds */
+		.if \key_len == AES_KEYSIZE_256
+
+			/* 13. ENC */
+			I = 0
+			.rept 8
+				define_reg	XDATA_REG_SUFX
+				define_reg	KEY_REG_SUFX
+				aesenc	(16*13)(reg_KEY), reg_XDAT
+				I = (I + 1)
+			.endr
+
+			/* 14. ENC last encode for 256bit key */
+			I = 0
+			.rept 8
+				define_reg	XDATA_REG_SUFX
+				define_reg	KEY_REG_SUFX
+				aesenclast	(16*14)(reg_KEY), reg_XDAT
+				I = (I + 1)
+			.endr
+		.endif
+
+	.endif
+
+	I = 0
+	.rept 8
+		define_reg	XDATA_REG_SUFX
+		MOVDQ	reg_XDAT, (TMP)	/* write back ciphertext */
+		I = (I + 1)
+		.if (I < 8)
+			mov	(OUT_OFFSET + 8*I)(ARG), TMP
+		.endif
+	.endr
+
+	cmp		IDX, LEN_AREA_OFFSET(LEN_AREA_REG)
+	je		.Ldone\key_len
+
+.Lmain_loop\key_len:
+	mov		(IN_OFFSET + 8*1)(ARG), TMP
+	pxor2		IN0, IDX, XDATA0	/* next block of plain text */
+	pxor2		TMP, IDX, XDATA1	/* next block of plain text */
+
+	mov		(IN_OFFSET + 8*3)(ARG), TMP
+	pxor2		IN2, IDX, XDATA2	/* next block of plain text */
+	pxor2		TMP, IDX, XDATA3	/* next block of plain text */
+
+	mov		(IN_OFFSET + 8*5)(ARG), TMP
+	pxor2		IN4, IDX, XDATA4	/* next block of plain text */
+	pxor2		TMP, IDX, XDATA5	/* next block of plain text */
+
+	mov		(IN_OFFSET + 8*7)(ARG), TMP
+	pxor2		IN6, IDX, XDATA6	/* next block of plain text */
+	pxor2		TMP, IDX, XDATA7	/* next block of plain text */
+
+	/* 0. ARK */
+	I = 0
+	.rept 8
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		pxor	16*0(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	/* 1. ENC */
+	I = 0
+	.rept 8
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	16*1(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	/* 2. ENC */
+	I = 0
+	.rept 8
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	16*2(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	/* 3. ENC */
+	aesenc		XKEY0_3, XDATA0		/* 3. ENC */
+	I = 1
+	.rept 7
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	16*3(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	/* 4. ENC */
+	aesenc		16*4(KEYS0), XDATA0	/* 4. ENC */
+	aesenc		XKEY1_4, XDATA1		/* 4. ENC */
+	I = 2
+	.rept 6
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	16*4(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	/* 5. ENC */
+	aesenc		16*5(KEYS0), XDATA0	/* 5. ENC */
+	aesenc		16*5(KEYS1), XDATA1	/* 5. ENC */
+	aesenc		XKEY2_5, XDATA2		/* 5. ENC */
+
+	I = 3
+	.rept 5
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	16*5(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	/* 6. ENC */
+	I = 0
+	.rept 3
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	16*6(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+	aesenc		XKEY3_6, XDATA3		/* 6. ENC */
+	I = 4
+	.rept 4
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	16*6(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	/* 7. ENC */
+	I = 0
+	.rept 4
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	16*7(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+	aesenc		XKEY4_7, XDATA4		/* 7. ENC */
+	I = 5
+	.rept 3
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	16*7(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+
+	/* 8. ENC */
+	I = 0
+	.rept 5
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	16*8(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+	aesenc		XKEY5_8, XDATA5		/* 8. ENC */
+	aesenc		16*8(KEYS6), XDATA6	/* 8. ENC */
+	aesenc		16*8(KEYS7), XDATA7	/* 8. ENC */
+
+	/* 9. ENC */
+	I = 0
+	.rept 6
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		aesenc	16*9(reg_KEY), reg_XDAT
+		I = (I + 1)
+	.endr
+	mov		(OUT_OFFSET + 8*0)(ARG), TMP
+	aesenc		XKEY6_9, XDATA6		/* 9. ENC */
+	aesenc		16*9(KEYS7), XDATA7	/* 9. ENC */
+
+	/* 10. ENC (last for 128 bit key) */
+	I = 0
+	.rept 8
+		define_reg	XDATA_REG_SUFX
+		define_reg	KEY_REG_SUFX
+		.if (\key_len == AES_KEYSIZE_128)
+			aesenclast	16*10(reg_KEY), reg_XDAT
+		.else
+			aesenc	16*10(reg_KEY), reg_XDAT
+		.endif
+		I = (I + 1)
+	.endr
+
+	.if (\key_len != AES_KEYSIZE_128)
+		/* 11. ENC */
+		I = 0
+		.rept 8
+			define_reg	XDATA_REG_SUFX
+			define_reg	KEY_REG_SUFX
+			aesenc	16*11(reg_KEY), reg_XDAT
+			I = (I + 1)
+		.endr
+
+		/* 12. last ENC for 192bit key */
+		I = 0
+		.rept 8
+			define_reg	XDATA_REG_SUFX
+			define_reg	KEY_REG_SUFX
+			.if (\key_len == AES_KEYSIZE_192)
+				aesenclast	16*12(reg_KEY), reg_XDAT
+			.else
+				aesenc		16*12(reg_KEY), reg_XDAT
+			.endif
+			I = (I + 1)
+		.endr
+
+		.if \key_len == AES_KEYSIZE_256
+			/* for 256bit, two more rounds */
+			/* 13. ENC */
+			I = 0
+			.rept 8
+				define_reg	XDATA_REG_SUFX
+				define_reg	KEY_REG_SUFX
+				aesenc	16*13(reg_KEY), reg_XDAT
+				I = (I + 1)
+			.endr
+
+			/* 14. last ENC for 256bit key */
+			I = 0
+			.rept 8
+				define_reg	XDATA_REG_SUFX
+				define_reg	KEY_REG_SUFX
+				aesenclast	16*14(reg_KEY), reg_XDAT
+				I = (I + 1)
+			.endr
+		.endif
+	.endif
+
+	I = 0
+	.rept 8
+		define_reg	XDATA_REG_SUFX
+		/* write back cipher text */
+		MOVDQ		reg_XDAT, (TMP , IDX)
+		I = (I + 1)
+		.if (I < 8)
+			mov		(OUT_OFFSET + 8*I)(ARG), TMP
+		.endif
+	.endr
+
+	add	$16, IDX
+	cmp	IDX, LEN_AREA_OFFSET(LEN_AREA_REG)
+	jne	.Lmain_loop\key_len
+
+.Ldone\key_len:
+	/* update IV */
+	I = 0
+	.rept 8
+		define_reg	XDATA_REG_SUFX
+		movdqa	reg_XDAT, (IV_OFFSET + 16*I)(ARG)
+		I = (I + 1)
+	.endr
+
+	/* update IN and OUT */
+	movd	LEN_AREA_OFFSET(LEN_AREA_REG), %xmm0
+	pshufd	$0x44, %xmm0, %xmm0
+
+	I = 1
+	.rept 4
+		define_reg	XMM_REG_SUFX
+		movdqa	(IN_OFFSET + 16*(I-1))(ARG), reg_XMM
+		I = (I + 1)
+	.endr
+
+	paddq	%xmm0, %xmm1
+	paddq	%xmm0, %xmm2
+	paddq	%xmm0, %xmm3
+	paddq	%xmm0, %xmm4
+
+	I = 5
+	.rept 4
+		define_reg	XMM_REG_SUFX
+		movdqa	(OUT_OFFSET + 16*(I-5))(ARG), reg_XMM
+		I = (I + 1)
+	.endr
+
+	I = 1
+	.rept 4
+		define_reg	XMM_REG_SUFX
+		movdqa	reg_XMM, (IN_OFFSET + 16*(I-1))(ARG)
+		I = (I + 1)
+	.endr
+
+	paddq	%xmm0, %xmm5
+	paddq	%xmm0, %xmm6
+	paddq	%xmm0, %xmm7
+	paddq	%xmm0, %xmm8
+
+	I = 5
+	.rept 4
+		define_reg	XMM_REG_SUFX
+		movdqa	reg_XMM, (OUT_OFFSET + 16*(I-5))(ARG)
+		I = (I + 1)
+	.endr
+
+	mov	(XMM_SAVE_SIZE + 8*0)(GPR_SAVE_REG), %rbx
+	mov	(XMM_SAVE_SIZE + 8*3)(GPR_SAVE_REG), %rbp
+	mov	(XMM_SAVE_SIZE + 8*4)(GPR_SAVE_REG), %r12
+	mov	(XMM_SAVE_SIZE + 8*5)(GPR_SAVE_REG), %r13
+	mov	(XMM_SAVE_SIZE + 8*6)(GPR_SAVE_REG), %r14
+	mov	(XMM_SAVE_SIZE + 8*7)(GPR_SAVE_REG), %r15
+
+	add	$STACK_SIZE, %rsp
+
+	ret
+.endm
+
+/*
+ * AES CBC encryption routine supporting 128/192/256 bit keys
+ *
+ * void aes_cbc_enc_128_x8(struct aes_cbc_args_x8 *args, u64 len);
+ * arg 1: rcx : addr of AES_ARGS_x8 structure
+ * arg 2: rdx : len (in units of 16-byte blocks)
+ * void aes_cbc_enc_192_x8(struct aes_cbc_args_x8 *args, u64 len);
+ * arg 1: rcx : addr of aes_cbc_args_x8 structure
+ * arg 2: rdx : len (in units of 16-byte blocks)
+ * void aes_cbc_enc_256_x8(struct aes_cbc_args_x8 *args, u64 len);
+ * arg 1: rcx : addr of aes_cbc_args_x8 structure
+ * arg 2: rdx : len (in units of 16-byte blocks)
+ */
+
+ENTRY(aes_cbc_enc_128_x8)
+
+	aes_cbc_enc_x8 AES_KEYSIZE_128
+
+ENDPROC(aes_cbc_enc_128_x8)
+
+ENTRY(aes_cbc_enc_192_x8)
+
+	aes_cbc_enc_x8 AES_KEYSIZE_192
+
+ENDPROC(aes_cbc_enc_192_x8)
+
+ENTRY(aes_cbc_enc_256_x8)
+
+	aes_cbc_enc_x8 AES_KEYSIZE_256
+
+ENDPROC(aes_cbc_enc_256_x8)