@@ -19,8 +19,9 @@ CFLAGS += $(CFLAGS_xeninclude)
SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er avx512vbmi
FMA := fma4 fma
SG := avx2-sg avx512f-sg avx512vl-sg
+AES := ssse3-aes avx-aes avx2-vaes avx512bw-vaes
GF := sse2-gf avx2-gf avx512bw-gf
-TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(GF)
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(AES) $(GF)
OPMASK := avx512f avx512dq avx512bw
@@ -143,6 +144,10 @@ $(1)-cflags := \
$(foreach flt,$($(1)-flts), \
"-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
endef
+define simd-aes-defs
+$(1)-cflags := $(foreach vec,$($(patsubst %-aes,sse,$(1))-vecs) $($(patsubst %-vaes,%,$(1))-vecs), \
+ "-D_$(vec) -maes $(addprefix -m,$(subst -,$(space),$(1))) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
+endef
define simd-gf-defs
$(1)-cflags := $(foreach vec,$($(1:-gf=)-vecs), \
"-D_$(vec) -mgfni -m$(1:-gf=) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
@@ -153,6 +158,7 @@ endef
$(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
$(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
+$(foreach flavor,$(AES),$(eval $(call simd-aes-defs,$(flavor))))
$(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor))))
$(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
@@ -203,10 +209,13 @@ $(addsuffix .c,$(FMA)):
$(addsuffix .c,$(SG)):
ln -sf simd-sg.c $@
+$(addsuffix .c,$(AES)):
+ ln -sf simd-aes.c $@
+
$(addsuffix .c,$(GF)):
ln -sf simd-gf.c $@
-$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(GF)): simd.h
+$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(GF)): simd.h
xop.h avx512f.h: simd-fma.c
@@ -0,0 +1,102 @@
+#define UINT_SIZE 1
+
+#include "simd.h"
+ENTRY(aes_test);
+
+#if VEC_SIZE == 16
+# define AES(op, a...) __builtin_ia32_vaes ## op ## _v16qi(a)
+# define imc(x) ((vec_t)__builtin_ia32_aesimc128((vdi_t)(x)))
+#elif VEC_SIZE == 32
+# define AES(op, a...) __builtin_ia32_vaes ## op ## _v32qi(a)
+# define imc(x) ({ \
+ vec_t r_; \
+ unsigned char __attribute__((vector_size(16))) t_; \
+ asm ( "vaesimc (%3), %x0\n\t" \
+ "vaesimc 16(%3), %1\n\t" \
+ "vinserti128 $1, %1, %0, %0" \
+ : "=&v" (r_), "=&v" (t_) \
+ : "m" (x), "r" (&(x)) ); \
+ r_; \
+})
+#elif VEC_SIZE == 64
+# define AES(op, a...) __builtin_ia32_vaes ## op ## _v64qi(a)
+# define imc(x) ({ \
+ vec_t r_; \
+ unsigned char __attribute__((vector_size(16))) t_; \
+ asm ( "vaesimc (%3), %x0\n\t" \
+ "vaesimc 1*16(%3), %1\n\t" \
+ "vinserti32x4 $1, %1, %0, %0\n\t" \
+ "vaesimc 2*16(%3), %1\n\t" \
+ "vinserti32x4 $2, %1, %0, %0\n\t" \
+ "vaesimc 3*16(%3), %1\n\t" \
+ "vinserti32x4 $3, %1, %0, %0" \
+ : "=&v" (r_), "=&v" (t_) \
+ : "m" (x), "r" (&(x)) ); \
+ r_; \
+})
+#endif
+
+#ifdef __AVX512BW__
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
+# define aes(op, x, y) ((vec_t)AES(op, (vqi_t)(x), (vqi_t)(y)))
+#else
+# if defined(__AVX2__) && VEC_SIZE == 32
+# define to_bool(cmp) B(ptestc, , cmp, (vdi_t){} == 0)
+# define aes(op, x, y) ((vec_t)AES(op, (vqi_t)(x), (vqi_t)(y)))
+# else
+# define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
+# define aes(op, x, y) ((vec_t)__builtin_ia32_aes ## op ## 128((vdi_t)(x), (vdi_t)(y)))
+# endif
+# define eq(x, y) to_bool((x) == (y))
+#endif
+
+int aes_test(void)
+{
+ unsigned int i;
+ vec_t src, zero = {};
+
+ for ( i = 0; i < ELEM_COUNT; ++i )
+ src[i] = i;
+
+ do {
+ vec_t x, y;
+
+ touch(src);
+ x = imc(src);
+ touch(src);
+
+ touch(zero);
+ y = aes(enclast, src, zero);
+ touch(zero);
+ y = aes(dec, y, zero);
+
+ if ( !eq(x, y) ) return __LINE__;
+
+ touch(zero);
+ x = aes(declast, src, zero);
+ touch(zero);
+ y = aes(enc, x, zero);
+ touch(y);
+ x = imc(y);
+
+ if ( !eq(x, src) ) return __LINE__;
+
+#if VEC_SIZE == 16
+ touch(src);
+ x = (vec_t)__builtin_ia32_aeskeygenassist128((vdi_t)src, 0);
+ touch(src);
+ y = (vec_t)__builtin_ia32_pshufb128((vqi_t)x,
+ (vqi_t){ 7, 4, 5, 6,
+ 1, 2, 3, 0,
+ 15, 12, 13, 14,
+ 9, 10, 11, 8 });
+ if ( !eq(x, y) ) return __LINE__;
+#endif
+
+ src += ELEM_COUNT;
+ i += ELEM_COUNT;
+ } while ( i <= 256 );
+
+ return 0;
+}
@@ -340,6 +340,10 @@ REN(pandn, , d);
REN(por, , d);
REN(pxor, , d);
# endif
+OVR(aesdec);
+OVR(aesdeclast);
+OVR(aesenc);
+OVR(aesenclast);
OVR(cvtpd2dqx);
OVR(cvtpd2dqy);
OVR(cvtpd2psx);
@@ -12,12 +12,15 @@ asm ( ".pushsection .test, \"ax\", @prog
#include "sse.h"
#include "sse2.h"
#include "sse2-gf.h"
+#include "ssse3-aes.h"
#include "sse4.h"
#include "avx.h"
+#include "avx-aes.h"
#include "fma4.h"
#include "fma.h"
#include "avx2.h"
#include "avx2-sg.h"
+#include "avx2-vaes.h"
#include "avx2-gf.h"
#include "xop.h"
#include "avx512f-opmask.h"
@@ -27,6 +30,7 @@ asm ( ".pushsection .test, \"ax\", @prog
#include "avx512f-sg.h"
#include "avx512vl-sg.h"
#include "avx512bw.h"
+#include "avx512bw-vaes.h"
#include "avx512bw-gf.h"
#include "avx512dq.h"
#include "avx512er.h"
@@ -91,6 +95,16 @@ static bool simd_check_xop(void)
return cpu_has_xop;
}
+static bool simd_check_ssse3_aes(void)
+{
+ return cpu_has_aesni && cpu_has_ssse3;
+}
+
+static bool simd_check_avx_aes(void)
+{
+ return cpu_has_aesni && cpu_has_avx;
+}
+
static bool simd_check_avx512f(void)
{
return cpu_has_avx512f;
@@ -141,6 +155,22 @@ static bool simd_check_avx512vbmi_vl(voi
return cpu_has_avx512_vbmi && cpu_has_avx512vl;
}
+static bool simd_check_avx2_vaes(void)
+{
+ return cpu_has_aesni && cpu_has_vaes && cpu_has_avx2;
+}
+
+static bool simd_check_avx512bw_vaes(void)
+{
+ return cpu_has_aesni && cpu_has_vaes && cpu_has_avx512bw;
+}
+
+static bool simd_check_avx512bw_vaes_vl(void)
+{
+ return cpu_has_aesni && cpu_has_vaes &&
+ cpu_has_avx512bw && cpu_has_avx512vl;
+}
+
static bool simd_check_sse2_gf(void)
{
return cpu_has_gfni && cpu_has_sse2;
@@ -319,6 +349,8 @@ static const struct {
SIMD(XOP i16x16, xop, 32i2),
SIMD(XOP i32x8, xop, 32i4),
SIMD(XOP i64x4, xop, 32i8),
+ SIMD(AES (legacy), ssse3_aes, 16),
+ SIMD(AES (VEX/x16), avx_aes, 16),
SIMD(OPMASK/w, avx512f_opmask, 2),
SIMD(OPMASK+DQ/b, avx512dq_opmask, 1),
SIMD(OPMASK+DQ/w, avx512dq_opmask, 2),
@@ -418,6 +450,10 @@ static const struct {
AVX512VL(_VBMI+VL u16x8, avx512vbmi, 16u2),
AVX512VL(_VBMI+VL s16x16, avx512vbmi, 32i2),
AVX512VL(_VBMI+VL u16x16, avx512vbmi, 32u2),
+ SIMD(VAES (VEX/x32), avx2_vaes, 32),
+ SIMD(VAES (EVEX/x64), avx512bw_vaes, 64),
+ AVX512VL(VL+VAES (x16), avx512bw_vaes, 16),
+ AVX512VL(VL+VAES (x32), avx512bw_vaes, 32),
SIMD(GFNI (legacy), sse2_gf, 16),
SIMD(GFNI (VEX/x16), avx2_gf, 16),
SIMD(GFNI (VEX/x32), avx2_gf, 32),
@@ -125,10 +125,12 @@ static inline bool xcr0_mask(uint64_t ma
#define cpu_has_sse cp.basic.sse
#define cpu_has_sse2 cp.basic.sse2
#define cpu_has_sse3 cp.basic.sse3
+#define cpu_has_ssse3 cp.basic.ssse3
#define cpu_has_fma (cp.basic.fma && xcr0_mask(6))
#define cpu_has_sse4_1 cp.basic.sse4_1
#define cpu_has_sse4_2 cp.basic.sse4_2
#define cpu_has_popcnt cp.basic.popcnt
+#define cpu_has_aesni cp.basic.aesni
#define cpu_has_avx (cp.basic.avx && xcr0_mask(6))
#define cpu_has_f16c (cp.basic.f16c && xcr0_mask(6))