From d25e44ea6169ef362ab0269140cb5aa492aa197b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olivier=20Ch=C3=A9ron?= Date: Sun, 19 May 2019 11:18:40 +0200 Subject: [PATCH] Add GHASH implementation with PCLMULQDQ --- cbits/aes/gf.c | 4 +- cbits/aes/gf.h | 4 +- cbits/aes/x86ni.c | 93 ++++++++++++++++++++++++++++++++++++++++-- cbits/aes/x86ni.h | 5 ++- cbits/cryptonite_aes.c | 23 +++++++++-- 5 files changed, 116 insertions(+), 13 deletions(-) diff --git a/cbits/aes/gf.c b/cbits/aes/gf.c index 7aeccf6..7dcc12c 100644 --- a/cbits/aes/gf.c +++ b/cbits/aes/gf.c @@ -39,7 +39,7 @@ * to speed up the multiplication. * TODO: optimise with tables */ -void cryptonite_gf_mul(block128 *a, block128 *b) +void cryptonite_aes_generic_gf_mul(block128 *a, block128 *b) { uint64_t a0, a1, v0, v1; int i, j; @@ -62,7 +62,7 @@ void cryptonite_gf_mul(block128 *a, block128 *b) } /* inplace GFMUL for xts mode */ -void cryptonite_gf_mulx(block128 *a) +void cryptonite_aes_generic_gf_mulx(block128 *a) { const uint64_t gf_mask = cpu_to_le64(0x8000000000000000ULL); uint64_t r = ((a->q[1] & gf_mask) ? cpu_to_le64(0x87) : 0); diff --git a/cbits/aes/gf.h b/cbits/aes/gf.h index 329d290..21b542c 100644 --- a/cbits/aes/gf.h +++ b/cbits/aes/gf.h @@ -32,7 +32,7 @@ #include "aes/block128.h" -void cryptonite_gf_mul(block128 *a, block128 *b); -void cryptonite_gf_mulx(block128 *a); +void cryptonite_aes_generic_gf_mul(block128 *a, block128 *b); +void cryptonite_aes_generic_gf_mulx(block128 *a); #endif diff --git a/cbits/aes/x86ni.c b/cbits/aes/x86ni.c index 2b8eeb6..556bde1 100644 --- a/cbits/aes/x86ni.c +++ b/cbits/aes/x86ni.c @@ -158,18 +158,103 @@ static __m128i gfmulx(__m128i v) return v; } -static __m128i ghash_add(__m128i tag, __m128i h, __m128i m) +static __m128i gfmul_generic(__m128i tag, __m128i h) { aes_block _t, _h; - tag = _mm_xor_si128(tag, m); - _mm_store_si128((__m128i *) &_t, tag); _mm_store_si128((__m128i *) &_h, h); - cryptonite_gf_mul(&_t, &_h); + cryptonite_aes_generic_gf_mul(&_t, &_h); tag = _mm_load_si128((__m128i *) &_t); return tag; } +#ifdef WITH_PCLMUL + +__m128i (*gfmul_branch_ptr)(__m128i a, __m128i b) = gfmul_generic; +#define gfmul(a,b) ((*gfmul_branch_ptr)(a,b)) + +/* See Intel carry-less-multiplication-instruction-in-gcm-mode-paper.pdf + * + * Adapted from figure 5, with additional byte swapping so that interface + * is simimar to cryptonite_aes_generic_gf_mul. + */ +static __m128i gfmul_pclmuldq(__m128i a, __m128i b) +{ + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; + __m128i bswap_mask = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); + + a = _mm_shuffle_epi8(a, bswap_mask); + b = _mm_shuffle_epi8(b, bswap_mask); + + tmp3 = _mm_clmulepi64_si128(a, b, 0x00); + tmp4 = _mm_clmulepi64_si128(a, b, 0x10); + tmp5 = _mm_clmulepi64_si128(a, b, 0x01); + tmp6 = _mm_clmulepi64_si128(a, b, 0x11); + + tmp4 = _mm_xor_si128(tmp4, tmp5); + tmp5 = _mm_slli_si128(tmp4, 8); + tmp4 = _mm_srli_si128(tmp4, 8); + tmp3 = _mm_xor_si128(tmp3, tmp5); + tmp6 = _mm_xor_si128(tmp6, tmp4); + + tmp7 = _mm_srli_epi32(tmp3, 31); + tmp8 = _mm_srli_epi32(tmp6, 31); + tmp3 = _mm_slli_epi32(tmp3, 1); + tmp6 = _mm_slli_epi32(tmp6, 1); + + tmp9 = _mm_srli_si128(tmp7, 12); + tmp8 = _mm_slli_si128(tmp8, 4); + tmp7 = _mm_slli_si128(tmp7, 4); + tmp3 = _mm_or_si128(tmp3, tmp7); + tmp6 = _mm_or_si128(tmp6, tmp8); + tmp6 = _mm_or_si128(tmp6, tmp9); + + tmp7 = _mm_slli_epi32(tmp3, 31); + tmp8 = _mm_slli_epi32(tmp3, 30); + tmp9 = _mm_slli_epi32(tmp3, 25); + + tmp7 = _mm_xor_si128(tmp7, tmp8); + tmp7 = _mm_xor_si128(tmp7, tmp9); + tmp8 = _mm_srli_si128(tmp7, 4); + tmp7 = _mm_slli_si128(tmp7, 12); + tmp3 = _mm_xor_si128(tmp3, tmp7); + + tmp2 = _mm_srli_epi32(tmp3, 1); + tmp4 = _mm_srli_epi32(tmp3, 2); + tmp5 = _mm_srli_epi32(tmp3, 7); + tmp2 = _mm_xor_si128(tmp2, tmp4); + tmp2 = _mm_xor_si128(tmp2, tmp5); + tmp2 = _mm_xor_si128(tmp2, tmp8); + tmp3 = _mm_xor_si128(tmp3, tmp2); + tmp6 = _mm_xor_si128(tmp6, tmp3); + + return _mm_shuffle_epi8(tmp6, bswap_mask); +} + +void cryptonite_aesni_gf_mul(block128 *a, block128 *b) +{ + __m128i _a, _b, _c; + _a = _mm_loadu_si128((__m128i *) a); + _b = _mm_loadu_si128((__m128i *) b); + _c = gfmul_pclmuldq(_a, _b); + _mm_storeu_si128((__m128i *) a, _c); +} + +void cryptonite_aesni_init_pclmul() +{ + gfmul_branch_ptr = gfmul_pclmuldq; +} + +#else +#define gfmul(a,b) (gfmul_generic(a,b)) +#endif + +static inline __m128i ghash_add(__m128i tag, __m128i h, __m128i m) +{ + tag = _mm_xor_si128(tag, m); + return gfmul(tag, h); +} + #define PRELOAD_ENC_KEYS128(k) \ __m128i K0 = _mm_loadu_si128(((__m128i *) k)+0); \ __m128i K1 = _mm_loadu_si128(((__m128i *) k)+1); \ diff --git a/cbits/aes/x86ni.h b/cbits/aes/x86ni.h index f71ee0f..b9a568a 100644 --- a/cbits/aes/x86ni.h +++ b/cbits/aes/x86ni.h @@ -72,7 +72,10 @@ void cryptonite_aesni_encrypt_xts256(aes_block *out, aes_key *key1, aes_key *key void cryptonite_aesni_gcm_encrypt128(uint8_t *out, aes_gcm *gcm, aes_key *key, uint8_t *in, uint32_t length); void cryptonite_aesni_gcm_encrypt256(uint8_t *out, aes_gcm *gcm, aes_key *key, uint8_t *in, uint32_t length); -void gf_mul_x86ni(block128 *res, block128 *a_, block128 *b_); +#ifdef WITH_PCLMUL +void cryptonite_aesni_init_pclmul(); +void cryptonite_aesni_gf_mul(block128 *a, block128 *b); +#endif #endif diff --git a/cbits/cryptonite_aes.c b/cbits/cryptonite_aes.c index 0b017fc..b5ce004 100644 --- a/cbits/cryptonite_aes.c +++ b/cbits/cryptonite_aes.c @@ -81,6 +81,8 @@ enum { /* ccm */ ENCRYPT_CCM_128, ENCRYPT_CCM_192, ENCRYPT_CCM_256, DECRYPT_CCM_128, DECRYPT_CCM_192, DECRYPT_CCM_256, + /* ghash */ + GHASH_GF_MUL, }; void *cryptonite_aes_branch_table[] = { @@ -141,6 +143,8 @@ void *cryptonite_aes_branch_table[] = { [DECRYPT_CCM_128] = cryptonite_aes_generic_ccm_decrypt, [DECRYPT_CCM_192] = cryptonite_aes_generic_ccm_decrypt, [DECRYPT_CCM_256] = cryptonite_aes_generic_ccm_decrypt, + /* GHASH */ + [GHASH_GF_MUL] = cryptonite_aes_generic_gf_mul, }; typedef void (*init_f)(aes_key *, uint8_t *, uint8_t); @@ -152,6 +156,7 @@ typedef void (*gcm_crypt_f)(uint8_t *output, aes_gcm *gcm, aes_key *key, uint8_t typedef void (*ocb_crypt_f)(uint8_t *output, aes_ocb *ocb, aes_key *key, uint8_t *input, uint32_t length); typedef void (*ccm_crypt_f)(uint8_t *output, aes_ccm *ccm, aes_key *key, uint8_t *input, uint32_t length); typedef void (*block_f)(aes_block *output, aes_key *key, aes_block *input); +typedef void (*gf_mul_f)(aes_block *a, aes_block *b); #ifdef WITH_AESNI #define GET_INIT(strength) \ @@ -186,6 +191,8 @@ typedef void (*block_f)(aes_block *output, aes_key *key, aes_block *input); (((block_f) (cryptonite_aes_branch_table[ENCRYPT_BLOCK_128 + k->strength]))(o,k,i)) #define cryptonite_aes_decrypt_block(o,k,i) \ (((block_f) (cryptonite_aes_branch_table[DECRYPT_BLOCK_128 + k->strength]))(o,k,i)) +#define cryptonite_gf_mul(a,b) \ + (((gf_mul_f) (cryptonite_aes_branch_table[GHASH_GF_MUL]))(a,b)) #else #define GET_INIT(strenght) cryptonite_aes_generic_init #define GET_ECB_ENCRYPT(strength) cryptonite_aes_generic_encrypt_ecb @@ -203,6 +210,7 @@ typedef void (*block_f)(aes_block *output, aes_key *key, aes_block *input); #define GET_CCM_DECRYPT(strength) cryptonite_aes_generic_ccm_decrypt #define cryptonite_aes_encrypt_block(o,k,i) cryptonite_aes_generic_encrypt_block(o,k,i) #define cryptonite_aes_decrypt_block(o,k,i) cryptonite_aes_generic_decrypt_block(o,k,i) +#define cryptonite_gf_mul(a,b) cryptonite_aes_generic_gf_mul(a,b) #endif #if defined(ARCH_X86) && defined(WITH_AESNI) @@ -241,6 +249,13 @@ static void initialize_table_ni(int aesni, int pclmul) cryptonite_aes_branch_table[ENCRYPT_OCB_128] = cryptonite_aesni_ocb_encrypt128; cryptonite_aes_branch_table[ENCRYPT_OCB_256] = cryptonite_aesni_ocb_encrypt256; */ +#ifdef WITH_PCLMUL + if (!pclmul) + return; + /* GHASH */ + cryptonite_aes_branch_table[GHASH_GF_MUL] = cryptonite_aesni_gf_mul; + cryptonite_aesni_init_pclmul(); +#endif } #endif @@ -761,9 +776,9 @@ void cryptonite_aes_generic_encrypt_xts(aes_block *output, aes_key *k1, aes_key /* TO OPTIMISE: this is really inefficient way to do that */ while (spoint-- > 0) - cryptonite_gf_mulx(&tweak); + cryptonite_aes_generic_gf_mulx(&tweak); - for ( ; nb_blocks-- > 0; input++, output++, cryptonite_gf_mulx(&tweak)) { + for ( ; nb_blocks-- > 0; input++, output++, cryptonite_aes_generic_gf_mulx(&tweak)) { block128_vxor(&block, input, &tweak); cryptonite_aes_encrypt_block(&block, k1, &block); block128_vxor(output, &block, &tweak); @@ -781,9 +796,9 @@ void cryptonite_aes_generic_decrypt_xts(aes_block *output, aes_key *k1, aes_key /* TO OPTIMISE: this is really inefficient way to do that */ while (spoint-- > 0) - cryptonite_gf_mulx(&tweak); + cryptonite_aes_generic_gf_mulx(&tweak); - for ( ; nb_blocks-- > 0; input++, output++, cryptonite_gf_mulx(&tweak)) { + for ( ; nb_blocks-- > 0; input++, output++, cryptonite_aes_generic_gf_mulx(&tweak)) { block128_vxor(&block, input, &tweak); cryptonite_aes_decrypt_block(&block, k1, &block); block128_vxor(output, &block, &tweak);