From 9a28ad590ca6137bf5e19ba477e3f379527bbd73 Mon Sep 17 00:00:00 2001 From: Foudge Date: Sun, 28 Jan 2018 12:58:19 +0100 Subject: [PATCH] up to 20% perf increase with Cryptonight with non-AES CPU This time, the performance increase is got with MSVC and GCC. On non-AES CPU, there were an useless load/store SSE2 register. The last MSVC "hack" is replaced by a portable code and he's more complete (a load is saved). On my C2Q6600, with 3 thread, I have +16% with MSVC2015 and +20% with GCC 7.3, compared to official 2.4.4 version. --- src/crypto/CryptoNight_arm.h | 30 +++++++++-------- src/crypto/CryptoNight_x86.h | 62 ++++++++++++++++++------------------ src/crypto/soft_aes.h | 17 +++------- 3 files changed, 52 insertions(+), 57 deletions(-) diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index 15be6c3dc..17bba7aff 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -194,14 +194,14 @@ template static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7) { if (SOFT_AES) { - *x0 = soft_aesenc(*x0, key); - *x1 = soft_aesenc(*x1, key); - *x2 = soft_aesenc(*x2, key); - *x3 = soft_aesenc(*x3, key); - *x4 = soft_aesenc(*x4, key); - *x5 = soft_aesenc(*x5, key); - *x6 = soft_aesenc(*x6, key); - *x7 = soft_aesenc(*x7, key); + *x0 = soft_aesenc((uint32_t*)x0, key); + *x1 = soft_aesenc((uint32_t*)x1, key); + *x2 = soft_aesenc((uint32_t*)x2, key); + *x3 = soft_aesenc((uint32_t*)x3, key); + *x4 = soft_aesenc((uint32_t*)x4, key); + *x5 = soft_aesenc((uint32_t*)x5, key); + *x6 = soft_aesenc((uint32_t*)x6, key); + *x7 = soft_aesenc((uint32_t*)x7, key); } # ifndef XMRIG_ARMv7 else { @@ -361,12 +361,13 @@ inline void cryptonight_hash(const void *__restrict__ input, size_t size, void * uint64_t idx0 = h0[0] ^ h0[4]; for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); + __m128i cx; if (SOFT_AES) { - cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); + cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); } else { + cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); # ifndef XMRIG_ARMv7 cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); # endif @@ -425,14 +426,15 @@ inline void cryptonight_double_hash(const void *__restrict__ input, size_t size, uint64_t idx1 = h1[0] ^ h1[4]; for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); - __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]); + __m128i cx0, cx1; if (SOFT_AES) { - cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1)); + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); } else { + cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]); # ifndef XMRIG_ARMv7 cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 362a1a9f4..786d28f1b 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -193,14 +193,14 @@ template static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7) { if (SOFT_AES) { - *x0 = soft_aesenc(*x0, key); - *x1 = soft_aesenc(*x1, key); - *x2 = soft_aesenc(*x2, key); - *x3 = soft_aesenc(*x3, key); - *x4 = soft_aesenc(*x4, key); - *x5 = soft_aesenc(*x5, key); - *x6 = soft_aesenc(*x6, key); - *x7 = soft_aesenc(*x7, key); + *x0 = soft_aesenc((uint32_t*)x0, key); + *x1 = soft_aesenc((uint32_t*)x1, key); + *x2 = soft_aesenc((uint32_t*)x2, key); + *x3 = soft_aesenc((uint32_t*)x3, key); + *x4 = soft_aesenc((uint32_t*)x4, key); + *x5 = soft_aesenc((uint32_t*)x5, key); + *x6 = soft_aesenc((uint32_t*)x6, key); + *x7 = soft_aesenc((uint32_t*)x7, key); } else { *x0 = _mm_aesenc_si128(*x0, key); @@ -324,19 +324,18 @@ inline void cryptonight_hash(const void *__restrict__ input, size_t size, void * uint64_t idx0 = h0[0] ^ h0[4]; for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx; - cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); + __m128i cx; - if (SOFT_AES) { - cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); - } - else { - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); - } - - _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - idx0 = EXTRACT64(cx); - bx0 = cx; + if (SOFT_AES) { + cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + } + else { + cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); + } + _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); + idx0 = EXTRACT64(cx); + bx0 = cx; uint64_t hi, lo, cl, ch; cl = ((uint64_t*) &l0[idx0 & MASK])[0]; @@ -386,18 +385,19 @@ inline void cryptonight_double_hash(const void *__restrict__ input, size_t size, uint64_t idx0 = h0[0] ^ h0[4]; uint64_t idx1 = h1[0] ^ h1[4]; - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); - __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]); + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0, cx1; - if (SOFT_AES) { - cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1)); - } - else { - cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); - cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); - } + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + } + else { + cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]); + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + } _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); _mm_store_si128((__m128i *) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); diff --git a/src/crypto/soft_aes.h b/src/crypto/soft_aes.h index 148f39c1c..b7698ac4f 100644 --- a/src/crypto/soft_aes.h +++ b/src/crypto/soft_aes.h @@ -89,19 +89,12 @@ alignas(16) const uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) }; alignas(16) const uint8_t saes_sbox[256] = saes_data(saes_h0); -static inline __m128i soft_aesenc(__m128i in, __m128i key) +static inline __m128i soft_aesenc(const uint32_t* in, __m128i key) { -#if defined(_MSC_VER) - const uint32_t x0 = in.m128i_u32[0]; - const uint32_t x1 = in.m128i_u32[1]; - const uint32_t x2 = in.m128i_u32[2]; - const uint32_t x3 = in.m128i_u32[3]; -#else - const uint32_t x0 = _mm_cvtsi128_si32(in); - const uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55)); - const uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA)); - const uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF)); -#endif + const uint32_t x0 = in[0]; + const uint32_t x1 = in[1]; + const uint32_t x2 = in[2]; + const uint32_t x3 = in[3]; __m128i out = _mm_set_epi32( (saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]),