up to 20% perf increase with Cryptonight with non-AES CPU

This time, the performance increase is got with MSVC and GCC. On non-AES CPU, there were an useless load/store SSE2 register. The last MSVC "hack" is replaced by a portable code and he's more complete (a load is saved).

On my C2Q6600, with 3 thread, I have +16% with MSVC2015 and +20% with GCC 7.3, compared to official 2.4.4 version.
This commit is contained in:
Foudge 2018-01-28 12:58:19 +01:00
parent 15fe6ce23f
commit 9a28ad590c
3 changed files with 52 additions and 57 deletions

View file

@ -194,14 +194,14 @@ template<bool SOFT_AES>
static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7) static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
{ {
if (SOFT_AES) { if (SOFT_AES) {
*x0 = soft_aesenc(*x0, key); *x0 = soft_aesenc((uint32_t*)x0, key);
*x1 = soft_aesenc(*x1, key); *x1 = soft_aesenc((uint32_t*)x1, key);
*x2 = soft_aesenc(*x2, key); *x2 = soft_aesenc((uint32_t*)x2, key);
*x3 = soft_aesenc(*x3, key); *x3 = soft_aesenc((uint32_t*)x3, key);
*x4 = soft_aesenc(*x4, key); *x4 = soft_aesenc((uint32_t*)x4, key);
*x5 = soft_aesenc(*x5, key); *x5 = soft_aesenc((uint32_t*)x5, key);
*x6 = soft_aesenc(*x6, key); *x6 = soft_aesenc((uint32_t*)x6, key);
*x7 = soft_aesenc(*x7, key); *x7 = soft_aesenc((uint32_t*)x7, key);
} }
# ifndef XMRIG_ARMv7 # ifndef XMRIG_ARMv7
else { else {
@ -361,12 +361,13 @@ inline void cryptonight_hash(const void *__restrict__ input, size_t size, void *
uint64_t idx0 = h0[0] ^ h0[4]; uint64_t idx0 = h0[0] ^ h0[4];
for (size_t i = 0; i < ITERATIONS; i++) { for (size_t i = 0; i < ITERATIONS; i++) {
__m128i cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); __m128i cx;
if (SOFT_AES) { if (SOFT_AES) {
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
} }
else { else {
cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
# ifndef XMRIG_ARMv7 # ifndef XMRIG_ARMv7
cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
# endif # endif
@ -425,14 +426,15 @@ inline void cryptonight_double_hash(const void *__restrict__ input, size_t size,
uint64_t idx1 = h1[0] ^ h1[4]; uint64_t idx1 = h1[0] ^ h1[4];
for (size_t i = 0; i < ITERATIONS; i++) { for (size_t i = 0; i < ITERATIONS; i++) {
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); __m128i cx0, cx1;
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]);
if (SOFT_AES) { if (SOFT_AES) {
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0)); cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1)); cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
} }
else { else {
cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]);
# ifndef XMRIG_ARMv7 # ifndef XMRIG_ARMv7
cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);

View file

@ -193,14 +193,14 @@ template<bool SOFT_AES>
static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7) static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
{ {
if (SOFT_AES) { if (SOFT_AES) {
*x0 = soft_aesenc(*x0, key); *x0 = soft_aesenc((uint32_t*)x0, key);
*x1 = soft_aesenc(*x1, key); *x1 = soft_aesenc((uint32_t*)x1, key);
*x2 = soft_aesenc(*x2, key); *x2 = soft_aesenc((uint32_t*)x2, key);
*x3 = soft_aesenc(*x3, key); *x3 = soft_aesenc((uint32_t*)x3, key);
*x4 = soft_aesenc(*x4, key); *x4 = soft_aesenc((uint32_t*)x4, key);
*x5 = soft_aesenc(*x5, key); *x5 = soft_aesenc((uint32_t*)x5, key);
*x6 = soft_aesenc(*x6, key); *x6 = soft_aesenc((uint32_t*)x6, key);
*x7 = soft_aesenc(*x7, key); *x7 = soft_aesenc((uint32_t*)x7, key);
} }
else { else {
*x0 = _mm_aesenc_si128(*x0, key); *x0 = _mm_aesenc_si128(*x0, key);
@ -324,19 +324,18 @@ inline void cryptonight_hash(const void *__restrict__ input, size_t size, void *
uint64_t idx0 = h0[0] ^ h0[4]; uint64_t idx0 = h0[0] ^ h0[4];
for (size_t i = 0; i < ITERATIONS; i++) { for (size_t i = 0; i < ITERATIONS; i++) {
__m128i cx; __m128i cx;
cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
if (SOFT_AES) { if (SOFT_AES) {
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
} }
else { else {
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
} cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
}
_mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
idx0 = EXTRACT64(cx); idx0 = EXTRACT64(cx);
bx0 = cx; bx0 = cx;
uint64_t hi, lo, cl, ch; uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & MASK])[0]; cl = ((uint64_t*) &l0[idx0 & MASK])[0];
@ -386,18 +385,19 @@ inline void cryptonight_double_hash(const void *__restrict__ input, size_t size,
uint64_t idx0 = h0[0] ^ h0[4]; uint64_t idx0 = h0[0] ^ h0[4];
uint64_t idx1 = h1[0] ^ h1[4]; uint64_t idx1 = h1[0] ^ h1[4];
for (size_t i = 0; i < ITERATIONS; i++) { for (size_t i = 0; i < ITERATIONS; i++) {
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); __m128i cx0, cx1;
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]);
if (SOFT_AES) { if (SOFT_AES) {
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0)); cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1)); cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
} }
else { else {
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]);
} cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
}
_mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
_mm_store_si128((__m128i *) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); _mm_store_si128((__m128i *) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));

View file

@ -89,19 +89,12 @@
alignas(16) const uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) }; alignas(16) const uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) };
alignas(16) const uint8_t saes_sbox[256] = saes_data(saes_h0); alignas(16) const uint8_t saes_sbox[256] = saes_data(saes_h0);
static inline __m128i soft_aesenc(__m128i in, __m128i key) static inline __m128i soft_aesenc(const uint32_t* in, __m128i key)
{ {
#if defined(_MSC_VER) const uint32_t x0 = in[0];
const uint32_t x0 = in.m128i_u32[0]; const uint32_t x1 = in[1];
const uint32_t x1 = in.m128i_u32[1]; const uint32_t x2 = in[2];
const uint32_t x2 = in.m128i_u32[2]; const uint32_t x3 = in[3];
const uint32_t x3 = in.m128i_u32[3];
#else
const uint32_t x0 = _mm_cvtsi128_si32(in);
const uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
const uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA));
const uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF));
#endif
__m128i out = _mm_set_epi32( __m128i out = _mm_set_epi32(
(saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]), (saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]),