From e67eb477960effcc0b030fa4656d64f186b557ca Mon Sep 17 00:00:00 2001 From: SChernykh Date: Tue, 23 Nov 2021 21:32:44 +0100 Subject: [PATCH] Faster quad hash for GhostRider algos (Ryzen CPUs) --- src/crypto/cn/CryptoNight_x86.h | 190 ++++++++++++++++++++++++++++++++ 1 file changed, 190 insertions(+) diff --git a/src/crypto/cn/CryptoNight_x86.h b/src/crypto/cn/CryptoNight_x86.h index f718bd7a5..7ca22cd3b 100644 --- a/src/crypto/cn/CryptoNight_x86.h +++ b/src/crypto/cn/CryptoNight_x86.h @@ -1299,6 +1299,188 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si } +static inline void cryptonight_monero_tweak_gr(uint64_t* mem_out, const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i cx) +{ + __m128i tmp = _mm_xor_si128(bx0, cx); + mem_out[0] = _mm_cvtsi128_si64(tmp); + + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + uint64_t vh = _mm_cvtsi128_si64(tmp); + + mem_out[1] = vh ^ tweak1_table[static_cast(vh) >> 24]; +} + + +template +void cryptonight_quad_hash_zen(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height) +{ + constexpr CnAlgo props; + constexpr size_t MASK = props.mask(); + constexpr Algorithm::Id BASE = props.base(); + + if (BASE == Algorithm::CN_1 && size < 43) { + memset(output, 0, 64); + return; + } + + keccak(input + size * 0, size, ctx[0]->state); + keccak(input + size * 1, size, ctx[1]->state); + keccak(input + size * 2, size, ctx[2]->state); + keccak(input + size * 3, size, ctx[3]->state); + + uint8_t* l0 = ctx[0]->memory; + uint8_t* l1 = ctx[1]->memory; + uint8_t* l2 = ctx[2]->memory; + uint8_t* l3 = ctx[3]->memory; + + uint64_t* h0 = reinterpret_cast(ctx[0]->state); + uint64_t* h1 = reinterpret_cast(ctx[1]->state); + uint64_t* h2 = reinterpret_cast(ctx[2]->state); + uint64_t* h3 = reinterpret_cast(ctx[3]->state); + + VARIANT1_INIT(0); + VARIANT1_INIT(1); + VARIANT1_INIT(2); + VARIANT1_INIT(3); + + if (props.half_mem()) { + ctx[0]->first_half = true; + ctx[1]->first_half = true; + ctx[2]->first_half = true; + ctx[3]->first_half = true; + } + + cn_explode_scratchpad(ctx[0]); + cn_explode_scratchpad(ctx[1]); + cn_explode_scratchpad(ctx[2]); + cn_explode_scratchpad(ctx[3]); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t al1 = h1[0] ^ h1[4]; + uint64_t al2 = h2[0] ^ h2[4]; + uint64_t al3 = h3[0] ^ h3[4]; + + uint64_t ah0 = h0[1] ^ h0[5]; + uint64_t ah1 = h1[1] ^ h1[5]; + uint64_t ah2 = h2[1] ^ h2[5]; + uint64_t ah3 = h3[1] ^ h3[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx20 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx30 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + + uint64_t idx0 = al0; + uint64_t idx1 = al1; + uint64_t idx2 = al2; + uint64_t idx3 = al3; + + for (size_t i = 0; i < props.iterations(); i++) { + __m128i cx0, cx1, cx2, cx3; + if (!SOFT_AES) { + cx0 = _mm_load_si128(reinterpret_cast(&l0[idx0 & MASK])); + cx1 = _mm_load_si128(reinterpret_cast(&l1[idx1 & MASK])); + cx2 = _mm_load_si128(reinterpret_cast(&l2[idx2 & MASK])); + cx3 = _mm_load_si128(reinterpret_cast(&l3[idx3 & MASK])); + } + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + const __m128i ax2 = _mm_set_epi64x(ah2, al2); + const __m128i ax3 = _mm_set_epi64x(ah3, al3); + + if (SOFT_AES) { + cx0 = soft_aesenc(&l0[idx0 & MASK], ax0, reinterpret_cast(saes_table)); + cx1 = soft_aesenc(&l1[idx1 & MASK], ax1, reinterpret_cast(saes_table)); + cx2 = soft_aesenc(&l2[idx2 & MASK], ax2, reinterpret_cast(saes_table)); + cx3 = soft_aesenc(&l3[idx3 & MASK], ax3, reinterpret_cast(saes_table)); + } + else { + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + cx2 = _mm_aesenc_si128(cx2, ax2); + cx3 = _mm_aesenc_si128(cx3, ax3); + } + + cryptonight_monero_tweak_gr((uint64_t*)&l0[idx0 & MASK], l0, idx0 & MASK, ax0, bx00, cx0); + cryptonight_monero_tweak_gr((uint64_t*)&l1[idx1 & MASK], l1, idx1 & MASK, ax1, bx10, cx1); + cryptonight_monero_tweak_gr((uint64_t*)&l2[idx2 & MASK], l2, idx2 & MASK, ax2, bx20, cx2); + cryptonight_monero_tweak_gr((uint64_t*)&l3[idx3 & MASK], l3, idx3 & MASK, ax3, bx30, cx3); + + idx0 = _mm_cvtsi128_si64(cx0); + idx1 = _mm_cvtsi128_si64(cx1); + idx2 = _mm_cvtsi128_si64(cx2); + idx3 = _mm_cvtsi128_si64(cx3); + + uint64_t hi, lo, cl, ch; + + cl = ((uint64_t*)&l0[idx0 & MASK])[0]; + ch = ((uint64_t*)&l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + al0 += hi; + ah0 += lo; + ((uint64_t*)&l0[idx0 & MASK])[0] = al0; + ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0; + al0 ^= cl; + ah0 ^= ch; + idx0 = al0; + + cl = ((uint64_t*)&l1[idx1 & MASK])[0]; + ch = ((uint64_t*)&l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + al1 += hi; + ah1 += lo; + ((uint64_t*)&l1[idx1 & MASK])[0] = al1; + ((uint64_t*)&l1[idx1 & MASK])[1] = ah1 ^ tweak1_2_1; + al1 ^= cl; + ah1 ^= ch; + idx1 = al1; + + cl = ((uint64_t*)&l2[idx2 & MASK])[0]; + ch = ((uint64_t*)&l2[idx2 & MASK])[1]; + lo = __umul128(idx2, cl, &hi); + al2 += hi; + ah2 += lo; + ((uint64_t*)&l2[idx2 & MASK])[0] = al2; + ((uint64_t*)&l2[idx2 & MASK])[1] = ah2 ^ tweak1_2_2; + al2 ^= cl; + ah2 ^= ch; + idx2 = al2; + + cl = ((uint64_t*)&l3[idx3 & MASK])[0]; + ch = ((uint64_t*)&l3[idx3 & MASK])[1]; + lo = __umul128(idx3, cl, &hi); + al3 += hi; + ah3 += lo; + ((uint64_t*)&l3[idx3 & MASK])[0] = al3; + ((uint64_t*)&l3[idx3 & MASK])[1] = ah3 ^ tweak1_2_3; + al3 ^= cl; + ah3 ^= ch; + idx3 = al3; + + bx00 = cx0; + bx10 = cx1; + bx20 = cx2; + bx30 = cx3; + } + + cn_implode_scratchpad(ctx[0]); + cn_implode_scratchpad(ctx[1]); + cn_implode_scratchpad(ctx[2]); + cn_implode_scratchpad(ctx[3]); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); + extra_hashes[ctx[2]->state[0] & 3](ctx[2]->state, 200, output + 64); + extra_hashes[ctx[3]->state[0] & 3](ctx[3]->state, 200, output + 96); +} + + #define CN_STEP1(a, b0, b1, c, l, ptr, idx, conc_var) \ ptr = reinterpret_cast<__m128i*>(&l[idx & MASK]); \ c = _mm_load_si128(ptr); \ @@ -1492,6 +1674,14 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si template inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height) { + const auto arch = Cpu::info()->arch(); + if ((arch >= ICpuInfo::ARCH_ZEN) && (arch <= ICpuInfo::ARCH_ZEN3)) { + if ((ALGO == Algorithm::CN_GR_0) || (ALGO == Algorithm::CN_GR_1) || (ALGO == Algorithm::CN_GR_2) || (ALGO == Algorithm::CN_GR_3) || (ALGO == Algorithm::CN_GR_4) || (ALGO == Algorithm::CN_GR_5)) { + cryptonight_quad_hash_zen(input, size, output, ctx, height); + return; + } + } + constexpr CnAlgo props; constexpr size_t MASK = props.mask(); constexpr Algorithm::Id BASE = props.base();