From 764767d3172bf23017a15554e0b9f2d1ae612e00 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sun, 17 Feb 2019 18:17:14 +0100 Subject: [PATCH] Support for Cryptonight variant 4 (Monero) --- src/Mem.cpp | 6 +- src/common/crypto/Algorithm.cpp | 2 + src/common/net/Job.cpp | 2 +- src/common/net/Pool.cpp | 1 + src/common/xmrig.h | 1 + src/crypto/CryptoNight.h | 11 +- src/crypto/CryptoNight_arm.h | 40 +- src/crypto/CryptoNight_constants.h | 6 + src/crypto/CryptoNight_monero.h | 20 +- src/crypto/CryptoNight_test.h | 17 +- src/crypto/CryptoNight_x86.h | 112 +++- src/crypto/CryptonightR_gen.cpp | 32 +- src/crypto/asm/CryptonightR_template.S | 25 +- src/crypto/asm/CryptonightR_template.asm | 25 +- src/crypto/asm/CryptonightR_template.h | 13 + src/crypto/asm/CryptonightR_template.inc | 117 +++-- src/crypto/asm/CryptonightR_template_win.inc | 117 +++-- src/crypto/asm/CryptonightWOW_template.inc | 486 ++++++++++++++++++ .../asm/CryptonightWOW_template_win.inc | 486 ++++++++++++++++++ src/crypto/asm/win64/CryptonightR_template.S | 24 +- .../asm/win64/CryptonightR_template.asm | 24 +- .../asm/win64/CryptonightR_template.inc | 117 +++-- .../asm/win64/CryptonightR_template_win.inc | 117 +++-- .../asm/win64/CryptonightWOW_template.inc | 486 ++++++++++++++++++ .../asm/win64/CryptonightWOW_template_win.inc | 486 ++++++++++++++++++ src/crypto/variant4_random_math.h | 68 ++- src/workers/CpuThread.cpp | 18 + src/workers/MultiWorker.cpp | 4 + 28 files changed, 2610 insertions(+), 253 deletions(-) create mode 100644 src/crypto/asm/CryptonightWOW_template.inc create mode 100644 src/crypto/asm/CryptonightWOW_template_win.inc create mode 100644 src/crypto/asm/win64/CryptonightWOW_template.inc create mode 100644 src/crypto/asm/win64/CryptonightWOW_template_win.inc diff --git a/src/Mem.cpp b/src/Mem.cpp index 1aa2f018..4fa794d6 100644 --- a/src/Mem.cpp +++ b/src/Mem.cpp @@ -54,8 +54,10 @@ MemInfo Mem::create(cryptonight_ctx **ctx, xmrig::Algo algorithm, size_t count) uint8_t* p = reinterpret_cast(allocateExecutableMemory(0x4000)); c->generated_code = reinterpret_cast(p); c->generated_code_double = reinterpret_cast(p + 0x2000); - c->generated_code_height = (uint64_t)(-1); - c->generated_code_double_height = (uint64_t)(-1); + + c->generated_code_data.variant = xmrig::VARIANT_MAX; + c->generated_code_data.height = (uint64_t)(-1); + c->generated_code_double_data = c->generated_code_data; ctx[i] = c; } diff --git a/src/common/crypto/Algorithm.cpp b/src/common/crypto/Algorithm.cpp index 0d360a40..8437555d 100644 --- a/src/common/crypto/Algorithm.cpp +++ b/src/common/crypto/Algorithm.cpp @@ -65,6 +65,7 @@ static AlgoData const algorithms[] = { { "cryptonight/half", "cn/half", xmrig::CRYPTONIGHT, xmrig::VARIANT_HALF }, { "cryptonight/xtlv9", "cn/xtlv9", xmrig::CRYPTONIGHT, xmrig::VARIANT_HALF }, { "cryptonight/wow", "cn/wow", xmrig::CRYPTONIGHT, xmrig::VARIANT_WOW }, + { "cryptonight/4", "cn/4", xmrig::CRYPTONIGHT, xmrig::VARIANT_4 }, # ifndef XMRIG_NO_AEON { "cryptonight-lite", "cn-lite", xmrig::CRYPTONIGHT_LITE, xmrig::VARIANT_AUTO }, @@ -130,6 +131,7 @@ static const char *variants[] = { "trtl", "gpu", "wow", + "4", }; diff --git a/src/common/net/Job.cpp b/src/common/net/Job.cpp index ee76f732..2f6d9f8e 100644 --- a/src/common/net/Job.cpp +++ b/src/common/net/Job.cpp @@ -246,7 +246,7 @@ xmrig::Variant Job::variant() const switch (m_algorithm.algo()) { case CRYPTONIGHT: - return (m_blob[0] >= 8) ? VARIANT_2 : VARIANT_1; + return (m_blob[0] >= 10) ? VARIANT_4 : ((m_blob[0] >= 8) ? VARIANT_2 : VARIANT_1); case CRYPTONIGHT_LITE: return VARIANT_1; diff --git a/src/common/net/Pool.cpp b/src/common/net/Pool.cpp index 585e4596..0df7b845 100644 --- a/src/common/net/Pool.cpp +++ b/src/common/net/Pool.cpp @@ -412,6 +412,7 @@ void Pool::rebuild() m_algorithms.push_back(m_algorithm); # ifndef XMRIG_PROXY_PROJECT + addVariant(xmrig::VARIANT_4); addVariant(xmrig::VARIANT_WOW); addVariant(xmrig::VARIANT_2); addVariant(xmrig::VARIANT_1); diff --git a/src/common/xmrig.h b/src/common/xmrig.h index c861d11c..c6a5f568 100644 --- a/src/common/xmrig.h +++ b/src/common/xmrig.h @@ -75,6 +75,7 @@ enum Variant { VARIANT_TRTL = 10, // CryptoNight Turtle (TRTL) VARIANT_GPU = 11, // CryptoNight-GPU (Ryo) VARIANT_WOW = 12, // CryptoNightR (Wownero) + VARIANT_4 = 13, // CryptoNightR (Monero's variant 4) VARIANT_MAX }; diff --git a/src/crypto/CryptoNight.h b/src/crypto/CryptoNight.h index 8a48eafd..b92945e4 100644 --- a/src/crypto/CryptoNight.h +++ b/src/crypto/CryptoNight.h @@ -39,13 +39,20 @@ struct cryptonight_ctx; typedef void(*cn_mainloop_fun_ms_abi)(cryptonight_ctx*) ABI_ATTRIBUTE; typedef void(*cn_mainloop_double_fun_ms_abi)(cryptonight_ctx*, cryptonight_ctx*) ABI_ATTRIBUTE; +struct cryptonight_r_data { + int variant; + uint64_t height; + + bool match(const int v, const uint64_t h) const { return (v == variant) && (h == height); } +}; + struct cryptonight_ctx { alignas(16) uint8_t state[224]; alignas(16) uint8_t *memory; cn_mainloop_fun_ms_abi generated_code; cn_mainloop_double_fun_ms_abi generated_code_double; - uint64_t generated_code_height; - uint64_t generated_code_double_height; + cryptonight_r_data generated_code_data; + cryptonight_r_data generated_code_double_data; }; diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index f04c27f1..e7232eb1 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -431,12 +431,12 @@ static inline __m128i aes_round_tweak_div(const __m128i &in, const __m128i &key) template -static inline void cryptonight_monero_tweak(const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i bx1, __m128i cx) +static inline void cryptonight_monero_tweak(const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i bx1, __m128i& cx) { uint64_t* mem_out = (uint64_t*)&l[idx]; if (BASE == xmrig::VARIANT_2) { - VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1); + VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1, cx); _mm_store_si128((__m128i *)mem_out, _mm_xor_si128(bx0, cx)); } else { __m128i tmp = _mm_xor_si128(bx0, cx); @@ -515,8 +515,12 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si ch = ((uint64_t*) &l0[idx0 & MASK])[1]; if (BASE == xmrig::VARIANT_2) { - if (VARIANT == xmrig::VARIANT_WOW) { + if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1); + if (VARIANT == xmrig::VARIANT_4) { + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + } } else { VARIANT2_INTEGER_MATH(0, cl, cx); } @@ -525,7 +529,11 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si lo = __umul128(idx0, cl, &hi); if (BASE == xmrig::VARIANT_2) { - VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo); + if (VARIANT == xmrig::VARIANT_4) { + VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx0, bx1, cx); + } else { + VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo); + } } al0 += hi; @@ -686,8 +694,12 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si ch = ((uint64_t*) &l0[idx0 & MASK])[1]; if (BASE == xmrig::VARIANT_2) { - if (VARIANT == xmrig::VARIANT_WOW) { + if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01); + if (VARIANT == xmrig::VARIANT_4) { + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + } } else { VARIANT2_INTEGER_MATH(0, cl, cx0); } @@ -696,7 +708,11 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si lo = __umul128(idx0, cl, &hi); if (BASE == xmrig::VARIANT_2) { - VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo); + if (VARIANT == xmrig::VARIANT_4) { + VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx00, bx01, cx0); + } else { + VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo); + } } al0 += hi; @@ -736,8 +752,12 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si ch = ((uint64_t*) &l1[idx1 & MASK])[1]; if (BASE == xmrig::VARIANT_2) { - if (VARIANT == xmrig::VARIANT_WOW) { + if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11); + if (VARIANT == xmrig::VARIANT_4) { + al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); + ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); + } } else { VARIANT2_INTEGER_MATH(1, cl, cx1); } @@ -746,7 +766,11 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si lo = __umul128(idx1, cl, &hi); if (BASE == xmrig::VARIANT_2) { - VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo); + if (VARIANT == xmrig::VARIANT_4) { + VARIANT2_SHUFFLE(l1, idx1 & MASK, ax1, bx10, bx11, cx1); + } else { + VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo); + } } al1 += hi; diff --git a/src/crypto/CryptoNight_constants.h b/src/crypto/CryptoNight_constants.h index 2bb24de7..4ea1adb3 100644 --- a/src/crypto/CryptoNight_constants.h +++ b/src/crypto/CryptoNight_constants.h @@ -127,6 +127,7 @@ template<> inline constexpr uint32_t cn_select_iter() template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } +template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_HALF_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_HALF_ITER; } @@ -197,8 +198,13 @@ template<> inline constexpr Variant cn_base_variant() { return VA template<> inline constexpr Variant cn_base_variant() { return VARIANT_2; } template<> inline constexpr Variant cn_base_variant() { return VARIANT_GPU; } template<> inline constexpr Variant cn_base_variant() { return VARIANT_2; } +template<> inline constexpr Variant cn_base_variant() { return VARIANT_2; } +template inline constexpr bool cn_is_cryptonight_r() { return false; } +template<> inline constexpr bool cn_is_cryptonight_r() { return true; } +template<> inline constexpr bool cn_is_cryptonight_r() { return true; } + } /* namespace xmrig */ diff --git a/src/crypto/CryptoNight_monero.h b/src/crypto/CryptoNight_monero.h index 9c26ae5f..26c1fff0 100644 --- a/src/crypto/CryptoNight_monero.h +++ b/src/crypto/CryptoNight_monero.h @@ -83,7 +83,7 @@ sqrt_result_xmm_##part = int_sqrt_v2(cx_0 + division_result); \ } while (0) -# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1) \ +# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c) \ do { \ const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \ const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \ @@ -91,6 +91,9 @@ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ + if (VARIANT == xmrig::VARIANT_4) { \ + _c = _mm_xor_si128(_mm_xor_si128(_c, chunk3), _mm_xor_si128(chunk1, chunk2)); \ + } \ } while (0) # define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \ @@ -125,7 +128,7 @@ sqrt_result_##part += ((r2 + b > sqrt_input) ? -1 : 0) + ((r2 + (1ULL << 32) < sqrt_input - s) ? 1 : 0); \ } while (0) -# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1) \ +# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c) \ do { \ const uint64x2_t chunk1 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10))); \ const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20))); \ @@ -133,6 +136,9 @@ vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \ vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \ vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \ + if (VARIANT == xmrig::VARIANT_4) { \ + _c = veorq_u64(veorq_u64(_c, chunk3), veorq_u64(chunk1, chunk2)); \ + } \ } while (0) # define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \ @@ -152,26 +158,28 @@ #define SWAP64LE(x) x #define hash_extra_blake(data, length, hash) blake256_hash((uint8_t*)(hash), (uint8_t*)(data), (length)) +#include "common/xmrig.h" #include "variant4_random_math.h" #define VARIANT4_RANDOM_MATH_INIT(part) \ - uint32_t r##part[8]; \ + uint32_t r##part[9]; \ struct V4_Instruction code##part[256]; \ - if (VARIANT == xmrig::VARIANT_WOW) { \ + if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { \ r##part[0] = (uint32_t)(h##part[12]); \ r##part[1] = (uint32_t)(h##part[12] >> 32); \ r##part[2] = (uint32_t)(h##part[13]); \ r##part[3] = (uint32_t)(h##part[13] >> 32); \ } \ - v4_random_math_init(code##part, height); + v4_random_math_init(code##part, height); #define VARIANT4_RANDOM_MATH(part, al, ah, cl, bx0, bx1) \ - if (VARIANT == xmrig::VARIANT_WOW) { \ + if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { \ cl ^= (r##part[0] + r##part[1]) | ((uint64_t)(r##part[2] + r##part[3]) << 32); \ r##part[4] = static_cast(al); \ r##part[5] = static_cast(ah); \ r##part[6] = static_cast(_mm_cvtsi128_si32(bx0)); \ r##part[7] = static_cast(_mm_cvtsi128_si32(bx1)); \ + r##part[8] = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \ v4_random_math(code##part, r##part); \ } diff --git a/src/crypto/CryptoNight_test.h b/src/crypto/CryptoNight_test.h index d3da28c1..237fe31b 100644 --- a/src/crypto/CryptoNight_test.h +++ b/src/crypto/CryptoNight_test.h @@ -58,8 +58,7 @@ const static uint8_t test_input[380] = { 0xCF, 0x50, 0x29, 0x6A, 0x07, 0x0B, 0x93, 0x8F, 0x8F, 0xA8, 0x10, 0x04 }; -const static char* test_input_WOW = R"===( -9d47bf4c41b7e8e727e681715acb47fa1677cdba9ca7bcb05ad8cc8abd5daa66 5468697320697320612074657374205468697320697320612074657374205468697320697320612074657374 1806260 +const static char* test_input_WOW = R"===(9d47bf4c41b7e8e727e681715acb47fa1677cdba9ca7bcb05ad8cc8abd5daa66 5468697320697320612074657374205468697320697320612074657374205468697320697320612074657374 1806260 0d4a495cb844a3ca8ba4edb8e6bcf829ef1c06d9cdea2b62ca46c2a21b8b0a79 4c6f72656d20697073756d20646f6c6f722073697420616d65742c20636f6e73656374657475722061646970697363696e67 1806261 a1d6d848b5c5915fccd2f64cf216c6b1a02cf7c77bc80d8d4e51b419e88ff0dd 656c69742c2073656420646f20656975736d6f642074656d706f7220696e6369646964756e74207574206c61626f7265 1806262 af3a8544a0221a148c2ac90484b19861e3afca33fe17021efb8ad6496b567915 657420646f6c6f7265206d61676e6120616c697175612e20557420656e696d206164206d696e696d2076656e69616d2c 1806263 @@ -68,8 +67,18 @@ af3a8544a0221a148c2ac90484b19861e3afca33fe17021efb8ad6496b567915 657420646f6c6f7 2b13000535f3db5f9b9b84a65c4351f386cd2cdedebb8c3ad2eab086e6a3fee5 697275726520646f6c6f7220696e20726570726568656e646572697420696e20766f6c7570746174652076656c6974 1806266 fc0e1dad8e895749dc90eb690bc1ba059a1cd772afaaf65a106bf9e5e6b80503 657373652063696c6c756d20646f6c6f726520657520667567696174206e756c6c612070617269617475722e 1806267 b60b0afe144deff7d903ed2d5545e77ebe66a3c51fee7016eeb8fee9eb630c0f 4578636570746575722073696e74206f6363616563617420637570696461746174206e6f6e2070726f6964656e742c 1806268 -64774b27e7d5fec862fc4c0c13ac6bf09123b6f05bb0e4b75c97f379a2b3a679 73756e7420696e2063756c706120717569206f666669636961206465736572756e74206d6f6c6c697420616e696d20696420657374206c61626f72756d2e 1806269 -)==="; +64774b27e7d5fec862fc4c0c13ac6bf09123b6f05bb0e4b75c97f379a2b3a679 73756e7420696e2063756c706120717569206f666669636961206465736572756e74206d6f6c6c697420616e696d20696420657374206c61626f72756d2e 1806269)==="; + +const static char* test_input_R = R"===(f759588ad57e758467295443a9bd71490abff8e9dad1b95b6bf2f5d0d78387bc 5468697320697320612074657374205468697320697320612074657374205468697320697320612074657374 1806260 +5bb833deca2bdd7252a9ccd7b4ce0b6a4854515794b56c207262f7a5b9bdb566 4c6f72656d20697073756d20646f6c6f722073697420616d65742c20636f6e73656374657475722061646970697363696e67 1806261 +1ee6728da60fbd8d7d55b2b1ade487a3cf52a2c3ac6f520db12c27d8921f6cab 656c69742c2073656420646f20656975736d6f642074656d706f7220696e6369646964756e74207574206c61626f7265 1806262 +6969fe2ddfb758438d48049f302fc2108a4fcc93e37669170e6db4b0b9b4c4cb 657420646f6c6f7265206d61676e6120616c697175612e20557420656e696d206164206d696e696d2076656e69616d2c 1806263 +7f3048b4e90d0cbe7a57c0394f37338a01fae3adfdc0e5126d863a895eb04e02 71756973206e6f737472756420657865726369746174696f6e20756c6c616d636f206c61626f726973206e697369 1806264 +1d290443a4b542af04a82f6b2494a6ee7f20f2754c58e0849032483a56e8e2ef 757420616c697175697020657820656120636f6d6d6f646f20636f6e7365717561742e20447569732061757465 1806265 +c43cc6567436a86afbd6aa9eaa7c276e9806830334b614b2bee23cc76634f6fd 697275726520646f6c6f7220696e20726570726568656e646572697420696e20766f6c7570746174652076656c6974 1806266 +87be2479c0c4e8edfdfaa5603e93f4265b3f8224c1c5946feb424819d18990a4 657373652063696c6c756d20646f6c6f726520657520667567696174206e756c6c612070617269617475722e 1806267 +dd9d6a6d8e47465cceac0877ef889b93e7eba979557e3935d7f86dce11b070f3 4578636570746575722073696e74206f6363616563617420637570696461746174206e6f6e2070726f6964656e742c 1806268 +75c6f2ae49a20521de97285b431e717125847fb8935ed84a61e7f8d36a2c3d8e 73756e7420696e2063756c706120717569206f666669636961206465736572756e74206d6f6c6c697420616e696d20696420657374206c61626f72756d2e 1806269)==="; // "cn/0" const static uint8_t test_output_v0[160] = { diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index b6969571..4c5d4ac0 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -457,10 +457,10 @@ static inline __m128i int_sqrt_v2(const uint64_t n0) template -static inline void cryptonight_monero_tweak(uint64_t* mem_out, const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i bx1, __m128i cx) +static inline void cryptonight_monero_tweak(uint64_t* mem_out, const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i bx1, __m128i& cx) { if (BASE == xmrig::VARIANT_2) { - VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1); + VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1, cx); _mm_store_si128((__m128i *)mem_out, _mm_xor_si128(bx0, cx)); } else { __m128i tmp = _mm_xor_si128(bx0, cx); @@ -543,8 +543,12 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si ch = ((uint64_t*) &l0[idx0 & MASK])[1]; if (BASE == xmrig::VARIANT_2) { - if (VARIANT == xmrig::VARIANT_WOW) { + if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1); + if (VARIANT == xmrig::VARIANT_4) { + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + } } else { VARIANT2_INTEGER_MATH(0, cl, cx); } @@ -553,7 +557,11 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si lo = __umul128(idx0, cl, &hi); if (BASE == xmrig::VARIANT_2) { - VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo); + if (VARIANT == xmrig::VARIANT_4) { + VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx0, bx1, cx); + } else { + VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo); + } } al0 += hi; @@ -658,21 +666,46 @@ extern xmrig::CpuThread::cn_mainloop_fun cn_trtl_mainloop_ryzen_asm; extern xmrig::CpuThread::cn_mainloop_fun cn_trtl_mainloop_bulldozer_asm; extern xmrig::CpuThread::cn_mainloop_double_fun cn_trtl_double_mainloop_sandybridge_asm; +void wow_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM); void v4_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM); -void v4_64_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM); +void wow_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM); void v4_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM); -void v4_64_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM); + +template +void cn_r_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM) +{ + v4_compile_code(code, code_size, machine_code, ASM); +} + +template +void cn_r_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM) +{ + v4_compile_code_double(code, code_size, machine_code, ASM); +} + +template<> +void cn_r_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM) +{ + wow_compile_code(code, code_size, machine_code, ASM); +} + +template<> +void cn_r_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM) +{ + wow_compile_code_double(code, code_size, machine_code, ASM); +} template inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height) { constexpr size_t MEM = xmrig::cn_select_memory(); - if ((VARIANT == xmrig::VARIANT_WOW) && (height != ctx[0]->generated_code_height)) { + if (xmrig::cn_is_cryptonight_r() && !ctx[0]->generated_code_data.match(VARIANT, height)) { V4_Instruction code[256]; - const int code_size = v4_random_math_init(code, height); - v4_compile_code(code, code_size, reinterpret_cast(ctx[0]->generated_code), ASM); - ctx[0]->generated_code_height = height; + const int code_size = v4_random_math_init(code, height); + cn_r_compile_code(code, code_size, reinterpret_cast(ctx[0]->generated_code), ASM); + ctx[0]->generated_code_data.variant = VARIANT; + ctx[0]->generated_code_data.height = height; } xmrig::keccak(input, size, ctx[0]->state); @@ -711,7 +744,7 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_ cn_trtl_mainloop_bulldozer_asm(ctx[0]); } } - else if (VARIANT == xmrig::VARIANT_WOW) { + else if (xmrig::cn_is_cryptonight_r()) { ctx[0]->generated_code(ctx[0]); } @@ -726,11 +759,12 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_ { constexpr size_t MEM = xmrig::cn_select_memory(); - if ((VARIANT == xmrig::VARIANT_WOW) && (height != ctx[0]->generated_code_double_height)) { + if (xmrig::cn_is_cryptonight_r() && !ctx[0]->generated_code_double_data.match(VARIANT, height)) { V4_Instruction code[256]; - const int code_size = v4_random_math_init(code, height); - v4_compile_code_double(code, code_size, reinterpret_cast(ctx[0]->generated_code_double), ASM); - ctx[0]->generated_code_double_height = height; + const int code_size = v4_random_math_init(code, height); + cn_r_compile_code_double(code, code_size, reinterpret_cast(ctx[0]->generated_code_double), ASM); + ctx[0]->generated_code_double_data.variant = VARIANT; + ctx[0]->generated_code_double_data.height = height; } xmrig::keccak(input, size, ctx[0]->state); @@ -748,7 +782,7 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_ else if (VARIANT == xmrig::VARIANT_TRTL) { cn_trtl_double_mainloop_sandybridge_asm(ctx[0], ctx[1]); } - else if (VARIANT == xmrig::VARIANT_WOW) { + else if (xmrig::cn_is_cryptonight_r()) { ctx[0]->generated_code_double(ctx[0], ctx[1]); } @@ -847,8 +881,12 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si ch = ((uint64_t*) &l0[idx0 & MASK])[1]; if (BASE == xmrig::VARIANT_2) { - if (VARIANT == xmrig::VARIANT_WOW) { + if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01); + if (VARIANT == xmrig::VARIANT_4) { + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + } } else { VARIANT2_INTEGER_MATH(0, cl, cx0); } @@ -857,7 +895,11 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si lo = __umul128(idx0, cl, &hi); if (BASE == xmrig::VARIANT_2) { - VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo); + if (VARIANT == xmrig::VARIANT_4) { + VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx00, bx01, cx0); + } else { + VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo); + } } al0 += hi; @@ -895,8 +937,12 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si ch = ((uint64_t*) &l1[idx1 & MASK])[1]; if (BASE == xmrig::VARIANT_2) { - if (VARIANT == xmrig::VARIANT_WOW) { + if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11); + if (VARIANT == xmrig::VARIANT_4) { + al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); + ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); + } } else { VARIANT2_INTEGER_MATH(1, cl, cx1); } @@ -905,7 +951,11 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si lo = __umul128(idx1, cl, &hi); if (BASE == xmrig::VARIANT_2) { - VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo); + if (VARIANT == xmrig::VARIANT_4) { + VARIANT2_SHUFFLE(l1, idx1 & MASK, ax1, bx10, bx11, cx1); + } else { + VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo); + } } al1 += hi; @@ -989,18 +1039,30 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si #define CN_STEP4(part, a, b0, b1, c, l, mc, ptr, idx) \ + uint64_t al##part, ah##part; \ if (BASE == xmrig::VARIANT_2) { \ - if (VARIANT == xmrig::VARIANT_WOW) { \ - const uint64_t al = _mm_cvtsi128_si64(a); \ - const uint64_t ah = _mm_cvtsi128_si64(_mm_srli_si128(a, 8)); \ - VARIANT4_RANDOM_MATH(part, al, ah, cl##part, b0, b1); \ + if ((VARIANT == xmrig::VARIANT_WOW) || (VARIANT == xmrig::VARIANT_4)) { \ + al##part = _mm_cvtsi128_si64(a); \ + ah##part = _mm_cvtsi128_si64(_mm_srli_si128(a, 8)); \ + VARIANT4_RANDOM_MATH(part, al##part, ah##part, cl##part, b0, b1); \ + if (VARIANT == xmrig::VARIANT_4) { \ + al##part ^= r##part[2] | ((uint64_t)(r##part[3]) << 32); \ + ah##part ^= r##part[0] | ((uint64_t)(r##part[1]) << 32); \ + } \ } else { \ VARIANT2_INTEGER_MATH(part, cl##part, c); \ } \ } \ lo = __umul128(idx, cl##part, &hi); \ if (BASE == xmrig::VARIANT_2) { \ - VARIANT2_SHUFFLE2(l, idx & MASK, a, b0, b1, hi, lo); \ + if (VARIANT == xmrig::VARIANT_4) { \ + VARIANT2_SHUFFLE(l, idx & MASK, a, b0, b1, c); \ + } else { \ + VARIANT2_SHUFFLE2(l, idx & MASK, a, b0, b1, hi, lo); \ + } \ + } \ + if (VARIANT == xmrig::VARIANT_4) { \ + a = _mm_set_epi64x(ah##part, al##part); \ } \ a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \ \ diff --git a/src/crypto/CryptonightR_gen.cpp b/src/crypto/CryptonightR_gen.cpp index 38225955..55f94662 100644 --- a/src/crypto/CryptonightR_gen.cpp +++ b/src/crypto/CryptonightR_gen.cpp @@ -58,7 +58,7 @@ static inline void add_random_math(uint8_t* &p, const V4_Instruction* code, int const uint32_t a = inst.dst_index; const uint32_t b = inst.src_index; - const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (src_index << (V4_OPCODE_BITS + V4_DST_INDEX_BITS)); + const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS)); switch (inst.opcode) { case ROR: @@ -99,6 +99,20 @@ static inline void add_random_math(uint8_t* &p, const V4_Instruction* code, int } } +void wow_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM) +{ + uint8_t* p0 = reinterpret_cast(machine_code); + uint8_t* p = p0; + + add_code(p, CryptonightWOW_template_part1, CryptonightWOW_template_part2); + add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM); + add_code(p, CryptonightWOW_template_part2, CryptonightWOW_template_part3); + *(int*)(p - 4) = static_cast((((const uint8_t*)CryptonightWOW_template_mainloop) - ((const uint8_t*)CryptonightWOW_template_part1)) - (p - p0)); + add_code(p, CryptonightWOW_template_part3, CryptonightWOW_template_end); + + Mem::flushInstructionCache(machine_code, p - p0); +} + void v4_compile_code(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM) { uint8_t* p0 = reinterpret_cast(machine_code); @@ -113,6 +127,22 @@ void v4_compile_code(const V4_Instruction* code, int code_size, void* machine_co Mem::flushInstructionCache(machine_code, p - p0); } +void wow_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM) +{ + uint8_t* p0 = reinterpret_cast(machine_code); + uint8_t* p = p0; + + add_code(p, CryptonightWOW_template_double_part1, CryptonightWOW_template_double_part2); + add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM); + add_code(p, CryptonightWOW_template_double_part2, CryptonightWOW_template_double_part3); + add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM); + add_code(p, CryptonightWOW_template_double_part3, CryptonightWOW_template_double_part4); + *(int*)(p - 4) = static_cast((((const uint8_t*)CryptonightWOW_template_double_mainloop) - ((const uint8_t*)CryptonightWOW_template_double_part1)) - (p - p0)); + add_code(p, CryptonightWOW_template_double_part4, CryptonightWOW_template_double_end); + + Mem::flushInstructionCache(machine_code, p - p0); +} + void v4_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, xmrig::Assembly ASM) { uint8_t* p0 = reinterpret_cast(machine_code); diff --git a/src/crypto/asm/CryptonightR_template.S b/src/crypto/asm/CryptonightR_template.S index e8478beb..5f3046cb 100644 --- a/src/crypto/asm/CryptonightR_template.S +++ b/src/crypto/asm/CryptonightR_template.S @@ -529,6 +529,7 @@ PUBLIC FN_PREFIX(CryptonightR_instruction_mov254) PUBLIC FN_PREFIX(CryptonightR_instruction_mov255) PUBLIC FN_PREFIX(CryptonightR_instruction_mov256) +#include "CryptonightWOW_template.inc" #include "CryptonightR_template.inc" FN_PREFIX(CryptonightR_instruction0): @@ -538,16 +539,16 @@ FN_PREFIX(CryptonightR_instruction1): FN_PREFIX(CryptonightR_instruction2): imul rbx, rbx FN_PREFIX(CryptonightR_instruction3): - add rbx, rbx + add rbx, r9 add rbx, 2147483647 FN_PREFIX(CryptonightR_instruction4): - sub rbx, rbx + sub rbx, r9 FN_PREFIX(CryptonightR_instruction5): ror ebx, cl FN_PREFIX(CryptonightR_instruction6): rol ebx, cl FN_PREFIX(CryptonightR_instruction7): - xor rbx, rbx + xor rbx, r9 FN_PREFIX(CryptonightR_instruction8): imul rsi, rbx FN_PREFIX(CryptonightR_instruction9): @@ -623,16 +624,16 @@ FN_PREFIX(CryptonightR_instruction41): FN_PREFIX(CryptonightR_instruction42): imul rsi, rsi FN_PREFIX(CryptonightR_instruction43): - add rsi, rsi + add rsi, r9 add rsi, 2147483647 FN_PREFIX(CryptonightR_instruction44): - sub rsi, rsi + sub rsi, r9 FN_PREFIX(CryptonightR_instruction45): ror esi, cl FN_PREFIX(CryptonightR_instruction46): rol esi, cl FN_PREFIX(CryptonightR_instruction47): - xor rsi, rsi + xor rsi, r9 FN_PREFIX(CryptonightR_instruction48): imul rdi, rsi FN_PREFIX(CryptonightR_instruction49): @@ -708,16 +709,16 @@ FN_PREFIX(CryptonightR_instruction81): FN_PREFIX(CryptonightR_instruction82): imul rdi, rdi FN_PREFIX(CryptonightR_instruction83): - add rdi, rdi + add rdi, r9 add rdi, 2147483647 FN_PREFIX(CryptonightR_instruction84): - sub rdi, rdi + sub rdi, r9 FN_PREFIX(CryptonightR_instruction85): ror edi, cl FN_PREFIX(CryptonightR_instruction86): rol edi, cl FN_PREFIX(CryptonightR_instruction87): - xor rdi, rdi + xor rdi, r9 FN_PREFIX(CryptonightR_instruction88): imul rbp, rdi FN_PREFIX(CryptonightR_instruction89): @@ -793,16 +794,16 @@ FN_PREFIX(CryptonightR_instruction121): FN_PREFIX(CryptonightR_instruction122): imul rbp, rbp FN_PREFIX(CryptonightR_instruction123): - add rbp, rbp + add rbp, r9 add rbp, 2147483647 FN_PREFIX(CryptonightR_instruction124): - sub rbp, rbp + sub rbp, r9 FN_PREFIX(CryptonightR_instruction125): ror ebp, cl FN_PREFIX(CryptonightR_instruction126): rol ebp, cl FN_PREFIX(CryptonightR_instruction127): - xor rbp, rbp + xor rbp, r9 FN_PREFIX(CryptonightR_instruction128): imul rbx, rsp FN_PREFIX(CryptonightR_instruction129): diff --git a/src/crypto/asm/CryptonightR_template.asm b/src/crypto/asm/CryptonightR_template.asm index ec8ad5af..25b72c3c 100644 --- a/src/crypto/asm/CryptonightR_template.asm +++ b/src/crypto/asm/CryptonightR_template.asm @@ -516,6 +516,7 @@ PUBLIC CryptonightR_instruction_mov254 PUBLIC CryptonightR_instruction_mov255 PUBLIC CryptonightR_instruction_mov256 +INCLUDE CryptonightWOW_template_win.inc INCLUDE CryptonightR_template_win.inc CryptonightR_instruction0: @@ -525,16 +526,16 @@ CryptonightR_instruction1: CryptonightR_instruction2: imul rbx, rbx CryptonightR_instruction3: - add rbx, rbx + add rbx, r9 add rbx, 2147483647 CryptonightR_instruction4: - sub rbx, rbx + sub rbx, r9 CryptonightR_instruction5: ror ebx, cl CryptonightR_instruction6: rol ebx, cl CryptonightR_instruction7: - xor rbx, rbx + xor rbx, r9 CryptonightR_instruction8: imul rsi, rbx CryptonightR_instruction9: @@ -610,16 +611,16 @@ CryptonightR_instruction41: CryptonightR_instruction42: imul rsi, rsi CryptonightR_instruction43: - add rsi, rsi + add rsi, r9 add rsi, 2147483647 CryptonightR_instruction44: - sub rsi, rsi + sub rsi, r9 CryptonightR_instruction45: ror esi, cl CryptonightR_instruction46: rol esi, cl CryptonightR_instruction47: - xor rsi, rsi + xor rsi, r9 CryptonightR_instruction48: imul rdi, rsi CryptonightR_instruction49: @@ -695,16 +696,16 @@ CryptonightR_instruction81: CryptonightR_instruction82: imul rdi, rdi CryptonightR_instruction83: - add rdi, rdi + add rdi, r9 add rdi, 2147483647 CryptonightR_instruction84: - sub rdi, rdi + sub rdi, r9 CryptonightR_instruction85: ror edi, cl CryptonightR_instruction86: rol edi, cl CryptonightR_instruction87: - xor rdi, rdi + xor rdi, r9 CryptonightR_instruction88: imul rbp, rdi CryptonightR_instruction89: @@ -780,16 +781,16 @@ CryptonightR_instruction121: CryptonightR_instruction122: imul rbp, rbp CryptonightR_instruction123: - add rbp, rbp + add rbp, r9 add rbp, 2147483647 CryptonightR_instruction124: - sub rbp, rbp + sub rbp, r9 CryptonightR_instruction125: ror ebp, cl CryptonightR_instruction126: rol ebp, cl CryptonightR_instruction127: - xor rbp, rbp + xor rbp, r9 CryptonightR_instruction128: imul rbx, rsp CryptonightR_instruction129: diff --git a/src/crypto/asm/CryptonightR_template.h b/src/crypto/asm/CryptonightR_template.h index 182c6870..c2054705 100644 --- a/src/crypto/asm/CryptonightR_template.h +++ b/src/crypto/asm/CryptonightR_template.h @@ -2,6 +2,18 @@ extern "C" { + void CryptonightWOW_template_part1(); + void CryptonightWOW_template_mainloop(); + void CryptonightWOW_template_part2(); + void CryptonightWOW_template_part3(); + void CryptonightWOW_template_end(); + void CryptonightWOW_template_double_part1(); + void CryptonightWOW_template_double_mainloop(); + void CryptonightWOW_template_double_part2(); + void CryptonightWOW_template_double_part3(); + void CryptonightWOW_template_double_part4(); + void CryptonightWOW_template_double_end(); + void CryptonightR_template_part1(); void CryptonightR_template_mainloop(); void CryptonightR_template_part2(); @@ -13,6 +25,7 @@ extern "C" void CryptonightR_template_double_part3(); void CryptonightR_template_double_part4(); void CryptonightR_template_double_end(); + void CryptonightR_instruction0(); void CryptonightR_instruction1(); void CryptonightR_instruction2(); diff --git a/src/crypto/asm/CryptonightR_template.inc b/src/crypto/asm/CryptonightR_template.inc index 468eb87b..b54486a5 100644 --- a/src/crypto/asm/CryptonightR_template.inc +++ b/src/crypto/asm/CryptonightR_template.inc @@ -10,6 +10,7 @@ PUBLIC FN_PREFIX(CryptonightR_template_double_part3) PUBLIC FN_PREFIX(CryptonightR_template_double_part4) PUBLIC FN_PREFIX(CryptonightR_template_double_end) +ALIGN(64) FN_PREFIX(CryptonightR_template_part1): mov QWORD PTR [rsp+16], rbx mov QWORD PTR [rsp+24], rbp @@ -68,8 +69,6 @@ FN_PREFIX(CryptonightR_template_mainloop): lea rdx, QWORD PTR [r9+r11] aesenc xmm5, xmm4 - movd r10d, xmm5 - and r10d, 2097136 mov r12d, r9d mov eax, r9d @@ -77,16 +76,23 @@ FN_PREFIX(CryptonightR_template_mainloop): xor r12d, 16 xor eax, 32 movdqu xmm0, XMMWORD PTR [r9+r11] + movaps xmm3, xmm0 movdqu xmm2, XMMWORD PTR [r12+r11] movdqu xmm1, XMMWORD PTR [rax+r11] - paddq xmm0, xmm7 + pxor xmm0, xmm2 + pxor xmm5, xmm1 + pxor xmm5, xmm0 + paddq xmm3, xmm7 paddq xmm2, xmm6 paddq xmm1, xmm4 - movdqu XMMWORD PTR [r12+r11], xmm0 - movq r12, xmm5 + movdqu XMMWORD PTR [r12+r11], xmm3 movdqu XMMWORD PTR [rax+r11], xmm2 movdqu XMMWORD PTR [r9+r11], xmm1 + movq r12, xmm5 + movd r10d, xmm5 + and r10d, 2097136 + movdqa xmm0, xmm5 pxor xmm0, xmm6 movdqu XMMWORD PTR [rdx], xmm0 @@ -101,13 +107,23 @@ FN_PREFIX(CryptonightR_template_mainloop): movd eax, xmm6 movd edx, xmm7 + pextrd r9d, xmm7, 2 FN_PREFIX(CryptonightR_template_part2): + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor rsp, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r15, rax + mov rax, r13 mul r12 - movq xmm0, rax - movq xmm3, rdx - punpcklqdq xmm3, xmm0 mov r9d, r10d mov r12d, r10d @@ -115,16 +131,18 @@ FN_PREFIX(CryptonightR_template_part2): xor r12d, 32 xor r10d, 48 movdqa xmm1, XMMWORD PTR [r12+r11] - xor rdx, QWORD PTR [r12+r11] - xor rax, QWORD PTR [r11+r12+8] + movaps xmm3, xmm1 movdqa xmm2, XMMWORD PTR [r9+r11] - pxor xmm3, xmm2 - paddq xmm7, XMMWORD PTR [r10+r11] - paddq xmm1, xmm4 - paddq xmm3, xmm6 - movdqu XMMWORD PTR [r9+r11], xmm7 - movdqu XMMWORD PTR [r12+r11], xmm3 - movdqu XMMWORD PTR [r10+r11], xmm1 + movdqa xmm0, XMMWORD PTR [r10+r11] + pxor xmm1, xmm2 + pxor xmm5, xmm0 + pxor xmm5, xmm1 + paddq xmm3, xmm4 + paddq xmm2, xmm6 + paddq xmm0, xmm7 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqu XMMWORD PTR [r12+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm3 movdqa xmm7, xmm6 add r15, rax @@ -247,18 +265,21 @@ FN_PREFIX(CryptonightR_template_double_mainloop): punpcklqdq xmm3, xmm0 xor ebx, 16 aesenc xmm6, xmm3 - movq rdx, xmm6 movq xmm4, r15 movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 xor ebx, 48 paddq xmm0, xmm7 movdqu xmm1, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm1 movdqu XMMWORD PTR [rbx+rsi], xmm0 paddq xmm1, xmm3 xor ebx, 16 mov eax, ebx xor rax, 32 movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 + movq rdx, xmm6 movdqu XMMWORD PTR [rbx+rsi], xmm1 paddq xmm0, xmm9 movdqu XMMWORD PTR [rax+rsi], xmm0 @@ -274,15 +295,18 @@ FN_PREFIX(CryptonightR_template_double_mainloop): xor r8d, 16 aesenc xmm5, xmm4 movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 xor r8d, 48 paddq xmm0, xmm8 movdqu xmm1, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm1 movdqu XMMWORD PTR [r8+rdi], xmm0 paddq xmm1, xmm4 xor r8d, 16 mov eax, r8d xor rax, 32 movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 movdqu XMMWORD PTR [r8+rdi], xmm1 paddq xmm0, xmm10 movdqu XMMWORD PTR [rax+rdi], xmm0 @@ -303,7 +327,8 @@ FN_PREFIX(CryptonightR_template_double_mainloop): movq xmm11, rbp movq xmm12, r15 movq xmm13, rdx - mov [rsp+112], rcx + mov [rsp+104], rcx + mov [rsp+112], r9 mov ebx, DWORD PTR [rsp+16] mov esi, DWORD PTR [rsp+20] @@ -320,9 +345,22 @@ FN_PREFIX(CryptonightR_template_double_mainloop): pextrd r15d, xmm3, 2 movd eax, xmm7 movd edx, xmm9 + pextrd r9d, xmm9, 2 FN_PREFIX(CryptonightR_template_double_part2): + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r14, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r12, rax + movq rsp, xmm0 mov DWORD PTR [rsp+16], ebx mov DWORD PTR [rsp+20], esi @@ -334,28 +372,27 @@ FN_PREFIX(CryptonightR_template_double_part2): movq rbp, xmm11 movq r15, xmm12 movq rdx, xmm13 - mov rcx, [rsp+112] + mov rcx, [rsp+104] + mov r9, [rsp+112] mov rbx, r8 mov rax, r8 mul rdx and ebp, 2097136 mov r8, rax - movq xmm1, rdx - movq xmm0, r8 - punpcklqdq xmm1, xmm0 - pxor xmm1, XMMWORD PTR [rcx+rsi] + movdqu xmm1, XMMWORD PTR [rcx+rsi] + pxor xmm6, xmm1 xor esi, 48 paddq xmm1, xmm7 movdqu xmm2, XMMWORD PTR [rsi+rcx] - xor rdx, QWORD PTR [rsi+rcx] + pxor xmm6, xmm2 paddq xmm2, xmm3 - xor r8, QWORD PTR [rsi+rcx+8] movdqu XMMWORD PTR [rsi+rcx], xmm1 xor esi, 16 mov eax, esi mov rsi, rcx movdqu xmm0, XMMWORD PTR [rax+rcx] + pxor xmm6, xmm0 movdqu XMMWORD PTR [rax+rcx], xmm2 paddq xmm0, xmm9 add r12, r8 @@ -383,6 +420,7 @@ FN_PREFIX(CryptonightR_template_double_part2): movq xmm12, rbp movq xmm13, r15 mov [rsp+104], rcx + mov [rsp+112], r9 mov ebx, DWORD PTR [rsp] mov esi, DWORD PTR [rsp+4] @@ -401,9 +439,24 @@ FN_PREFIX(CryptonightR_template_double_part2): pextrd r15d, xmm4, 2 movd eax, xmm8 movd edx, xmm10 + pextrd r9d, xmm10, 2 FN_PREFIX(CryptonightR_template_double_part3): + movq r15, xmm13 + + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r15, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r13, rax + movq rsp, xmm0 mov DWORD PTR [rsp], ebx mov DWORD PTR [rsp+4], esi @@ -414,23 +467,20 @@ FN_PREFIX(CryptonightR_template_double_part3): movq rsi, xmm2 movq rdi, xmm11 movq rbp, xmm12 - movq r15, xmm13 mov rcx, [rsp+104] + mov r9, [rsp+112] mov rax, r8 mul rdi - movq xmm1, rdx - movq xmm0, rax - punpcklqdq xmm1, xmm0 mov rdi, rcx mov r8, rax - pxor xmm1, XMMWORD PTR [rbp+rcx] + movdqu xmm1, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm1 xor ebp, 48 paddq xmm1, xmm8 - xor r8, QWORD PTR [rbp+rcx+8] - xor rdx, QWORD PTR [rbp+rcx] add r13, r8 movdqu xmm2, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm2 add r15, rdx movdqu XMMWORD PTR [rbp+rcx], xmm1 paddq xmm2, xmm4 @@ -438,6 +488,7 @@ FN_PREFIX(CryptonightR_template_double_part3): mov eax, ebp xor rax, 32 movdqu xmm0, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm0 movdqu XMMWORD PTR [rbp+rcx], xmm2 paddq xmm0, xmm10 movdqu XMMWORD PTR [rax+rcx], xmm0 diff --git a/src/crypto/asm/CryptonightR_template_win.inc b/src/crypto/asm/CryptonightR_template_win.inc index efa1f3f5..150bb0e3 100644 --- a/src/crypto/asm/CryptonightR_template_win.inc +++ b/src/crypto/asm/CryptonightR_template_win.inc @@ -10,6 +10,7 @@ PUBLIC CryptonightR_template_double_part3 PUBLIC CryptonightR_template_double_part4 PUBLIC CryptonightR_template_double_end +ALIGN(64) CryptonightR_template_part1: mov QWORD PTR [rsp+16], rbx mov QWORD PTR [rsp+24], rbp @@ -68,8 +69,6 @@ CryptonightR_template_mainloop: lea rdx, QWORD PTR [r9+r11] aesenc xmm5, xmm4 - movd r10d, xmm5 - and r10d, 2097136 mov r12d, r9d mov eax, r9d @@ -77,16 +76,23 @@ CryptonightR_template_mainloop: xor r12d, 16 xor eax, 32 movdqu xmm0, XMMWORD PTR [r9+r11] + movaps xmm3, xmm0 movdqu xmm2, XMMWORD PTR [r12+r11] movdqu xmm1, XMMWORD PTR [rax+r11] - paddq xmm0, xmm7 + pxor xmm0, xmm2 + pxor xmm5, xmm1 + pxor xmm5, xmm0 + paddq xmm3, xmm7 paddq xmm2, xmm6 paddq xmm1, xmm4 - movdqu XMMWORD PTR [r12+r11], xmm0 - movq r12, xmm5 + movdqu XMMWORD PTR [r12+r11], xmm3 movdqu XMMWORD PTR [rax+r11], xmm2 movdqu XMMWORD PTR [r9+r11], xmm1 + movq r12, xmm5 + movd r10d, xmm5 + and r10d, 2097136 + movdqa xmm0, xmm5 pxor xmm0, xmm6 movdqu XMMWORD PTR [rdx], xmm0 @@ -101,13 +107,23 @@ CryptonightR_template_mainloop: movd eax, xmm6 movd edx, xmm7 + pextrd r9d, xmm7, 2 CryptonightR_template_part2: + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor rsp, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r15, rax + mov rax, r13 mul r12 - movq xmm0, rax - movq xmm3, rdx - punpcklqdq xmm3, xmm0 mov r9d, r10d mov r12d, r10d @@ -115,16 +131,18 @@ CryptonightR_template_part2: xor r12d, 32 xor r10d, 48 movdqa xmm1, XMMWORD PTR [r12+r11] - xor rdx, QWORD PTR [r12+r11] - xor rax, QWORD PTR [r11+r12+8] + movaps xmm3, xmm1 movdqa xmm2, XMMWORD PTR [r9+r11] - pxor xmm3, xmm2 - paddq xmm7, XMMWORD PTR [r10+r11] - paddq xmm1, xmm4 - paddq xmm3, xmm6 - movdqu XMMWORD PTR [r9+r11], xmm7 - movdqu XMMWORD PTR [r12+r11], xmm3 - movdqu XMMWORD PTR [r10+r11], xmm1 + movdqa xmm0, XMMWORD PTR [r10+r11] + pxor xmm1, xmm2 + pxor xmm5, xmm0 + pxor xmm5, xmm1 + paddq xmm3, xmm4 + paddq xmm2, xmm6 + paddq xmm0, xmm7 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqu XMMWORD PTR [r12+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm3 movdqa xmm7, xmm6 add r15, rax @@ -247,18 +265,21 @@ CryptonightR_template_double_mainloop: punpcklqdq xmm3, xmm0 xor ebx, 16 aesenc xmm6, xmm3 - movq rdx, xmm6 movq xmm4, r15 movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 xor ebx, 48 paddq xmm0, xmm7 movdqu xmm1, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm1 movdqu XMMWORD PTR [rbx+rsi], xmm0 paddq xmm1, xmm3 xor ebx, 16 mov eax, ebx xor rax, 32 movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 + movq rdx, xmm6 movdqu XMMWORD PTR [rbx+rsi], xmm1 paddq xmm0, xmm9 movdqu XMMWORD PTR [rax+rsi], xmm0 @@ -274,15 +295,18 @@ CryptonightR_template_double_mainloop: xor r8d, 16 aesenc xmm5, xmm4 movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 xor r8d, 48 paddq xmm0, xmm8 movdqu xmm1, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm1 movdqu XMMWORD PTR [r8+rdi], xmm0 paddq xmm1, xmm4 xor r8d, 16 mov eax, r8d xor rax, 32 movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 movdqu XMMWORD PTR [r8+rdi], xmm1 paddq xmm0, xmm10 movdqu XMMWORD PTR [rax+rdi], xmm0 @@ -303,7 +327,8 @@ CryptonightR_template_double_mainloop: movq xmm11, rbp movq xmm12, r15 movq xmm13, rdx - mov [rsp+112], rcx + mov [rsp+104], rcx + mov [rsp+112], r9 mov ebx, DWORD PTR [rsp+16] mov esi, DWORD PTR [rsp+20] @@ -320,9 +345,22 @@ CryptonightR_template_double_mainloop: pextrd r15d, xmm3, 2 movd eax, xmm7 movd edx, xmm9 + pextrd r9d, xmm9, 2 CryptonightR_template_double_part2: + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r14, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r12, rax + movq rsp, xmm0 mov DWORD PTR [rsp+16], ebx mov DWORD PTR [rsp+20], esi @@ -334,28 +372,27 @@ CryptonightR_template_double_part2: movq rbp, xmm11 movq r15, xmm12 movq rdx, xmm13 - mov rcx, [rsp+112] + mov rcx, [rsp+104] + mov r9, [rsp+112] mov rbx, r8 mov rax, r8 mul rdx and ebp, 2097136 mov r8, rax - movq xmm1, rdx - movq xmm0, r8 - punpcklqdq xmm1, xmm0 - pxor xmm1, XMMWORD PTR [rcx+rsi] + movdqu xmm1, XMMWORD PTR [rcx+rsi] + pxor xmm6, xmm1 xor esi, 48 paddq xmm1, xmm7 movdqu xmm2, XMMWORD PTR [rsi+rcx] - xor rdx, QWORD PTR [rsi+rcx] + pxor xmm6, xmm2 paddq xmm2, xmm3 - xor r8, QWORD PTR [rsi+rcx+8] movdqu XMMWORD PTR [rsi+rcx], xmm1 xor esi, 16 mov eax, esi mov rsi, rcx movdqu xmm0, XMMWORD PTR [rax+rcx] + pxor xmm6, xmm0 movdqu XMMWORD PTR [rax+rcx], xmm2 paddq xmm0, xmm9 add r12, r8 @@ -383,6 +420,7 @@ CryptonightR_template_double_part2: movq xmm12, rbp movq xmm13, r15 mov [rsp+104], rcx + mov [rsp+112], r9 mov ebx, DWORD PTR [rsp] mov esi, DWORD PTR [rsp+4] @@ -401,9 +439,24 @@ CryptonightR_template_double_part2: pextrd r15d, xmm4, 2 movd eax, xmm8 movd edx, xmm10 + pextrd r9d, xmm10, 2 CryptonightR_template_double_part3: + movq r15, xmm13 + + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r15, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r13, rax + movq rsp, xmm0 mov DWORD PTR [rsp], ebx mov DWORD PTR [rsp+4], esi @@ -414,23 +467,20 @@ CryptonightR_template_double_part3: movq rsi, xmm2 movq rdi, xmm11 movq rbp, xmm12 - movq r15, xmm13 mov rcx, [rsp+104] + mov r9, [rsp+112] mov rax, r8 mul rdi - movq xmm1, rdx - movq xmm0, rax - punpcklqdq xmm1, xmm0 mov rdi, rcx mov r8, rax - pxor xmm1, XMMWORD PTR [rbp+rcx] + movdqu xmm1, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm1 xor ebp, 48 paddq xmm1, xmm8 - xor r8, QWORD PTR [rbp+rcx+8] - xor rdx, QWORD PTR [rbp+rcx] add r13, r8 movdqu xmm2, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm2 add r15, rdx movdqu XMMWORD PTR [rbp+rcx], xmm1 paddq xmm2, xmm4 @@ -438,6 +488,7 @@ CryptonightR_template_double_part3: mov eax, ebp xor rax, 32 movdqu xmm0, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm0 movdqu XMMWORD PTR [rbp+rcx], xmm2 paddq xmm0, xmm10 movdqu XMMWORD PTR [rax+rcx], xmm0 diff --git a/src/crypto/asm/CryptonightWOW_template.inc b/src/crypto/asm/CryptonightWOW_template.inc new file mode 100644 index 00000000..7183a659 --- /dev/null +++ b/src/crypto/asm/CryptonightWOW_template.inc @@ -0,0 +1,486 @@ +PUBLIC FN_PREFIX(CryptonightWOW_template_part1) +PUBLIC FN_PREFIX(CryptonightWOW_template_mainloop) +PUBLIC FN_PREFIX(CryptonightWOW_template_part2) +PUBLIC FN_PREFIX(CryptonightWOW_template_part3) +PUBLIC FN_PREFIX(CryptonightWOW_template_end) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part1) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_mainloop) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part2) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part3) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part4) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_end) + +ALIGN(64) +FN_PREFIX(CryptonightWOW_template_part1): + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push rdi + sub rsp, 64 + mov r12, rcx + mov r8, QWORD PTR [r12+32] + mov rdx, r12 + xor r8, QWORD PTR [r12] + mov r15, QWORD PTR [r12+40] + mov r9, r8 + xor r15, QWORD PTR [r12+8] + mov r11, QWORD PTR [r12+224] + mov r12, QWORD PTR [r12+56] + xor r12, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm0, r12 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + movaps XMMWORD PTR [rsp], xmm9 + mov r12, QWORD PTR [rdx+88] + xor r12, QWORD PTR [rdx+72] + movq xmm6, rax + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm6, xmm0 + and r9d, 2097136 + movq xmm0, r12 + movq xmm7, rax + punpcklqdq xmm7, xmm0 + mov r10d, r9d + movq xmm9, rsp + mov rsp, r8 + mov r8d, 524288 + + mov ebx, [rdx+96] + mov esi, [rdx+100] + mov edi, [rdx+104] + mov ebp, [rdx+108] + + ALIGN(64) +FN_PREFIX(CryptonightWOW_template_mainloop): + movdqa xmm5, XMMWORD PTR [r9+r11] + movq xmm0, r15 + movq xmm4, rsp + punpcklqdq xmm4, xmm0 + lea rdx, QWORD PTR [r9+r11] + + aesenc xmm5, xmm4 + movd r10d, xmm5 + and r10d, 2097136 + + mov r12d, r9d + mov eax, r9d + xor r9d, 48 + xor r12d, 16 + xor eax, 32 + movdqu xmm0, XMMWORD PTR [r9+r11] + movdqu xmm2, XMMWORD PTR [r12+r11] + movdqu xmm1, XMMWORD PTR [rax+r11] + paddq xmm0, xmm7 + paddq xmm2, xmm6 + paddq xmm1, xmm4 + movdqu XMMWORD PTR [r12+r11], xmm0 + movq r12, xmm5 + movdqu XMMWORD PTR [rax+r11], xmm2 + movdqu XMMWORD PTR [r9+r11], xmm1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [rdx], xmm0 + + lea r13d, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or r13, rdx + + xor r13, QWORD PTR [r10+r11] + mov r14, QWORD PTR [r10+r11+8] + + movd eax, xmm6 + movd edx, xmm7 + pextrd r9d, xmm7, 2 + +FN_PREFIX(CryptonightWOW_template_part2): + mov rax, r13 + mul r12 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + mov r9d, r10d + mov r12d, r10d + xor r9d, 16 + xor r12d, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [r12+r11] + xor rdx, QWORD PTR [r12+r11] + xor rax, QWORD PTR [r11+r12+8] + movdqa xmm2, XMMWORD PTR [r9+r11] + pxor xmm3, xmm2 + paddq xmm7, XMMWORD PTR [r10+r11] + paddq xmm1, xmm4 + paddq xmm3, xmm6 + movdqu XMMWORD PTR [r9+r11], xmm7 + movdqu XMMWORD PTR [r12+r11], xmm3 + movdqu XMMWORD PTR [r10+r11], xmm1 + + movdqa xmm7, xmm6 + add r15, rax + add rsp, rdx + xor r10, 48 + mov QWORD PTR [r10+r11], rsp + xor rsp, r13 + mov r9d, esp + mov QWORD PTR [r10+r11+8], r15 + and r9d, 2097136 + xor r15, r14 + movdqa xmm6, xmm5 + dec r8d + jnz FN_PREFIX(CryptonightWOW_template_mainloop) + +FN_PREFIX(CryptonightWOW_template_part3): + movq rsp, xmm9 + + mov rbx, QWORD PTR [rsp+136] + mov rbp, QWORD PTR [rsp+144] + mov rsi, QWORD PTR [rsp+152] + movaps xmm6, XMMWORD PTR [rsp+48] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+16] + movaps xmm9, XMMWORD PTR [rsp] + add rsp, 64 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + ret 0 +FN_PREFIX(CryptonightWOW_template_end): + +ALIGN(64) +FN_PREFIX(CryptonightWOW_template_double_part1): + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 320 + mov r14, QWORD PTR [rcx+32] + mov r8, rcx + xor r14, QWORD PTR [rcx] + mov r12, QWORD PTR [rcx+40] + mov ebx, r14d + mov rsi, QWORD PTR [rcx+224] + and ebx, 2097136 + xor r12, QWORD PTR [rcx+8] + mov rcx, QWORD PTR [rcx+56] + xor rcx, QWORD PTR [r8+24] + mov rax, QWORD PTR [r8+48] + xor rax, QWORD PTR [r8+16] + mov r15, QWORD PTR [rdx+32] + xor r15, QWORD PTR [rdx] + movq xmm0, rcx + mov rcx, QWORD PTR [r8+88] + xor rcx, QWORD PTR [r8+72] + mov r13, QWORD PTR [rdx+40] + mov rdi, QWORD PTR [rdx+224] + xor r13, QWORD PTR [rdx+8] + movaps XMMWORD PTR [rsp+160], xmm6 + movaps XMMWORD PTR [rsp+176], xmm7 + movaps XMMWORD PTR [rsp+192], xmm8 + movaps XMMWORD PTR [rsp+208], xmm9 + movaps XMMWORD PTR [rsp+224], xmm10 + movaps XMMWORD PTR [rsp+240], xmm11 + movaps XMMWORD PTR [rsp+256], xmm12 + movaps XMMWORD PTR [rsp+272], xmm13 + movaps XMMWORD PTR [rsp+288], xmm14 + movaps XMMWORD PTR [rsp+304], xmm15 + movq xmm7, rax + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + + movaps xmm1, XMMWORD PTR [rdx+96] + movaps xmm2, XMMWORD PTR [r8+96] + movaps XMMWORD PTR [rsp], xmm1 + movaps XMMWORD PTR [rsp+16], xmm2 + + mov r8d, r15d + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [rdx+56] + xor rcx, QWORD PTR [rdx+24] + movq xmm9, rax + mov QWORD PTR [rsp+128], rsi + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + punpcklqdq xmm9, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [rdx+88] + xor rcx, QWORD PTR [rdx+72] + movq xmm8, rax + mov QWORD PTR [rsp+136], rdi + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm8, xmm0 + and r8d, 2097136 + movq xmm0, rcx + mov r11d, 524288 + movq xmm10, rax + punpcklqdq xmm10, xmm0 + + movq xmm14, QWORD PTR [rsp+128] + movq xmm15, QWORD PTR [rsp+136] + + ALIGN(64) +FN_PREFIX(CryptonightWOW_template_double_mainloop): + movdqu xmm6, XMMWORD PTR [rbx+rsi] + movq xmm0, r12 + mov ecx, ebx + movq xmm3, r14 + punpcklqdq xmm3, xmm0 + xor ebx, 16 + aesenc xmm6, xmm3 + movq rdx, xmm6 + movq xmm4, r15 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + xor ebx, 48 + paddq xmm0, xmm7 + movdqu xmm1, XMMWORD PTR [rbx+rsi] + movdqu XMMWORD PTR [rbx+rsi], xmm0 + paddq xmm1, xmm3 + xor ebx, 16 + mov eax, ebx + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + movdqu XMMWORD PTR [rbx+rsi], xmm1 + paddq xmm0, xmm9 + movdqu XMMWORD PTR [rax+rsi], xmm0 + movdqa xmm0, xmm6 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [rcx+rsi], xmm0 + mov esi, edx + movdqu xmm5, XMMWORD PTR [r8+rdi] + and esi, 2097136 + mov ecx, r8d + movq xmm0, r13 + punpcklqdq xmm4, xmm0 + xor r8d, 16 + aesenc xmm5, xmm4 + movdqu xmm0, XMMWORD PTR [r8+rdi] + xor r8d, 48 + paddq xmm0, xmm8 + movdqu xmm1, XMMWORD PTR [r8+rdi] + movdqu XMMWORD PTR [r8+rdi], xmm0 + paddq xmm1, xmm4 + xor r8d, 16 + mov eax, r8d + xor rax, 32 + movdqu xmm0, XMMWORD PTR [r8+rdi] + movdqu XMMWORD PTR [r8+rdi], xmm1 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rdi], xmm0 + movdqa xmm0, xmm5 + pxor xmm0, xmm8 + movdqu XMMWORD PTR [rcx+rdi], xmm0 + movq rdi, xmm5 + movq rcx, xmm14 + mov ebp, edi + mov r8, QWORD PTR [rcx+rsi] + mov r10, QWORD PTR [rcx+rsi+8] + lea r9, QWORD PTR [rcx+rsi] + xor esi, 16 + + movq xmm0, rsp + movq xmm1, rsi + movq xmm2, rdi + movq xmm11, rbp + movq xmm12, r15 + movq xmm13, rdx + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp+16] + mov esi, DWORD PTR [rsp+20] + mov edi, DWORD PTR [rsp+24] + mov ebp, DWORD PTR [rsp+28] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + xor r8, rax + + movd esp, xmm3 + pextrd r15d, xmm3, 2 + movd eax, xmm7 + movd edx, xmm9 + pextrd r9d, xmm9, 2 + +FN_PREFIX(CryptonightWOW_template_double_part2): + + movq rsp, xmm0 + mov DWORD PTR [rsp+16], ebx + mov DWORD PTR [rsp+20], esi + mov DWORD PTR [rsp+24], edi + mov DWORD PTR [rsp+28], ebp + + movq rsi, xmm1 + movq rdi, xmm2 + movq rbp, xmm11 + movq r15, xmm12 + movq rdx, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rbx, r8 + mov rax, r8 + mul rdx + and ebp, 2097136 + mov r8, rax + movq xmm1, rdx + movq xmm0, r8 + punpcklqdq xmm1, xmm0 + pxor xmm1, XMMWORD PTR [rcx+rsi] + xor esi, 48 + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rsi+rcx] + xor rdx, QWORD PTR [rsi+rcx] + paddq xmm2, xmm3 + xor r8, QWORD PTR [rsi+rcx+8] + movdqu XMMWORD PTR [rsi+rcx], xmm1 + xor esi, 16 + mov eax, esi + mov rsi, rcx + movdqu xmm0, XMMWORD PTR [rax+rcx] + movdqu XMMWORD PTR [rax+rcx], xmm2 + paddq xmm0, xmm9 + add r12, r8 + xor rax, 32 + add r14, rdx + movdqa xmm9, xmm7 + movdqa xmm7, xmm6 + movdqu XMMWORD PTR [rax+rcx], xmm0 + mov QWORD PTR [r9+8], r12 + xor r12, r10 + mov QWORD PTR [r9], r14 + movq rcx, xmm15 + xor r14, rbx + mov r10d, ebp + mov ebx, r14d + xor ebp, 16 + and ebx, 2097136 + mov r8, QWORD PTR [r10+rcx] + mov r9, QWORD PTR [r10+rcx+8] + + movq xmm0, rsp + movq xmm1, rbx + movq xmm2, rsi + movq xmm11, rdi + movq xmm12, rbp + movq xmm13, r15 + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp] + mov esi, DWORD PTR [rsp+4] + mov edi, DWORD PTR [rsp+8] + mov ebp, DWORD PTR [rsp+12] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + + xor r8, rax + movq xmm3, r8 + + movd esp, xmm4 + pextrd r15d, xmm4, 2 + movd eax, xmm8 + movd edx, xmm10 + pextrd r9d, xmm10, 2 + +FN_PREFIX(CryptonightWOW_template_double_part3): + + movq rsp, xmm0 + mov DWORD PTR [rsp], ebx + mov DWORD PTR [rsp+4], esi + mov DWORD PTR [rsp+8], edi + mov DWORD PTR [rsp+12], ebp + + movq rbx, xmm1 + movq rsi, xmm2 + movq rdi, xmm11 + movq rbp, xmm12 + movq r15, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rax, r8 + mul rdi + movq xmm1, rdx + movq xmm0, rax + punpcklqdq xmm1, xmm0 + mov rdi, rcx + mov r8, rax + pxor xmm1, XMMWORD PTR [rbp+rcx] + xor ebp, 48 + paddq xmm1, xmm8 + xor r8, QWORD PTR [rbp+rcx+8] + xor rdx, QWORD PTR [rbp+rcx] + add r13, r8 + movdqu xmm2, XMMWORD PTR [rbp+rcx] + add r15, rdx + movdqu XMMWORD PTR [rbp+rcx], xmm1 + paddq xmm2, xmm4 + xor ebp, 16 + mov eax, ebp + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbp+rcx] + movdqu XMMWORD PTR [rbp+rcx], xmm2 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rcx], xmm0 + movq rax, xmm3 + movdqa xmm10, xmm8 + mov QWORD PTR [r10+rcx], r15 + movdqa xmm8, xmm5 + xor r15, rax + mov QWORD PTR [r10+rcx+8], r13 + mov r8d, r15d + xor r13, r9 + and r8d, 2097136 + dec r11d + jnz FN_PREFIX(CryptonightWOW_template_double_mainloop) + +FN_PREFIX(CryptonightWOW_template_double_part4): + + mov rbx, QWORD PTR [rsp+400] + movaps xmm6, XMMWORD PTR [rsp+160] + movaps xmm7, XMMWORD PTR [rsp+176] + movaps xmm8, XMMWORD PTR [rsp+192] + movaps xmm9, XMMWORD PTR [rsp+208] + movaps xmm10, XMMWORD PTR [rsp+224] + movaps xmm11, XMMWORD PTR [rsp+240] + movaps xmm12, XMMWORD PTR [rsp+256] + movaps xmm13, XMMWORD PTR [rsp+272] + movaps xmm14, XMMWORD PTR [rsp+288] + movaps xmm15, XMMWORD PTR [rsp+304] + add rsp, 320 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + ret 0 +FN_PREFIX(CryptonightWOW_template_double_end): diff --git a/src/crypto/asm/CryptonightWOW_template_win.inc b/src/crypto/asm/CryptonightWOW_template_win.inc new file mode 100644 index 00000000..c5652e27 --- /dev/null +++ b/src/crypto/asm/CryptonightWOW_template_win.inc @@ -0,0 +1,486 @@ +PUBLIC CryptonightWOW_template_part1 +PUBLIC CryptonightWOW_template_mainloop +PUBLIC CryptonightWOW_template_part2 +PUBLIC CryptonightWOW_template_part3 +PUBLIC CryptonightWOW_template_end +PUBLIC CryptonightWOW_template_double_part1 +PUBLIC CryptonightWOW_template_double_mainloop +PUBLIC CryptonightWOW_template_double_part2 +PUBLIC CryptonightWOW_template_double_part3 +PUBLIC CryptonightWOW_template_double_part4 +PUBLIC CryptonightWOW_template_double_end + +ALIGN(64) +CryptonightWOW_template_part1: + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push rdi + sub rsp, 64 + mov r12, rcx + mov r8, QWORD PTR [r12+32] + mov rdx, r12 + xor r8, QWORD PTR [r12] + mov r15, QWORD PTR [r12+40] + mov r9, r8 + xor r15, QWORD PTR [r12+8] + mov r11, QWORD PTR [r12+224] + mov r12, QWORD PTR [r12+56] + xor r12, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm0, r12 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + movaps XMMWORD PTR [rsp], xmm9 + mov r12, QWORD PTR [rdx+88] + xor r12, QWORD PTR [rdx+72] + movq xmm6, rax + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm6, xmm0 + and r9d, 2097136 + movq xmm0, r12 + movq xmm7, rax + punpcklqdq xmm7, xmm0 + mov r10d, r9d + movq xmm9, rsp + mov rsp, r8 + mov r8d, 524288 + + mov ebx, [rdx+96] + mov esi, [rdx+100] + mov edi, [rdx+104] + mov ebp, [rdx+108] + + ALIGN(64) +CryptonightWOW_template_mainloop: + movdqa xmm5, XMMWORD PTR [r9+r11] + movq xmm0, r15 + movq xmm4, rsp + punpcklqdq xmm4, xmm0 + lea rdx, QWORD PTR [r9+r11] + + aesenc xmm5, xmm4 + movd r10d, xmm5 + and r10d, 2097136 + + mov r12d, r9d + mov eax, r9d + xor r9d, 48 + xor r12d, 16 + xor eax, 32 + movdqu xmm0, XMMWORD PTR [r9+r11] + movdqu xmm2, XMMWORD PTR [r12+r11] + movdqu xmm1, XMMWORD PTR [rax+r11] + paddq xmm0, xmm7 + paddq xmm2, xmm6 + paddq xmm1, xmm4 + movdqu XMMWORD PTR [r12+r11], xmm0 + movq r12, xmm5 + movdqu XMMWORD PTR [rax+r11], xmm2 + movdqu XMMWORD PTR [r9+r11], xmm1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [rdx], xmm0 + + lea r13d, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or r13, rdx + + xor r13, QWORD PTR [r10+r11] + mov r14, QWORD PTR [r10+r11+8] + + movd eax, xmm6 + movd edx, xmm7 + pextrd r9d, xmm7, 2 + +CryptonightWOW_template_part2: + mov rax, r13 + mul r12 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + mov r9d, r10d + mov r12d, r10d + xor r9d, 16 + xor r12d, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [r12+r11] + xor rdx, QWORD PTR [r12+r11] + xor rax, QWORD PTR [r11+r12+8] + movdqa xmm2, XMMWORD PTR [r9+r11] + pxor xmm3, xmm2 + paddq xmm7, XMMWORD PTR [r10+r11] + paddq xmm1, xmm4 + paddq xmm3, xmm6 + movdqu XMMWORD PTR [r9+r11], xmm7 + movdqu XMMWORD PTR [r12+r11], xmm3 + movdqu XMMWORD PTR [r10+r11], xmm1 + + movdqa xmm7, xmm6 + add r15, rax + add rsp, rdx + xor r10, 48 + mov QWORD PTR [r10+r11], rsp + xor rsp, r13 + mov r9d, esp + mov QWORD PTR [r10+r11+8], r15 + and r9d, 2097136 + xor r15, r14 + movdqa xmm6, xmm5 + dec r8d + jnz CryptonightWOW_template_mainloop + +CryptonightWOW_template_part3: + movq rsp, xmm9 + + mov rbx, QWORD PTR [rsp+136] + mov rbp, QWORD PTR [rsp+144] + mov rsi, QWORD PTR [rsp+152] + movaps xmm6, XMMWORD PTR [rsp+48] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+16] + movaps xmm9, XMMWORD PTR [rsp] + add rsp, 64 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + ret 0 +CryptonightWOW_template_end: + +ALIGN(64) +CryptonightWOW_template_double_part1: + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 320 + mov r14, QWORD PTR [rcx+32] + mov r8, rcx + xor r14, QWORD PTR [rcx] + mov r12, QWORD PTR [rcx+40] + mov ebx, r14d + mov rsi, QWORD PTR [rcx+224] + and ebx, 2097136 + xor r12, QWORD PTR [rcx+8] + mov rcx, QWORD PTR [rcx+56] + xor rcx, QWORD PTR [r8+24] + mov rax, QWORD PTR [r8+48] + xor rax, QWORD PTR [r8+16] + mov r15, QWORD PTR [rdx+32] + xor r15, QWORD PTR [rdx] + movq xmm0, rcx + mov rcx, QWORD PTR [r8+88] + xor rcx, QWORD PTR [r8+72] + mov r13, QWORD PTR [rdx+40] + mov rdi, QWORD PTR [rdx+224] + xor r13, QWORD PTR [rdx+8] + movaps XMMWORD PTR [rsp+160], xmm6 + movaps XMMWORD PTR [rsp+176], xmm7 + movaps XMMWORD PTR [rsp+192], xmm8 + movaps XMMWORD PTR [rsp+208], xmm9 + movaps XMMWORD PTR [rsp+224], xmm10 + movaps XMMWORD PTR [rsp+240], xmm11 + movaps XMMWORD PTR [rsp+256], xmm12 + movaps XMMWORD PTR [rsp+272], xmm13 + movaps XMMWORD PTR [rsp+288], xmm14 + movaps XMMWORD PTR [rsp+304], xmm15 + movq xmm7, rax + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + + movaps xmm1, XMMWORD PTR [rdx+96] + movaps xmm2, XMMWORD PTR [r8+96] + movaps XMMWORD PTR [rsp], xmm1 + movaps XMMWORD PTR [rsp+16], xmm2 + + mov r8d, r15d + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [rdx+56] + xor rcx, QWORD PTR [rdx+24] + movq xmm9, rax + mov QWORD PTR [rsp+128], rsi + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + punpcklqdq xmm9, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [rdx+88] + xor rcx, QWORD PTR [rdx+72] + movq xmm8, rax + mov QWORD PTR [rsp+136], rdi + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm8, xmm0 + and r8d, 2097136 + movq xmm0, rcx + mov r11d, 524288 + movq xmm10, rax + punpcklqdq xmm10, xmm0 + + movq xmm14, QWORD PTR [rsp+128] + movq xmm15, QWORD PTR [rsp+136] + + ALIGN(64) +CryptonightWOW_template_double_mainloop: + movdqu xmm6, XMMWORD PTR [rbx+rsi] + movq xmm0, r12 + mov ecx, ebx + movq xmm3, r14 + punpcklqdq xmm3, xmm0 + xor ebx, 16 + aesenc xmm6, xmm3 + movq rdx, xmm6 + movq xmm4, r15 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + xor ebx, 48 + paddq xmm0, xmm7 + movdqu xmm1, XMMWORD PTR [rbx+rsi] + movdqu XMMWORD PTR [rbx+rsi], xmm0 + paddq xmm1, xmm3 + xor ebx, 16 + mov eax, ebx + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + movdqu XMMWORD PTR [rbx+rsi], xmm1 + paddq xmm0, xmm9 + movdqu XMMWORD PTR [rax+rsi], xmm0 + movdqa xmm0, xmm6 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [rcx+rsi], xmm0 + mov esi, edx + movdqu xmm5, XMMWORD PTR [r8+rdi] + and esi, 2097136 + mov ecx, r8d + movq xmm0, r13 + punpcklqdq xmm4, xmm0 + xor r8d, 16 + aesenc xmm5, xmm4 + movdqu xmm0, XMMWORD PTR [r8+rdi] + xor r8d, 48 + paddq xmm0, xmm8 + movdqu xmm1, XMMWORD PTR [r8+rdi] + movdqu XMMWORD PTR [r8+rdi], xmm0 + paddq xmm1, xmm4 + xor r8d, 16 + mov eax, r8d + xor rax, 32 + movdqu xmm0, XMMWORD PTR [r8+rdi] + movdqu XMMWORD PTR [r8+rdi], xmm1 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rdi], xmm0 + movdqa xmm0, xmm5 + pxor xmm0, xmm8 + movdqu XMMWORD PTR [rcx+rdi], xmm0 + movq rdi, xmm5 + movq rcx, xmm14 + mov ebp, edi + mov r8, QWORD PTR [rcx+rsi] + mov r10, QWORD PTR [rcx+rsi+8] + lea r9, QWORD PTR [rcx+rsi] + xor esi, 16 + + movq xmm0, rsp + movq xmm1, rsi + movq xmm2, rdi + movq xmm11, rbp + movq xmm12, r15 + movq xmm13, rdx + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp+16] + mov esi, DWORD PTR [rsp+20] + mov edi, DWORD PTR [rsp+24] + mov ebp, DWORD PTR [rsp+28] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + xor r8, rax + + movd esp, xmm3 + pextrd r15d, xmm3, 2 + movd eax, xmm7 + movd edx, xmm9 + pextrd r9d, xmm9, 2 + +CryptonightWOW_template_double_part2: + + movq rsp, xmm0 + mov DWORD PTR [rsp+16], ebx + mov DWORD PTR [rsp+20], esi + mov DWORD PTR [rsp+24], edi + mov DWORD PTR [rsp+28], ebp + + movq rsi, xmm1 + movq rdi, xmm2 + movq rbp, xmm11 + movq r15, xmm12 + movq rdx, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rbx, r8 + mov rax, r8 + mul rdx + and ebp, 2097136 + mov r8, rax + movq xmm1, rdx + movq xmm0, r8 + punpcklqdq xmm1, xmm0 + pxor xmm1, XMMWORD PTR [rcx+rsi] + xor esi, 48 + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rsi+rcx] + xor rdx, QWORD PTR [rsi+rcx] + paddq xmm2, xmm3 + xor r8, QWORD PTR [rsi+rcx+8] + movdqu XMMWORD PTR [rsi+rcx], xmm1 + xor esi, 16 + mov eax, esi + mov rsi, rcx + movdqu xmm0, XMMWORD PTR [rax+rcx] + movdqu XMMWORD PTR [rax+rcx], xmm2 + paddq xmm0, xmm9 + add r12, r8 + xor rax, 32 + add r14, rdx + movdqa xmm9, xmm7 + movdqa xmm7, xmm6 + movdqu XMMWORD PTR [rax+rcx], xmm0 + mov QWORD PTR [r9+8], r12 + xor r12, r10 + mov QWORD PTR [r9], r14 + movq rcx, xmm15 + xor r14, rbx + mov r10d, ebp + mov ebx, r14d + xor ebp, 16 + and ebx, 2097136 + mov r8, QWORD PTR [r10+rcx] + mov r9, QWORD PTR [r10+rcx+8] + + movq xmm0, rsp + movq xmm1, rbx + movq xmm2, rsi + movq xmm11, rdi + movq xmm12, rbp + movq xmm13, r15 + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp] + mov esi, DWORD PTR [rsp+4] + mov edi, DWORD PTR [rsp+8] + mov ebp, DWORD PTR [rsp+12] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + + xor r8, rax + movq xmm3, r8 + + movd esp, xmm4 + pextrd r15d, xmm4, 2 + movd eax, xmm8 + movd edx, xmm10 + pextrd r9d, xmm10, 2 + +CryptonightWOW_template_double_part3: + + movq rsp, xmm0 + mov DWORD PTR [rsp], ebx + mov DWORD PTR [rsp+4], esi + mov DWORD PTR [rsp+8], edi + mov DWORD PTR [rsp+12], ebp + + movq rbx, xmm1 + movq rsi, xmm2 + movq rdi, xmm11 + movq rbp, xmm12 + movq r15, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rax, r8 + mul rdi + movq xmm1, rdx + movq xmm0, rax + punpcklqdq xmm1, xmm0 + mov rdi, rcx + mov r8, rax + pxor xmm1, XMMWORD PTR [rbp+rcx] + xor ebp, 48 + paddq xmm1, xmm8 + xor r8, QWORD PTR [rbp+rcx+8] + xor rdx, QWORD PTR [rbp+rcx] + add r13, r8 + movdqu xmm2, XMMWORD PTR [rbp+rcx] + add r15, rdx + movdqu XMMWORD PTR [rbp+rcx], xmm1 + paddq xmm2, xmm4 + xor ebp, 16 + mov eax, ebp + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbp+rcx] + movdqu XMMWORD PTR [rbp+rcx], xmm2 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rcx], xmm0 + movq rax, xmm3 + movdqa xmm10, xmm8 + mov QWORD PTR [r10+rcx], r15 + movdqa xmm8, xmm5 + xor r15, rax + mov QWORD PTR [r10+rcx+8], r13 + mov r8d, r15d + xor r13, r9 + and r8d, 2097136 + dec r11d + jnz CryptonightWOW_template_double_mainloop + +CryptonightWOW_template_double_part4: + + mov rbx, QWORD PTR [rsp+400] + movaps xmm6, XMMWORD PTR [rsp+160] + movaps xmm7, XMMWORD PTR [rsp+176] + movaps xmm8, XMMWORD PTR [rsp+192] + movaps xmm9, XMMWORD PTR [rsp+208] + movaps xmm10, XMMWORD PTR [rsp+224] + movaps xmm11, XMMWORD PTR [rsp+240] + movaps xmm12, XMMWORD PTR [rsp+256] + movaps xmm13, XMMWORD PTR [rsp+272] + movaps xmm14, XMMWORD PTR [rsp+288] + movaps xmm15, XMMWORD PTR [rsp+304] + add rsp, 320 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + ret 0 +CryptonightWOW_template_double_end: diff --git a/src/crypto/asm/win64/CryptonightR_template.S b/src/crypto/asm/win64/CryptonightR_template.S index e8478beb..4390bd45 100644 --- a/src/crypto/asm/win64/CryptonightR_template.S +++ b/src/crypto/asm/win64/CryptonightR_template.S @@ -538,16 +538,16 @@ FN_PREFIX(CryptonightR_instruction1): FN_PREFIX(CryptonightR_instruction2): imul rbx, rbx FN_PREFIX(CryptonightR_instruction3): - add rbx, rbx + add rbx, r9 add rbx, 2147483647 FN_PREFIX(CryptonightR_instruction4): - sub rbx, rbx + sub rbx, r9 FN_PREFIX(CryptonightR_instruction5): ror ebx, cl FN_PREFIX(CryptonightR_instruction6): rol ebx, cl FN_PREFIX(CryptonightR_instruction7): - xor rbx, rbx + xor rbx, r9 FN_PREFIX(CryptonightR_instruction8): imul rsi, rbx FN_PREFIX(CryptonightR_instruction9): @@ -623,16 +623,16 @@ FN_PREFIX(CryptonightR_instruction41): FN_PREFIX(CryptonightR_instruction42): imul rsi, rsi FN_PREFIX(CryptonightR_instruction43): - add rsi, rsi + add rsi, r9 add rsi, 2147483647 FN_PREFIX(CryptonightR_instruction44): - sub rsi, rsi + sub rsi, r9 FN_PREFIX(CryptonightR_instruction45): ror esi, cl FN_PREFIX(CryptonightR_instruction46): rol esi, cl FN_PREFIX(CryptonightR_instruction47): - xor rsi, rsi + xor rsi, r9 FN_PREFIX(CryptonightR_instruction48): imul rdi, rsi FN_PREFIX(CryptonightR_instruction49): @@ -708,16 +708,16 @@ FN_PREFIX(CryptonightR_instruction81): FN_PREFIX(CryptonightR_instruction82): imul rdi, rdi FN_PREFIX(CryptonightR_instruction83): - add rdi, rdi + add rdi, r9 add rdi, 2147483647 FN_PREFIX(CryptonightR_instruction84): - sub rdi, rdi + sub rdi, r9 FN_PREFIX(CryptonightR_instruction85): ror edi, cl FN_PREFIX(CryptonightR_instruction86): rol edi, cl FN_PREFIX(CryptonightR_instruction87): - xor rdi, rdi + xor rdi, r9 FN_PREFIX(CryptonightR_instruction88): imul rbp, rdi FN_PREFIX(CryptonightR_instruction89): @@ -793,16 +793,16 @@ FN_PREFIX(CryptonightR_instruction121): FN_PREFIX(CryptonightR_instruction122): imul rbp, rbp FN_PREFIX(CryptonightR_instruction123): - add rbp, rbp + add rbp, r9 add rbp, 2147483647 FN_PREFIX(CryptonightR_instruction124): - sub rbp, rbp + sub rbp, r9 FN_PREFIX(CryptonightR_instruction125): ror ebp, cl FN_PREFIX(CryptonightR_instruction126): rol ebp, cl FN_PREFIX(CryptonightR_instruction127): - xor rbp, rbp + xor rbp, r9 FN_PREFIX(CryptonightR_instruction128): imul rbx, rsp FN_PREFIX(CryptonightR_instruction129): diff --git a/src/crypto/asm/win64/CryptonightR_template.asm b/src/crypto/asm/win64/CryptonightR_template.asm index ec8ad5af..a16cf328 100644 --- a/src/crypto/asm/win64/CryptonightR_template.asm +++ b/src/crypto/asm/win64/CryptonightR_template.asm @@ -525,16 +525,16 @@ CryptonightR_instruction1: CryptonightR_instruction2: imul rbx, rbx CryptonightR_instruction3: - add rbx, rbx + add rbx, r9 add rbx, 2147483647 CryptonightR_instruction4: - sub rbx, rbx + sub rbx, r9 CryptonightR_instruction5: ror ebx, cl CryptonightR_instruction6: rol ebx, cl CryptonightR_instruction7: - xor rbx, rbx + xor rbx, r9 CryptonightR_instruction8: imul rsi, rbx CryptonightR_instruction9: @@ -610,16 +610,16 @@ CryptonightR_instruction41: CryptonightR_instruction42: imul rsi, rsi CryptonightR_instruction43: - add rsi, rsi + add rsi, r9 add rsi, 2147483647 CryptonightR_instruction44: - sub rsi, rsi + sub rsi, r9 CryptonightR_instruction45: ror esi, cl CryptonightR_instruction46: rol esi, cl CryptonightR_instruction47: - xor rsi, rsi + xor rsi, r9 CryptonightR_instruction48: imul rdi, rsi CryptonightR_instruction49: @@ -695,16 +695,16 @@ CryptonightR_instruction81: CryptonightR_instruction82: imul rdi, rdi CryptonightR_instruction83: - add rdi, rdi + add rdi, r9 add rdi, 2147483647 CryptonightR_instruction84: - sub rdi, rdi + sub rdi, r9 CryptonightR_instruction85: ror edi, cl CryptonightR_instruction86: rol edi, cl CryptonightR_instruction87: - xor rdi, rdi + xor rdi, r9 CryptonightR_instruction88: imul rbp, rdi CryptonightR_instruction89: @@ -780,16 +780,16 @@ CryptonightR_instruction121: CryptonightR_instruction122: imul rbp, rbp CryptonightR_instruction123: - add rbp, rbp + add rbp, r9 add rbp, 2147483647 CryptonightR_instruction124: - sub rbp, rbp + sub rbp, r9 CryptonightR_instruction125: ror ebp, cl CryptonightR_instruction126: rol ebp, cl CryptonightR_instruction127: - xor rbp, rbp + xor rbp, r9 CryptonightR_instruction128: imul rbx, rsp CryptonightR_instruction129: diff --git a/src/crypto/asm/win64/CryptonightR_template.inc b/src/crypto/asm/win64/CryptonightR_template.inc index f6e6ef45..1dae434a 100644 --- a/src/crypto/asm/win64/CryptonightR_template.inc +++ b/src/crypto/asm/win64/CryptonightR_template.inc @@ -10,6 +10,7 @@ PUBLIC FN_PREFIX(CryptonightR_template_double_part3) PUBLIC FN_PREFIX(CryptonightR_template_double_part4) PUBLIC FN_PREFIX(CryptonightR_template_double_end) +ALIGN(64) FN_PREFIX(CryptonightR_template_part1): mov QWORD PTR [rsp+16], rbx mov QWORD PTR [rsp+24], rbp @@ -68,8 +69,6 @@ FN_PREFIX(CryptonightR_template_mainloop): lea rdx, QWORD PTR [r9+r11] aesenc xmm5, xmm4 - movd r10d, xmm5 - and r10d, 2097136 mov r12d, r9d mov eax, r9d @@ -77,16 +76,23 @@ FN_PREFIX(CryptonightR_template_mainloop): xor r12d, 16 xor eax, 32 movdqu xmm0, XMMWORD PTR [r9+r11] + movaps xmm3, xmm0 movdqu xmm2, XMMWORD PTR [r12+r11] movdqu xmm1, XMMWORD PTR [rax+r11] - paddq xmm0, xmm7 + pxor xmm0, xmm2 + pxor xmm5, xmm1 + pxor xmm5, xmm0 + paddq xmm3, xmm7 paddq xmm2, xmm6 paddq xmm1, xmm4 - movdqu XMMWORD PTR [r12+r11], xmm0 - movd r12, xmm5 + movdqu XMMWORD PTR [r12+r11], xmm3 movdqu XMMWORD PTR [rax+r11], xmm2 movdqu XMMWORD PTR [r9+r11], xmm1 + movd r12, xmm5 + movd r10d, xmm5 + and r10d, 2097136 + movdqa xmm0, xmm5 pxor xmm0, xmm6 movdqu XMMWORD PTR [rdx], xmm0 @@ -101,13 +107,23 @@ FN_PREFIX(CryptonightR_template_mainloop): movd eax, xmm6 movd edx, xmm7 + pextrd r9d, xmm7, 2 FN_PREFIX(CryptonightR_template_part2): + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor rsp, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r15, rax + mov rax, r13 mul r12 - movd xmm0, rax - movd xmm3, rdx - punpcklqdq xmm3, xmm0 mov r9d, r10d mov r12d, r10d @@ -115,16 +131,18 @@ FN_PREFIX(CryptonightR_template_part2): xor r12d, 32 xor r10d, 48 movdqa xmm1, XMMWORD PTR [r12+r11] - xor rdx, QWORD PTR [r12+r11] - xor rax, QWORD PTR [r11+r12+8] + movaps xmm3, xmm1 movdqa xmm2, XMMWORD PTR [r9+r11] - pxor xmm3, xmm2 - paddq xmm7, XMMWORD PTR [r10+r11] - paddq xmm1, xmm4 - paddq xmm3, xmm6 - movdqu XMMWORD PTR [r9+r11], xmm7 - movdqu XMMWORD PTR [r12+r11], xmm3 - movdqu XMMWORD PTR [r10+r11], xmm1 + movdqa xmm0, XMMWORD PTR [r10+r11] + pxor xmm1, xmm2 + pxor xmm5, xmm0 + pxor xmm5, xmm1 + paddq xmm3, xmm4 + paddq xmm2, xmm6 + paddq xmm0, xmm7 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqu XMMWORD PTR [r12+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm3 movdqa xmm7, xmm6 add r15, rax @@ -247,18 +265,21 @@ FN_PREFIX(CryptonightR_template_double_mainloop): punpcklqdq xmm3, xmm0 xor ebx, 16 aesenc xmm6, xmm3 - movd rdx, xmm6 movd xmm4, r15 movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 xor ebx, 48 paddq xmm0, xmm7 movdqu xmm1, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm1 movdqu XMMWORD PTR [rbx+rsi], xmm0 paddq xmm1, xmm3 xor ebx, 16 mov eax, ebx xor rax, 32 movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 + movd rdx, xmm6 movdqu XMMWORD PTR [rbx+rsi], xmm1 paddq xmm0, xmm9 movdqu XMMWORD PTR [rax+rsi], xmm0 @@ -274,15 +295,18 @@ FN_PREFIX(CryptonightR_template_double_mainloop): xor r8d, 16 aesenc xmm5, xmm4 movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 xor r8d, 48 paddq xmm0, xmm8 movdqu xmm1, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm1 movdqu XMMWORD PTR [r8+rdi], xmm0 paddq xmm1, xmm4 xor r8d, 16 mov eax, r8d xor rax, 32 movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 movdqu XMMWORD PTR [r8+rdi], xmm1 paddq xmm0, xmm10 movdqu XMMWORD PTR [rax+rdi], xmm0 @@ -303,7 +327,8 @@ FN_PREFIX(CryptonightR_template_double_mainloop): movd xmm11, rbp movd xmm12, r15 movd xmm13, rdx - mov [rsp+112], rcx + mov [rsp+104], rcx + mov [rsp+112], r9 mov ebx, DWORD PTR [rsp+16] mov esi, DWORD PTR [rsp+20] @@ -320,9 +345,22 @@ FN_PREFIX(CryptonightR_template_double_mainloop): pextrd r15d, xmm3, 2 movd eax, xmm7 movd edx, xmm9 + pextrd r9d, xmm9, 2 FN_PREFIX(CryptonightR_template_double_part2): + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r14, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r12, rax + movd rsp, xmm0 mov DWORD PTR [rsp+16], ebx mov DWORD PTR [rsp+20], esi @@ -334,28 +372,27 @@ FN_PREFIX(CryptonightR_template_double_part2): movd rbp, xmm11 movd r15, xmm12 movd rdx, xmm13 - mov rcx, [rsp+112] + mov rcx, [rsp+104] + mov r9, [rsp+112] mov rbx, r8 mov rax, r8 mul rdx and ebp, 2097136 mov r8, rax - movd xmm1, rdx - movd xmm0, r8 - punpcklqdq xmm1, xmm0 - pxor xmm1, XMMWORD PTR [rcx+rsi] + movdqu xmm1, XMMWORD PTR [rcx+rsi] + pxor xmm6, xmm1 xor esi, 48 paddq xmm1, xmm7 movdqu xmm2, XMMWORD PTR [rsi+rcx] - xor rdx, QWORD PTR [rsi+rcx] + pxor xmm6, xmm2 paddq xmm2, xmm3 - xor r8, QWORD PTR [rsi+rcx+8] movdqu XMMWORD PTR [rsi+rcx], xmm1 xor esi, 16 mov eax, esi mov rsi, rcx movdqu xmm0, XMMWORD PTR [rax+rcx] + pxor xmm6, xmm0 movdqu XMMWORD PTR [rax+rcx], xmm2 paddq xmm0, xmm9 add r12, r8 @@ -383,6 +420,7 @@ FN_PREFIX(CryptonightR_template_double_part2): movd xmm12, rbp movd xmm13, r15 mov [rsp+104], rcx + mov [rsp+112], r9 mov ebx, DWORD PTR [rsp] mov esi, DWORD PTR [rsp+4] @@ -401,9 +439,24 @@ FN_PREFIX(CryptonightR_template_double_part2): pextrd r15d, xmm4, 2 movd eax, xmm8 movd edx, xmm10 + pextrd r9d, xmm10, 2 FN_PREFIX(CryptonightR_template_double_part3): + movd r15, xmm13 + + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r15, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r13, rax + movd rsp, xmm0 mov DWORD PTR [rsp], ebx mov DWORD PTR [rsp+4], esi @@ -414,23 +467,20 @@ FN_PREFIX(CryptonightR_template_double_part3): movd rsi, xmm2 movd rdi, xmm11 movd rbp, xmm12 - movd r15, xmm13 mov rcx, [rsp+104] + mov r9, [rsp+112] mov rax, r8 mul rdi - movd xmm1, rdx - movd xmm0, rax - punpcklqdq xmm1, xmm0 mov rdi, rcx mov r8, rax - pxor xmm1, XMMWORD PTR [rbp+rcx] + movdqu xmm1, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm1 xor ebp, 48 paddq xmm1, xmm8 - xor r8, QWORD PTR [rbp+rcx+8] - xor rdx, QWORD PTR [rbp+rcx] add r13, r8 movdqu xmm2, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm2 add r15, rdx movdqu XMMWORD PTR [rbp+rcx], xmm1 paddq xmm2, xmm4 @@ -438,6 +488,7 @@ FN_PREFIX(CryptonightR_template_double_part3): mov eax, ebp xor rax, 32 movdqu xmm0, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm0 movdqu XMMWORD PTR [rbp+rcx], xmm2 paddq xmm0, xmm10 movdqu XMMWORD PTR [rax+rcx], xmm0 diff --git a/src/crypto/asm/win64/CryptonightR_template_win.inc b/src/crypto/asm/win64/CryptonightR_template_win.inc index b0217e04..2f2d71a2 100644 --- a/src/crypto/asm/win64/CryptonightR_template_win.inc +++ b/src/crypto/asm/win64/CryptonightR_template_win.inc @@ -10,6 +10,7 @@ PUBLIC CryptonightR_template_double_part3 PUBLIC CryptonightR_template_double_part4 PUBLIC CryptonightR_template_double_end +ALIGN(64) CryptonightR_template_part1: mov QWORD PTR [rsp+16], rbx mov QWORD PTR [rsp+24], rbp @@ -68,8 +69,6 @@ CryptonightR_template_mainloop: lea rdx, QWORD PTR [r9+r11] aesenc xmm5, xmm4 - movd r10d, xmm5 - and r10d, 2097136 mov r12d, r9d mov eax, r9d @@ -77,16 +76,23 @@ CryptonightR_template_mainloop: xor r12d, 16 xor eax, 32 movdqu xmm0, XMMWORD PTR [r9+r11] + movaps xmm3, xmm0 movdqu xmm2, XMMWORD PTR [r12+r11] movdqu xmm1, XMMWORD PTR [rax+r11] - paddq xmm0, xmm7 + pxor xmm0, xmm2 + pxor xmm5, xmm1 + pxor xmm5, xmm0 + paddq xmm3, xmm7 paddq xmm2, xmm6 paddq xmm1, xmm4 - movdqu XMMWORD PTR [r12+r11], xmm0 - movd r12, xmm5 + movdqu XMMWORD PTR [r12+r11], xmm3 movdqu XMMWORD PTR [rax+r11], xmm2 movdqu XMMWORD PTR [r9+r11], xmm1 + movd r12, xmm5 + movd r10d, xmm5 + and r10d, 2097136 + movdqa xmm0, xmm5 pxor xmm0, xmm6 movdqu XMMWORD PTR [rdx], xmm0 @@ -101,13 +107,23 @@ CryptonightR_template_mainloop: movd eax, xmm6 movd edx, xmm7 + pextrd r9d, xmm7, 2 CryptonightR_template_part2: + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor rsp, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r15, rax + mov rax, r13 mul r12 - movd xmm0, rax - movd xmm3, rdx - punpcklqdq xmm3, xmm0 mov r9d, r10d mov r12d, r10d @@ -115,16 +131,18 @@ CryptonightR_template_part2: xor r12d, 32 xor r10d, 48 movdqa xmm1, XMMWORD PTR [r12+r11] - xor rdx, QWORD PTR [r12+r11] - xor rax, QWORD PTR [r11+r12+8] + movaps xmm3, xmm1 movdqa xmm2, XMMWORD PTR [r9+r11] - pxor xmm3, xmm2 - paddq xmm7, XMMWORD PTR [r10+r11] - paddq xmm1, xmm4 - paddq xmm3, xmm6 - movdqu XMMWORD PTR [r9+r11], xmm7 - movdqu XMMWORD PTR [r12+r11], xmm3 - movdqu XMMWORD PTR [r10+r11], xmm1 + movdqa xmm0, XMMWORD PTR [r10+r11] + pxor xmm1, xmm2 + pxor xmm5, xmm0 + pxor xmm5, xmm1 + paddq xmm3, xmm4 + paddq xmm2, xmm6 + paddq xmm0, xmm7 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqu XMMWORD PTR [r12+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm3 movdqa xmm7, xmm6 add r15, rax @@ -247,18 +265,21 @@ CryptonightR_template_double_mainloop: punpcklqdq xmm3, xmm0 xor ebx, 16 aesenc xmm6, xmm3 - movd rdx, xmm6 movd xmm4, r15 movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 xor ebx, 48 paddq xmm0, xmm7 movdqu xmm1, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm1 movdqu XMMWORD PTR [rbx+rsi], xmm0 paddq xmm1, xmm3 xor ebx, 16 mov eax, ebx xor rax, 32 movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 + movd rdx, xmm6 movdqu XMMWORD PTR [rbx+rsi], xmm1 paddq xmm0, xmm9 movdqu XMMWORD PTR [rax+rsi], xmm0 @@ -274,15 +295,18 @@ CryptonightR_template_double_mainloop: xor r8d, 16 aesenc xmm5, xmm4 movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 xor r8d, 48 paddq xmm0, xmm8 movdqu xmm1, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm1 movdqu XMMWORD PTR [r8+rdi], xmm0 paddq xmm1, xmm4 xor r8d, 16 mov eax, r8d xor rax, 32 movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 movdqu XMMWORD PTR [r8+rdi], xmm1 paddq xmm0, xmm10 movdqu XMMWORD PTR [rax+rdi], xmm0 @@ -303,7 +327,8 @@ CryptonightR_template_double_mainloop: movd xmm11, rbp movd xmm12, r15 movd xmm13, rdx - mov [rsp+112], rcx + mov [rsp+104], rcx + mov [rsp+112], r9 mov ebx, DWORD PTR [rsp+16] mov esi, DWORD PTR [rsp+20] @@ -320,9 +345,22 @@ CryptonightR_template_double_mainloop: pextrd r15d, xmm3, 2 movd eax, xmm7 movd edx, xmm9 + pextrd r9d, xmm9, 2 CryptonightR_template_double_part2: + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r14, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r12, rax + movd rsp, xmm0 mov DWORD PTR [rsp+16], ebx mov DWORD PTR [rsp+20], esi @@ -334,28 +372,27 @@ CryptonightR_template_double_part2: movd rbp, xmm11 movd r15, xmm12 movd rdx, xmm13 - mov rcx, [rsp+112] + mov rcx, [rsp+104] + mov r9, [rsp+112] mov rbx, r8 mov rax, r8 mul rdx and ebp, 2097136 mov r8, rax - movd xmm1, rdx - movd xmm0, r8 - punpcklqdq xmm1, xmm0 - pxor xmm1, XMMWORD PTR [rcx+rsi] + movdqu xmm1, XMMWORD PTR [rcx+rsi] + pxor xmm6, xmm1 xor esi, 48 paddq xmm1, xmm7 movdqu xmm2, XMMWORD PTR [rsi+rcx] - xor rdx, QWORD PTR [rsi+rcx] + pxor xmm6, xmm2 paddq xmm2, xmm3 - xor r8, QWORD PTR [rsi+rcx+8] movdqu XMMWORD PTR [rsi+rcx], xmm1 xor esi, 16 mov eax, esi mov rsi, rcx movdqu xmm0, XMMWORD PTR [rax+rcx] + pxor xmm6, xmm0 movdqu XMMWORD PTR [rax+rcx], xmm2 paddq xmm0, xmm9 add r12, r8 @@ -383,6 +420,7 @@ CryptonightR_template_double_part2: movd xmm12, rbp movd xmm13, r15 mov [rsp+104], rcx + mov [rsp+112], r9 mov ebx, DWORD PTR [rsp] mov esi, DWORD PTR [rsp+4] @@ -401,9 +439,24 @@ CryptonightR_template_double_part2: pextrd r15d, xmm4, 2 movd eax, xmm8 movd edx, xmm10 + pextrd r9d, xmm10, 2 CryptonightR_template_double_part3: + movd r15, xmm13 + + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r15, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r13, rax + movd rsp, xmm0 mov DWORD PTR [rsp], ebx mov DWORD PTR [rsp+4], esi @@ -414,23 +467,20 @@ CryptonightR_template_double_part3: movd rsi, xmm2 movd rdi, xmm11 movd rbp, xmm12 - movd r15, xmm13 mov rcx, [rsp+104] + mov r9, [rsp+112] mov rax, r8 mul rdi - movd xmm1, rdx - movd xmm0, rax - punpcklqdq xmm1, xmm0 mov rdi, rcx mov r8, rax - pxor xmm1, XMMWORD PTR [rbp+rcx] + movdqu xmm1, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm1 xor ebp, 48 paddq xmm1, xmm8 - xor r8, QWORD PTR [rbp+rcx+8] - xor rdx, QWORD PTR [rbp+rcx] add r13, r8 movdqu xmm2, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm2 add r15, rdx movdqu XMMWORD PTR [rbp+rcx], xmm1 paddq xmm2, xmm4 @@ -438,6 +488,7 @@ CryptonightR_template_double_part3: mov eax, ebp xor rax, 32 movdqu xmm0, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm0 movdqu XMMWORD PTR [rbp+rcx], xmm2 paddq xmm0, xmm10 movdqu XMMWORD PTR [rax+rcx], xmm0 diff --git a/src/crypto/asm/win64/CryptonightWOW_template.inc b/src/crypto/asm/win64/CryptonightWOW_template.inc new file mode 100644 index 00000000..47fbc94f --- /dev/null +++ b/src/crypto/asm/win64/CryptonightWOW_template.inc @@ -0,0 +1,486 @@ +PUBLIC FN_PREFIX(CryptonightWOW_template_part1) +PUBLIC FN_PREFIX(CryptonightWOW_template_mainloop) +PUBLIC FN_PREFIX(CryptonightWOW_template_part2) +PUBLIC FN_PREFIX(CryptonightWOW_template_part3) +PUBLIC FN_PREFIX(CryptonightWOW_template_end) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part1) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_mainloop) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part2) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part3) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part4) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_end) + +ALIGN(64) +FN_PREFIX(CryptonightWOW_template_part1): + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push rdi + sub rsp, 64 + mov r12, rcx + mov r8, QWORD PTR [r12+32] + mov rdx, r12 + xor r8, QWORD PTR [r12] + mov r15, QWORD PTR [r12+40] + mov r9, r8 + xor r15, QWORD PTR [r12+8] + mov r11, QWORD PTR [r12+224] + mov r12, QWORD PTR [r12+56] + xor r12, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm0, r12 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + movaps XMMWORD PTR [rsp], xmm9 + mov r12, QWORD PTR [rdx+88] + xor r12, QWORD PTR [rdx+72] + movd xmm6, rax + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm6, xmm0 + and r9d, 2097136 + movd xmm0, r12 + movd xmm7, rax + punpcklqdq xmm7, xmm0 + mov r10d, r9d + movd xmm9, rsp + mov rsp, r8 + mov r8d, 524288 + + mov ebx, [rdx+96] + mov esi, [rdx+100] + mov edi, [rdx+104] + mov ebp, [rdx+108] + + ALIGN(64) +FN_PREFIX(CryptonightWOW_template_mainloop): + movdqa xmm5, XMMWORD PTR [r9+r11] + movd xmm0, r15 + movd xmm4, rsp + punpcklqdq xmm4, xmm0 + lea rdx, QWORD PTR [r9+r11] + + aesenc xmm5, xmm4 + movd r10d, xmm5 + and r10d, 2097136 + + mov r12d, r9d + mov eax, r9d + xor r9d, 48 + xor r12d, 16 + xor eax, 32 + movdqu xmm0, XMMWORD PTR [r9+r11] + movdqu xmm2, XMMWORD PTR [r12+r11] + movdqu xmm1, XMMWORD PTR [rax+r11] + paddq xmm0, xmm7 + paddq xmm2, xmm6 + paddq xmm1, xmm4 + movdqu XMMWORD PTR [r12+r11], xmm0 + movd r12, xmm5 + movdqu XMMWORD PTR [rax+r11], xmm2 + movdqu XMMWORD PTR [r9+r11], xmm1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [rdx], xmm0 + + lea r13d, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or r13, rdx + + xor r13, QWORD PTR [r10+r11] + mov r14, QWORD PTR [r10+r11+8] + + movd eax, xmm6 + movd edx, xmm7 + pextrd r9d, xmm7, 2 + +FN_PREFIX(CryptonightWOW_template_part2): + mov rax, r13 + mul r12 + movd xmm0, rax + movd xmm3, rdx + punpcklqdq xmm3, xmm0 + + mov r9d, r10d + mov r12d, r10d + xor r9d, 16 + xor r12d, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [r12+r11] + xor rdx, QWORD PTR [r12+r11] + xor rax, QWORD PTR [r11+r12+8] + movdqa xmm2, XMMWORD PTR [r9+r11] + pxor xmm3, xmm2 + paddq xmm7, XMMWORD PTR [r10+r11] + paddq xmm1, xmm4 + paddq xmm3, xmm6 + movdqu XMMWORD PTR [r9+r11], xmm7 + movdqu XMMWORD PTR [r12+r11], xmm3 + movdqu XMMWORD PTR [r10+r11], xmm1 + + movdqa xmm7, xmm6 + add r15, rax + add rsp, rdx + xor r10, 48 + mov QWORD PTR [r10+r11], rsp + xor rsp, r13 + mov r9d, esp + mov QWORD PTR [r10+r11+8], r15 + and r9d, 2097136 + xor r15, r14 + movdqa xmm6, xmm5 + dec r8d + jnz FN_PREFIX(CryptonightWOW_template_mainloop) + +FN_PREFIX(CryptonightWOW_template_part3): + movd rsp, xmm9 + + mov rbx, QWORD PTR [rsp+136] + mov rbp, QWORD PTR [rsp+144] + mov rsi, QWORD PTR [rsp+152] + movaps xmm6, XMMWORD PTR [rsp+48] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+16] + movaps xmm9, XMMWORD PTR [rsp] + add rsp, 64 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + ret 0 +FN_PREFIX(CryptonightWOW_template_end): + +ALIGN(64) +FN_PREFIX(CryptonightWOW_template_double_part1): + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 320 + mov r14, QWORD PTR [rcx+32] + mov r8, rcx + xor r14, QWORD PTR [rcx] + mov r12, QWORD PTR [rcx+40] + mov ebx, r14d + mov rsi, QWORD PTR [rcx+224] + and ebx, 2097136 + xor r12, QWORD PTR [rcx+8] + mov rcx, QWORD PTR [rcx+56] + xor rcx, QWORD PTR [r8+24] + mov rax, QWORD PTR [r8+48] + xor rax, QWORD PTR [r8+16] + mov r15, QWORD PTR [rdx+32] + xor r15, QWORD PTR [rdx] + movd xmm0, rcx + mov rcx, QWORD PTR [r8+88] + xor rcx, QWORD PTR [r8+72] + mov r13, QWORD PTR [rdx+40] + mov rdi, QWORD PTR [rdx+224] + xor r13, QWORD PTR [rdx+8] + movaps XMMWORD PTR [rsp+160], xmm6 + movaps XMMWORD PTR [rsp+176], xmm7 + movaps XMMWORD PTR [rsp+192], xmm8 + movaps XMMWORD PTR [rsp+208], xmm9 + movaps XMMWORD PTR [rsp+224], xmm10 + movaps XMMWORD PTR [rsp+240], xmm11 + movaps XMMWORD PTR [rsp+256], xmm12 + movaps XMMWORD PTR [rsp+272], xmm13 + movaps XMMWORD PTR [rsp+288], xmm14 + movaps XMMWORD PTR [rsp+304], xmm15 + movd xmm7, rax + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + + movaps xmm1, XMMWORD PTR [rdx+96] + movaps xmm2, XMMWORD PTR [r8+96] + movaps XMMWORD PTR [rsp], xmm1 + movaps XMMWORD PTR [rsp+16], xmm2 + + mov r8d, r15d + punpcklqdq xmm7, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [rdx+56] + xor rcx, QWORD PTR [rdx+24] + movd xmm9, rax + mov QWORD PTR [rsp+128], rsi + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + punpcklqdq xmm9, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [rdx+88] + xor rcx, QWORD PTR [rdx+72] + movd xmm8, rax + mov QWORD PTR [rsp+136], rdi + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm8, xmm0 + and r8d, 2097136 + movd xmm0, rcx + mov r11d, 524288 + movd xmm10, rax + punpcklqdq xmm10, xmm0 + + movd xmm14, QWORD PTR [rsp+128] + movd xmm15, QWORD PTR [rsp+136] + + ALIGN(64) +FN_PREFIX(CryptonightWOW_template_double_mainloop): + movdqu xmm6, XMMWORD PTR [rbx+rsi] + movd xmm0, r12 + mov ecx, ebx + movd xmm3, r14 + punpcklqdq xmm3, xmm0 + xor ebx, 16 + aesenc xmm6, xmm3 + movd rdx, xmm6 + movd xmm4, r15 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + xor ebx, 48 + paddq xmm0, xmm7 + movdqu xmm1, XMMWORD PTR [rbx+rsi] + movdqu XMMWORD PTR [rbx+rsi], xmm0 + paddq xmm1, xmm3 + xor ebx, 16 + mov eax, ebx + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + movdqu XMMWORD PTR [rbx+rsi], xmm1 + paddq xmm0, xmm9 + movdqu XMMWORD PTR [rax+rsi], xmm0 + movdqa xmm0, xmm6 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [rcx+rsi], xmm0 + mov esi, edx + movdqu xmm5, XMMWORD PTR [r8+rdi] + and esi, 2097136 + mov ecx, r8d + movd xmm0, r13 + punpcklqdq xmm4, xmm0 + xor r8d, 16 + aesenc xmm5, xmm4 + movdqu xmm0, XMMWORD PTR [r8+rdi] + xor r8d, 48 + paddq xmm0, xmm8 + movdqu xmm1, XMMWORD PTR [r8+rdi] + movdqu XMMWORD PTR [r8+rdi], xmm0 + paddq xmm1, xmm4 + xor r8d, 16 + mov eax, r8d + xor rax, 32 + movdqu xmm0, XMMWORD PTR [r8+rdi] + movdqu XMMWORD PTR [r8+rdi], xmm1 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rdi], xmm0 + movdqa xmm0, xmm5 + pxor xmm0, xmm8 + movdqu XMMWORD PTR [rcx+rdi], xmm0 + movd rdi, xmm5 + movd rcx, xmm14 + mov ebp, edi + mov r8, QWORD PTR [rcx+rsi] + mov r10, QWORD PTR [rcx+rsi+8] + lea r9, QWORD PTR [rcx+rsi] + xor esi, 16 + + movd xmm0, rsp + movd xmm1, rsi + movd xmm2, rdi + movd xmm11, rbp + movd xmm12, r15 + movd xmm13, rdx + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp+16] + mov esi, DWORD PTR [rsp+20] + mov edi, DWORD PTR [rsp+24] + mov ebp, DWORD PTR [rsp+28] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + xor r8, rax + + movd esp, xmm3 + pextrd r15d, xmm3, 2 + movd eax, xmm7 + movd edx, xmm9 + pextrd r9d, xmm9, 2 + +FN_PREFIX(CryptonightWOW_template_double_part2): + + movd rsp, xmm0 + mov DWORD PTR [rsp+16], ebx + mov DWORD PTR [rsp+20], esi + mov DWORD PTR [rsp+24], edi + mov DWORD PTR [rsp+28], ebp + + movd rsi, xmm1 + movd rdi, xmm2 + movd rbp, xmm11 + movd r15, xmm12 + movd rdx, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rbx, r8 + mov rax, r8 + mul rdx + and ebp, 2097136 + mov r8, rax + movd xmm1, rdx + movd xmm0, r8 + punpcklqdq xmm1, xmm0 + pxor xmm1, XMMWORD PTR [rcx+rsi] + xor esi, 48 + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rsi+rcx] + xor rdx, QWORD PTR [rsi+rcx] + paddq xmm2, xmm3 + xor r8, QWORD PTR [rsi+rcx+8] + movdqu XMMWORD PTR [rsi+rcx], xmm1 + xor esi, 16 + mov eax, esi + mov rsi, rcx + movdqu xmm0, XMMWORD PTR [rax+rcx] + movdqu XMMWORD PTR [rax+rcx], xmm2 + paddq xmm0, xmm9 + add r12, r8 + xor rax, 32 + add r14, rdx + movdqa xmm9, xmm7 + movdqa xmm7, xmm6 + movdqu XMMWORD PTR [rax+rcx], xmm0 + mov QWORD PTR [r9+8], r12 + xor r12, r10 + mov QWORD PTR [r9], r14 + movd rcx, xmm15 + xor r14, rbx + mov r10d, ebp + mov ebx, r14d + xor ebp, 16 + and ebx, 2097136 + mov r8, QWORD PTR [r10+rcx] + mov r9, QWORD PTR [r10+rcx+8] + + movd xmm0, rsp + movd xmm1, rbx + movd xmm2, rsi + movd xmm11, rdi + movd xmm12, rbp + movd xmm13, r15 + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp] + mov esi, DWORD PTR [rsp+4] + mov edi, DWORD PTR [rsp+8] + mov ebp, DWORD PTR [rsp+12] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + + xor r8, rax + movd xmm3, r8 + + movd esp, xmm4 + pextrd r15d, xmm4, 2 + movd eax, xmm8 + movd edx, xmm10 + pextrd r9d, xmm10, 2 + +FN_PREFIX(CryptonightWOW_template_double_part3): + + movd rsp, xmm0 + mov DWORD PTR [rsp], ebx + mov DWORD PTR [rsp+4], esi + mov DWORD PTR [rsp+8], edi + mov DWORD PTR [rsp+12], ebp + + movd rbx, xmm1 + movd rsi, xmm2 + movd rdi, xmm11 + movd rbp, xmm12 + movd r15, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rax, r8 + mul rdi + movd xmm1, rdx + movd xmm0, rax + punpcklqdq xmm1, xmm0 + mov rdi, rcx + mov r8, rax + pxor xmm1, XMMWORD PTR [rbp+rcx] + xor ebp, 48 + paddq xmm1, xmm8 + xor r8, QWORD PTR [rbp+rcx+8] + xor rdx, QWORD PTR [rbp+rcx] + add r13, r8 + movdqu xmm2, XMMWORD PTR [rbp+rcx] + add r15, rdx + movdqu XMMWORD PTR [rbp+rcx], xmm1 + paddq xmm2, xmm4 + xor ebp, 16 + mov eax, ebp + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbp+rcx] + movdqu XMMWORD PTR [rbp+rcx], xmm2 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rcx], xmm0 + movd rax, xmm3 + movdqa xmm10, xmm8 + mov QWORD PTR [r10+rcx], r15 + movdqa xmm8, xmm5 + xor r15, rax + mov QWORD PTR [r10+rcx+8], r13 + mov r8d, r15d + xor r13, r9 + and r8d, 2097136 + dec r11d + jnz FN_PREFIX(CryptonightWOW_template_double_mainloop) + +FN_PREFIX(CryptonightWOW_template_double_part4): + + mov rbx, QWORD PTR [rsp+400] + movaps xmm6, XMMWORD PTR [rsp+160] + movaps xmm7, XMMWORD PTR [rsp+176] + movaps xmm8, XMMWORD PTR [rsp+192] + movaps xmm9, XMMWORD PTR [rsp+208] + movaps xmm10, XMMWORD PTR [rsp+224] + movaps xmm11, XMMWORD PTR [rsp+240] + movaps xmm12, XMMWORD PTR [rsp+256] + movaps xmm13, XMMWORD PTR [rsp+272] + movaps xmm14, XMMWORD PTR [rsp+288] + movaps xmm15, XMMWORD PTR [rsp+304] + add rsp, 320 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + ret 0 +FN_PREFIX(CryptonightWOW_template_double_end): diff --git a/src/crypto/asm/win64/CryptonightWOW_template_win.inc b/src/crypto/asm/win64/CryptonightWOW_template_win.inc new file mode 100644 index 00000000..9db2cf39 --- /dev/null +++ b/src/crypto/asm/win64/CryptonightWOW_template_win.inc @@ -0,0 +1,486 @@ +PUBLIC CryptonightWOW_template_part1 +PUBLIC CryptonightWOW_template_mainloop +PUBLIC CryptonightWOW_template_part2 +PUBLIC CryptonightWOW_template_part3 +PUBLIC CryptonightWOW_template_end +PUBLIC CryptonightWOW_template_double_part1 +PUBLIC CryptonightWOW_template_double_mainloop +PUBLIC CryptonightWOW_template_double_part2 +PUBLIC CryptonightWOW_template_double_part3 +PUBLIC CryptonightWOW_template_double_part4 +PUBLIC CryptonightWOW_template_double_end + +ALIGN(64) +CryptonightWOW_template_part1: + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push rdi + sub rsp, 64 + mov r12, rcx + mov r8, QWORD PTR [r12+32] + mov rdx, r12 + xor r8, QWORD PTR [r12] + mov r15, QWORD PTR [r12+40] + mov r9, r8 + xor r15, QWORD PTR [r12+8] + mov r11, QWORD PTR [r12+224] + mov r12, QWORD PTR [r12+56] + xor r12, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm0, r12 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + movaps XMMWORD PTR [rsp], xmm9 + mov r12, QWORD PTR [rdx+88] + xor r12, QWORD PTR [rdx+72] + movd xmm6, rax + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm6, xmm0 + and r9d, 2097136 + movd xmm0, r12 + movd xmm7, rax + punpcklqdq xmm7, xmm0 + mov r10d, r9d + movd xmm9, rsp + mov rsp, r8 + mov r8d, 524288 + + mov ebx, [rdx+96] + mov esi, [rdx+100] + mov edi, [rdx+104] + mov ebp, [rdx+108] + + ALIGN(64) +CryptonightWOW_template_mainloop: + movdqa xmm5, XMMWORD PTR [r9+r11] + movd xmm0, r15 + movd xmm4, rsp + punpcklqdq xmm4, xmm0 + lea rdx, QWORD PTR [r9+r11] + + aesenc xmm5, xmm4 + movd r10d, xmm5 + and r10d, 2097136 + + mov r12d, r9d + mov eax, r9d + xor r9d, 48 + xor r12d, 16 + xor eax, 32 + movdqu xmm0, XMMWORD PTR [r9+r11] + movdqu xmm2, XMMWORD PTR [r12+r11] + movdqu xmm1, XMMWORD PTR [rax+r11] + paddq xmm0, xmm7 + paddq xmm2, xmm6 + paddq xmm1, xmm4 + movdqu XMMWORD PTR [r12+r11], xmm0 + movd r12, xmm5 + movdqu XMMWORD PTR [rax+r11], xmm2 + movdqu XMMWORD PTR [r9+r11], xmm1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [rdx], xmm0 + + lea r13d, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or r13, rdx + + xor r13, QWORD PTR [r10+r11] + mov r14, QWORD PTR [r10+r11+8] + + movd eax, xmm6 + movd edx, xmm7 + pextrd r9d, xmm7, 2 + +CryptonightWOW_template_part2: + mov rax, r13 + mul r12 + movd xmm0, rax + movd xmm3, rdx + punpcklqdq xmm3, xmm0 + + mov r9d, r10d + mov r12d, r10d + xor r9d, 16 + xor r12d, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [r12+r11] + xor rdx, QWORD PTR [r12+r11] + xor rax, QWORD PTR [r11+r12+8] + movdqa xmm2, XMMWORD PTR [r9+r11] + pxor xmm3, xmm2 + paddq xmm7, XMMWORD PTR [r10+r11] + paddq xmm1, xmm4 + paddq xmm3, xmm6 + movdqu XMMWORD PTR [r9+r11], xmm7 + movdqu XMMWORD PTR [r12+r11], xmm3 + movdqu XMMWORD PTR [r10+r11], xmm1 + + movdqa xmm7, xmm6 + add r15, rax + add rsp, rdx + xor r10, 48 + mov QWORD PTR [r10+r11], rsp + xor rsp, r13 + mov r9d, esp + mov QWORD PTR [r10+r11+8], r15 + and r9d, 2097136 + xor r15, r14 + movdqa xmm6, xmm5 + dec r8d + jnz CryptonightWOW_template_mainloop + +CryptonightWOW_template_part3: + movd rsp, xmm9 + + mov rbx, QWORD PTR [rsp+136] + mov rbp, QWORD PTR [rsp+144] + mov rsi, QWORD PTR [rsp+152] + movaps xmm6, XMMWORD PTR [rsp+48] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+16] + movaps xmm9, XMMWORD PTR [rsp] + add rsp, 64 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + ret 0 +CryptonightWOW_template_end: + +ALIGN(64) +CryptonightWOW_template_double_part1: + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 320 + mov r14, QWORD PTR [rcx+32] + mov r8, rcx + xor r14, QWORD PTR [rcx] + mov r12, QWORD PTR [rcx+40] + mov ebx, r14d + mov rsi, QWORD PTR [rcx+224] + and ebx, 2097136 + xor r12, QWORD PTR [rcx+8] + mov rcx, QWORD PTR [rcx+56] + xor rcx, QWORD PTR [r8+24] + mov rax, QWORD PTR [r8+48] + xor rax, QWORD PTR [r8+16] + mov r15, QWORD PTR [rdx+32] + xor r15, QWORD PTR [rdx] + movd xmm0, rcx + mov rcx, QWORD PTR [r8+88] + xor rcx, QWORD PTR [r8+72] + mov r13, QWORD PTR [rdx+40] + mov rdi, QWORD PTR [rdx+224] + xor r13, QWORD PTR [rdx+8] + movaps XMMWORD PTR [rsp+160], xmm6 + movaps XMMWORD PTR [rsp+176], xmm7 + movaps XMMWORD PTR [rsp+192], xmm8 + movaps XMMWORD PTR [rsp+208], xmm9 + movaps XMMWORD PTR [rsp+224], xmm10 + movaps XMMWORD PTR [rsp+240], xmm11 + movaps XMMWORD PTR [rsp+256], xmm12 + movaps XMMWORD PTR [rsp+272], xmm13 + movaps XMMWORD PTR [rsp+288], xmm14 + movaps XMMWORD PTR [rsp+304], xmm15 + movd xmm7, rax + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + + movaps xmm1, XMMWORD PTR [rdx+96] + movaps xmm2, XMMWORD PTR [r8+96] + movaps XMMWORD PTR [rsp], xmm1 + movaps XMMWORD PTR [rsp+16], xmm2 + + mov r8d, r15d + punpcklqdq xmm7, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [rdx+56] + xor rcx, QWORD PTR [rdx+24] + movd xmm9, rax + mov QWORD PTR [rsp+128], rsi + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + punpcklqdq xmm9, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [rdx+88] + xor rcx, QWORD PTR [rdx+72] + movd xmm8, rax + mov QWORD PTR [rsp+136], rdi + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm8, xmm0 + and r8d, 2097136 + movd xmm0, rcx + mov r11d, 524288 + movd xmm10, rax + punpcklqdq xmm10, xmm0 + + movd xmm14, QWORD PTR [rsp+128] + movd xmm15, QWORD PTR [rsp+136] + + ALIGN(64) +CryptonightWOW_template_double_mainloop: + movdqu xmm6, XMMWORD PTR [rbx+rsi] + movd xmm0, r12 + mov ecx, ebx + movd xmm3, r14 + punpcklqdq xmm3, xmm0 + xor ebx, 16 + aesenc xmm6, xmm3 + movd rdx, xmm6 + movd xmm4, r15 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + xor ebx, 48 + paddq xmm0, xmm7 + movdqu xmm1, XMMWORD PTR [rbx+rsi] + movdqu XMMWORD PTR [rbx+rsi], xmm0 + paddq xmm1, xmm3 + xor ebx, 16 + mov eax, ebx + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + movdqu XMMWORD PTR [rbx+rsi], xmm1 + paddq xmm0, xmm9 + movdqu XMMWORD PTR [rax+rsi], xmm0 + movdqa xmm0, xmm6 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [rcx+rsi], xmm0 + mov esi, edx + movdqu xmm5, XMMWORD PTR [r8+rdi] + and esi, 2097136 + mov ecx, r8d + movd xmm0, r13 + punpcklqdq xmm4, xmm0 + xor r8d, 16 + aesenc xmm5, xmm4 + movdqu xmm0, XMMWORD PTR [r8+rdi] + xor r8d, 48 + paddq xmm0, xmm8 + movdqu xmm1, XMMWORD PTR [r8+rdi] + movdqu XMMWORD PTR [r8+rdi], xmm0 + paddq xmm1, xmm4 + xor r8d, 16 + mov eax, r8d + xor rax, 32 + movdqu xmm0, XMMWORD PTR [r8+rdi] + movdqu XMMWORD PTR [r8+rdi], xmm1 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rdi], xmm0 + movdqa xmm0, xmm5 + pxor xmm0, xmm8 + movdqu XMMWORD PTR [rcx+rdi], xmm0 + movd rdi, xmm5 + movd rcx, xmm14 + mov ebp, edi + mov r8, QWORD PTR [rcx+rsi] + mov r10, QWORD PTR [rcx+rsi+8] + lea r9, QWORD PTR [rcx+rsi] + xor esi, 16 + + movd xmm0, rsp + movd xmm1, rsi + movd xmm2, rdi + movd xmm11, rbp + movd xmm12, r15 + movd xmm13, rdx + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp+16] + mov esi, DWORD PTR [rsp+20] + mov edi, DWORD PTR [rsp+24] + mov ebp, DWORD PTR [rsp+28] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + xor r8, rax + + movd esp, xmm3 + pextrd r15d, xmm3, 2 + movd eax, xmm7 + movd edx, xmm9 + pextrd r9d, xmm9, 2 + +CryptonightWOW_template_double_part2: + + movd rsp, xmm0 + mov DWORD PTR [rsp+16], ebx + mov DWORD PTR [rsp+20], esi + mov DWORD PTR [rsp+24], edi + mov DWORD PTR [rsp+28], ebp + + movd rsi, xmm1 + movd rdi, xmm2 + movd rbp, xmm11 + movd r15, xmm12 + movd rdx, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rbx, r8 + mov rax, r8 + mul rdx + and ebp, 2097136 + mov r8, rax + movd xmm1, rdx + movd xmm0, r8 + punpcklqdq xmm1, xmm0 + pxor xmm1, XMMWORD PTR [rcx+rsi] + xor esi, 48 + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rsi+rcx] + xor rdx, QWORD PTR [rsi+rcx] + paddq xmm2, xmm3 + xor r8, QWORD PTR [rsi+rcx+8] + movdqu XMMWORD PTR [rsi+rcx], xmm1 + xor esi, 16 + mov eax, esi + mov rsi, rcx + movdqu xmm0, XMMWORD PTR [rax+rcx] + movdqu XMMWORD PTR [rax+rcx], xmm2 + paddq xmm0, xmm9 + add r12, r8 + xor rax, 32 + add r14, rdx + movdqa xmm9, xmm7 + movdqa xmm7, xmm6 + movdqu XMMWORD PTR [rax+rcx], xmm0 + mov QWORD PTR [r9+8], r12 + xor r12, r10 + mov QWORD PTR [r9], r14 + movd rcx, xmm15 + xor r14, rbx + mov r10d, ebp + mov ebx, r14d + xor ebp, 16 + and ebx, 2097136 + mov r8, QWORD PTR [r10+rcx] + mov r9, QWORD PTR [r10+rcx+8] + + movd xmm0, rsp + movd xmm1, rbx + movd xmm2, rsi + movd xmm11, rdi + movd xmm12, rbp + movd xmm13, r15 + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp] + mov esi, DWORD PTR [rsp+4] + mov edi, DWORD PTR [rsp+8] + mov ebp, DWORD PTR [rsp+12] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + + xor r8, rax + movd xmm3, r8 + + movd esp, xmm4 + pextrd r15d, xmm4, 2 + movd eax, xmm8 + movd edx, xmm10 + pextrd r9d, xmm10, 2 + +CryptonightWOW_template_double_part3: + + movd rsp, xmm0 + mov DWORD PTR [rsp], ebx + mov DWORD PTR [rsp+4], esi + mov DWORD PTR [rsp+8], edi + mov DWORD PTR [rsp+12], ebp + + movd rbx, xmm1 + movd rsi, xmm2 + movd rdi, xmm11 + movd rbp, xmm12 + movd r15, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rax, r8 + mul rdi + movd xmm1, rdx + movd xmm0, rax + punpcklqdq xmm1, xmm0 + mov rdi, rcx + mov r8, rax + pxor xmm1, XMMWORD PTR [rbp+rcx] + xor ebp, 48 + paddq xmm1, xmm8 + xor r8, QWORD PTR [rbp+rcx+8] + xor rdx, QWORD PTR [rbp+rcx] + add r13, r8 + movdqu xmm2, XMMWORD PTR [rbp+rcx] + add r15, rdx + movdqu XMMWORD PTR [rbp+rcx], xmm1 + paddq xmm2, xmm4 + xor ebp, 16 + mov eax, ebp + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbp+rcx] + movdqu XMMWORD PTR [rbp+rcx], xmm2 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rcx], xmm0 + movd rax, xmm3 + movdqa xmm10, xmm8 + mov QWORD PTR [r10+rcx], r15 + movdqa xmm8, xmm5 + xor r15, rax + mov QWORD PTR [r10+rcx+8], r13 + mov r8d, r15d + xor r13, r9 + and r8d, 2097136 + dec r11d + jnz CryptonightWOW_template_double_mainloop + +CryptonightWOW_template_double_part4: + + mov rbx, QWORD PTR [rsp+400] + movaps xmm6, XMMWORD PTR [rsp+160] + movaps xmm7, XMMWORD PTR [rsp+176] + movaps xmm8, XMMWORD PTR [rsp+192] + movaps xmm9, XMMWORD PTR [rsp+208] + movaps xmm10, XMMWORD PTR [rsp+224] + movaps xmm11, XMMWORD PTR [rsp+240] + movaps xmm12, XMMWORD PTR [rsp+256] + movaps xmm13, XMMWORD PTR [rsp+272] + movaps xmm14, XMMWORD PTR [rsp+288] + movaps xmm15, XMMWORD PTR [rsp+304] + add rsp, 320 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + ret 0 +CryptonightWOW_template_double_end: diff --git a/src/crypto/variant4_random_math.h b/src/crypto/variant4_random_math.h index 793e9e03..1f3ea0ac 100644 --- a/src/crypto/variant4_random_math.h +++ b/src/crypto/variant4_random_math.h @@ -12,8 +12,11 @@ enum V4_Settings TOTAL_LATENCY = 15 * 3, // Always generate at least 60 instructions - NUM_INSTRUCTIONS = 60, - + NUM_INSTRUCTIONS_MIN = 60, + + // Never generate more than 70 instructions (final RET instruction doesn't count here) + NUM_INSTRUCTIONS_MAX = 70, + // Available ALUs for MUL // Modern CPUs typically have only 1 ALU which can do multiplications ALU_COUNT_MUL = 1, @@ -38,10 +41,9 @@ enum V4_InstructionList // V4_InstructionDefinition is used to generate code from random data // Every random sequence of bytes is a valid code // -// There are 8 registers in total: +// There are 9 registers in total: // - 4 variable registers -// - 4 constant registers initialized from loop variables -// +// - 5 constant registers initialized from loop variables // This is why dst_index is 2 bits enum V4_InstructionDefinition { @@ -144,16 +146,16 @@ static void v4_random_math(const struct V4_Instruction* code, v4_reg* r) // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency // I've checked all block heights < 10,000,000 and here is the distribution of program sizes: // - // 60 28495 - // 61 106077 - // 62 2455855 - // 63 5114930 - // 64 1020868 - // 65 1109026 - // 66 151756 - // 67 8429 - // 68 4477 - // 69 87 + // 60 27960 + // 61 105054 + // 62 2452759 + // 63 5115997 + // 64 1022269 + // 65 1109635 + // 66 153145 + // 67 8550 + // 68 4529 + // 69 102 // Unroll 70 instructions here V4_EXEC_10(0); // instructions 0-9 @@ -179,6 +181,8 @@ static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed } // Generates as many random math operations as possible with given latency and ALU restrictions +// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions +template static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height) { // MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle @@ -200,6 +204,10 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh memset(data, 0, sizeof(data)); uint64_t tmp = SWAP64LE(height); memcpy(data, &tmp, sizeof(uint64_t)); + if (VARIANT == xmrig::VARIANT_4) + { + data[20] = -38; + } // Set data_index past the last byte in data // to trigger full data update with blake hash @@ -207,18 +215,22 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh size_t data_index = sizeof(data); int code_size; + + // There is a small chance (1.8%) that register R8 won't be used in the generated program + // So we keep track of it and try again if it's not used + bool r8_used; do { - int latency[8]; - int asic_latency[8]; + int latency[9]; + int asic_latency[9]; // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution // byte 0: current value of the destination register // byte 1: instruction opcode // byte 2: current value of the source register // - // Registers R4-R7 are constant and are treated as having the same value because when we do + // Registers R4-R8 are constant and are treated as having the same value because when we do // the same operation twice with two constant source registers, it can be optimized into a single operation - uint32_t inst_data[8] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF }; + uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF }; bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT]; bool is_rotation[V4_INSTRUCTION_COUNT]; @@ -237,6 +249,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh code_size = 0; int total_iterations = 0; + r8_used = (VARIANT == xmrig::VARIANT_WOW); // Generate random code to achieve minimal required latency for our abstract CPU // Try to get this latency for all 4 registers @@ -281,7 +294,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) { // a is always < 4, so we don't need to check bounds here - b = a + 4; + b = (VARIANT == xmrig::VARIANT_WOW) ? (a + 4) : 8; src_index = b; } @@ -362,6 +375,11 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh code[code_size].src_index = src_index; code[code_size].C = 0; + if (src_index == 8) + { + r8_used = true; + } + if (opcode == ADD) { // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too @@ -376,7 +394,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh } ++code_size; - if (code_size >= NUM_INSTRUCTIONS) + if (code_size >= NUM_INSTRUCTIONS_MIN) { break; } @@ -391,7 +409,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC // Get this latency for at least 1 of the 4 registers const int prev_code_size = code_size; - while ((asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) + while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) { int min_idx = 0; int max_idx = 0; @@ -413,9 +431,11 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh ++code_size; } - // There is ~99.8% chance that code_size >= NUM_INSTRUCTIONS here, so second iteration is required rarely - } while (code_size < NUM_INSTRUCTIONS); + // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time + // It never does more than 4 iterations for all block heights < 10,000,000 + } while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX)); + // It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here // Add final instruction to stop the interpreter code[code_size].opcode = RET; code[code_size].dst_index = 0; diff --git a/src/workers/CpuThread.cpp b/src/workers/CpuThread.cpp index 4b539bb3..3d5ec59e 100644 --- a/src/workers/CpuThread.cpp +++ b/src/workers/CpuThread.cpp @@ -175,6 +175,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a add_asm_func(asm_func_map); add_asm_func(asm_func_map); add_asm_func(asm_func_map); + add_asm_func(asm_func_map); add_asm_func(asm_func_map); @@ -310,6 +311,17 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a cryptonight_quad_hash, cryptonight_penta_hash, + cryptonight_single_hash, + cryptonight_double_hash, + cryptonight_single_hash, + cryptonight_double_hash, + cryptonight_triple_hash, + cryptonight_quad_hash, + cryptonight_penta_hash, + cryptonight_triple_hash, + cryptonight_quad_hash, + cryptonight_penta_hash, + # ifndef XMRIG_NO_AEON cryptonight_single_hash, cryptonight_double_hash, @@ -344,6 +356,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_TRTL nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4 # else nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_0 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_1 @@ -358,6 +371,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_TRTL nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4 # endif # ifndef XMRIG_NO_SUMO @@ -406,6 +420,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_TRTL nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4 # else nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_0 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_1 @@ -420,6 +435,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_TRTL nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4 # endif # ifndef XMRIG_NO_CN_PICO @@ -447,6 +463,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4 # else nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_0 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_1 @@ -461,6 +478,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_TRTL nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4 # endif }; diff --git a/src/workers/MultiWorker.cpp b/src/workers/MultiWorker.cpp index 2a6153d5..946fbda6 100644 --- a/src/workers/MultiWorker.cpp +++ b/src/workers/MultiWorker.cpp @@ -60,6 +60,10 @@ bool MultiWorker::selfTest() LOG_WARN("CryptonightR (Wownero) self-test failed"); return false; } + if (!verify2(VARIANT_4, test_input_R)) { + LOG_WARN("CryptonightR self-test failed"); + return false; + } const bool rc = verify(VARIANT_0, test_output_v0) && verify(VARIANT_1, test_output_v1) &&