diff --git a/src/backend/cpu/CpuWorker.h b/src/backend/cpu/CpuWorker.h index a15378ed9..ade256ec5 100644 --- a/src/backend/cpu/CpuWorker.h +++ b/src/backend/cpu/CpuWorker.h @@ -55,7 +55,7 @@ public: size_t threads() const override { # ifdef XMRIG_ALGO_GHOSTRIDER - return m_ghHelper ? 2 : 1; + return ((m_algorithm.family() == Algorithm::GHOSTRIDER) && m_ghHelper) ? 2 : 1; # else return 1; # endif diff --git a/src/backend/cpu/platform/BasicCpuInfo.cpp b/src/backend/cpu/platform/BasicCpuInfo.cpp index 42a82f6a6..81d9395e5 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.cpp +++ b/src/backend/cpu/platform/BasicCpuInfo.cpp @@ -30,6 +30,12 @@ #endif +#include "crypto/cn/CryptoNight_monero.h" +#ifdef XMRIG_VAES +# include "crypto/cn/CryptoNight_x86_vaes.h" +#endif + + #include "backend/cpu/platform/BasicCpuInfo.h" #include "3rdparty/rapidjson/document.h" #include "crypto/common/Assembly.h" @@ -294,6 +300,9 @@ xmrig::BasicCpuInfo::BasicCpuInfo() : } } # endif + + cn_sse41_enabled = has(FLAG_SSE41); + cn_vaes_enabled = has(FLAG_VAES); } diff --git a/src/crypto/cn/CnHash.cpp b/src/crypto/cn/CnHash.cpp index 0df12bcac..5e0918f57 100644 --- a/src/crypto/cn/CnHash.cpp +++ b/src/crypto/cn/CnHash.cpp @@ -66,6 +66,10 @@ } while (0) +bool cn_sse41_enabled = false; +bool cn_vaes_enabled = false; + + namespace xmrig { @@ -97,6 +101,27 @@ cn_mainloop_fun cn_double_double_mainloop_sandybridge_asm = nullptr; cn_mainloop_fun cn_upx2_mainloop_asm = nullptr; cn_mainloop_fun cn_upx2_double_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr0_single_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr1_single_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr2_single_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr3_single_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr4_single_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr5_single_mainloop_asm = nullptr; + +cn_mainloop_fun cn_gr0_double_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr1_double_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr2_double_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr3_double_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr4_double_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr5_double_mainloop_asm = nullptr; + +cn_mainloop_fun cn_gr0_quad_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr1_quad_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr2_quad_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr3_quad_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr4_quad_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr5_quad_mainloop_asm = nullptr; + template static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t mask = CnAlgo().mask()) @@ -136,7 +161,7 @@ static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t ma static void patchAsmVariants() { - const int allocation_size = 131072; + constexpr size_t allocation_size = 0x20000; auto base = static_cast(VirtualMemory::allocateExecutableMemory(allocation_size, false)); cn_half_mainloop_ivybridge_asm = reinterpret_cast (base + 0x0000); @@ -173,6 +198,29 @@ static void patchAsmVariants() cn_upx2_double_mainloop_asm = reinterpret_cast (base + 0x15000); # endif +# ifdef XMRIG_ALGO_GHOSTRIDER + cn_gr0_single_mainloop_asm = reinterpret_cast (base + 0x16000); + cn_gr1_single_mainloop_asm = reinterpret_cast (base + 0x16800); + cn_gr2_single_mainloop_asm = reinterpret_cast (base + 0x17000); + cn_gr3_single_mainloop_asm = reinterpret_cast (base + 0x17800); + cn_gr4_single_mainloop_asm = reinterpret_cast (base + 0x18000); + cn_gr5_single_mainloop_asm = reinterpret_cast (base + 0x18800); + + cn_gr0_double_mainloop_asm = reinterpret_cast (base + 0x19000); + cn_gr1_double_mainloop_asm = reinterpret_cast (base + 0x19800); + cn_gr2_double_mainloop_asm = reinterpret_cast (base + 0x1A000); + cn_gr3_double_mainloop_asm = reinterpret_cast (base + 0x1A800); + cn_gr4_double_mainloop_asm = reinterpret_cast (base + 0x1B000); + cn_gr5_double_mainloop_asm = reinterpret_cast (base + 0x1B800); + + cn_gr0_quad_mainloop_asm = reinterpret_cast (base + 0x1C000); + cn_gr1_quad_mainloop_asm = reinterpret_cast (base + 0x1C800); + cn_gr2_quad_mainloop_asm = reinterpret_cast (base + 0x1D000); + cn_gr3_quad_mainloop_asm = reinterpret_cast (base + 0x1D800); + cn_gr4_quad_mainloop_asm = reinterpret_cast (base + 0x1E000); + cn_gr5_quad_mainloop_asm = reinterpret_cast (base + 0x1E800); +# endif + { constexpr uint32_t ITER = CnAlgo().iterations(); @@ -230,7 +278,30 @@ static void patchAsmVariants() patchCode(cn_upx2_mainloop_asm, cnv2_rwz_mainloop_asm, ITER, MASK); patchCode(cn_upx2_double_mainloop_asm, cnv2_rwz_double_mainloop_asm, ITER, MASK); } -#endif +# endif + +# ifdef XMRIG_ALGO_GHOSTRIDER + patchCode(cn_gr0_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr1_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr2_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr3_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr4_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr5_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + + patchCode(cn_gr0_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr1_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr2_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr3_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr4_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr5_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + + patchCode(cn_gr0_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr1_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr2_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr3_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr4_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr5_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); +# endif VirtualMemory::protectRX(base, allocation_size); VirtualMemory::flushInstructionCache(base, allocation_size); diff --git a/src/crypto/cn/CryptoNight.h b/src/crypto/cn/CryptoNight.h index fc8d67870..897890d28 100644 --- a/src/crypto/cn/CryptoNight.h +++ b/src/crypto/cn/CryptoNight.h @@ -52,8 +52,10 @@ struct cryptonight_r_data { struct cryptonight_ctx { alignas(16) uint8_t state[224]; alignas(16) uint8_t *memory; + const uint32_t* tweak1_table; + uint64_t tweak1_2; - uint8_t unused[40]; + uint8_t unused[24]; const uint32_t *saes_table; cn_mainloop_fun_ms_abi generated_code; diff --git a/src/crypto/cn/CryptoNight_monero.h b/src/crypto/cn/CryptoNight_monero.h index f34c963c9..a9975e784 100644 --- a/src/crypto/cn/CryptoNight_monero.h +++ b/src/crypto/cn/CryptoNight_monero.h @@ -204,4 +204,7 @@ v4_random_math(code##part, r##part); \ } +extern bool cn_sse41_enabled; +extern bool cn_vaes_enabled; + #endif /* XMRIG_CRYPTONIGHT_MONERO_H */ diff --git a/src/crypto/cn/CryptoNight_x86.h b/src/crypto/cn/CryptoNight_x86.h index 2fe64edba..304477106 100644 --- a/src/crypto/cn/CryptoNight_x86.h +++ b/src/crypto/cn/CryptoNight_x86.h @@ -295,8 +295,8 @@ static NOINLINE void cn_explode_scratchpad(cryptonight_ctx *ctx) constexpr CnAlgo props; # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_explode_scratchpad_vaes(ctx); + if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) { + cn_explode_scratchpad_vaes(ctx, props.memory(), props.half_mem()); return; } # endif @@ -409,8 +409,8 @@ static NOINLINE void cn_implode_scratchpad(cryptonight_ctx *ctx) constexpr CnAlgo props; # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_implode_scratchpad_vaes(ctx); + if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) { + cn_implode_scratchpad_vaes(ctx, props.memory(), props.half_mem()); return; } # endif @@ -634,9 +634,31 @@ static inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) cx = _mm_xor_si128(cx, _mm_cvttps_epi32(nc)); } +template +void cryptonight_single_hash_gr_sse41(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height); + template inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height) { + if (!SOFT_AES) { + switch (ALGO) { + case Algorithm::CN_GR_0: + case Algorithm::CN_GR_1: + case Algorithm::CN_GR_2: + case Algorithm::CN_GR_3: + case Algorithm::CN_GR_4: + case Algorithm::CN_GR_5: + if (cn_sse41_enabled) { + cryptonight_single_hash_gr_sse41(input, size, output, ctx, height); + return; + } + break; + + default: + break; + } + } + constexpr CnAlgo props; constexpr size_t MASK = props.mask(); constexpr Algorithm::Id BASE = props.base(); @@ -822,13 +844,16 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si #ifdef XMRIG_FEATURE_ASM +extern "C" void cnv1_single_mainloop_asm(cryptonight_ctx * *ctx); +extern "C" void cnv1_double_mainloop_asm(cryptonight_ctx **ctx); +extern "C" void cnv1_quad_mainloop_asm(cryptonight_ctx **ctx); extern "C" void cnv2_mainloop_ivybridge_asm(cryptonight_ctx **ctx); extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx **ctx); extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx **ctx); extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx **ctx); extern "C" void cnv2_rwz_mainloop_asm(cryptonight_ctx **ctx); extern "C" void cnv2_rwz_double_mainloop_asm(cryptonight_ctx **ctx); -extern "C" void cnv2_upx_double_mainloop_zen3_asm(cryptonight_ctx * *ctx); +extern "C" void cnv2_upx_double_mainloop_zen3_asm(cryptonight_ctx **ctx); namespace xmrig { @@ -865,6 +890,28 @@ extern cn_mainloop_fun cn_double_double_mainloop_sandybridge_asm; extern cn_mainloop_fun cn_upx2_mainloop_asm; extern cn_mainloop_fun cn_upx2_double_mainloop_asm; +extern cn_mainloop_fun cn_gr0_single_mainloop_asm; +extern cn_mainloop_fun cn_gr1_single_mainloop_asm; +extern cn_mainloop_fun cn_gr2_single_mainloop_asm; +extern cn_mainloop_fun cn_gr3_single_mainloop_asm; +extern cn_mainloop_fun cn_gr4_single_mainloop_asm; +extern cn_mainloop_fun cn_gr5_single_mainloop_asm; + +extern cn_mainloop_fun cn_gr0_double_mainloop_asm; +extern cn_mainloop_fun cn_gr1_double_mainloop_asm; +extern cn_mainloop_fun cn_gr2_double_mainloop_asm; +extern cn_mainloop_fun cn_gr3_double_mainloop_asm; +extern cn_mainloop_fun cn_gr4_double_mainloop_asm; +extern cn_mainloop_fun cn_gr5_double_mainloop_asm; + +extern cn_mainloop_fun cn_gr0_quad_mainloop_asm; +extern cn_mainloop_fun cn_gr1_quad_mainloop_asm; +extern cn_mainloop_fun cn_gr2_quad_mainloop_asm; +extern cn_mainloop_fun cn_gr3_quad_mainloop_asm; +extern cn_mainloop_fun cn_gr4_quad_mainloop_asm; +extern cn_mainloop_fun cn_gr5_quad_mainloop_asm; + + } // namespace xmrig @@ -1017,8 +1064,8 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_ } # ifdef XMRIG_VAES - if (!props.isHeavy() && Cpu::info()->hasVAES()) { - cn_explode_scratchpad_vaes_double(ctx[0], ctx[1]); + if (!props.isHeavy() && cn_vaes_enabled) { + cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); } else # endif @@ -1065,8 +1112,8 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_ } # ifdef XMRIG_VAES - if (!props.isHeavy() && Cpu::info()->hasVAES()) { - cn_implode_scratchpad_vaes_double(ctx[0], ctx[1]); + if (!props.isHeavy() && cn_vaes_enabled) { + cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); } else # endif @@ -1090,9 +1137,126 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_ namespace xmrig { +template +static NOINLINE void cryptonight_single_hash_gr_sse41(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height) +{ + constexpr CnAlgo props; + constexpr Algorithm::Id BASE = props.base(); + + if (BASE == Algorithm::CN_1 && size < 43) { + memset(output, 0, 32); + return; + } + + keccak(input, size, ctx[0]->state); + + if (props.half_mem()) { + ctx[0]->first_half = true; + } + cn_explode_scratchpad(ctx[0]); + + VARIANT1_INIT(0); + ctx[0]->tweak1_2 = tweak1_2_0; + ctx[0]->tweak1_table = tweak1_table; + if (ALGO == Algorithm::CN_GR_0) cn_gr0_single_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_1) cn_gr1_single_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_2) cn_gr2_single_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_3) cn_gr3_single_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_4) cn_gr4_single_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_5) cn_gr5_single_mainloop_asm(ctx); + + cn_implode_scratchpad(ctx[0]); + keccakf(reinterpret_cast(ctx[0]->state), 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} + + +template +static NOINLINE void cryptonight_double_hash_gr_sse41(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height) +{ + constexpr CnAlgo props; + constexpr Algorithm::Id BASE = props.base(); + + if (BASE == Algorithm::CN_1 && size < 43) { + memset(output, 0, 64); + return; + } + + keccak(input, size, ctx[0]->state); + keccak(input + size, size, ctx[1]->state); + + if (props.half_mem()) { + ctx[0]->first_half = true; + ctx[1]->first_half = true; + } + +# ifdef XMRIG_VAES + if (!props.isHeavy() && cn_vaes_enabled) { + cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); + } + else +# endif + { + cn_explode_scratchpad(ctx[0]); + cn_explode_scratchpad(ctx[1]); + } + + VARIANT1_INIT(0); + VARIANT1_INIT(1); + + ctx[0]->tweak1_2 = tweak1_2_0; + ctx[1]->tweak1_2 = tweak1_2_1; + + ctx[0]->tweak1_table = tweak1_table; + + if (ALGO == Algorithm::CN_GR_0) cn_gr0_double_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_1) cn_gr1_double_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_2) cn_gr2_double_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_3) cn_gr3_double_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_4) cn_gr4_double_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_5) cn_gr5_double_mainloop_asm(ctx); + +# ifdef XMRIG_VAES + if (!props.isHeavy() && cn_vaes_enabled) { + cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); + } + else +# endif + { + cn_implode_scratchpad(ctx[0]); + cn_implode_scratchpad(ctx[1]); + } + + keccakf(reinterpret_cast(ctx[0]->state), 24); + keccakf(reinterpret_cast(ctx[1]->state), 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); +} + + template inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height) { + if (!SOFT_AES) { + switch (ALGO) { + case Algorithm::CN_GR_0: + case Algorithm::CN_GR_1: + case Algorithm::CN_GR_2: + case Algorithm::CN_GR_3: + case Algorithm::CN_GR_4: + case Algorithm::CN_GR_5: + if (cn_sse41_enabled) { + cryptonight_double_hash_gr_sse41(input, size, output, ctx, height); + return; + } + break; + + default: + break; + } + } + constexpr CnAlgo props; constexpr size_t MASK = props.mask(); constexpr Algorithm::Id BASE = props.base(); @@ -1130,8 +1294,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si } # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_explode_scratchpad_vaes_double(ctx[0], ctx[1]); + if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) { + cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); } else # endif @@ -1334,8 +1498,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si } # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_implode_scratchpad_vaes_double(ctx[0], ctx[1]); + if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) { + cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); } else # endif @@ -1352,27 +1516,14 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si } -static inline void cryptonight_monero_tweak_gr(uint64_t* mem_out, const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i cx) -{ - __m128i tmp = _mm_xor_si128(bx0, cx); - mem_out[0] = _mm_cvtsi128_si64(tmp); - - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - uint64_t vh = _mm_cvtsi128_si64(tmp); - - mem_out[1] = vh ^ tweak1_table[static_cast(vh) >> 24]; -} - - -template -void cryptonight_quad_hash_zen(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height) +template +static NOINLINE void cryptonight_quad_hash_gr_sse41(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height) { constexpr CnAlgo props; - constexpr size_t MASK = props.mask(); constexpr Algorithm::Id BASE = props.base(); if (BASE == Algorithm::CN_1 && size < 43) { - memset(output, 0, 64); + memset(output, 0, 32 * 4); return; } @@ -1381,21 +1532,6 @@ void cryptonight_quad_hash_zen(const uint8_t* __restrict__ input, size_t size, u keccak(input + size * 2, size, ctx[2]->state); keccak(input + size * 3, size, ctx[3]->state); - uint8_t* l0 = ctx[0]->memory; - uint8_t* l1 = ctx[1]->memory; - uint8_t* l2 = ctx[2]->memory; - uint8_t* l3 = ctx[3]->memory; - - uint64_t* h0 = reinterpret_cast(ctx[0]->state); - uint64_t* h1 = reinterpret_cast(ctx[1]->state); - uint64_t* h2 = reinterpret_cast(ctx[2]->state); - uint64_t* h3 = reinterpret_cast(ctx[3]->state); - - VARIANT1_INIT(0); - VARIANT1_INIT(1); - VARIANT1_INIT(2); - VARIANT1_INIT(3); - if (props.half_mem()) { ctx[0]->first_half = true; ctx[1]->first_half = true; @@ -1404,156 +1540,51 @@ void cryptonight_quad_hash_zen(const uint8_t* __restrict__ input, size_t size, u } # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_explode_scratchpad_vaes_double(ctx[0], ctx[1]); - cn_explode_scratchpad_vaes_double(ctx[2], ctx[3]); + if (!props.isHeavy() && cn_vaes_enabled) { + cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); + cn_explode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem()); } else # endif { - cn_explode_scratchpad(ctx[0]); - cn_explode_scratchpad(ctx[1]); - cn_explode_scratchpad(ctx[2]); - cn_explode_scratchpad(ctx[3]); + cn_explode_scratchpad(ctx[0]); + cn_explode_scratchpad(ctx[1]); + cn_explode_scratchpad(ctx[2]); + cn_explode_scratchpad(ctx[3]); } - uint64_t al0 = h0[0] ^ h0[4]; - uint64_t al1 = h1[0] ^ h1[4]; - uint64_t al2 = h2[0] ^ h2[4]; - uint64_t al3 = h3[0] ^ h3[4]; + VARIANT1_INIT(0); ctx[0]->tweak1_2 = tweak1_2_0; + VARIANT1_INIT(1); ctx[1]->tweak1_2 = tweak1_2_1; + VARIANT1_INIT(2); ctx[2]->tweak1_2 = tweak1_2_2; + VARIANT1_INIT(3); ctx[3]->tweak1_2 = tweak1_2_3; - uint64_t ah0 = h0[1] ^ h0[5]; - uint64_t ah1 = h1[1] ^ h1[5]; - uint64_t ah2 = h2[1] ^ h2[5]; - uint64_t ah3 = h3[1] ^ h3[5]; + ctx[0]->tweak1_table = tweak1_table; - __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i bx20 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - __m128i bx30 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); - - uint64_t idx0 = al0; - uint64_t idx1 = al1; - uint64_t idx2 = al2; - uint64_t idx3 = al3; - - __m128i cx0, cx1, cx2, cx3; - - if (!SOFT_AES) { - cx0 = _mm_load_si128(reinterpret_cast(&l0[idx0 & MASK])); - cx1 = _mm_load_si128(reinterpret_cast(&l1[idx1 & MASK])); - cx2 = _mm_load_si128(reinterpret_cast(&l2[idx2 & MASK])); - cx3 = _mm_load_si128(reinterpret_cast(&l3[idx3 & MASK])); - } - - for (size_t i = 0; i < props.iterations(); i++) { - const __m128i ax0 = _mm_set_epi64x(ah0, al0); - const __m128i ax1 = _mm_set_epi64x(ah1, al1); - const __m128i ax2 = _mm_set_epi64x(ah2, al2); - const __m128i ax3 = _mm_set_epi64x(ah3, al3); - - if (SOFT_AES) { - cx0 = soft_aesenc(&l0[idx0 & MASK], ax0, reinterpret_cast(saes_table)); - cx1 = soft_aesenc(&l1[idx1 & MASK], ax1, reinterpret_cast(saes_table)); - cx2 = soft_aesenc(&l2[idx2 & MASK], ax2, reinterpret_cast(saes_table)); - cx3 = soft_aesenc(&l3[idx3 & MASK], ax3, reinterpret_cast(saes_table)); - } - else { - cx0 = _mm_aesenc_si128(cx0, ax0); - cx1 = _mm_aesenc_si128(cx1, ax1); - cx2 = _mm_aesenc_si128(cx2, ax2); - cx3 = _mm_aesenc_si128(cx3, ax3); - if (MASK > 131072) { - _mm_prefetch((const char*)(&l0[_mm_cvtsi128_si32(cx0) & MASK]), _MM_HINT_T0); - _mm_prefetch((const char*)(&l1[_mm_cvtsi128_si32(cx1) & MASK]), _MM_HINT_T0); - _mm_prefetch((const char*)(&l2[_mm_cvtsi128_si32(cx2) & MASK]), _MM_HINT_T0); - _mm_prefetch((const char*)(&l3[_mm_cvtsi128_si32(cx3) & MASK]), _MM_HINT_T0); - } - } - - cryptonight_monero_tweak_gr((uint64_t*)&l0[idx0 & MASK], l0, idx0 & MASK, ax0, bx00, cx0); - cryptonight_monero_tweak_gr((uint64_t*)&l1[idx1 & MASK], l1, idx1 & MASK, ax1, bx10, cx1); - cryptonight_monero_tweak_gr((uint64_t*)&l2[idx2 & MASK], l2, idx2 & MASK, ax2, bx20, cx2); - cryptonight_monero_tweak_gr((uint64_t*)&l3[idx3 & MASK], l3, idx3 & MASK, ax3, bx30, cx3); - - idx0 = _mm_cvtsi128_si64(cx0); - idx1 = _mm_cvtsi128_si64(cx1); - idx2 = _mm_cvtsi128_si64(cx2); - idx3 = _mm_cvtsi128_si64(cx3); - - uint64_t hi, lo, cl, ch; - - cl = ((uint64_t*)&l0[idx0 & MASK])[0]; - ch = ((uint64_t*)&l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); - al0 += hi; - ah0 += lo; - ((uint64_t*)&l0[idx0 & MASK])[0] = al0; - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0; - al0 ^= cl; - ah0 ^= ch; - idx0 = al0; - bx00 = cx0; - if (!SOFT_AES) cx0 = _mm_load_si128(reinterpret_cast(&l0[idx0 & MASK])); - - cl = ((uint64_t*)&l1[idx1 & MASK])[0]; - ch = ((uint64_t*)&l1[idx1 & MASK])[1]; - lo = __umul128(idx1, cl, &hi); - al1 += hi; - ah1 += lo; - ((uint64_t*)&l1[idx1 & MASK])[0] = al1; - ((uint64_t*)&l1[idx1 & MASK])[1] = ah1 ^ tweak1_2_1; - al1 ^= cl; - ah1 ^= ch; - idx1 = al1; - bx10 = cx1; - if (!SOFT_AES) cx1 = _mm_load_si128(reinterpret_cast(&l1[idx1 & MASK])); - - cl = ((uint64_t*)&l2[idx2 & MASK])[0]; - ch = ((uint64_t*)&l2[idx2 & MASK])[1]; - lo = __umul128(idx2, cl, &hi); - al2 += hi; - ah2 += lo; - ((uint64_t*)&l2[idx2 & MASK])[0] = al2; - ((uint64_t*)&l2[idx2 & MASK])[1] = ah2 ^ tweak1_2_2; - al2 ^= cl; - ah2 ^= ch; - idx2 = al2; - bx20 = cx2; - if (!SOFT_AES) cx2 = _mm_load_si128(reinterpret_cast(&l2[idx2 & MASK])); - - cl = ((uint64_t*)&l3[idx3 & MASK])[0]; - ch = ((uint64_t*)&l3[idx3 & MASK])[1]; - lo = __umul128(idx3, cl, &hi); - al3 += hi; - ah3 += lo; - ((uint64_t*)&l3[idx3 & MASK])[0] = al3; - ((uint64_t*)&l3[idx3 & MASK])[1] = ah3 ^ tweak1_2_3; - al3 ^= cl; - ah3 ^= ch; - idx3 = al3; - bx30 = cx3; - if (!SOFT_AES) cx3 = _mm_load_si128(reinterpret_cast(&l3[idx3 & MASK])); - } + if (ALGO == Algorithm::CN_GR_0) cn_gr0_quad_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_1) cn_gr1_quad_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_2) cn_gr2_quad_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_3) cn_gr3_quad_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_4) cn_gr4_quad_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_5) cn_gr5_quad_mainloop_asm(ctx); # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_implode_scratchpad_vaes_double(ctx[0], ctx[1]); - cn_implode_scratchpad_vaes_double(ctx[2], ctx[3]); + if (!props.isHeavy() && cn_vaes_enabled) { + cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); + cn_implode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem()); } else # endif { - cn_implode_scratchpad(ctx[0]); - cn_implode_scratchpad(ctx[1]); - cn_implode_scratchpad(ctx[2]); - cn_implode_scratchpad(ctx[3]); + cn_implode_scratchpad(ctx[0]); + cn_implode_scratchpad(ctx[1]); + cn_implode_scratchpad(ctx[2]); + cn_implode_scratchpad(ctx[3]); } - keccakf(h0, 24); - keccakf(h1, 24); - keccakf(h2, 24); - keccakf(h3, 24); + keccakf(reinterpret_cast(ctx[0]->state), 24); + keccakf(reinterpret_cast(ctx[1]->state), 24); + keccakf(reinterpret_cast(ctx[2]->state), 24); + keccakf(reinterpret_cast(ctx[3]->state), 24); extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); @@ -1755,11 +1786,22 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si template inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height) { - const auto arch = Cpu::info()->arch(); - if ((arch >= ICpuInfo::ARCH_ZEN) && (arch <= ICpuInfo::ARCH_ZEN3)) { - if ((ALGO == Algorithm::CN_GR_0) || (ALGO == Algorithm::CN_GR_1) || (ALGO == Algorithm::CN_GR_2) || (ALGO == Algorithm::CN_GR_3) || (ALGO == Algorithm::CN_GR_4) || (ALGO == Algorithm::CN_GR_5)) { - cryptonight_quad_hash_zen(input, size, output, ctx, height); - return; + if (!SOFT_AES) { + switch (ALGO) { + case Algorithm::CN_GR_0: + case Algorithm::CN_GR_1: + case Algorithm::CN_GR_2: + case Algorithm::CN_GR_3: + case Algorithm::CN_GR_4: + case Algorithm::CN_GR_5: + if (cn_sse41_enabled) { + cryptonight_quad_hash_gr_sse41(input, size, output, ctx, height); + return; + } + break; + + default: + break; } } @@ -1788,9 +1830,9 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size } # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_explode_scratchpad_vaes_double(ctx[0], ctx[1]); - cn_explode_scratchpad_vaes_double(ctx[2], ctx[3]); + if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) { + cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); + cn_explode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem()); } else # endif @@ -1851,9 +1893,9 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size } # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_implode_scratchpad_vaes_double(ctx[0], ctx[1]); - cn_implode_scratchpad_vaes_double(ctx[2], ctx[3]); + if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) { + cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); + cn_implode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem()); } else # endif diff --git a/src/crypto/cn/CryptoNight_x86_vaes.cpp b/src/crypto/cn/CryptoNight_x86_vaes.cpp index 45b26d3a9..41da111bc 100644 --- a/src/crypto/cn/CryptoNight_x86_vaes.cpp +++ b/src/crypto/cn/CryptoNight_x86_vaes.cpp @@ -162,12 +162,9 @@ static FORCEINLINE void vaes_round(__m256i key, __m256i& x0, __m256i& x1, __m256 namespace xmrig { -template -NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx) +NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem) { - constexpr CnAlgo props; - - constexpr size_t N = (props.memory() / sizeof(__m256i)) / (props.half_mem() ? 2 : 1); + const size_t N = (memory / sizeof(__m256i)) / (half_mem ? 2 : 1); __m256i xin01, xin23, xin45, xin67; __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; @@ -177,7 +174,7 @@ NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx) vaes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); - if (props.half_mem() && !ctx->first_half) { + if (half_mem && !ctx->first_half) { const __m256i* p = reinterpret_cast(ctx->save_state); xin01 = _mm256_loadu_si256(p + 0); xin23 = _mm256_loadu_si256(p + 1); @@ -226,7 +223,7 @@ NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx) prefetch_ptr = output; } - if (props.half_mem() && ctx->first_half) { + if (half_mem && ctx->first_half) { __m256i* p = reinterpret_cast<__m256i*>(ctx->save_state); _mm256_storeu_si256(p + 0, xin01); _mm256_storeu_si256(p + 1, xin23); @@ -238,12 +235,9 @@ NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx) } -template -NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2) +NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem) { - constexpr CnAlgo props; - - constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1); + const size_t N = (memory / sizeof(__m128i)) / (half_mem ? 2 : 1); __m256i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7; __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; @@ -257,7 +251,7 @@ NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonig vaes_genkey_double(input1, input2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); { - const bool b = props.half_mem() && !ctx1->first_half && !ctx2->first_half; + const bool b = half_mem && !ctx1->first_half && !ctx2->first_half; const __m128i* p1 = b ? reinterpret_cast(ctx1->save_state) : (input1 + 4); const __m128i* p2 = b ? reinterpret_cast(ctx2->save_state) : (input2 + 4); xin0 = _mm256_loadu2_m128i(p2 + 0, p1 + 0); @@ -315,7 +309,7 @@ NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonig prefetch_ptr2 = output2; } - if (props.half_mem() && ctx1->first_half && ctx2->first_half) { + if (half_mem && ctx1->first_half && ctx2->first_half) { __m128i* p1 = reinterpret_cast<__m128i*>(ctx1->save_state); __m128i* p2 = reinterpret_cast<__m128i*>(ctx2->save_state); _mm256_storeu2_m128i(p2 + 0, p1 + 0, xin0); @@ -332,12 +326,9 @@ NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonig } -template -NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx) +NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem) { - constexpr CnAlgo props; - - constexpr size_t N = (props.memory() / sizeof(__m256i)) / (props.half_mem() ? 2 : 1); + const size_t N = (memory / sizeof(__m256i)) / (half_mem ? 2 : 1); __m256i xout01, xout23, xout45, xout67; __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; @@ -353,11 +344,11 @@ NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx) xout67 = _mm256_loadu_si256(output + 5); const __m256i* input_begin = input; - for (size_t part = 0; part < (props.half_mem() ? 2 : 1); ++part) { - if (props.half_mem() && (part == 1)) { + for (size_t part = 0; part < (half_mem ? 2 : 1); ++part) { + if (half_mem && (part == 1)) { input = input_begin; ctx->first_half = false; - cn_explode_scratchpad_vaes(ctx); + cn_explode_scratchpad_vaes(ctx, memory, half_mem); } for (size_t i = 0; i < N;) { @@ -399,12 +390,9 @@ NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx) } -template -NOINLINE void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2) +NOINLINE void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem) { - constexpr CnAlgo props; - - constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1); + const size_t N = (memory / sizeof(__m128i)) / (half_mem ? 2 : 1); __m256i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7; __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; @@ -428,13 +416,13 @@ NOINLINE void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonig const __m128i* input_begin1 = input1; const __m128i* input_begin2 = input2; - for (size_t part = 0; part < (props.half_mem() ? 2 : 1); ++part) { - if (props.half_mem() && (part == 1)) { + for (size_t part = 0; part < (half_mem ? 2 : 1); ++part) { + if (half_mem && (part == 1)) { input1 = input_begin1; input2 = input_begin2; ctx1->first_half = false; ctx2->first_half = false; - cn_explode_scratchpad_vaes_double(ctx1, ctx2); + cn_explode_scratchpad_vaes_double(ctx1, ctx2, memory, half_mem); } for (size_t i = 0; i < N;) { @@ -487,44 +475,4 @@ NOINLINE void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonig } -template -void VAES_Instance() -{ - cn_explode_scratchpad_vaes(nullptr); - cn_explode_scratchpad_vaes_double(nullptr, nullptr); - cn_implode_scratchpad_vaes(nullptr); - cn_implode_scratchpad_vaes_double(nullptr, nullptr); -} - - -void (*vaes_instances[])() = { - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, -}; - - } // xmrig diff --git a/src/crypto/cn/CryptoNight_x86_vaes.h b/src/crypto/cn/CryptoNight_x86_vaes.h index 475780b85..1c824ecfb 100644 --- a/src/crypto/cn/CryptoNight_x86_vaes.h +++ b/src/crypto/cn/CryptoNight_x86_vaes.h @@ -36,10 +36,10 @@ struct cryptonight_ctx; namespace xmrig { -template void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx); -template void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2); -template void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx); -template void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2); +void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem); +void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem); +void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem); +void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem); } // xmrig diff --git a/src/crypto/cn/asm/cn1/cnv1_double_main_loop.inc b/src/crypto/cn/asm/cn1/cnv1_double_main_loop.inc new file mode 100644 index 000000000..896c1a2cb --- /dev/null +++ b/src/crypto/cn/asm/cn1/cnv1_double_main_loop.inc @@ -0,0 +1,132 @@ + mov QWORD PTR [rsp+8], rbx + mov QWORD PTR [rsp+16], rbp + mov QWORD PTR [rsp+24], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 32 + mov rdx, QWORD PTR [rcx] + mov r8, QWORD PTR [rcx+8] + mov r12d, 524288 + movaps XMMWORD PTR [rsp+16], xmm6 + mov rbx, QWORD PTR [rdx+32] + xor rbx, QWORD PTR [rdx] + mov rsi, QWORD PTR [rdx+40] + mov r10, rbx + xor rsi, QWORD PTR [rdx+8] + and r10d, 2097136 + mov rdi, QWORD PTR [r8+32] + xor rdi, QWORD PTR [r8] + movq xmm3, rbx + mov rbp, QWORD PTR [r8+40] + mov r9, rdi + xor rbp, QWORD PTR [r8+8] + movq xmm0, rsi + mov rcx, QWORD PTR [rdx+56] + and r9d, 2097136 + xor rcx, QWORD PTR [rdx+24] + movq xmm4, rdi + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + mov r14, QWORD PTR [rdx+224] + mov r13, QWORD PTR [rdx+232] + mov r15, QWORD PTR [r8+224] + punpcklqdq xmm3, xmm0 + movq xmm0, rbp + movq xmm5, rax + punpcklqdq xmm4, xmm0 + mov rax, QWORD PTR [r8+48] + movq xmm0, rcx + xor rax, QWORD PTR [r8+16] + mov rcx, QWORD PTR [r8+56] + xor rcx, QWORD PTR [r8+24] + movdqu xmm1, XMMWORD PTR [r14+r10] + movq xmm6, rax + punpcklqdq xmm5, xmm0 + mov rax, QWORD PTR [rdx+240] + movq xmm0, rcx + movdqu xmm2, XMMWORD PTR [r15+r9] + mov QWORD PTR [rsp], rax + mov rax, QWORD PTR [r8+240] + mov QWORD PTR [rsp+8], rax + punpcklqdq xmm6, xmm0 + + ALIGN(64) +main_loop_cnv1_double: + aesenc xmm1, xmm3 + aesenc xmm2, xmm4 + movdqa xmm0, xmm1 + movq r11, xmm2 + pxor xmm0, xmm5 + movdqa xmm5, xmm1 + movq QWORD PTR [r14+r10], xmm0 + pextrq rcx, xmm0, 1 + mov eax, ecx + movdqa xmm0, xmm2 + shr rax, 24 + pxor xmm0, xmm6 + movdqa xmm6, xmm2 + mov eax, DWORD PTR [r13+rax*4] + xor rax, rcx + mov QWORD PTR [r14+r10+8], rax + movq QWORD PTR [r15+r9], xmm0 + pextrq rcx, xmm0, 1 + mov eax, ecx + shr rax, 24 + mov eax, DWORD PTR [r13+rax*4] + xor rax, rcx + movq rcx, xmm1 + mov QWORD PTR [r15+r9+8], rax + mov r9, rcx + and r9d, 2097136 + mov r10, QWORD PTR [r14+r9] + mov r8, QWORD PTR [r14+r9+8] + mov rax, r10 + mul rcx + add rsi, rax + add rbx, rdx + mov rax, QWORD PTR [rsp] + mov QWORD PTR [r14+r9], rbx + xor rax, rsi + mov QWORD PTR [r14+r9+8], rax + xor rsi, r8 + xor rbx, r10 + mov r8, r11 + and r8d, 2097136 + mov r10, rbx + and r10d, 2097136 + movq xmm3, rbx + pinsrq xmm3, rsi, 1 + mov r9, QWORD PTR [r15+r8] + mov rcx, QWORD PTR [r15+r8+8] + mov rax, r9 + movdqu xmm1, XMMWORD PTR [r14+r10] + mul r11 + add rbp, rax + add rdi, rdx + mov rax, QWORD PTR [rsp+8] + mov QWORD PTR [r15+r8], rdi + xor rax, rbp + xor rdi, r9 + mov QWORD PTR [r15+r8+8], rax + mov r9, rdi + xor rbp, rcx + and r9d, 2097136 + movq xmm4, rdi + pinsrq xmm4, rbp, 1 + movdqu xmm2, XMMWORD PTR [r15+r9] + sub r12, 1 + jne main_loop_cnv1_double + + mov rbx, QWORD PTR [rsp+80] + mov rbp, QWORD PTR [rsp+88] + mov rsi, QWORD PTR [rsp+96] + movaps xmm6, XMMWORD PTR [rsp+16] + add rsp, 32 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi diff --git a/src/crypto/cn/asm/cn1/cnv1_quad_main_loop.inc b/src/crypto/cn/asm/cn1/cnv1_quad_main_loop.inc new file mode 100644 index 000000000..2b61ced08 --- /dev/null +++ b/src/crypto/cn/asm/cn1/cnv1_quad_main_loop.inc @@ -0,0 +1,263 @@ + mov rax, rsp + mov QWORD PTR [rax+8], rbx + mov QWORD PTR [rax+16], rbp + mov QWORD PTR [rax+24], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 144 + mov r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+8] + mov r10, QWORD PTR [rcx+16] + mov r11, QWORD PTR [rcx+24] + mov rbp, QWORD PTR [r8+224] + mov r13, QWORD PTR [r8+232] + mov r14, QWORD PTR [r9+224] + mov r15, QWORD PTR [r10+224] + mov r12, QWORD PTR [r11+224] + mov rcx, QWORD PTR [r8+40] + xor rcx, QWORD PTR [r8+8] + mov rbx, QWORD PTR [r8+32] + xor rbx, QWORD PTR [r8] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + movq xmm0, rcx + mov rcx, QWORD PTR [r9+40] + xor rcx, QWORD PTR [r9+8] + movq xmm1, rbx + movaps XMMWORD PTR [rax-56], xmm6 + movaps XMMWORD PTR [rax-72], xmm7 + movaps XMMWORD PTR [rax-88], xmm8 + movaps XMMWORD PTR [rax-104], xmm9 + movaps XMMWORD PTR [rax-120], xmm10 + movaps XMMWORD PTR [rsp+48], xmm11 + movaps XMMWORD PTR [rsp+32], xmm12 + and ebx, 2097136 + mov rsi, QWORD PTR [r10+32] + movq xmm2, rdi + mov rax, QWORD PTR [r8+240] + and edi, 2097136 + xor rsi, QWORD PTR [r10] + mov rdx, QWORD PTR [r8+56] + xor rdx, QWORD PTR [r8+24] + mov QWORD PTR [rsp], rax + mov rax, QWORD PTR [r9+240] + movq xmm3, rsi + mov QWORD PTR [rsp+8], rax + and esi, 2097136 + mov rax, QWORD PTR [r10+240] + punpcklqdq xmm1, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r10+40] + xor rcx, QWORD PTR [r10+8] + mov QWORD PTR [rsp+16], rax + mov rax, QWORD PTR [r11+240] + punpcklqdq xmm2, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp+24], rax + mov rcx, QWORD PTR [r11+40] + xor rcx, QWORD PTR [r11+8] + mov rax, QWORD PTR [r11+32] + xor rax, QWORD PTR [r11] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r8+48] + xor rcx, QWORD PTR [r8+16] + movq xmm4, rax + and eax, 2097136 + punpcklqdq xmm4, xmm0 + movq xmm0, rdx + mov rdx, QWORD PTR [r9+56] + xor rdx, QWORD PTR [r9+24] + movq xmm5, rcx + mov rcx, QWORD PTR [r9+48] + xor rcx, QWORD PTR [r9+16] + punpcklqdq xmm5, xmm0 + movq xmm0, rdx + mov rdx, QWORD PTR [r10+56] + xor rdx, QWORD PTR [r10+24] + movq xmm6, rcx + mov rcx, QWORD PTR [r10+48] + xor rcx, QWORD PTR [r10+16] + punpcklqdq xmm6, xmm0 + movq xmm0, rdx + mov rdx, QWORD PTR [r11+56] + movq xmm7, rcx + punpcklqdq xmm7, xmm0 + xor rdx, QWORD PTR [r11+24] + mov rcx, QWORD PTR [r11+48] + xor rcx, QWORD PTR [r11+16] + mov r11d, 524288 + movdqu xmm9, XMMWORD PTR [rbp+rbx] + movdqu xmm10, XMMWORD PTR [r14+rdi] + movq xmm0, rdx + movdqu xmm11, XMMWORD PTR [r15+rsi] + movdqu xmm12, XMMWORD PTR [r12+rax] + movq xmm8, rcx + punpcklqdq xmm8, xmm0 + + ALIGN(64) +main_loop_cnv1_quad: + aesenc xmm9, xmm1 + aesenc xmm10, xmm2 + aesenc xmm11, xmm3 + aesenc xmm12, xmm4 + movd ecx, xmm9 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+rbp] + movd ecx, xmm10 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+r14] + movd ecx, xmm11 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+r15] + movd ecx, xmm12 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+r12] + movdqa xmm0, xmm9 + pxor xmm0, xmm5 + movdqa xmm5, xmm9 + movq QWORD PTR [rbp+rbx], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + movdqa xmm0, xmm10 + shr rcx, 24 + pxor xmm0, xmm6 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [rbp+rbx+8], rcx + movq rbx, xmm1 + movq QWORD PTR [r14+rdi], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + movdqa xmm0, xmm11 + shr rcx, 24 + pxor xmm0, xmm7 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [r14+rdi+8], rcx + movq rdi, xmm2 + movq QWORD PTR [r15+rsi], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + movdqa xmm0, xmm12 + shr rcx, 24 + pxor xmm0, xmm8 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [r15+rsi+8], rcx + movq QWORD PTR [r12+rax], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + shr rcx, 24 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [r12+rax+8], rcx + movq rcx, xmm9 + mov r8, rcx + and r8d, 2097136 + mov r9, QWORD PTR [rbp+r8] + mov r10, QWORD PTR [rbp+r8+8] + mov rax, r9 + mul rcx + pextrq rcx, xmm1, 1 + add rcx, rax + add rbx, rdx + mov rax, QWORD PTR [rsp] + mov QWORD PTR [rbp+r8], rbx + xor rax, rcx + mov QWORD PTR [rbp+r8+8], rax + xor rcx, r10 + xor rbx, r9 + movq xmm1, rbx + and ebx, 2097136 + pinsrq xmm1, rcx, 1 + movq rcx, xmm10 + mov r8, rcx + and r8d, 2097136 + movdqu xmm9, XMMWORD PTR [rbp+rbx] + mov r9, QWORD PTR [r14+r8] + mov r10, QWORD PTR [r14+r8+8] + mov rax, r9 + mul rcx + pextrq rcx, xmm2, 1 + add rcx, rax + add rdi, rdx + mov rax, QWORD PTR [rsp+8] + mov QWORD PTR [r14+r8], rdi + xor rax, rcx + xor rdi, r9 + mov QWORD PTR [r14+r8+8], rax + xor rcx, r10 + movq xmm2, rdi + and edi, 2097136 + pinsrq xmm2, rcx, 1 + movq rcx, xmm11 + movq rsi, xmm3 + mov r8, rcx + and r8d, 2097136 + movdqa xmm6, xmm10 + movdqa xmm7, xmm11 + movdqa xmm8, xmm12 + movdqu xmm10, XMMWORD PTR [r14+rdi] + mov r9, QWORD PTR [r15+r8] + mov r10, QWORD PTR [r15+r8+8] + mov rax, r9 + mul rcx + pextrq rcx, xmm3, 1 + add rcx, rax + add rsi, rdx + mov rax, QWORD PTR [rsp+16] + xor rax, rcx + mov QWORD PTR [r15+r8], rsi + mov QWORD PTR [r15+r8+8], rax + xor rcx, r10 + xor rsi, r9 + movq xmm3, rsi + and esi, 2097136 + pinsrq xmm3, rcx, 1 + movq rcx, xmm12 + mov r8, rcx + and r8d, 2097136 + movdqu xmm11, XMMWORD PTR [r15+rsi] + mov r9, QWORD PTR [r12+r8] + mov r10, QWORD PTR [r12+r8+8] + mov rax, r9 + mul rcx + mov rcx, rax + movq rax, xmm4 + add rax, rdx + mov QWORD PTR [r12+r8], rax + xor rax, r9 + pextrq rdx, xmm4, 1 + add rdx, rcx + mov rcx, QWORD PTR [rsp+24] + xor rcx, rdx + xor rdx, r10 + movq xmm4, rax + mov QWORD PTR [r12+r8+8], rcx + and eax, 2097136 + pinsrq xmm4, rdx, 1 + movdqu xmm12, XMMWORD PTR [r12+rax] + sub r11, 1 + jne main_loop_cnv1_quad + + movaps xmm7, XMMWORD PTR [rsp+112] + lea r11, QWORD PTR [rsp+144] + mov rbx, QWORD PTR [r11+48] + mov rbp, QWORD PTR [r11+56] + mov rsi, QWORD PTR [r11+64] + movaps xmm6, XMMWORD PTR [r11-16] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm9, XMMWORD PTR [r11-64] + movaps xmm10, XMMWORD PTR [r11-80] + movaps xmm11, XMMWORD PTR [r11-96] + movaps xmm12, XMMWORD PTR [r11-112] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi diff --git a/src/crypto/cn/asm/cn1/cnv1_single_main_loop.inc b/src/crypto/cn/asm/cn1/cnv1_single_main_loop.inc new file mode 100644 index 000000000..62558c3c3 --- /dev/null +++ b/src/crypto/cn/asm/cn1/cnv1_single_main_loop.inc @@ -0,0 +1,66 @@ + mov QWORD PTR [rsp+8], rbx + mov QWORD PTR [rsp+16], rbp + mov QWORD PTR [rsp+24], rsi + mov QWORD PTR [rsp+32], rdi + push r13 + push r14 + push r15 + mov rdx, QWORD PTR [rcx] + mov esi, 524288 + mov r11, QWORD PTR [rdx+32] + xor r11, QWORD PTR [rdx] + mov rdi, QWORD PTR [rdx+224] + mov rbx, QWORD PTR [rdx+40] + xor rbx, QWORD PTR [rdx+8] + mov rcx, QWORD PTR [rdx+56] + xor rcx, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + mov rbp, QWORD PTR [rdx+240] + mov r14, QWORD PTR [rdx+232] + movq xmm2, rax + pinsrq xmm2, rcx, 1 + + ALIGN(64) +main_loop_cnv1_single: + mov r8, r11 + and r8d, 2097136 + movdqu xmm1, XMMWORD PTR [rdi+r8] + movq xmm0, r11 + pinsrq xmm0, rbx, 1 + aesenc xmm1, xmm0 + movq r15, xmm1 + mov r9, r15 + and r9d, 2097136 + movdqa xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm2, xmm1 + movq QWORD PTR [rdi+r8], xmm0 + pextrq rdx, xmm0, 1 + mov eax, edx + shr rax, 24 + mov ecx, DWORD PTR [r14+rax*4] + xor rcx, rdx + mov QWORD PTR [rdi+r8+8], rcx + mov r10, QWORD PTR [rdi+r9] + mov r8, QWORD PTR [rdi+r9+8] + mov rax, r10 + mul r15 + add rbx, rax + add r11, rdx + mov QWORD PTR [rdi+r9], r11 + mov rax, rbx + xor rbx, r8 + xor r11, r10 + xor rax, rbp + mov QWORD PTR [rdi+r9+8], rax + sub rsi, 1 + jne main_loop_cnv1_single + + pop r15 + pop r14 + pop r13 + mov rbx, QWORD PTR [rsp+8] + mov rbp, QWORD PTR [rsp+16] + mov rsi, QWORD PTR [rsp+24] + mov rdi, QWORD PTR [rsp+32] diff --git a/src/crypto/cn/asm/cn_main_loop.S b/src/crypto/cn/asm/cn_main_loop.S index 0dfd3ee27..527e20e12 100644 --- a/src/crypto/cn/asm/cn_main_loop.S +++ b/src/crypto/cn/asm/cn_main_loop.S @@ -11,6 +11,9 @@ # define FN_PREFIX(fn) fn .section .text #endif +.global FN_PREFIX(cnv1_single_mainloop_asm) +.global FN_PREFIX(cnv1_double_mainloop_asm) +.global FN_PREFIX(cnv1_quad_mainloop_asm) .global FN_PREFIX(cnv2_mainloop_ivybridge_asm) .global FN_PREFIX(cnv2_mainloop_ryzen_asm) .global FN_PREFIX(cnv2_mainloop_bulldozer_asm) @@ -19,6 +22,33 @@ .global FN_PREFIX(cnv2_rwz_double_mainloop_asm) .global FN_PREFIX(cnv2_upx_double_mainloop_zen3_asm) +ALIGN(64) +FN_PREFIX(cnv1_single_mainloop_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn1/cnv1_single_main_loop.inc" + add rsp, 48 + ret 0 + mov eax, 3735929054 + +ALIGN(64) +FN_PREFIX(cnv1_double_mainloop_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn1/cnv1_double_main_loop.inc" + add rsp, 48 + ret 0 + mov eax, 3735929054 + +ALIGN(64) +FN_PREFIX(cnv1_quad_mainloop_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn1/cnv1_quad_main_loop.inc" + add rsp, 48 + ret 0 + mov eax, 3735929054 + ALIGN(64) FN_PREFIX(cnv2_mainloop_ivybridge_asm): sub rsp, 48 diff --git a/src/crypto/cn/asm/cn_main_loop.asm b/src/crypto/cn/asm/cn_main_loop.asm index 97ae5299b..0979580ab 100644 --- a/src/crypto/cn/asm/cn_main_loop.asm +++ b/src/crypto/cn/asm/cn_main_loop.asm @@ -1,4 +1,7 @@ _TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE +PUBLIC cnv1_single_mainloop_asm +PUBLIC cnv1_double_mainloop_asm +PUBLIC cnv1_quad_mainloop_asm PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_bulldozer_asm @@ -6,6 +9,27 @@ PUBLIC cnv2_double_mainloop_sandybridge_asm PUBLIC cnv2_rwz_mainloop_asm PUBLIC cnv2_rwz_double_mainloop_asm +ALIGN(64) +cnv1_single_mainloop_asm PROC + INCLUDE cn1/cnv1_single_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv1_single_mainloop_asm ENDP + +ALIGN(64) +cnv1_double_mainloop_asm PROC + INCLUDE cn1/cnv1_double_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv1_double_mainloop_asm ENDP + +ALIGN(64) +cnv1_quad_mainloop_asm PROC + INCLUDE cn1/cnv1_quad_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv1_quad_mainloop_asm ENDP + ALIGN(64) cnv2_mainloop_ivybridge_asm PROC INCLUDE cn2/cnv2_main_loop_ivybridge.inc diff --git a/src/crypto/cn/asm/win64/cn1/cnv1_double_main_loop.inc b/src/crypto/cn/asm/win64/cn1/cnv1_double_main_loop.inc new file mode 100644 index 000000000..ac0cdb21a --- /dev/null +++ b/src/crypto/cn/asm/win64/cn1/cnv1_double_main_loop.inc @@ -0,0 +1,132 @@ + mov QWORD PTR [rsp+8], rbx + mov QWORD PTR [rsp+16], rbp + mov QWORD PTR [rsp+24], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 32 + mov rdx, QWORD PTR [rcx] + mov r8, QWORD PTR [rcx+8] + mov r12d, 524288 + movaps XMMWORD PTR [rsp+16], xmm6 + mov rbx, QWORD PTR [rdx+32] + xor rbx, QWORD PTR [rdx] + mov rsi, QWORD PTR [rdx+40] + mov r10, rbx + xor rsi, QWORD PTR [rdx+8] + and r10d, 2097136 + mov rdi, QWORD PTR [r8+32] + xor rdi, QWORD PTR [r8] + movd xmm3, rbx + mov rbp, QWORD PTR [r8+40] + mov r9, rdi + xor rbp, QWORD PTR [r8+8] + movd xmm0, rsi + mov rcx, QWORD PTR [rdx+56] + and r9d, 2097136 + xor rcx, QWORD PTR [rdx+24] + movd xmm4, rdi + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + mov r14, QWORD PTR [rdx+224] + mov r13, QWORD PTR [rdx+232] + mov r15, QWORD PTR [r8+224] + punpcklqdq xmm3, xmm0 + movd xmm0, rbp + movd xmm5, rax + punpcklqdq xmm4, xmm0 + mov rax, QWORD PTR [r8+48] + movd xmm0, rcx + xor rax, QWORD PTR [r8+16] + mov rcx, QWORD PTR [r8+56] + xor rcx, QWORD PTR [r8+24] + movdqu xmm1, XMMWORD PTR [r14+r10] + movd xmm6, rax + punpcklqdq xmm5, xmm0 + mov rax, QWORD PTR [rdx+240] + movd xmm0, rcx + movdqu xmm2, XMMWORD PTR [r15+r9] + mov QWORD PTR [rsp], rax + mov rax, QWORD PTR [r8+240] + mov QWORD PTR [rsp+8], rax + punpcklqdq xmm6, xmm0 + + ALIGN(64) +main_loop_cnv1_double: + aesenc xmm1, xmm3 + aesenc xmm2, xmm4 + movdqa xmm0, xmm1 + movd r11, xmm2 + pxor xmm0, xmm5 + movdqa xmm5, xmm1 + movd QWORD PTR [r14+r10], xmm0 + pextrq rcx, xmm0, 1 + mov eax, ecx + movdqa xmm0, xmm2 + shr rax, 24 + pxor xmm0, xmm6 + movdqa xmm6, xmm2 + mov eax, DWORD PTR [r13+rax*4] + xor rax, rcx + mov QWORD PTR [r14+r10+8], rax + movd QWORD PTR [r15+r9], xmm0 + pextrq rcx, xmm0, 1 + mov eax, ecx + shr rax, 24 + mov eax, DWORD PTR [r13+rax*4] + xor rax, rcx + movd rcx, xmm1 + mov QWORD PTR [r15+r9+8], rax + mov r9, rcx + and r9d, 2097136 + mov r10, QWORD PTR [r14+r9] + mov r8, QWORD PTR [r14+r9+8] + mov rax, r10 + mul rcx + add rsi, rax + add rbx, rdx + mov rax, QWORD PTR [rsp] + mov QWORD PTR [r14+r9], rbx + xor rax, rsi + mov QWORD PTR [r14+r9+8], rax + xor rsi, r8 + xor rbx, r10 + mov r8, r11 + and r8d, 2097136 + mov r10, rbx + and r10d, 2097136 + movd xmm3, rbx + pinsrq xmm3, rsi, 1 + mov r9, QWORD PTR [r15+r8] + mov rcx, QWORD PTR [r15+r8+8] + mov rax, r9 + movdqu xmm1, XMMWORD PTR [r14+r10] + mul r11 + add rbp, rax + add rdi, rdx + mov rax, QWORD PTR [rsp+8] + mov QWORD PTR [r15+r8], rdi + xor rax, rbp + xor rdi, r9 + mov QWORD PTR [r15+r8+8], rax + mov r9, rdi + xor rbp, rcx + and r9d, 2097136 + movd xmm4, rdi + pinsrq xmm4, rbp, 1 + movdqu xmm2, XMMWORD PTR [r15+r9] + sub r12, 1 + jne main_loop_cnv1_double + + mov rbx, QWORD PTR [rsp+80] + mov rbp, QWORD PTR [rsp+88] + mov rsi, QWORD PTR [rsp+96] + movaps xmm6, XMMWORD PTR [rsp+16] + add rsp, 32 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi diff --git a/src/crypto/cn/asm/win64/cn1/cnv1_quad_main_loop.inc b/src/crypto/cn/asm/win64/cn1/cnv1_quad_main_loop.inc new file mode 100644 index 000000000..78d145a14 --- /dev/null +++ b/src/crypto/cn/asm/win64/cn1/cnv1_quad_main_loop.inc @@ -0,0 +1,263 @@ + mov rax, rsp + mov QWORD PTR [rax+8], rbx + mov QWORD PTR [rax+16], rbp + mov QWORD PTR [rax+24], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 144 + mov r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+8] + mov r10, QWORD PTR [rcx+16] + mov r11, QWORD PTR [rcx+24] + mov rbp, QWORD PTR [r8+224] + mov r13, QWORD PTR [r8+232] + mov r14, QWORD PTR [r9+224] + mov r15, QWORD PTR [r10+224] + mov r12, QWORD PTR [r11+224] + mov rcx, QWORD PTR [r8+40] + xor rcx, QWORD PTR [r8+8] + mov rbx, QWORD PTR [r8+32] + xor rbx, QWORD PTR [r8] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + movd xmm0, rcx + mov rcx, QWORD PTR [r9+40] + xor rcx, QWORD PTR [r9+8] + movd xmm1, rbx + movaps XMMWORD PTR [rax-56], xmm6 + movaps XMMWORD PTR [rax-72], xmm7 + movaps XMMWORD PTR [rax-88], xmm8 + movaps XMMWORD PTR [rax-104], xmm9 + movaps XMMWORD PTR [rax-120], xmm10 + movaps XMMWORD PTR [rsp+48], xmm11 + movaps XMMWORD PTR [rsp+32], xmm12 + and ebx, 2097136 + mov rsi, QWORD PTR [r10+32] + movd xmm2, rdi + mov rax, QWORD PTR [r8+240] + and edi, 2097136 + xor rsi, QWORD PTR [r10] + mov rdx, QWORD PTR [r8+56] + xor rdx, QWORD PTR [r8+24] + mov QWORD PTR [rsp], rax + mov rax, QWORD PTR [r9+240] + movd xmm3, rsi + mov QWORD PTR [rsp+8], rax + and esi, 2097136 + mov rax, QWORD PTR [r10+240] + punpcklqdq xmm1, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [r10+40] + xor rcx, QWORD PTR [r10+8] + mov QWORD PTR [rsp+16], rax + mov rax, QWORD PTR [r11+240] + punpcklqdq xmm2, xmm0 + movd xmm0, rcx + mov QWORD PTR [rsp+24], rax + mov rcx, QWORD PTR [r11+40] + xor rcx, QWORD PTR [r11+8] + mov rax, QWORD PTR [r11+32] + xor rax, QWORD PTR [r11] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [r8+48] + xor rcx, QWORD PTR [r8+16] + movd xmm4, rax + and eax, 2097136 + punpcklqdq xmm4, xmm0 + movd xmm0, rdx + mov rdx, QWORD PTR [r9+56] + xor rdx, QWORD PTR [r9+24] + movd xmm5, rcx + mov rcx, QWORD PTR [r9+48] + xor rcx, QWORD PTR [r9+16] + punpcklqdq xmm5, xmm0 + movd xmm0, rdx + mov rdx, QWORD PTR [r10+56] + xor rdx, QWORD PTR [r10+24] + movd xmm6, rcx + mov rcx, QWORD PTR [r10+48] + xor rcx, QWORD PTR [r10+16] + punpcklqdq xmm6, xmm0 + movd xmm0, rdx + mov rdx, QWORD PTR [r11+56] + movd xmm7, rcx + punpcklqdq xmm7, xmm0 + xor rdx, QWORD PTR [r11+24] + mov rcx, QWORD PTR [r11+48] + xor rcx, QWORD PTR [r11+16] + mov r11d, 524288 + movdqu xmm9, XMMWORD PTR [rbp+rbx] + movdqu xmm10, XMMWORD PTR [r14+rdi] + movd xmm0, rdx + movdqu xmm11, XMMWORD PTR [r15+rsi] + movdqu xmm12, XMMWORD PTR [r12+rax] + movd xmm8, rcx + punpcklqdq xmm8, xmm0 + + ALIGN(64) +main_loop_cnv1_quad: + aesenc xmm9, xmm1 + aesenc xmm10, xmm2 + aesenc xmm11, xmm3 + aesenc xmm12, xmm4 + movd ecx, xmm9 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+rbp] + movd ecx, xmm10 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+r14] + movd ecx, xmm11 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+r15] + movd ecx, xmm12 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+r12] + movdqa xmm0, xmm9 + pxor xmm0, xmm5 + movdqa xmm5, xmm9 + movd QWORD PTR [rbp+rbx], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + movdqa xmm0, xmm10 + shr rcx, 24 + pxor xmm0, xmm6 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [rbp+rbx+8], rcx + movd rbx, xmm1 + movd QWORD PTR [r14+rdi], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + movdqa xmm0, xmm11 + shr rcx, 24 + pxor xmm0, xmm7 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [r14+rdi+8], rcx + movd rdi, xmm2 + movd QWORD PTR [r15+rsi], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + movdqa xmm0, xmm12 + shr rcx, 24 + pxor xmm0, xmm8 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [r15+rsi+8], rcx + movd QWORD PTR [r12+rax], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + shr rcx, 24 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [r12+rax+8], rcx + movd rcx, xmm9 + mov r8, rcx + and r8d, 2097136 + mov r9, QWORD PTR [rbp+r8] + mov r10, QWORD PTR [rbp+r8+8] + mov rax, r9 + mul rcx + pextrq rcx, xmm1, 1 + add rcx, rax + add rbx, rdx + mov rax, QWORD PTR [rsp] + mov QWORD PTR [rbp+r8], rbx + xor rax, rcx + mov QWORD PTR [rbp+r8+8], rax + xor rcx, r10 + xor rbx, r9 + movd xmm1, rbx + and ebx, 2097136 + pinsrq xmm1, rcx, 1 + movd rcx, xmm10 + mov r8, rcx + and r8d, 2097136 + movdqu xmm9, XMMWORD PTR [rbp+rbx] + mov r9, QWORD PTR [r14+r8] + mov r10, QWORD PTR [r14+r8+8] + mov rax, r9 + mul rcx + pextrq rcx, xmm2, 1 + add rcx, rax + add rdi, rdx + mov rax, QWORD PTR [rsp+8] + mov QWORD PTR [r14+r8], rdi + xor rax, rcx + xor rdi, r9 + mov QWORD PTR [r14+r8+8], rax + xor rcx, r10 + movd xmm2, rdi + and edi, 2097136 + pinsrq xmm2, rcx, 1 + movd rcx, xmm11 + movd rsi, xmm3 + mov r8, rcx + and r8d, 2097136 + movdqa xmm6, xmm10 + movdqa xmm7, xmm11 + movdqa xmm8, xmm12 + movdqu xmm10, XMMWORD PTR [r14+rdi] + mov r9, QWORD PTR [r15+r8] + mov r10, QWORD PTR [r15+r8+8] + mov rax, r9 + mul rcx + pextrq rcx, xmm3, 1 + add rcx, rax + add rsi, rdx + mov rax, QWORD PTR [rsp+16] + xor rax, rcx + mov QWORD PTR [r15+r8], rsi + mov QWORD PTR [r15+r8+8], rax + xor rcx, r10 + xor rsi, r9 + movd xmm3, rsi + and esi, 2097136 + pinsrq xmm3, rcx, 1 + movd rcx, xmm12 + mov r8, rcx + and r8d, 2097136 + movdqu xmm11, XMMWORD PTR [r15+rsi] + mov r9, QWORD PTR [r12+r8] + mov r10, QWORD PTR [r12+r8+8] + mov rax, r9 + mul rcx + mov rcx, rax + movd rax, xmm4 + add rax, rdx + mov QWORD PTR [r12+r8], rax + xor rax, r9 + pextrq rdx, xmm4, 1 + add rdx, rcx + mov rcx, QWORD PTR [rsp+24] + xor rcx, rdx + xor rdx, r10 + movd xmm4, rax + mov QWORD PTR [r12+r8+8], rcx + and eax, 2097136 + pinsrq xmm4, rdx, 1 + movdqu xmm12, XMMWORD PTR [r12+rax] + sub r11, 1 + jne main_loop_cnv1_quad + + movaps xmm7, XMMWORD PTR [rsp+112] + lea r11, QWORD PTR [rsp+144] + mov rbx, QWORD PTR [r11+48] + mov rbp, QWORD PTR [r11+56] + mov rsi, QWORD PTR [r11+64] + movaps xmm6, XMMWORD PTR [r11-16] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm9, XMMWORD PTR [r11-64] + movaps xmm10, XMMWORD PTR [r11-80] + movaps xmm11, XMMWORD PTR [r11-96] + movaps xmm12, XMMWORD PTR [r11-112] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi diff --git a/src/crypto/cn/asm/win64/cn1/cnv1_single_main_loop.inc b/src/crypto/cn/asm/win64/cn1/cnv1_single_main_loop.inc new file mode 100644 index 000000000..37413f23d --- /dev/null +++ b/src/crypto/cn/asm/win64/cn1/cnv1_single_main_loop.inc @@ -0,0 +1,66 @@ + mov QWORD PTR [rsp+8], rbx + mov QWORD PTR [rsp+16], rbp + mov QWORD PTR [rsp+24], rsi + mov QWORD PTR [rsp+32], rdi + push r13 + push r14 + push r15 + mov rdx, QWORD PTR [rcx] + mov esi, 524288 + mov r11, QWORD PTR [rdx+32] + xor r11, QWORD PTR [rdx] + mov rdi, QWORD PTR [rdx+224] + mov rbx, QWORD PTR [rdx+40] + xor rbx, QWORD PTR [rdx+8] + mov rcx, QWORD PTR [rdx+56] + xor rcx, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + mov rbp, QWORD PTR [rdx+240] + mov r14, QWORD PTR [rdx+232] + movd xmm2, rax + pinsrq xmm2, rcx, 1 + + ALIGN(64) +main_loop_cnv1_single: + mov r8, r11 + and r8d, 2097136 + movdqu xmm1, XMMWORD PTR [rdi+r8] + movd xmm0, r11 + pinsrq xmm0, rbx, 1 + aesenc xmm1, xmm0 + movd r15, xmm1 + mov r9, r15 + and r9d, 2097136 + movdqa xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm2, xmm1 + movd QWORD PTR [rdi+r8], xmm0 + pextrq rdx, xmm0, 1 + mov eax, edx + shr rax, 24 + mov ecx, DWORD PTR [r14+rax*4] + xor rcx, rdx + mov QWORD PTR [rdi+r8+8], rcx + mov r10, QWORD PTR [rdi+r9] + mov r8, QWORD PTR [rdi+r9+8] + mov rax, r10 + mul r15 + add rbx, rax + add r11, rdx + mov QWORD PTR [rdi+r9], r11 + mov rax, rbx + xor rbx, r8 + xor r11, r10 + xor rax, rbp + mov QWORD PTR [rdi+r9+8], rax + sub rsi, 1 + jne main_loop_cnv1_single + + pop r15 + pop r14 + pop r13 + mov rbx, QWORD PTR [rsp+8] + mov rbp, QWORD PTR [rsp+16] + mov rsi, QWORD PTR [rsp+24] + mov rdi, QWORD PTR [rsp+32] diff --git a/src/crypto/cn/asm/win64/cn_main_loop.S b/src/crypto/cn/asm/win64/cn_main_loop.S index 9361469a4..9a227f0af 100644 --- a/src/crypto/cn/asm/win64/cn_main_loop.S +++ b/src/crypto/cn/asm/win64/cn_main_loop.S @@ -1,6 +1,9 @@ #define ALIGN(x) .align 64 .intel_syntax noprefix .section .text +.global cnv1_single_mainloop_asm +.global cnv1_double_mainloop_asm +.global cnv1_quad_mainloop_asm .global cnv2_mainloop_ivybridge_asm .global cnv2_mainloop_ryzen_asm .global cnv2_mainloop_bulldozer_asm @@ -9,6 +12,24 @@ .global cnv2_rwz_double_mainloop_asm .global cnv2_upx_double_mainloop_zen3_asm +ALIGN(64) +cnv1_single_mainloop_asm: + #include "../cn1/cnv1_single_main_loop.inc" + ret 0 + mov eax, 3735929054 + +ALIGN(64) +cnv1_double_mainloop_asm: + #include "../cn1/cnv1_double_main_loop.inc" + ret 0 + mov eax, 3735929054 + +ALIGN(64) +cnv1_quad_mainloop_asm: + #include "../cn1/cnv1_quad_main_loop.inc" + ret 0 + mov eax, 3735929054 + ALIGN(64) cnv2_mainloop_ivybridge_asm: #include "../cn2/cnv2_main_loop_ivybridge.inc" diff --git a/src/crypto/cn/asm/win64/cn_main_loop.asm b/src/crypto/cn/asm/win64/cn_main_loop.asm index 7f83e6827..0979580ab 100644 --- a/src/crypto/cn/asm/win64/cn_main_loop.asm +++ b/src/crypto/cn/asm/win64/cn_main_loop.asm @@ -1,4 +1,7 @@ _TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE +PUBLIC cnv1_single_mainloop_asm +PUBLIC cnv1_double_mainloop_asm +PUBLIC cnv1_quad_mainloop_asm PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_bulldozer_asm @@ -6,28 +9,49 @@ PUBLIC cnv2_double_mainloop_sandybridge_asm PUBLIC cnv2_rwz_mainloop_asm PUBLIC cnv2_rwz_double_mainloop_asm -ALIGN 64 +ALIGN(64) +cnv1_single_mainloop_asm PROC + INCLUDE cn1/cnv1_single_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv1_single_mainloop_asm ENDP + +ALIGN(64) +cnv1_double_mainloop_asm PROC + INCLUDE cn1/cnv1_double_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv1_double_mainloop_asm ENDP + +ALIGN(64) +cnv1_quad_mainloop_asm PROC + INCLUDE cn1/cnv1_quad_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv1_quad_mainloop_asm ENDP + +ALIGN(64) cnv2_mainloop_ivybridge_asm PROC INCLUDE cn2/cnv2_main_loop_ivybridge.inc ret 0 mov eax, 3735929054 cnv2_mainloop_ivybridge_asm ENDP -ALIGN 64 +ALIGN(64) cnv2_mainloop_ryzen_asm PROC INCLUDE cn2/cnv2_main_loop_ryzen.inc ret 0 mov eax, 3735929054 cnv2_mainloop_ryzen_asm ENDP -ALIGN 64 +ALIGN(64) cnv2_mainloop_bulldozer_asm PROC INCLUDE cn2/cnv2_main_loop_bulldozer.inc ret 0 mov eax, 3735929054 cnv2_mainloop_bulldozer_asm ENDP -ALIGN 64 +ALIGN(64) cnv2_double_mainloop_sandybridge_asm PROC INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc ret 0 diff --git a/src/crypto/ghostrider/CMakeLists.txt b/src/crypto/ghostrider/CMakeLists.txt index f76fe649e..a80396b80 100644 --- a/src/crypto/ghostrider/CMakeLists.txt +++ b/src/crypto/ghostrider/CMakeLists.txt @@ -42,14 +42,40 @@ set(SOURCES ghostrider.cpp ) -if (CMAKE_C_COMPILER_ID MATCHES GNU) - # gcc 11.2.0 crashes with -ftree-vrp - set_source_files_properties(sph_jh.c PROPERTIES COMPILE_FLAGS "-Ofast -fno-tree-vrp") - - # gcc 11.2.0 creates incorrect code with -O3 - set_source_files_properties(sph_sha2.c PROPERTIES COMPILE_FLAGS "-O2") - - set_source_files_properties(sph_luffa.c PROPERTIES COMPILE_FLAGS "-Ofast -Wno-unused-const-variable") +if (CMAKE_C_COMPILER_ID MATCHES MSVC) + set_source_files_properties(sph_blake.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_bmw.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_cubehash.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_echo.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_fugue.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_groestl.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_hamsi.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_jh.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_keccak.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_luffa.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_shabal.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_shavite.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_simd.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_sha2.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_skein.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_whirlpool.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") +elseif (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang) + set_source_files_properties(sph_blake.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_bmw.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_cubehash.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_echo.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_fugue.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_groestl.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_hamsi.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_jh.c PROPERTIES COMPILE_FLAGS "-Os -fno-tree-vrp") + set_source_files_properties(sph_keccak.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_luffa.c PROPERTIES COMPILE_FLAGS "-Os -Wno-unused-const-variable") + set_source_files_properties(sph_shabal.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_shavite.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_simd.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_sha2.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_skein.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_whirlpool.c PROPERTIES COMPILE_FLAGS "-Os") endif() include_directories(.)