diff --git a/scripts/benchmark_10M.cmd b/scripts/benchmark_10M.cmd index b67a82ecb..dbbcc78c2 100644 --- a/scripts/benchmark_10M.cmd +++ b/scripts/benchmark_10M.cmd @@ -1,3 +1,4 @@ @echo off +cd %~dp0 xmrig.exe --bench=10M --submit pause diff --git a/scripts/benchmark_1M.cmd b/scripts/benchmark_1M.cmd index 0a0d95dbf..5d2166d0c 100644 --- a/scripts/benchmark_1M.cmd +++ b/scripts/benchmark_1M.cmd @@ -1,3 +1,4 @@ @echo off +cd %~dp0 xmrig.exe --bench=1M --submit pause diff --git a/scripts/pool_mine_example.cmd b/scripts/pool_mine_example.cmd index 27749ef61..6e35c913f 100644 --- a/scripts/pool_mine_example.cmd +++ b/scripts/pool_mine_example.cmd @@ -15,5 +15,6 @@ :: Choose pools outside of top 5 to help Monero network be more decentralized! :: Smaller pools also often have smaller fees/payout limits. +cd %~dp0 xmrig.exe -o pool.hashvault.pro:3333 -u 48edfHu7V9Z84YzzMa6fUueoELZ9ZRXq9VetWzYGzKt52XU5xvqgzYnDK9URnRoJMk1j8nLwEVsaSWJ4fhdUyZijBGUicoD -p x pause diff --git a/scripts/solo_mine_example.cmd b/scripts/solo_mine_example.cmd index 151ecc5d2..c925b36d9 100644 --- a/scripts/solo_mine_example.cmd +++ b/scripts/solo_mine_example.cmd @@ -11,5 +11,6 @@ :: Mining solo is the best way to help Monero network be more decentralized! :: But you will only get a payout when you find a block which can take more than a year for a single low-end PC. +cd %~dp0 xmrig.exe -o node.xmr.to:18081 -a rx/0 -u 48edfHu7V9Z84YzzMa6fUueoELZ9ZRXq9VetWzYGzKt52XU5xvqgzYnDK9URnRoJMk1j8nLwEVsaSWJ4fhdUyZijBGUicoD --daemon pause diff --git a/src/backend/common/Workers.cpp b/src/backend/common/Workers.cpp index 9282a53c1..a70affe66 100644 --- a/src/backend/common/Workers.cpp +++ b/src/backend/common/Workers.cpp @@ -214,13 +214,6 @@ void xmrig::Workers::start(const std::vector &data, bool sleep) for (auto worker : m_workers) { worker->start(Workers::onReady); - - // This sleep is important for optimal caching! - // Threads must allocate scratchpads in order so that adjacent cores will use adjacent scratchpads - // Sub-optimal caching can result in up to 0.5% hashrate penalty - if (sleep) { - std::this_thread::sleep_for(std::chrono::milliseconds(20)); - } } } diff --git a/src/backend/cpu/interfaces/ICpuInfo.h b/src/backend/cpu/interfaces/ICpuInfo.h index 7c08c103f..b772a92c4 100644 --- a/src/backend/cpu/interfaces/ICpuInfo.h +++ b/src/backend/cpu/interfaces/ICpuInfo.h @@ -40,6 +40,14 @@ public: VENDOR_AMD }; + enum Arch : uint32_t { + ARCH_UNKNOWN, + ARCH_ZEN, + ARCH_ZEN_PLUS, + ARCH_ZEN2, + ARCH_ZEN3 + }; + enum MsrMod : uint32_t { MSR_MOD_NONE, MSR_MOD_RYZEN_17H, @@ -53,6 +61,7 @@ public: enum Flag : uint32_t { FLAG_AES, + FLAG_AVX, FLAG_AVX2, FLAG_AVX512F, FLAG_BMI2, @@ -80,9 +89,11 @@ public: virtual Assembly::Id assembly() const = 0; virtual bool has(Flag feature) const = 0; virtual bool hasAES() const = 0; + virtual bool hasAVX() const = 0; virtual bool hasAVX2() const = 0; virtual bool hasBMI2() const = 0; virtual bool hasOneGbPages() const = 0; + virtual bool hasXOP() const = 0; virtual bool hasCatL3() const = 0; virtual bool isVM() const = 0; virtual const char *backend() const = 0; @@ -97,6 +108,7 @@ public: virtual size_t packages() const = 0; virtual size_t threads() const = 0; virtual Vendor vendor() const = 0; + virtual Arch arch() const = 0; virtual bool jccErratum() const = 0; }; diff --git a/src/backend/cpu/platform/BasicCpuInfo.cpp b/src/backend/cpu/platform/BasicCpuInfo.cpp index 177168148..2a4dc829d 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.cpp +++ b/src/backend/cpu/platform/BasicCpuInfo.cpp @@ -52,8 +52,8 @@ namespace xmrig { -constexpr size_t kCpuFlagsSize = 13; -static const std::array flagNames = { "aes", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" }; +constexpr size_t kCpuFlagsSize = 14; +static const std::array flagNames = { "aes", "avx", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" }; static_assert(kCpuFlagsSize == ICpuInfo::FLAG_MAX, "kCpuFlagsSize and FLAG_MAX mismatch"); @@ -134,11 +134,12 @@ static inline uint64_t xgetbv() #endif } -static inline bool has_xcr_avx2() { return (xgetbv() & 0x06) == 0x06; } +static inline bool has_xcr_avx() { return (xgetbv() & 0x06) == 0x06; } static inline bool has_xcr_avx512() { return (xgetbv() & 0xE6) == 0xE6; } static inline bool has_osxsave() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 27); } static inline bool has_aes_ni() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 25); } -static inline bool has_avx2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx2(); } +static inline bool has_avx() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 28) && has_osxsave() && has_xcr_avx(); } +static inline bool has_avx2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx(); } static inline bool has_avx512f() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 16) && has_osxsave() && has_xcr_avx512(); } static inline bool has_bmi2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 8); } static inline bool has_pdpe1gb() { return has_feature(PROCESSOR_EXT_INFO, EDX_Reg, 1 << 26); } @@ -175,6 +176,7 @@ xmrig::BasicCpuInfo::BasicCpuInfo() : cpu_brand_string(m_brand); m_flags.set(FLAG_AES, has_aes_ni()); + m_flags.set(FLAG_AVX, has_avx()); m_flags.set(FLAG_AVX2, has_avx2()); m_flags.set(FLAG_AVX512F, has_avx512f()); m_flags.set(FLAG_BMI2, has_bmi2()); @@ -215,9 +217,27 @@ xmrig::BasicCpuInfo::BasicCpuInfo() : switch (m_family) { case 0x17: m_msrMod = MSR_MOD_RYZEN_17H; + switch (m_model) { + case 1: + case 17: + case 32: + m_arch = ARCH_ZEN; + break; + case 8: + case 24: + m_arch = ARCH_ZEN_PLUS; + break; + case 49: + case 96: + case 113: + case 144: + m_arch = ARCH_ZEN2; + break; + } break; case 0x19: + m_arch = ARCH_ZEN3; m_msrMod = MSR_MOD_RYZEN_19H; break; diff --git a/src/backend/cpu/platform/BasicCpuInfo.h b/src/backend/cpu/platform/BasicCpuInfo.h index 5504d07b1..edf119a2e 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.h +++ b/src/backend/cpu/platform/BasicCpuInfo.h @@ -48,9 +48,11 @@ protected: inline Assembly::Id assembly() const override { return m_assembly; } inline bool has(Flag flag) const override { return m_flags.test(flag); } inline bool hasAES() const override { return has(FLAG_AES); } + inline bool hasAVX() const override { return has(FLAG_AVX); } inline bool hasAVX2() const override { return has(FLAG_AVX2); } inline bool hasBMI2() const override { return has(FLAG_BMI2); } inline bool hasOneGbPages() const override { return has(FLAG_PDPE1GB); } + inline bool hasXOP() const override { return has(FLAG_XOP); } inline bool hasCatL3() const override { return has(FLAG_CAT_L3); } inline bool isVM() const override { return has(FLAG_VM); } inline const char *brand() const override { return m_brand; } @@ -62,12 +64,14 @@ protected: inline size_t packages() const override { return 1; } inline size_t threads() const override { return m_threads; } inline Vendor vendor() const override { return m_vendor; } + inline Arch arch() const override { return m_arch; } inline bool jccErratum() const override { return m_jccErratum; } protected: char m_brand[64 + 6]{}; size_t m_threads; Vendor m_vendor = VENDOR_UNKNOWN; + Arch m_arch = ARCH_UNKNOWN; bool m_jccErratum = false; private: diff --git a/src/config.json b/src/config.json index 68fb439f0..aad273268 100644 --- a/src/config.json +++ b/src/config.json @@ -16,6 +16,7 @@ "title": true, "randomx": { "init": -1, + "init-avx2": -1, "mode": "auto", "1gb-pages": false, "rdmsr": true, diff --git a/src/core/config/Config_default.h b/src/core/config/Config_default.h index 6c8106ca7..94cb88d67 100644 --- a/src/core/config/Config_default.h +++ b/src/core/config/Config_default.h @@ -50,6 +50,7 @@ R"===( "colors": true, "randomx": { "init": -1, + "init-avx2": -1, "mode": "auto", "1gb-pages": false, "rdmsr": true, diff --git a/src/crypto/randomx/asm/program_sshash_avx2_constants.inc b/src/crypto/randomx/asm/program_sshash_avx2_constants.inc new file mode 100644 index 000000000..e2e5e0b12 --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_avx2_constants.inc @@ -0,0 +1,28 @@ +r0_avx2_increments: + db 2,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0 +mul_hi_avx2_data: + db 0,0,0,0,1,0,0,0 +r0_avx2_mul: + ;#/ 6364136223846793005 + db 45, 127, 149, 76, 45, 244, 81, 88 +r1_avx2_add: + ;#/ 9298411001130361340 + db 252, 161, 245, 89, 138, 151, 10, 129 +r2_avx2_add: + ;#/ 12065312585734608966 + db 70, 216, 194, 56, 223, 153, 112, 167 +r3_avx2_add: + ;#/ 9306329213124626780 + db 92, 73, 34, 191, 28, 185, 38, 129 +r4_avx2_add: + ;#/ 5281919268842080866 + db 98, 138, 159, 23, 151, 37, 77, 73 +r5_avx2_add: + ;#/ 10536153434571861004 + db 12, 236, 170, 206, 185, 239, 55, 146 +r6_avx2_add: + ;#/ 3398623926847679864 + db 120, 45, 230, 108, 116, 86, 42, 47 +r7_avx2_add: + ;#/ 9549104520008361294 + db 78, 229, 44, 182, 247, 59, 133, 132 \ No newline at end of file diff --git a/src/crypto/randomx/asm/program_sshash_avx2_epilogue.inc b/src/crypto/randomx/asm/program_sshash_avx2_epilogue.inc new file mode 100644 index 000000000..88204d996 --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_avx2_epilogue.inc @@ -0,0 +1,31 @@ + add rsp, 32 + pop r9 + + movdqu xmm0, xmmword ptr [rsp] + movdqu xmm1, xmmword ptr [rsp + 16] + movdqu xmm2, xmmword ptr [rsp + 32] + movdqu xmm3, xmmword ptr [rsp + 48] + movdqu xmm4, xmmword ptr [rsp + 64] + movdqu xmm5, xmmword ptr [rsp + 80] + movdqu xmm6, xmmword ptr [rsp + 96] + movdqu xmm7, xmmword ptr [rsp + 112] + movdqu xmm8, xmmword ptr [rsp + 128] + movdqu xmm9, xmmword ptr [rsp + 144] + movdqu xmm10, xmmword ptr [rsp + 160] + movdqu xmm11, xmmword ptr [rsp + 176] + movdqu xmm12, xmmword ptr [rsp + 192] + movdqu xmm13, xmmword ptr [rsp + 208] + movdqu xmm14, xmmword ptr [rsp + 224] + movdqu xmm15, xmmword ptr [rsp + 240] + vzeroupper + add rsp, 256 + + pop r15 + pop r14 + pop r13 + pop r12 + pop rsi + pop rdi + pop rbp + pop rbx + ret diff --git a/src/crypto/randomx/asm/program_sshash_avx2_loop_begin.inc b/src/crypto/randomx/asm/program_sshash_avx2_loop_begin.inc new file mode 100644 index 000000000..8055cf284 --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_avx2_loop_begin.inc @@ -0,0 +1,37 @@ + ;# prefetch RandomX dataset lines + prefetchnta byte ptr [rsi] + prefetchnta byte ptr [rsi+64] + prefetchnta byte ptr [rsi+128] + prefetchnta byte ptr [rsi+192] + prefetchnta byte ptr [rsi+256] + + ;# prefetch RandomX cache lines + mov rbx, rbp + and rbx, RANDOMX_CACHE_MASK + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rax, [rbp+1] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + prefetchnta byte ptr [rax] + mov [rsp], rax + lea rax, [rbp+2] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + prefetchnta byte ptr [rax] + mov [rsp+8], rax + lea rax, [rbp+3] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + prefetchnta byte ptr [rax] + mov [rsp+16], rax + lea rax, [rbp+4] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + prefetchnta byte ptr [rax] + mov [rsp+24], rax diff --git a/src/crypto/randomx/asm/program_sshash_avx2_loop_end.inc b/src/crypto/randomx/asm/program_sshash_avx2_loop_end.inc new file mode 100644 index 000000000..46dd469d7 --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_avx2_loop_end.inc @@ -0,0 +1,38 @@ + mov qword ptr [rsi+0], r8 + vpunpcklqdq ymm8, ymm0, ymm1 + mov qword ptr [rsi+8], r9 + vpunpcklqdq ymm9, ymm2, ymm3 + mov qword ptr [rsi+16], r10 + vpunpcklqdq ymm10, ymm4, ymm5 + mov qword ptr [rsi+24], r11 + vpunpcklqdq ymm11, ymm6, ymm7 + mov qword ptr [rsi+32], r12 + vpunpckhqdq ymm12, ymm0, ymm1 + mov qword ptr [rsi+40], r13 + vpunpckhqdq ymm13, ymm2, ymm3 + mov qword ptr [rsi+48], r14 + vpunpckhqdq ymm14, ymm4, ymm5 + mov qword ptr [rsi+56], r15 + vpunpckhqdq ymm15, ymm6, ymm7 + + vperm2i128 ymm0, ymm8, ymm9, 32 + vperm2i128 ymm1, ymm10, ymm11, 32 + vmovdqu ymmword ptr [rsi+64], ymm0 + vmovdqu ymmword ptr [rsi+96], ymm1 + vperm2i128 ymm2, ymm12, ymm13, 32 + vperm2i128 ymm3, ymm14, ymm15, 32 + vmovdqu ymmword ptr [rsi+128], ymm2 + vmovdqu ymmword ptr [rsi+160], ymm3 + vperm2i128 ymm4, ymm8, ymm9, 49 + vperm2i128 ymm5, ymm10, ymm11, 49 + vmovdqu ymmword ptr [rsi+192], ymm4 + vmovdqu ymmword ptr [rsi+224], ymm5 + vperm2i128 ymm6, ymm12, ymm13, 49 + vperm2i128 ymm7, ymm14, ymm15, 49 + vmovdqu ymmword ptr [rsi+256], ymm6 + vmovdqu ymmword ptr [rsi+288], ymm7 + + add rbp, 5 + add rsi, 320 + cmp rbp, qword ptr [rsp+32] + db 15, 130, 0, 0, 0, 0 ;# jb rel32 diff --git a/src/crypto/randomx/asm/program_sshash_avx2_save_registers.inc b/src/crypto/randomx/asm/program_sshash_avx2_save_registers.inc new file mode 100644 index 000000000..a551ffa45 --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_avx2_save_registers.inc @@ -0,0 +1,27 @@ + push rbx + push rbp + push rdi + push rsi + push r12 + push r13 + push r14 + push r15 + + ;# save all XMM registers just to be safe for all calling conventions + sub rsp, 256 + movdqu xmmword ptr [rsp], xmm0 + movdqu xmmword ptr [rsp + 16], xmm1 + movdqu xmmword ptr [rsp + 32], xmm2 + movdqu xmmword ptr [rsp + 48], xmm3 + movdqu xmmword ptr [rsp + 64], xmm4 + movdqu xmmword ptr [rsp + 80], xmm5 + movdqu xmmword ptr [rsp + 96], xmm6 + movdqu xmmword ptr [rsp + 112], xmm7 + movdqu xmmword ptr [rsp + 128], xmm8 + movdqu xmmword ptr [rsp + 144], xmm9 + movdqu xmmword ptr [rsp + 160], xmm10 + movdqu xmmword ptr [rsp + 176], xmm11 + movdqu xmmword ptr [rsp + 192], xmm12 + movdqu xmmword ptr [rsp + 208], xmm13 + movdqu xmmword ptr [rsp + 224], xmm14 + movdqu xmmword ptr [rsp + 240], xmm15 diff --git a/src/crypto/randomx/asm/program_sshash_avx2_ssh_load.inc b/src/crypto/randomx/asm/program_sshash_avx2_ssh_load.inc new file mode 100644 index 000000000..bed78094a --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_avx2_ssh_load.inc @@ -0,0 +1,50 @@ + sub rsp, 40 + mov [rsp], rbx + vmovdqu ymmword ptr [rsp+8], ymm14 + + mov rax, [rsp+40] + mov rbx, [rsp+48] + mov rcx, [rsp+56] + mov rdx, [rsp+64] + + vmovdqu ymm8, ymmword ptr [rax] ;# ymm8 = r0[1], r1[1], r2[1], r3[1] + vmovdqu ymm9, ymmword ptr [rbx] ;# ymm9 = r0[2], r1[2], r2[2], r3[2] + vmovdqu ymm10, ymmword ptr [rcx] ;# ymm10 = r0[3], r1[3], r2[3], r3[3] + vmovdqu ymm11, ymmword ptr [rdx] ;# ymm11 = r0[4], r1[4], r2[4], r3[4] + + vpunpcklqdq ymm12, ymm8, ymm9 ;# ymm12 = r0[1], r0[2], r2[1], r2[2] + vpunpcklqdq ymm13, ymm10, ymm11 ;# ymm13 = r0[3], r0[4], r2[3], r2[4] + vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r0[1], r0[2], r0[3], r0[4] + vpxor ymm0, ymm0, ymm14 + vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r2[1], r2[2], r2[3], r2[4] + vpxor ymm2, ymm2, ymm14 + + vpunpckhqdq ymm12, ymm8, ymm9 ;# ymm12 = r1[1], r1[2], r3[1], r3[2] + vpunpckhqdq ymm13, ymm10, ymm11 ;# ymm13 = r1[3], r1[4], r3[3], r3[4] + vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r1[1], r1[2], r1[3], r1[4] + vpxor ymm1, ymm1, ymm14 + vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r3[1], r3[2], r3[3], r3[4] + vpxor ymm3, ymm3, ymm14 + + vmovdqu ymm8, ymmword ptr [rax+32] ;# ymm8 = r4[1], r5[1], r6[1], r7[1] + vmovdqu ymm9, ymmword ptr [rbx+32] ;# ymm9 = r4[2], r5[2], r6[2], r7[2] + vmovdqu ymm10, ymmword ptr [rcx+32] ;# ymm10 = r4[3], r5[3], r6[3], r7[3] + vmovdqu ymm11, ymmword ptr [rdx+32] ;# ymm11 = r4[4], r5[4], r6[4], r7[4] + + vpunpcklqdq ymm12, ymm8, ymm9 ;# ymm12 = r4[1], r4[2], r6[1], r6[2] + vpunpcklqdq ymm13, ymm10, ymm11 ;# ymm13 = r4[3], r4[4], r6[3], r6[4] + vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r4[1], r4[2], r4[3], r4[4] + vpxor ymm4, ymm4, ymm14 + vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r6[1], r6[2], r6[3], r6[4] + vpxor ymm6, ymm6, ymm14 + + vpunpckhqdq ymm12, ymm8, ymm9 ;# ymm12 = r5[1], r5[2], r7[1], r7[2] + vpunpckhqdq ymm13, ymm10, ymm11 ;# ymm13 = r5[3], r5[4], r7[3], r7[4] + vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r5[1], r5[2], r5[3], r5[4] + vpxor ymm5, ymm5, ymm14 + vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r7[1], r7[2], r7[3], r7[4] + vpxor ymm7, ymm7, ymm14 + + mov rbx, [rsp] + vmovdqu ymm14, ymmword ptr [rsp+8] + add rsp, 40 diff --git a/src/crypto/randomx/asm/program_sshash_avx2_ssh_prefetch.inc b/src/crypto/randomx/asm/program_sshash_avx2_ssh_prefetch.inc new file mode 100644 index 000000000..072de8646 --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_avx2_ssh_prefetch.inc @@ -0,0 +1,29 @@ + vmovdqu ymmword ptr [rsp], ymm0 + + mov rax, [rsp] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + mov [rsp], rax + prefetchnta byte ptr [rax] + + mov rax, [rsp+8] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + mov [rsp+8], rax + prefetchnta byte ptr [rax] + + mov rax, [rsp+16] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + mov [rsp+16], rax + prefetchnta byte ptr [rax] + + mov rax, [rsp+24] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + mov [rsp+24], rax + prefetchnta byte ptr [rax] diff --git a/src/crypto/randomx/jit_compiler_a64.cpp b/src/crypto/randomx/jit_compiler_a64.cpp index 50e39c508..f98e36f6d 100644 --- a/src/crypto/randomx/jit_compiler_a64.cpp +++ b/src/crypto/randomx/jit_compiler_a64.cpp @@ -36,12 +36,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/virtual_memory.hpp" static bool hugePagesJIT = false; +static int optimizedDatasetInit = -1; void randomx_set_huge_pages_jit(bool hugePages) { hugePagesJIT = hugePages; } +void randomx_set_optimized_dataset_init(int value) +{ + optimizedDatasetInit = value; +} + namespace ARMV8A { constexpr uint32_t B = 0x14000000; @@ -98,7 +104,7 @@ static size_t CalcDatasetItemSize() constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 }; -JitCompilerA64::JitCompilerA64(bool hugePagesEnable) : +JitCompilerA64::JitCompilerA64(bool hugePagesEnable, bool) : hugePages(hugePagesJIT && hugePagesEnable), literalPos(ImulRcpLiteralsEnd) { diff --git a/src/crypto/randomx/jit_compiler_a64.hpp b/src/crypto/randomx/jit_compiler_a64.hpp index faa2ac2d1..32ff5166e 100644 --- a/src/crypto/randomx/jit_compiler_a64.hpp +++ b/src/crypto/randomx/jit_compiler_a64.hpp @@ -47,7 +47,7 @@ namespace randomx { class JitCompilerA64 { public: - explicit JitCompilerA64(bool hugePagesEnable); + explicit JitCompilerA64(bool hugePagesEnable, bool optimizedInitDatasetEnable); ~JitCompilerA64(); void prepare() {} diff --git a/src/crypto/randomx/jit_compiler_fallback.cpp b/src/crypto/randomx/jit_compiler_fallback.cpp index 374da6781..369458a45 100644 --- a/src/crypto/randomx/jit_compiler_fallback.cpp +++ b/src/crypto/randomx/jit_compiler_fallback.cpp @@ -35,3 +35,6 @@ void randomx_set_huge_pages_jit(bool) { } +void randomx_set_optimized_dataset_init(int) +{ +} diff --git a/src/crypto/randomx/jit_compiler_fallback.hpp b/src/crypto/randomx/jit_compiler_fallback.hpp index b86411976..cdf87cba4 100644 --- a/src/crypto/randomx/jit_compiler_fallback.hpp +++ b/src/crypto/randomx/jit_compiler_fallback.hpp @@ -43,7 +43,7 @@ namespace randomx { class JitCompilerFallback { public: - explicit JitCompilerFallback(bool) { + explicit JitCompilerFallback(bool, bool) { throw std::runtime_error("JIT compilation is not supported on this platform"); } void prepare() {} diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index ee3e1b458..5f7a83a14 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -49,17 +49,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef _MSC_VER # include -#else -# include #endif static bool hugePagesJIT = false; +static int optimizedDatasetInit = -1; void randomx_set_huge_pages_jit(bool hugePages) { hugePagesJIT = hugePages; } +void randomx_set_optimized_dataset_init(int value) +{ + optimizedDatasetInit = value; +} + namespace randomx { /* @@ -116,6 +120,11 @@ namespace randomx { #define codeReadDatasetLightSshInit ADDR(randomx_program_read_dataset_sshash_init) #define codeReadDatasetLightSshFin ADDR(randomx_program_read_dataset_sshash_fin) #define codeDatasetInit ADDR(randomx_dataset_init) + #define codeDatasetInitAVX2_prologue ADDR(randomx_dataset_init_avx2_prologue) + #define codeDatasetInitAVX2_loop_end ADDR(randomx_dataset_init_avx2_loop_end) + #define codeDatasetInitAVX2_loop_epilogue ADDR(randomx_dataset_init_avx2_epilogue) + #define codeDatasetInitAVX2_ssh_load ADDR(randomx_dataset_init_avx2_ssh_load) + #define codeDatasetInitAVX2_ssh_prefetch ADDR(randomx_dataset_init_avx2_ssh_prefetch) #define codeLoopStore ADDR(randomx_program_loop_store) #define codeLoopEnd ADDR(randomx_program_loop_end) #define codeEpilogue ADDR(randomx_program_epilogue) @@ -132,7 +141,12 @@ namespace randomx { #define readDatasetLightInitSize (codeReadDatasetLightSshFin - codeReadDatasetLightSshInit) #define readDatasetLightFinSize (codeLoopStore - codeReadDatasetLightSshFin) #define loopStoreSize (codeLoopEnd - codeLoopStore) - #define datasetInitSize (codeEpilogue - codeDatasetInit) + #define datasetInitSize (codeDatasetInitAVX2_prologue - codeDatasetInit) + #define datasetInitAVX2_prologue_size (codeDatasetInitAVX2_loop_end - codeDatasetInitAVX2_prologue) + #define datasetInitAVX2_loop_end_size (codeDatasetInitAVX2_loop_epilogue - codeDatasetInitAVX2_loop_end) + #define datasetInitAVX2_epilogue_size (codeDatasetInitAVX2_ssh_load - codeDatasetInitAVX2_loop_epilogue) + #define datasetInitAVX2_ssh_load_size (codeDatasetInitAVX2_ssh_prefetch - codeDatasetInitAVX2_ssh_load) + #define datasetInitAVX2_ssh_prefetch_size (codeEpilogue - codeDatasetInitAVX2_ssh_prefetch) #define epilogueSize (codeShhLoad - codeEpilogue) #define codeSshLoadSize (codeShhPrefetch - codeShhLoad) #define codeSshPrefetchSize (codeShhEnd - codeShhPrefetch) @@ -192,17 +206,6 @@ namespace randomx { xmrig::VirtualMemory::protectRX(p1, p2 - p1); } - static inline void cpuid(uint32_t level, int32_t output[4]) - { - memset(output, 0, sizeof(int32_t) * 4); - -# ifdef _MSC_VER - __cpuid(output, static_cast(level)); -# else - __cpuid_count(level, 0, output[0], output[1], output[2], output[3]); -# endif - } - # ifdef _MSC_VER static FORCE_INLINE uint32_t rotl32(uint32_t a, int shift) { return _rotl(a, shift); } # else @@ -212,17 +215,59 @@ namespace randomx { static std::atomic codeOffset; constexpr size_t codeOffsetIncrement = 59 * 64; - JitCompilerX86::JitCompilerX86(bool hugePagesEnable) { + JitCompilerX86::JitCompilerX86(bool hugePagesEnable, bool optimizedInitDatasetEnable) { BranchesWithin32B = xmrig::Cpu::info()->jccErratum(); - int32_t info[4]; - cpuid(1, info); - hasAVX = ((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0); + hasAVX = xmrig::Cpu::info()->hasAVX(); + hasAVX2 = xmrig::Cpu::info()->hasAVX2(); - cpuid(0x80000001, info); - hasXOP = ((info[2] & (1 << 11)) != 0); + // Disable by default + initDatasetAVX2 = false; - allocatedSize = CodeSize * 2; + if (optimizedInitDatasetEnable) { + // Dataset init using AVX2: + // -1 = Auto detect + // 0 = Always disabled + // +1 = Always enabled + if (optimizedDatasetInit > 0) { + initDatasetAVX2 = true; + } + else if (optimizedDatasetInit < 0) { + xmrig::ICpuInfo::Vendor vendor = xmrig::Cpu::info()->vendor(); + xmrig::ICpuInfo::Arch arch = xmrig::Cpu::info()->arch(); + + if (vendor == xmrig::ICpuInfo::VENDOR_INTEL) { + // AVX2 init is faster on Intel CPUs without HT + initDatasetAVX2 = (xmrig::Cpu::info()->cores() == xmrig::Cpu::info()->threads()); + } + else if (vendor == xmrig::ICpuInfo::VENDOR_AMD) { + switch (arch) { + case xmrig::ICpuInfo::ARCH_ZEN: + case xmrig::ICpuInfo::ARCH_ZEN_PLUS: + // AVX2 init is slower on Zen/Zen+ + initDatasetAVX2 = false; + break; + case xmrig::ICpuInfo::ARCH_ZEN2: + // AVX2 init is faster on Zen2 without SMT (mobile CPUs) + initDatasetAVX2 = (xmrig::Cpu::info()->cores() == xmrig::Cpu::info()->threads()); + break; + case xmrig::ICpuInfo::ARCH_ZEN3: + // AVX2 init is faster on Zen3 + initDatasetAVX2 = true; + break; + } + } + } + } + + // Sorry, low-end Intel CPUs + if (!hasAVX2) { + initDatasetAVX2 = false; + } + + hasXOP = xmrig::Cpu::info()->hasXOP(); + + allocatedSize = initDatasetAVX2 ? (CodeSize * 4) : (CodeSize * 2); allocatedCode = static_cast(allocExecutableMemory(allocatedSize, # ifdef XMRIG_SECURE_JIT false @@ -304,14 +349,49 @@ namespace randomx { template void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[N]) { + uint8_t* p = code; + if (initDatasetAVX2) { + codePos = 0; + emit(codeDatasetInitAVX2_prologue, datasetInitAVX2_prologue_size, code, codePos); + + for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) { + SuperscalarProgram& prog = programs[j]; + uint32_t pos = codePos; + for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) { + generateSuperscalarCode(prog(i), p, pos); + } + codePos = pos; + emit(codeShhLoad, codeSshLoadSize, code, codePos); + emit(codeDatasetInitAVX2_ssh_load, datasetInitAVX2_ssh_load_size, code, codePos); + if (j < RandomX_CurrentConfig.CacheAccesses - 1) { + *(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast(prog.getAddressRegister()) << 16); + codePos += 3; + emit(RandomX_CurrentConfig.codeShhPrefetchTweaked, codeSshPrefetchSize, code, codePos); + uint8_t* p = code + codePos; + emit(codeDatasetInitAVX2_ssh_prefetch, datasetInitAVX2_ssh_prefetch_size, code, codePos); + p[3] += prog.getAddressRegister() << 3; + } + } + + emit(codeDatasetInitAVX2_loop_end, datasetInitAVX2_loop_end_size, code, codePos); + + // Number of bytes from the start of randomx_dataset_init_avx2_prologue to loop_begin label + constexpr int32_t prologue_size = 320; + *(int32_t*)(code + codePos - 4) = prologue_size - codePos; + + emit(codeDatasetInitAVX2_loop_epilogue, datasetInitAVX2_epilogue_size, code, codePos); + return; + } + memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize); codePos = superScalarHashOffset + codeSshInitSize; for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) { SuperscalarProgram& prog = programs[j]; - for (unsigned i = 0; i < prog.getSize(); ++i) { - Instruction& instr = prog(i); - generateSuperscalarCode(instr); + uint32_t pos = codePos; + for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) { + generateSuperscalarCode(prog(i), p, pos); } + codePos = pos; emit(codeShhLoad, codeSshLoadSize, code, codePos); if (j < RandomX_CurrentConfig.CacheAccesses - 1) { *(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast(prog.getAddressRegister()) << 16); @@ -326,7 +406,10 @@ namespace randomx { void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_MAX_ACCESSES]); void JitCompilerX86::generateDatasetInitCode() { - memcpy(code, codeDatasetInit, datasetInitSize); + // AVX2 code is generated in generateSuperscalarHash() + if (!initDatasetAVX2) { + memcpy(code, codeDatasetInit, datasetInitSize); + } } void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) { @@ -405,85 +488,243 @@ namespace randomx { emit32(epilogueOffset - codePos - 4, code, codePos); } - void JitCompilerX86::generateSuperscalarCode(Instruction& instr) { - static constexpr uint8_t REX_SUB_RR[] = { 0x4d, 0x2b }; - static constexpr uint8_t REX_MOV_RR64[] = { 0x49, 0x8b }; - static constexpr uint8_t REX_MOV_R64R[] = { 0x4c, 0x8b }; - static constexpr uint8_t REX_IMUL_RR[] = { 0x4d, 0x0f, 0xaf }; - static constexpr uint8_t REX_IMUL_RM[] = { 0x4c, 0x0f, 0xaf }; - static constexpr uint8_t REX_MUL_R[] = { 0x49, 0xf7 }; - static constexpr uint8_t REX_81[] = { 0x49, 0x81 }; - static constexpr uint8_t MOV_RAX_I[] = { 0x48, 0xb8 }; - static constexpr uint8_t REX_LEA[] = { 0x4f, 0x8d }; - static constexpr uint8_t REX_XOR_RR[] = { 0x4D, 0x33 }; - static constexpr uint8_t REX_XOR_RI[] = { 0x49, 0x81 }; - static constexpr uint8_t REX_ROT_I8[] = { 0x49, 0xc1 }; - + template + FORCE_INLINE void JitCompilerX86::generateSuperscalarCode(Instruction& instr, uint8_t* code, uint32_t& codePos) { switch ((SuperscalarInstructionType)instr.opcode) { case randomx::SuperscalarInstructionType::ISUB_R: - emit(REX_SUB_RR, code, codePos); - emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos); + *(uint32_t*)(code + codePos) = 0x00C02B4DUL + (instr.dst << 19) + (instr.src << 16); + codePos += 3; + if (AVX2) { + emit32(0xC0FBFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos); + } break; case randomx::SuperscalarInstructionType::IXOR_R: - emit(REX_XOR_RR, code, codePos); - emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos); + *(uint32_t*)(code + codePos) = 0x00C0334DUL + (instr.dst << 19) + (instr.src << 16); + codePos += 3; + if (AVX2) { + emit32(0xC0EFFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos); + } break; case randomx::SuperscalarInstructionType::IADD_RS: - emit(REX_LEA, code, codePos); - emitByte(0x04 + 8 * instr.dst, code, codePos); - genSIB(instr.getModShift(), instr.src, instr.dst, code, codePos); + emit32(0x00048D4F + (instr.dst << 19) + (genSIB(instr.getModShift(), instr.src, instr.dst) << 24), code, codePos); + if (AVX2) { + if (instr.getModShift()) { + static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xF0, 0x00, 0xC5, 0xBD, 0xD4, 0xC0 }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + p[3] += instr.src; + p[4] = instr.getModShift(); + p[8] += instr.dst * 9; + } + else { + emit32(0xC0D4FDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos); + } + } break; case randomx::SuperscalarInstructionType::IMUL_R: - emit(REX_IMUL_RR, code, codePos); - emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos); + emit32(0xC0AF0F4DUL + (instr.dst << 27) + (instr.src << 24), code, codePos); + if (AVX2) { + static const uint8_t t[] = { + 0xC5, 0xBD, 0x73, 0xD0, 0x20, + 0xC5, 0xB5, 0x73, 0xD0, 0x20, + 0xC5, 0x7D, 0xF4, 0xD0, + 0xC5, 0x35, 0xF4, 0xD8, + 0xC5, 0xBD, 0xF4, 0xC0, + 0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20, + 0xC5, 0xFD, 0x73, 0xF0, 0x20, + 0xC4, 0x41, 0x2D, 0xD4, 0xD3, + 0xC5, 0xAD, 0xD4, 0xC0 + }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + p[3] += instr.dst; + p[8] += instr.src; + p[11] -= instr.dst * 8; + p[13] += instr.src; + p[17] += instr.dst; + p[21] += instr.dst * 8 + instr.src; + p[29] -= instr.dst * 8; + p[31] += instr.dst; + p[41] += instr.dst * 9; + } break; case randomx::SuperscalarInstructionType::IROR_C: - emit(REX_ROT_I8, code, codePos); - emitByte(0xc8 + instr.dst, code, codePos); - emitByte(instr.getImm32() & 63, code, codePos); + { + const uint32_t shift = instr.getImm32() & 63; + emit32(0x00C8C149UL + (instr.dst << 16) + (shift << 24), code, codePos); + if (AVX2) { + static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xD0, 0x00, 0xC5, 0xB5, 0x73, 0xF0, 0x00, 0xC4, 0xC1, 0x3D, 0xEB, 0xC1 }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + p[3] += instr.dst; + p[4] = shift; + p[8] += instr.dst; + p[9] = 64 - shift; + p[14] += instr.dst * 8; + } + } break; case randomx::SuperscalarInstructionType::IADD_C7: case randomx::SuperscalarInstructionType::IADD_C8: case randomx::SuperscalarInstructionType::IADD_C9: - emit(REX_81, code, codePos); - emitByte(0xc0 + instr.dst, code, codePos); - emit32(instr.getImm32(), code, codePos); + if (AVX2) { + static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x03, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xD4, 0xC0 }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + *(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32()); + p[12] += instr.dst * 8; + p[24] -= instr.dst * 8; + p[26] += instr.dst * 8; + } + else { + *(uint32_t*)(code + codePos) = 0x00C08149UL + (instr.dst << 16); + codePos += 3; + emit32(instr.getImm32(), code, codePos); + } break; case randomx::SuperscalarInstructionType::IXOR_C7: case randomx::SuperscalarInstructionType::IXOR_C8: case randomx::SuperscalarInstructionType::IXOR_C9: - emit(REX_XOR_RI, code, codePos); - emitByte(0xf0 + instr.dst, code, codePos); - emit32(instr.getImm32(), code, codePos); + if (AVX2) { + static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x33, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xEF, 0xC0 }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + *(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32()); + p[12] += instr.dst * 8; + p[24] -= instr.dst * 8; + p[26] += instr.dst * 8; + } + else { + *(uint32_t*)(code + codePos) = 0x00F08149UL + (instr.dst << 16); + codePos += 3; + emit32(instr.getImm32(), code, codePos); + } break; case randomx::SuperscalarInstructionType::IMULH_R: - emit(REX_MOV_RR64, code, codePos); - emitByte(0xc0 + instr.dst, code, codePos); - emit(REX_MUL_R, code, codePos); - emitByte(0xe0 + instr.src, code, codePos); - emit(REX_MOV_R64R, code, codePos); - emitByte(0xc2 + 8 * instr.dst, code, codePos); + *(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16); + codePos += 3; + *(uint32_t*)(code + codePos) = 0x00E0F749UL + (instr.src << 16); + codePos += 3; + *(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19); + codePos += 3; + if (AVX2) { + static const uint8_t t[] = { + 0xC5, 0xBD, 0x73, 0xD0, 0x20, + 0xC5, 0xB5, 0x73, 0xD0, 0x20, + 0xC5, 0x7D, 0xF4, 0xD0, + 0xC5, 0x3D, 0xF4, 0xD8, + 0xC4, 0x41, 0x7D, 0xF4, 0xE1, + 0xC4, 0xC1, 0x3D, 0xF4, 0xC1, + 0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20, + 0xC4, 0x41, 0x25, 0xEF, 0xC6, + 0xC4, 0x41, 0x25, 0xD4, 0xDC, + 0xC4, 0x41, 0x25, 0xD4, 0xDA, + 0xC4, 0x41, 0x25, 0xEF, 0xCE, + 0xC4, 0x42, 0x3D, 0x37, 0xC1, + 0xC4, 0x41, 0x3D, 0xDB, 0xC7, + 0xC5, 0xBD, 0xD4, 0xC0, + 0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20, + 0xC5, 0xA5, 0xD4, 0xC0 + }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + p[3] += instr.dst; + p[8] += instr.src; + p[11] -= instr.dst * 8; + p[13] += instr.src; + p[17] += instr.src; + p[20] -= instr.dst * 8; + p[27] += instr.dst * 8; + p[67] += instr.dst * 9; + p[77] += instr.dst * 9; + } break; case randomx::SuperscalarInstructionType::ISMULH_R: - emit(REX_MOV_RR64, code, codePos); - emitByte(0xc0 + instr.dst, code, codePos); - emit(REX_MUL_R, code, codePos); - emitByte(0xe8 + instr.src, code, codePos); - emit(REX_MOV_R64R, code, codePos); - emitByte(0xc2 + 8 * instr.dst, code, codePos); + *(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16); + codePos += 3; + *(uint32_t*)(code + codePos) = 0x00E8F749UL + (instr.src << 16); + codePos += 3; + *(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19); + codePos += 3; + if (AVX2) { + static const uint8_t t[] = { + 0xC5, 0xBD, 0x73, 0xD0, 0x20, + 0xC5, 0xB5, 0x73, 0xD0, 0x20, + 0xC5, 0x7D, 0xF4, 0xD0, + 0xC5, 0x3D, 0xF4, 0xD8, + 0xC4, 0x41, 0x7D, 0xF4, 0xE1, + 0xC4, 0x41, 0x3D, 0xF4, 0xE9, + 0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20, + 0xC4, 0x41, 0x25, 0xEF, 0xC6, + 0xC4, 0x41, 0x25, 0xD4, 0xDC, + 0xC4, 0x41, 0x25, 0xD4, 0xDA, + 0xC4, 0x41, 0x25, 0xEF, 0xCE, + 0xC4, 0x42, 0x3D, 0x37, 0xC1, + 0xC4, 0x41, 0x3D, 0xDB, 0xC7, + 0xC4, 0x41, 0x15, 0xD4, 0xE8, + 0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20, + 0xC4, 0x41, 0x15, 0xD4, 0xC3, + 0xC4, 0x41, 0x35, 0xEF, 0xC9, + 0xC4, 0x62, 0x35, 0x37, 0xD0, + 0xC4, 0x62, 0x35, 0x37, 0xD8, + 0xC5, 0x2D, 0xDB, 0xD0, + 0xC5, 0x25, 0xDB, 0xD8, + 0xC4, 0x41, 0x3D, 0xFB, 0xC2, + 0xC4, 0xC1, 0x3D, 0xFB, 0xC3 + }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + p[3] += instr.dst; + p[8] += instr.src; + p[11] -= instr.dst * 8; + p[13] += instr.src; + p[17] += instr.src; + p[20] -= instr.dst * 8; + p[89] += instr.dst; + p[94] += instr.src; + p[98] += instr.src; + p[102] += instr.dst; + p[112] += instr.dst * 8; + } break; case randomx::SuperscalarInstructionType::IMUL_RCP: - emit(MOV_RAX_I, code, codePos); + *(uint32_t*)(code + codePos) = 0x0000B848UL; + codePos += 2; emit64(randomx_reciprocal_fast(instr.getImm32()), code, codePos); - emit(REX_IMUL_RM, code, codePos); - emitByte(0xc0 + 8 * instr.dst, code, codePos); + emit32(0xC0AF0F4CUL + (instr.dst << 27), code, codePos); + if (AVX2) { + static const uint8_t t[] = { + 0xC4, 0x62, 0x7D, 0x19, 0x25, 0xEB, 0xFF, 0xFF, 0xFF, + 0xC5, 0xBD, 0x73, 0xD0, 0x20, + 0xC4, 0xC1, 0x35, 0x73, 0xD4, 0x20, + 0xC4, 0x41, 0x7D, 0xF4, 0xD4, + 0xC5, 0x35, 0xF4, 0xD8, + 0xC4, 0xC1, 0x3D, 0xF4, 0xC4, + 0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20, + 0xC5, 0xFD, 0x73, 0xF0, 0x20, + 0xC4, 0x41, 0x2D, 0xD4, 0xD3, + 0xC5, 0xAD, 0xD4, 0xC0 + }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + p[12] += instr.dst; + p[22] -= instr.dst * 8; + p[28] += instr.dst; + p[33] += instr.dst * 8; + p[41] -= instr.dst * 8; + p[43] += instr.dst; + p[53] += instr.dst * 9; + } break; default: UNREACHABLE; } } + template void JitCompilerX86::generateSuperscalarCode(Instruction&, uint8_t*, uint32_t&); + template void JitCompilerX86::generateSuperscalarCode(Instruction&, uint8_t*, uint32_t&); + template FORCE_INLINE void JitCompilerX86::genAddressReg(const Instruction& instr, const uint32_t src, uint8_t* code, uint32_t& codePos) { *(uint32_t*)(code + codePos) = (rax ? 0x24808d41 : 0x24888d41) + (src << 16); @@ -563,10 +804,6 @@ namespace randomx { codePos = pos; } - void JitCompilerX86::genSIB(int scale, int index, int base, uint8_t* code, uint32_t& codePos) { - emitByte((scale << 6) | (index << 3) | base, code, codePos); - } - void JitCompilerX86::h_ISUB_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; diff --git a/src/crypto/randomx/jit_compiler_x86.hpp b/src/crypto/randomx/jit_compiler_x86.hpp index 32ca97b5d..5c43264c5 100644 --- a/src/crypto/randomx/jit_compiler_x86.hpp +++ b/src/crypto/randomx/jit_compiler_x86.hpp @@ -49,7 +49,7 @@ namespace randomx { class JitCompilerX86 { public: - explicit JitCompilerX86(bool hugePagesEnable); + explicit JitCompilerX86(bool hugePagesEnable, bool optimizedInitDatasetEnable); ~JitCompilerX86(); void prepare(); void generateProgram(Program&, ProgramConfiguration&, uint32_t); @@ -96,6 +96,8 @@ namespace randomx { bool BranchesWithin32B = false; bool hasAVX; + bool hasAVX2; + bool initDatasetAVX2; bool hasXOP; uint8_t* allocatedCode = nullptr; @@ -107,9 +109,10 @@ namespace randomx { static void genAddressReg(const Instruction&, const uint32_t src, uint8_t* code, uint32_t& codePos); static void genAddressRegDst(const Instruction&, uint8_t* code, uint32_t& codePos); static void genAddressImm(const Instruction&, uint8_t* code, uint32_t& codePos); - static void genSIB(int scale, int index, int base, uint8_t* code, uint32_t& codePos); + static uint32_t genSIB(int scale, int index, int base) { return (scale << 6) | (index << 3) | base; } - void generateSuperscalarCode(Instruction &); + template + void generateSuperscalarCode(Instruction& inst, uint8_t* code, uint32_t& codePos); static void emitByte(uint8_t val, uint8_t* code, uint32_t& codePos) { code[codePos] = val; diff --git a/src/crypto/randomx/jit_compiler_x86_static.S b/src/crypto/randomx/jit_compiler_x86_static.S index 9f3a5bf18..da5ee98ea 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.S +++ b/src/crypto/randomx/jit_compiler_x86_static.S @@ -52,6 +52,11 @@ .global DECL(randomx_program_loop_store) .global DECL(randomx_program_loop_end) .global DECL(randomx_dataset_init) +.global DECL(randomx_dataset_init_avx2_prologue) +.global DECL(randomx_dataset_init_avx2_loop_end) +.global DECL(randomx_dataset_init_avx2_epilogue) +.global DECL(randomx_dataset_init_avx2_ssh_load) +.global DECL(randomx_dataset_init_avx2_ssh_prefetch) .global DECL(randomx_program_epilogue) .global DECL(randomx_sshash_load) .global DECL(randomx_sshash_prefetch) @@ -192,6 +197,98 @@ call_offset: pop rbx ret +.balign 64 +DECL(randomx_dataset_init_avx2_prologue): + #include "asm/program_sshash_avx2_save_registers.inc" + +#if defined(WINABI) + mov rdi, qword ptr [rcx] ;# cache->memory + mov rsi, rdx ;# dataset + mov rbp, r8 ;# block index + push r9 ;# max. block index +#else + mov rdi, qword ptr [rdi] ;# cache->memory + ;# dataset in rsi + mov rbp, rdx ;# block index + push rcx ;# max. block index +#endif + sub rsp, 32 + + jmp randomx_dataset_init_avx2_prologue_loop_begin + #include "asm/program_sshash_avx2_constants.inc" + +.balign 64 +randomx_dataset_init_avx2_prologue_loop_begin: + #include "asm/program_sshash_avx2_loop_begin.inc" + + ;# init integer registers (lane 0) + lea r8, [rbp+1] + imul r8, qword ptr [r0_avx2_mul+rip] + mov r9, qword ptr [r1_avx2_add+rip] + xor r9, r8 + mov r10, qword ptr [r2_avx2_add+rip] + xor r10, r8 + mov r11, qword ptr [r3_avx2_add+rip] + xor r11, r8 + mov r12, qword ptr [r4_avx2_add+rip] + xor r12, r8 + mov r13, qword ptr [r5_avx2_add+rip] + xor r13, r8 + mov r14, qword ptr [r6_avx2_add+rip] + xor r14, r8 + mov r15, qword ptr [r7_avx2_add+rip] + xor r15, r8 + + ;# init AVX registers (lanes 1-4) + vpxor ymm0, ymm0, ymm0 + movq xmm0, rbp + vpbroadcastq ymm0, xmm0 + vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments+rip] + + ;# ymm0 *= r0_avx2_mul + vbroadcastsd ymm1, qword ptr [r0_avx2_mul+rip] + vpsrlq ymm8, ymm0, 32 + vpsrlq ymm9, ymm1, 32 + vpmuludq ymm10, ymm0, ymm1 + vpmuludq ymm11, ymm9, ymm0 + vpmuludq ymm0, ymm8, ymm1 + vpsllq ymm11, ymm11, 32 + vpsllq ymm0, ymm0, 32 + vpaddq ymm10, ymm10, ymm11 + vpaddq ymm0, ymm10, ymm0 + + vbroadcastsd ymm1, qword ptr [r1_avx2_add+rip] + vpxor ymm1, ymm0, ymm1 + vbroadcastsd ymm2, qword ptr [r2_avx2_add+rip] + vpxor ymm2, ymm0, ymm2 + vbroadcastsd ymm3, qword ptr [r3_avx2_add+rip] + vpxor ymm3, ymm0, ymm3 + vbroadcastsd ymm4, qword ptr [r4_avx2_add+rip] + vpxor ymm4, ymm0, ymm4 + vbroadcastsd ymm5, qword ptr [r5_avx2_add+rip] + vpxor ymm5, ymm0, ymm5 + vbroadcastsd ymm6, qword ptr [r6_avx2_add+rip] + vpxor ymm6, ymm0, ymm6 + vbroadcastsd ymm7, qword ptr [r7_avx2_add+rip] + vpxor ymm7, ymm0, ymm7 + + vbroadcastsd ymm15, qword ptr [mul_hi_avx2_data+rip] ;# carry_bit (bit 32) + vpsllq ymm14, ymm15, 31 ;# sign64 (bit 63) + + ;# generated SuperscalarHash code goes here + +DECL(randomx_dataset_init_avx2_loop_end): + #include "asm/program_sshash_avx2_loop_end.inc" + +DECL(randomx_dataset_init_avx2_epilogue): + #include "asm/program_sshash_avx2_epilogue.inc" + +DECL(randomx_dataset_init_avx2_ssh_load): + #include "asm/program_sshash_avx2_ssh_load.inc" + +DECL(randomx_dataset_init_avx2_ssh_prefetch): + #include "asm/program_sshash_avx2_ssh_prefetch.inc" + .balign 64 DECL(randomx_program_epilogue): #include "asm/program_epilogue_store.inc" diff --git a/src/crypto/randomx/jit_compiler_x86_static.asm b/src/crypto/randomx/jit_compiler_x86_static.asm index e36e5aafa..f8a2d527d 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.asm +++ b/src/crypto/randomx/jit_compiler_x86_static.asm @@ -41,6 +41,11 @@ PUBLIC randomx_program_read_dataset_ryzen PUBLIC randomx_program_read_dataset_sshash_init PUBLIC randomx_program_read_dataset_sshash_fin PUBLIC randomx_dataset_init +PUBLIC randomx_dataset_init_avx2_prologue +PUBLIC randomx_dataset_init_avx2_loop_end +PUBLIC randomx_dataset_init_avx2_epilogue +PUBLIC randomx_dataset_init_avx2_ssh_load +PUBLIC randomx_dataset_init_avx2_ssh_prefetch PUBLIC randomx_program_loop_store PUBLIC randomx_program_loop_end PUBLIC randomx_program_epilogue @@ -183,6 +188,95 @@ init_block_loop: randomx_dataset_init ENDP ALIGN 64 +randomx_dataset_init_avx2_prologue PROC + include asm/program_sshash_avx2_save_registers.inc + + mov rdi, qword ptr [rcx] ;# cache->memory + mov rsi, rdx ;# dataset + mov rbp, r8 ;# block index + push r9 ;# max. block index + sub rsp, 32 + + jmp loop_begin + include asm/program_sshash_avx2_constants.inc + +ALIGN 64 +loop_begin: + include asm/program_sshash_avx2_loop_begin.inc + + ;# init integer registers (lane 0) + lea r8, [rbp+1] + imul r8, qword ptr [r0_avx2_mul] + mov r9, qword ptr [r1_avx2_add] + xor r9, r8 + mov r10, qword ptr [r2_avx2_add] + xor r10, r8 + mov r11, qword ptr [r3_avx2_add] + xor r11, r8 + mov r12, qword ptr [r4_avx2_add] + xor r12, r8 + mov r13, qword ptr [r5_avx2_add] + xor r13, r8 + mov r14, qword ptr [r6_avx2_add] + xor r14, r8 + mov r15, qword ptr [r7_avx2_add] + xor r15, r8 + + ;# init AVX registers (lanes 1-4) + vpxor ymm0, ymm0, ymm0 + movq xmm0, rbp + vpbroadcastq ymm0, xmm0 + vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments] + + ;# ymm0 *= r0_avx2_mul + vbroadcastsd ymm1, qword ptr [r0_avx2_mul] + vpsrlq ymm8, ymm0, 32 + vpsrlq ymm9, ymm1, 32 + vpmuludq ymm10, ymm0, ymm1 + vpmuludq ymm11, ymm9, ymm0 + vpmuludq ymm0, ymm8, ymm1 + vpsllq ymm11, ymm11, 32 + vpsllq ymm0, ymm0, 32 + vpaddq ymm10, ymm10, ymm11 + vpaddq ymm0, ymm10, ymm0 + + vbroadcastsd ymm1, qword ptr [r1_avx2_add] + vpxor ymm1, ymm0, ymm1 + vbroadcastsd ymm2, qword ptr [r2_avx2_add] + vpxor ymm2, ymm0, ymm2 + vbroadcastsd ymm3, qword ptr [r3_avx2_add] + vpxor ymm3, ymm0, ymm3 + vbroadcastsd ymm4, qword ptr [r4_avx2_add] + vpxor ymm4, ymm0, ymm4 + vbroadcastsd ymm5, qword ptr [r5_avx2_add] + vpxor ymm5, ymm0, ymm5 + vbroadcastsd ymm6, qword ptr [r6_avx2_add] + vpxor ymm6, ymm0, ymm6 + vbroadcastsd ymm7, qword ptr [r7_avx2_add] + vpxor ymm7, ymm0, ymm7 + + vbroadcastsd ymm15, qword ptr [mul_hi_avx2_data] ;# carry_bit (bit 32) + vpsllq ymm14, ymm15, 31 ;# sign64 (bit 63) +randomx_dataset_init_avx2_prologue ENDP + + ;# generated SuperscalarHash code goes here + +randomx_dataset_init_avx2_loop_end PROC + include asm/program_sshash_avx2_loop_end.inc +randomx_dataset_init_avx2_loop_end ENDP + +randomx_dataset_init_avx2_epilogue PROC + include asm/program_sshash_avx2_epilogue.inc +randomx_dataset_init_avx2_epilogue ENDP + +randomx_dataset_init_avx2_ssh_load PROC + include asm/program_sshash_avx2_ssh_load.inc +randomx_dataset_init_avx2_ssh_load ENDP + +randomx_dataset_init_avx2_ssh_prefetch PROC + include asm/program_sshash_avx2_ssh_prefetch.inc +randomx_dataset_init_avx2_ssh_prefetch ENDP + randomx_program_epilogue PROC include asm/program_epilogue_store.inc include asm/program_epilogue_win64.inc diff --git a/src/crypto/randomx/jit_compiler_x86_static.hpp b/src/crypto/randomx/jit_compiler_x86_static.hpp index 6523f9c47..121db5bed 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.hpp +++ b/src/crypto/randomx/jit_compiler_x86_static.hpp @@ -44,6 +44,11 @@ extern "C" { void randomx_program_loop_store(); void randomx_program_loop_end(); void randomx_dataset_init(); + void randomx_dataset_init_avx2_prologue(); + void randomx_dataset_init_avx2_loop_end(); + void randomx_dataset_init_avx2_epilogue(); + void randomx_dataset_init_avx2_ssh_load(); + void randomx_dataset_init_avx2_ssh_prefetch(); void randomx_program_epilogue(); void randomx_sshash_load(); void randomx_sshash_prefetch(); diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index 14aa7067e..9986a33fb 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -381,7 +381,7 @@ extern "C" { break; case RANDOMX_FLAG_JIT: - cache->jit = new randomx::JitCompiler(false); + cache->jit = new randomx::JitCompiler(false, true); cache->initialize = &randomx::initCacheCompile; cache->datasetInit = nullptr; cache->memory = memory; diff --git a/src/crypto/randomx/randomx.h b/src/crypto/randomx/randomx.h index 9a1fb3c71..f81df9dbc 100644 --- a/src/crypto/randomx/randomx.h +++ b/src/crypto/randomx/randomx.h @@ -170,6 +170,7 @@ void randomx_apply_config(const T& config) void randomx_set_scratchpad_prefetch_mode(int mode); void randomx_set_huge_pages_jit(bool hugePages); +void randomx_set_optimized_dataset_init(int value); #if defined(__cplusplus) extern "C" { diff --git a/src/crypto/randomx/vm_compiled.hpp b/src/crypto/randomx/vm_compiled.hpp index 2db99c759..0824d6bdd 100644 --- a/src/crypto/randomx/vm_compiled.hpp +++ b/src/crypto/randomx/vm_compiled.hpp @@ -59,7 +59,7 @@ namespace randomx { protected: void execute(); - JitCompiler compiler{ true }; + JitCompiler compiler{ true, false }; }; using CompiledVmDefault = CompiledVm<1>; diff --git a/src/crypto/rx/Rx.cpp b/src/crypto/rx/Rx.cpp index ea671d015..40d3c6129 100644 --- a/src/crypto/rx/Rx.cpp +++ b/src/crypto/rx/Rx.cpp @@ -96,6 +96,7 @@ bool xmrig::Rx::init(const T &seed, const RxConfig &config, const CpuConfig &cpu randomx_set_scratchpad_prefetch_mode(config.scratchpadPrefetchMode()); randomx_set_huge_pages_jit(cpu.isHugePagesJit()); + randomx_set_optimized_dataset_init(config.initDatasetAVX2()); if (!msrInitialized) { msrEnabled = msrInit(config, cpu.threads().get(seed.algorithm()).data()); diff --git a/src/crypto/rx/RxConfig.cpp b/src/crypto/rx/RxConfig.cpp index d9f05d4e3..ae6215dfe 100644 --- a/src/crypto/rx/RxConfig.cpp +++ b/src/crypto/rx/RxConfig.cpp @@ -47,6 +47,7 @@ namespace xmrig { const char *RxConfig::kInit = "init"; +const char *RxConfig::kInitAVX2 = "init-avx2"; const char *RxConfig::kField = "randomx"; const char *RxConfig::kMode = "mode"; const char *RxConfig::kOneGbPages = "1gb-pages"; @@ -86,9 +87,10 @@ static_assert (kMsrArraySize == ICpuInfo::MSR_MOD_MAX, "kMsrArraySize and MSR_MO bool xmrig::RxConfig::read(const rapidjson::Value &value) { if (value.IsObject()) { - m_threads = Json::getInt(value, kInit, m_threads); - m_mode = readMode(Json::getValue(value, kMode)); - m_rdmsr = Json::getBool(value, kRdmsr, m_rdmsr); + m_threads = Json::getInt(value, kInit, m_threads); + m_initDatasetAVX2 = Json::getInt(value, kInitAVX2, m_initDatasetAVX2); + m_mode = readMode(Json::getValue(value, kMode)); + m_rdmsr = Json::getBool(value, kRdmsr, m_rdmsr); # ifdef XMRIG_FEATURE_MSR readMSR(Json::getValue(value, kWrmsr)); @@ -141,6 +143,7 @@ rapidjson::Value xmrig::RxConfig::toJSON(rapidjson::Document &doc) const Value obj(kObjectType); obj.AddMember(StringRef(kInit), m_threads, allocator); + obj.AddMember(StringRef(kInitAVX2), m_initDatasetAVX2, allocator); obj.AddMember(StringRef(kMode), StringRef(modeName()), allocator); obj.AddMember(StringRef(kOneGbPages), m_oneGbPages, allocator); obj.AddMember(StringRef(kRdmsr), m_rdmsr, allocator); diff --git a/src/crypto/rx/RxConfig.h b/src/crypto/rx/RxConfig.h index 1e79d468c..fb3a656d4 100644 --- a/src/crypto/rx/RxConfig.h +++ b/src/crypto/rx/RxConfig.h @@ -61,6 +61,7 @@ public: static const char *kCacheQoS; static const char *kField; static const char *kInit; + static const char *kInitAVX2; static const char *kMode; static const char *kOneGbPages; static const char *kRdmsr; @@ -83,6 +84,7 @@ public: const char *modeName() const; uint32_t threads(uint32_t limit = 100) const; + inline int initDatasetAVX2() const { return m_initDatasetAVX2; } inline bool isOneGbPages() const { return m_oneGbPages; } inline bool rdmsr() const { return m_rdmsr; } inline bool wrmsr() const { return m_wrmsr; } @@ -111,11 +113,12 @@ private: Mode readMode(const rapidjson::Value &value) const; - bool m_numa = true; - bool m_oneGbPages = false; - bool m_rdmsr = true; - int m_threads = -1; - Mode m_mode = AutoMode; + bool m_numa = true; + bool m_oneGbPages = false; + bool m_rdmsr = true; + int m_threads = -1; + int m_initDatasetAVX2 = -1; + Mode m_mode = AutoMode; ScratchpadPrefetchMode m_scratchpadPrefetchMode = ScratchpadPrefetchT0; diff --git a/src/crypto/rx/RxDataset.cpp b/src/crypto/rx/RxDataset.cpp index 410a2f34e..b47285a3d 100644 --- a/src/crypto/rx/RxDataset.cpp +++ b/src/crypto/rx/RxDataset.cpp @@ -19,6 +19,7 @@ #include "crypto/rx/RxDataset.h" +#include "backend/cpu/Cpu.h" #include "base/io/log/Log.h" #include "base/io/log/Tags.h" #include "base/kernel/Platform.h" @@ -39,7 +40,13 @@ static void init_dataset_wrapper(randomx_dataset *dataset, randomx_cache *cache, { Platform::setThreadPriority(priority); - randomx_init_dataset(dataset, cache, startItem, itemCount); + if (Cpu::info()->hasAVX2() && (itemCount % 5)) { + randomx_init_dataset(dataset, cache, startItem, itemCount - (itemCount % 5)); + randomx_init_dataset(dataset, cache, startItem + itemCount - 5, 5); + } + else { + randomx_init_dataset(dataset, cache, startItem, itemCount); + } }