diff --git a/src/crypto/randomx/asm/program_loop_load_xop.inc b/src/crypto/randomx/asm/program_loop_load_xop.inc new file mode 100644 index 000000000..5ea2386e9 --- /dev/null +++ b/src/crypto/randomx/asm/program_loop_load_xop.inc @@ -0,0 +1,24 @@ + lea rcx, [rsi+rax] + mov [rsp+8], rcx + xor r8, qword ptr [rcx+0] + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] + lea rcx, [rsi+rdx] + mov [rsp+16], rcx + cvtdq2pd xmm0, qword ptr [rcx+0] + cvtdq2pd xmm1, qword ptr [rcx+8] + cvtdq2pd xmm2, qword ptr [rcx+16] + cvtdq2pd xmm3, qword ptr [rcx+24] + cvtdq2pd xmm4, qword ptr [rcx+32] + cvtdq2pd xmm5, qword ptr [rcx+40] + cvtdq2pd xmm6, qword ptr [rcx+48] + cvtdq2pd xmm7, qword ptr [rcx+56] + vpcmov xmm4, xmm4, xmm14, xmm13 + vpcmov xmm5, xmm5, xmm14, xmm13 + vpcmov xmm6, xmm6, xmm14, xmm13 + vpcmov xmm7, xmm7, xmm14, xmm13 diff --git a/src/crypto/randomx/asm/program_xmm_constants.inc b/src/crypto/randomx/asm/program_xmm_constants.inc index 296237a45..cb4b5430d 100644 --- a/src/crypto/randomx/asm/program_xmm_constants.inc +++ b/src/crypto/randomx/asm/program_xmm_constants.inc @@ -1,5 +1,5 @@ mantissaMask: - db 255, 255, 255, 255, 255, 255, 255, 0, 255, 255, 255, 255, 255, 255, 255, 0 + db 0, 0, 192, 255, 255, 255, 255, 0, 0, 0, 192, 255, 255, 255, 255, 0 exp240: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 scaleMask: diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index cac38949e..135203fea 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -89,6 +89,7 @@ namespace randomx { const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin; const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load; + const uint8_t* codeLoopLoadXOP = (uint8_t*)&randomx_program_loop_load_xop; const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; const uint8_t* codeReadDatasetLightSshInit = (uint8_t*)&randomx_program_read_dataset_sshash_init; const uint8_t* codeReadDatasetLightSshFin = (uint8_t*)&randomx_program_read_dataset_sshash_fin; @@ -104,7 +105,8 @@ namespace randomx { const int32_t prefetchScratchpadSize = codePrefetchScratchpadEnd - codePrefetchScratchpad; const int32_t prologueSize = codeLoopBegin - codePrologue; - const int32_t loopLoadSize = codeProgamStart - codeLoopLoad; + const int32_t loopLoadSize = codeLoopLoadXOP - codeLoopLoad; + const int32_t loopLoadXOPSize = codeProgamStart - codeLoopLoadXOP; const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit; const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin; const int32_t loopStoreSize = codeLoopEnd - codeLoopStore; @@ -184,6 +186,7 @@ namespace randomx { static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 }; static const uint8_t REX_XCHG[] = { 0x4d, 0x87 }; static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0F, 0x54, 0xE5, 0x45, 0x0F, 0x56, 0xE6 }; + static const uint8_t REX_VPCMOV_XMM12[] = { 0x8F, 0x48, 0x18, 0xA2, 0xE6, 0xD0 }; static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f }; static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 }; static const uint8_t CALL = 0xe8; @@ -295,12 +298,23 @@ namespace randomx { cpuid(1, info); hasAVX = ((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0); + cpuid(0x80000001, info); + hasXOP = ((info[2] & (1 << 11)) != 0); + allocatedCode = (uint8_t*)allocExecutableMemory(CodeSize * 2); // Shift code base address to improve caching - all threads will use different L2/L3 cache sets code = allocatedCode + (codeOffset.fetch_add(59 * 64) % CodeSize); memcpy(code, codePrologue, prologueSize); - memcpy(code + prologueSize, codeLoopLoad, loopLoadSize); + if (hasXOP) { + memcpy(code + prologueSize, codeLoopLoadXOP, loopLoadXOPSize); + } + else { + memcpy(code + prologueSize, codeLoopLoad, loopLoadSize); + } memcpy(code + epilogueOffset, codeEpilogue, epilogueSize); + + codePosFirst = prologueSize + (hasXOP ? loopLoadXOPSize : loopLoadSize); + # ifdef XMRIG_FIX_RYZEN mainLoopBounds.first = code + prologueSize; mainLoopBounds.second = code + epilogueOffset; @@ -318,7 +332,7 @@ namespace randomx { uint8_t* p; uint32_t n; - if (flags & RANDOMX_FLAG_RYZEN) { + if (flags & RANDOMX_FLAG_AMD) { p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked; n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize; } @@ -395,7 +409,7 @@ namespace randomx { # endif memcpy(code + prologueSize - 48, &pcfg.eMask, sizeof(pcfg.eMask)); - codePos = prologueSize + loopLoadSize; + codePos = codePosFirst; //mark all registers as used uint64_t* r = (uint64_t*)registerUsage; @@ -991,7 +1005,12 @@ namespace randomx { const uint32_t dst = instr.dst % RegisterCountFlt; genAddressReg(instr, p, pos); emit(REX_CVTDQ2PD_XMM12, p, pos); - emit(REX_ANDPS_XMM12, p, pos); + if (hasXOP) { + emit(REX_VPCMOV_XMM12, p, pos); + } + else { + emit(REX_ANDPS_XMM12, p, pos); + } emit(REX_DIVPD, p, pos); emitByte(0xe4 + 8 * dst, p, pos); @@ -1020,7 +1039,7 @@ namespace randomx { emit(ROL_RAX, p, pos); emitByte(rotate, p, pos); } - if (vm_flags & RANDOMX_FLAG_RYZEN) { + if (vm_flags & RANDOMX_FLAG_AMD) { emit(AND_OR_MOV_LDMXCSR_RYZEN, p, pos); } else { diff --git a/src/crypto/randomx/jit_compiler_x86.hpp b/src/crypto/randomx/jit_compiler_x86.hpp index 02b1a80fc..e330470aa 100644 --- a/src/crypto/randomx/jit_compiler_x86.hpp +++ b/src/crypto/randomx/jit_compiler_x86.hpp @@ -73,10 +73,12 @@ namespace randomx { std::pair mainLoopBounds; # endif int32_t codePos; + int32_t codePosFirst; uint32_t vm_flags; static bool BranchesWithin32B; bool hasAVX; + bool hasXOP; static void applyTweaks(); void generateProgramPrologue(Program&, ProgramConfiguration&); diff --git a/src/crypto/randomx/jit_compiler_x86_static.S b/src/crypto/randomx/jit_compiler_x86_static.S index e5709cdc2..916316d98 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.S +++ b/src/crypto/randomx/jit_compiler_x86_static.S @@ -43,6 +43,7 @@ .global DECL(randomx_program_prologue_first_load) .global DECL(randomx_program_loop_begin) .global DECL(randomx_program_loop_load) +.global DECL(randomx_program_loop_load_xop) .global DECL(randomx_program_start) .global DECL(randomx_program_read_dataset) .global DECL(randomx_program_read_dataset_ryzen) @@ -110,6 +111,9 @@ DECL(randomx_program_loop_begin): DECL(randomx_program_loop_load): #include "asm/program_loop_load.inc" +DECL(randomx_program_loop_load_xop): + #include "asm/program_loop_load_xop.inc" + DECL(randomx_program_start): nop diff --git a/src/crypto/randomx/jit_compiler_x86_static.asm b/src/crypto/randomx/jit_compiler_x86_static.asm index 4b3542e3a..8e7714002 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.asm +++ b/src/crypto/randomx/jit_compiler_x86_static.asm @@ -34,6 +34,7 @@ PUBLIC randomx_program_prologue PUBLIC randomx_program_prologue_first_load PUBLIC randomx_program_loop_begin PUBLIC randomx_program_loop_load +PUBLIC randomx_program_loop_load_xop PUBLIC randomx_program_start PUBLIC randomx_program_read_dataset PUBLIC randomx_program_read_dataset_ryzen @@ -101,6 +102,10 @@ randomx_program_loop_load PROC include asm/program_loop_load.inc randomx_program_loop_load ENDP +randomx_program_loop_load_xop PROC + include asm/program_loop_load_xop.inc +randomx_program_loop_load_xop ENDP + randomx_program_start PROC nop randomx_program_start ENDP diff --git a/src/crypto/randomx/jit_compiler_x86_static.hpp b/src/crypto/randomx/jit_compiler_x86_static.hpp index b0a7c5acb..6523f9c47 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.hpp +++ b/src/crypto/randomx/jit_compiler_x86_static.hpp @@ -35,6 +35,7 @@ extern "C" { void randomx_program_prologue_first_load(); void randomx_program_loop_begin(); void randomx_program_loop_load(); + void randomx_program_loop_load_xop(); void randomx_program_start(); void randomx_program_read_dataset(); void randomx_program_read_dataset_ryzen(); diff --git a/src/crypto/randomx/randomx.h b/src/crypto/randomx/randomx.h index 793e6e1b2..787491ebc 100644 --- a/src/crypto/randomx/randomx.h +++ b/src/crypto/randomx/randomx.h @@ -49,7 +49,7 @@ enum randomx_flags { RANDOMX_FLAG_FULL_MEM = 4, RANDOMX_FLAG_JIT = 8, RANDOMX_FLAG_1GB_PAGES = 16, - RANDOMX_FLAG_RYZEN = 64, + RANDOMX_FLAG_AMD = 64, }; diff --git a/src/crypto/rx/RxVm.cpp b/src/crypto/rx/RxVm.cpp index 486d83c2e..339d9e758 100644 --- a/src/crypto/rx/RxVm.cpp +++ b/src/crypto/rx/RxVm.cpp @@ -45,8 +45,12 @@ xmrig::RxVm::RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes, xmrig:: m_flags |= RANDOMX_FLAG_JIT; } - if ((assembly == Assembly::RYZEN) || ((assembly == Assembly::AUTO) && (Cpu::info()->assembly() == Assembly::RYZEN))) { - m_flags |= RANDOMX_FLAG_RYZEN; + if (assembly == Assembly::AUTO) { + assembly = Cpu::info()->assembly(); + } + + if ((assembly == Assembly::RYZEN) || (assembly == Assembly::BULLDOZER)) { + m_flags |= RANDOMX_FLAG_AMD; } m_vm = randomx_create_vm(static_cast(m_flags), dataset->cache() ? dataset->cache()->get() : nullptr, dataset->get(), scratchpad);