diff --git a/src/crypto/randomx/asm/program_loop_load.inc b/src/crypto/randomx/asm/program_loop_load.inc index 1c53e8314..f5b23a931 100644 --- a/src/crypto/randomx/asm/program_loop_load.inc +++ b/src/crypto/randomx/asm/program_loop_load.inc @@ -18,11 +18,11 @@ cvtdq2pd xmm5, qword ptr [rcx+40] cvtdq2pd xmm6, qword ptr [rcx+48] cvtdq2pd xmm7, qword ptr [rcx+56] - andps xmm4, xmm13 - andps xmm5, xmm13 - andps xmm6, xmm13 - andps xmm7, xmm13 - orps xmm4, xmm14 - orps xmm5, xmm14 - orps xmm6, xmm14 - orps xmm7, xmm14 + andpd xmm4, xmm13 + andpd xmm5, xmm13 + andpd xmm6, xmm13 + andpd xmm7, xmm13 + orpd xmm4, xmm14 + orpd xmm5, xmm14 + orpd xmm6, xmm14 + orpd xmm7, xmm14 diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index d0b0114c4..cac38949e 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -299,6 +299,7 @@ namespace randomx { // Shift code base address to improve caching - all threads will use different L2/L3 cache sets code = allocatedCode + (codeOffset.fetch_add(59 * 64) % CodeSize); memcpy(code, codePrologue, prologueSize); + memcpy(code + prologueSize, codeLoopLoad, loopLoadSize); memcpy(code + epilogueOffset, codeEpilogue, epilogueSize); # ifdef XMRIG_FIX_RYZEN mainLoopBounds.first = code + prologueSize; @@ -393,10 +394,8 @@ namespace randomx { xmrig::Rx::setMainLoopBounds(mainLoopBounds); # endif - codePos = prologueSize; - memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask)); - memcpy(code + codePos, codeLoopLoad, loopLoadSize); - codePos += loopLoadSize; + memcpy(code + prologueSize - 48, &pcfg.eMask, sizeof(pcfg.eMask)); + codePos = prologueSize + loopLoadSize; //mark all registers as used uint64_t* r = (uint64_t*)registerUsage;