From 2322e3bcf7370f55c808f79b69d00f9a62e9e1da Mon Sep 17 00:00:00 2001 From: SChernykh Date: Wed, 11 Sep 2019 19:10:01 +0200 Subject: [PATCH] RandomX: optimized loading from scratchpad Prefetches scratchpad data as soon as possible to calculate data address for the next load. Up to ~1.4% speedup on Ryzen 7 3700X @ 4.1 GHz, RAM 3200 MHz 14-14-14-28 with optimized sub-timings: Variant|Before H/S|After H/S -------|----------|--------- rx/0|8663|8777 rx/wow|9867|10009 rx/loki|8652|8731 --- src/crypto/randomx/asm/program_loop_load.inc | 4 --- src/crypto/randomx/asm/program_loop_store.inc | 1 - src/crypto/randomx/jit_compiler_x86.cpp | 27 +++++++++++++------ src/crypto/randomx/jit_compiler_x86.hpp | 2 +- src/crypto/randomx/jit_compiler_x86_static.S | 21 +++++++++++++++ .../randomx/jit_compiler_x86_static.asm | 26 +++++++++++++++++- .../randomx/jit_compiler_x86_static.hpp | 3 +++ src/crypto/randomx/randomx.cpp | 10 +++---- src/crypto/randomx/randomx.h | 2 +- 9 files changed, 75 insertions(+), 21 deletions(-) diff --git a/src/crypto/randomx/asm/program_loop_load.inc b/src/crypto/randomx/asm/program_loop_load.inc index 374af66a..c2933231 100644 --- a/src/crypto/randomx/asm/program_loop_load.inc +++ b/src/crypto/randomx/asm/program_loop_load.inc @@ -1,5 +1,3 @@ - mov rdx, rax - and eax, RANDOMX_SCRATCHPAD_MASK lea rcx, [rsi+rax] push rcx xor r8, qword ptr [rcx+0] @@ -10,8 +8,6 @@ xor r13, qword ptr [rcx+40] xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] - ror rdx, 32 - and edx, RANDOMX_SCRATCHPAD_MASK lea rcx, [rsi+rdx] push rcx cvtdq2pd xmm0, qword ptr [rcx+0] diff --git a/src/crypto/randomx/asm/program_loop_store.inc b/src/crypto/randomx/asm/program_loop_store.inc index 53164cb0..1ba1635c 100644 --- a/src/crypto/randomx/asm/program_loop_store.inc +++ b/src/crypto/randomx/asm/program_loop_store.inc @@ -1,4 +1,3 @@ - xor eax, eax pop rcx mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index 7ec3d524..2f6cfbda 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -76,6 +76,8 @@ namespace randomx { */ + const uint8_t* codePrefetchScratchpad = (uint8_t*)&randomx_prefetch_scratchpad; + const uint8_t* codePrefetchScratchpadEnd = (uint8_t*)&randomx_prefetch_scratchpad_end; const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin; const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load; @@ -93,6 +95,7 @@ namespace randomx { const uint8_t* codeShhEnd = (uint8_t*)&randomx_sshash_end; const uint8_t* codeShhInit = (uint8_t*)&randomx_sshash_init; + const int32_t prefetchScratchpadSize = codePrefetchScratchpadEnd - codePrefetchScratchpad; const int32_t prologueSize = codeLoopBegin - codePrologue; const int32_t loopLoadSize = codeProgamStart - codeLoopLoad; const int32_t readDatasetSize = codeReadDatasetLightSshInit - codeReadDataset; @@ -214,7 +217,7 @@ namespace randomx { generateProgramPrologue(prog, pcfg); memcpy(code + codePos, RandomX_CurrentConfig.codeReadDatasetTweaked, readDatasetSize); codePos += readDatasetSize; - generateProgramEpilogue(prog); + generateProgramEpilogue(prog, pcfg); } void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { @@ -225,7 +228,7 @@ namespace randomx { emitByte(CALL, code, codePos); emit32(superScalarHashOffset - (codePos + 4), code, codePos); emit(codeReadDatasetLightSshFin, readDatasetLightFinSize, code, codePos); - generateProgramEpilogue(prog); + generateProgramEpilogue(prog, pcfg); } template @@ -266,13 +269,16 @@ namespace randomx { void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) { memset(registerUsage, -1, sizeof(registerUsage)); + + codePos = ((uint8_t*)randomx_program_prologue_first_load) - ((uint8_t*)randomx_program_prologue); + code[codePos + 2] = 0xc0 + pcfg.readReg0; + code[codePos + 5] = 0xc0 + pcfg.readReg1; + *(uint32_t*)(code + codePos + 10) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated; + *(uint32_t*)(code + codePos + 20) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated; + codePos = prologueSize; memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask)); - emit(REX_XOR_RAX_R64, code, codePos); - emitByte(0xc0 + pcfg.readReg0, code, codePos); - emit(REX_XOR_RAX_R64, code, codePos); - emitByte(0xc0 + pcfg.readReg1, code, codePos); - memcpy(code + codePos, RandomX_CurrentConfig.codeLoopLoadTweaked, loopLoadSize); + memcpy(code + codePos, codeLoopLoad, loopLoadSize); codePos += loopLoadSize; for (unsigned i = 0; i < prog.getSize(); ++i) { Instruction& instr = prog(i); @@ -287,7 +293,12 @@ namespace randomx { emitByte(0xc0 + pcfg.readReg3, code, codePos); } - void JitCompilerX86::generateProgramEpilogue(Program& prog) { + void JitCompilerX86::generateProgramEpilogue(Program& prog, ProgramConfiguration& pcfg) { + emit(REX_MOV_RR64, code, codePos); + emitByte(0xc0 + pcfg.readReg0, code, codePos); + emit(REX_XOR_RAX_R64, code, codePos); + emitByte(0xc0 + pcfg.readReg1, code, codePos); + emit(RandomX_CurrentConfig.codePrefetchScratchpadTweaked, prefetchScratchpadSize, code, codePos); memcpy(code + codePos, codeLoopStore, loopStoreSize); codePos += loopStoreSize; emit(SUB_EBX, code, codePos); diff --git a/src/crypto/randomx/jit_compiler_x86.hpp b/src/crypto/randomx/jit_compiler_x86.hpp index f72bce86..30e7c281 100644 --- a/src/crypto/randomx/jit_compiler_x86.hpp +++ b/src/crypto/randomx/jit_compiler_x86.hpp @@ -72,7 +72,7 @@ namespace randomx { int32_t codePos; void generateProgramPrologue(Program&, ProgramConfiguration&); - void generateProgramEpilogue(Program&); + void generateProgramEpilogue(Program&, ProgramConfiguration&); static void genAddressReg(Instruction&, uint8_t* code, int& codePos, bool rax = true); static void genAddressRegDst(Instruction&, uint8_t* code, int& codePos); static void genAddressImm(Instruction&, uint8_t* code, int& codePos); diff --git a/src/crypto/randomx/jit_compiler_x86_static.S b/src/crypto/randomx/jit_compiler_x86_static.S index b6338d85..c20cd743 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.S +++ b/src/crypto/randomx/jit_compiler_x86_static.S @@ -37,7 +37,10 @@ #define WINABI #endif +.global DECL(randomx_prefetch_scratchpad) +.global DECL(randomx_prefetch_scratchpad_end) .global DECL(randomx_program_prologue) +.global DECL(randomx_program_prologue_first_load) .global DECL(randomx_program_loop_begin) .global DECL(randomx_program_loop_load) .global DECL(randomx_program_start) @@ -61,6 +64,16 @@ #define db .byte +DECL(randomx_prefetch_scratchpad): + mov rdx, rax + and eax, RANDOMX_SCRATCHPAD_MASK + prefetcht0 [rsi+rax] + ror rdx, 32 + and edx, RANDOMX_SCRATCHPAD_MASK + prefetcht0 [rsi+rdx] + +DECL(randomx_prefetch_scratchpad_end): + .balign 64 DECL(randomx_program_prologue): #if defined(WINABI) @@ -71,6 +84,14 @@ DECL(randomx_program_prologue): movapd xmm13, xmmword ptr [mantissaMask+rip] movapd xmm14, xmmword ptr [exp240+rip] movapd xmm15, xmmword ptr [scaleMask+rip] + +DECL(randomx_program_prologue_first_load): + xor rax, r8 + xor rax, r8 + mov rdx, rax + and eax, RANDOMX_SCRATCHPAD_MASK + ror rdx, 32 + and edx, RANDOMX_SCRATCHPAD_MASK jmp DECL(randomx_program_loop_begin) .balign 64 diff --git a/src/crypto/randomx/jit_compiler_x86_static.asm b/src/crypto/randomx/jit_compiler_x86_static.asm index 5ecfb435..73fa503a 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.asm +++ b/src/crypto/randomx/jit_compiler_x86_static.asm @@ -28,7 +28,10 @@ IFDEF RAX _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE +PUBLIC randomx_prefetch_scratchpad +PUBLIC randomx_prefetch_scratchpad_end PUBLIC randomx_program_prologue +PUBLIC randomx_program_prologue_first_load PUBLIC randomx_program_loop_begin PUBLIC randomx_program_loop_load PUBLIC randomx_program_start @@ -50,15 +53,36 @@ RANDOMX_SCRATCHPAD_MASK EQU 2097088 RANDOMX_DATASET_BASE_MASK EQU 2147483584 RANDOMX_CACHE_MASK EQU 4194303 +randomx_prefetch_scratchpad PROC + mov rdx, rax + and eax, RANDOMX_SCRATCHPAD_MASK + prefetcht0 [rsi+rax] + ror rdx, 32 + and edx, RANDOMX_SCRATCHPAD_MASK + prefetcht0 [rsi+rdx] +randomx_prefetch_scratchpad ENDP + +randomx_prefetch_scratchpad_end PROC +randomx_prefetch_scratchpad_end ENDP + ALIGN 64 randomx_program_prologue PROC include asm/program_prologue_win64.inc movapd xmm13, xmmword ptr [mantissaMask] movapd xmm14, xmmword ptr [exp240] movapd xmm15, xmmword ptr [scaleMask] - jmp randomx_program_loop_begin randomx_program_prologue ENDP +randomx_program_prologue_first_load PROC + xor rax, r8 + xor rax, r8 + mov rdx, rax + and eax, RANDOMX_SCRATCHPAD_MASK + ror rdx, 32 + and edx, RANDOMX_SCRATCHPAD_MASK + jmp randomx_program_loop_begin +randomx_program_prologue_first_load ENDP + ALIGN 64 include asm/program_xmm_constants.inc diff --git a/src/crypto/randomx/jit_compiler_x86_static.hpp b/src/crypto/randomx/jit_compiler_x86_static.hpp index ba196862..0a62c986 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.hpp +++ b/src/crypto/randomx/jit_compiler_x86_static.hpp @@ -29,7 +29,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once extern "C" { + void randomx_prefetch_scratchpad(); + void randomx_prefetch_scratchpad_end(); void randomx_program_prologue(); + void randomx_program_prologue_first_load(); void randomx_program_loop_begin(); void randomx_program_loop_load(); void randomx_program_start(); diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index a5f6bc08..51680704 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -149,9 +149,9 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase() memcpy(codeReadDatasetLightSshInitTweaked, a, b - a); } { - const uint8_t* a = (const uint8_t*)&randomx_program_loop_load; - const uint8_t* b = (const uint8_t*)&randomx_program_start; - memcpy(codeLoopLoadTweaked, a, b - a); + const uint8_t* a = (const uint8_t*)&randomx_prefetch_scratchpad; + const uint8_t* b = (const uint8_t*)&randomx_prefetch_scratchpad_end; + memcpy(codePrefetchScratchpadTweaked, a, b - a); } #endif } @@ -177,8 +177,8 @@ void RandomX_ConfigurationBase::Apply() ScratchpadL3Mask64_Calculated = ((ScratchpadL3_Size / sizeof(uint64_t)) / 8 - 1) * 64; #if defined(_M_X64) || defined(__x86_64__) - *(uint32_t*)(codeLoopLoadTweaked + 4) = ScratchpadL3Mask64_Calculated; - *(uint32_t*)(codeLoopLoadTweaked + 50) = ScratchpadL3Mask64_Calculated; + *(uint32_t*)(codePrefetchScratchpadTweaked + 4) = ScratchpadL3Mask64_Calculated; + *(uint32_t*)(codePrefetchScratchpadTweaked + 18) = ScratchpadL3Mask64_Calculated; #endif ConditionMask_Calculated = (1 << JumpBits) - 1; diff --git a/src/crypto/randomx/randomx.h b/src/crypto/randomx/randomx.h index 137edc50..1a54573a 100644 --- a/src/crypto/randomx/randomx.h +++ b/src/crypto/randomx/randomx.h @@ -116,7 +116,7 @@ struct RandomX_ConfigurationBase uint8_t codeShhPrefetchTweaked[20]; uint8_t codeReadDatasetTweaked[64]; uint8_t codeReadDatasetLightSshInitTweaked[68]; - uint8_t codeLoopLoadTweaked[140]; + uint8_t codePrefetchScratchpadTweaked[32]; uint32_t CacheLineAlignMask_Calculated; uint32_t DatasetExtraItems_Calculated;