RandomX: optimized loading from scratchpad

Prefetches scratchpad data as soon as possible to calculate data address for the next load.

Up to ~1.4% speedup on Ryzen 7 3700X @ 4.1 GHz, RAM 3200 MHz 14-14-14-28 with optimized sub-timings:
Variant|Before H/S|After H/S
-------|----------|---------
rx/0|8663|8777
rx/wow|9867|10009
rx/loki|8652|8731
This commit is contained in:
SChernykh 2019-09-11 19:10:01 +02:00
parent 01b2c952ea
commit 2322e3bcf7
9 changed files with 75 additions and 21 deletions

View file

@ -1,5 +1,3 @@
mov rdx, rax
and eax, RANDOMX_SCRATCHPAD_MASK
lea rcx, [rsi+rax] lea rcx, [rsi+rax]
push rcx push rcx
xor r8, qword ptr [rcx+0] xor r8, qword ptr [rcx+0]
@ -10,8 +8,6 @@
xor r13, qword ptr [rcx+40] xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48] xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56] xor r15, qword ptr [rcx+56]
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
lea rcx, [rsi+rdx] lea rcx, [rsi+rdx]
push rcx push rcx
cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm0, qword ptr [rcx+0]

View file

@ -1,4 +1,3 @@
xor eax, eax
pop rcx pop rcx
mov qword ptr [rcx+0], r8 mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9 mov qword ptr [rcx+8], r9

View file

@ -76,6 +76,8 @@ namespace randomx {
*/ */
const uint8_t* codePrefetchScratchpad = (uint8_t*)&randomx_prefetch_scratchpad;
const uint8_t* codePrefetchScratchpadEnd = (uint8_t*)&randomx_prefetch_scratchpad_end;
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin; const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin;
const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load; const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load;
@ -93,6 +95,7 @@ namespace randomx {
const uint8_t* codeShhEnd = (uint8_t*)&randomx_sshash_end; const uint8_t* codeShhEnd = (uint8_t*)&randomx_sshash_end;
const uint8_t* codeShhInit = (uint8_t*)&randomx_sshash_init; const uint8_t* codeShhInit = (uint8_t*)&randomx_sshash_init;
const int32_t prefetchScratchpadSize = codePrefetchScratchpadEnd - codePrefetchScratchpad;
const int32_t prologueSize = codeLoopBegin - codePrologue; const int32_t prologueSize = codeLoopBegin - codePrologue;
const int32_t loopLoadSize = codeProgamStart - codeLoopLoad; const int32_t loopLoadSize = codeProgamStart - codeLoopLoad;
const int32_t readDatasetSize = codeReadDatasetLightSshInit - codeReadDataset; const int32_t readDatasetSize = codeReadDatasetLightSshInit - codeReadDataset;
@ -214,7 +217,7 @@ namespace randomx {
generateProgramPrologue(prog, pcfg); generateProgramPrologue(prog, pcfg);
memcpy(code + codePos, RandomX_CurrentConfig.codeReadDatasetTweaked, readDatasetSize); memcpy(code + codePos, RandomX_CurrentConfig.codeReadDatasetTweaked, readDatasetSize);
codePos += readDatasetSize; codePos += readDatasetSize;
generateProgramEpilogue(prog); generateProgramEpilogue(prog, pcfg);
} }
void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) {
@ -225,7 +228,7 @@ namespace randomx {
emitByte(CALL, code, codePos); emitByte(CALL, code, codePos);
emit32(superScalarHashOffset - (codePos + 4), code, codePos); emit32(superScalarHashOffset - (codePos + 4), code, codePos);
emit(codeReadDatasetLightSshFin, readDatasetLightFinSize, code, codePos); emit(codeReadDatasetLightSshFin, readDatasetLightFinSize, code, codePos);
generateProgramEpilogue(prog); generateProgramEpilogue(prog, pcfg);
} }
template<size_t N> template<size_t N>
@ -266,13 +269,16 @@ namespace randomx {
void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) { void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) {
memset(registerUsage, -1, sizeof(registerUsage)); memset(registerUsage, -1, sizeof(registerUsage));
codePos = ((uint8_t*)randomx_program_prologue_first_load) - ((uint8_t*)randomx_program_prologue);
code[codePos + 2] = 0xc0 + pcfg.readReg0;
code[codePos + 5] = 0xc0 + pcfg.readReg1;
*(uint32_t*)(code + codePos + 10) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
*(uint32_t*)(code + codePos + 20) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
codePos = prologueSize; codePos = prologueSize;
memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask)); memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask));
emit(REX_XOR_RAX_R64, code, codePos); memcpy(code + codePos, codeLoopLoad, loopLoadSize);
emitByte(0xc0 + pcfg.readReg0, code, codePos);
emit(REX_XOR_RAX_R64, code, codePos);
emitByte(0xc0 + pcfg.readReg1, code, codePos);
memcpy(code + codePos, RandomX_CurrentConfig.codeLoopLoadTweaked, loopLoadSize);
codePos += loopLoadSize; codePos += loopLoadSize;
for (unsigned i = 0; i < prog.getSize(); ++i) { for (unsigned i = 0; i < prog.getSize(); ++i) {
Instruction& instr = prog(i); Instruction& instr = prog(i);
@ -287,7 +293,12 @@ namespace randomx {
emitByte(0xc0 + pcfg.readReg3, code, codePos); emitByte(0xc0 + pcfg.readReg3, code, codePos);
} }
void JitCompilerX86::generateProgramEpilogue(Program& prog) { void JitCompilerX86::generateProgramEpilogue(Program& prog, ProgramConfiguration& pcfg) {
emit(REX_MOV_RR64, code, codePos);
emitByte(0xc0 + pcfg.readReg0, code, codePos);
emit(REX_XOR_RAX_R64, code, codePos);
emitByte(0xc0 + pcfg.readReg1, code, codePos);
emit(RandomX_CurrentConfig.codePrefetchScratchpadTweaked, prefetchScratchpadSize, code, codePos);
memcpy(code + codePos, codeLoopStore, loopStoreSize); memcpy(code + codePos, codeLoopStore, loopStoreSize);
codePos += loopStoreSize; codePos += loopStoreSize;
emit(SUB_EBX, code, codePos); emit(SUB_EBX, code, codePos);

View file

@ -72,7 +72,7 @@ namespace randomx {
int32_t codePos; int32_t codePos;
void generateProgramPrologue(Program&, ProgramConfiguration&); void generateProgramPrologue(Program&, ProgramConfiguration&);
void generateProgramEpilogue(Program&); void generateProgramEpilogue(Program&, ProgramConfiguration&);
static void genAddressReg(Instruction&, uint8_t* code, int& codePos, bool rax = true); static void genAddressReg(Instruction&, uint8_t* code, int& codePos, bool rax = true);
static void genAddressRegDst(Instruction&, uint8_t* code, int& codePos); static void genAddressRegDst(Instruction&, uint8_t* code, int& codePos);
static void genAddressImm(Instruction&, uint8_t* code, int& codePos); static void genAddressImm(Instruction&, uint8_t* code, int& codePos);

View file

@ -37,7 +37,10 @@
#define WINABI #define WINABI
#endif #endif
.global DECL(randomx_prefetch_scratchpad)
.global DECL(randomx_prefetch_scratchpad_end)
.global DECL(randomx_program_prologue) .global DECL(randomx_program_prologue)
.global DECL(randomx_program_prologue_first_load)
.global DECL(randomx_program_loop_begin) .global DECL(randomx_program_loop_begin)
.global DECL(randomx_program_loop_load) .global DECL(randomx_program_loop_load)
.global DECL(randomx_program_start) .global DECL(randomx_program_start)
@ -61,6 +64,16 @@
#define db .byte #define db .byte
DECL(randomx_prefetch_scratchpad):
mov rdx, rax
and eax, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rax]
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rdx]
DECL(randomx_prefetch_scratchpad_end):
.balign 64 .balign 64
DECL(randomx_program_prologue): DECL(randomx_program_prologue):
#if defined(WINABI) #if defined(WINABI)
@ -71,6 +84,14 @@ DECL(randomx_program_prologue):
movapd xmm13, xmmword ptr [mantissaMask+rip] movapd xmm13, xmmword ptr [mantissaMask+rip]
movapd xmm14, xmmword ptr [exp240+rip] movapd xmm14, xmmword ptr [exp240+rip]
movapd xmm15, xmmword ptr [scaleMask+rip] movapd xmm15, xmmword ptr [scaleMask+rip]
DECL(randomx_program_prologue_first_load):
xor rax, r8
xor rax, r8
mov rdx, rax
and eax, RANDOMX_SCRATCHPAD_MASK
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
jmp DECL(randomx_program_loop_begin) jmp DECL(randomx_program_loop_begin)
.balign 64 .balign 64

View file

@ -28,7 +28,10 @@ IFDEF RAX
_RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
PUBLIC randomx_prefetch_scratchpad
PUBLIC randomx_prefetch_scratchpad_end
PUBLIC randomx_program_prologue PUBLIC randomx_program_prologue
PUBLIC randomx_program_prologue_first_load
PUBLIC randomx_program_loop_begin PUBLIC randomx_program_loop_begin
PUBLIC randomx_program_loop_load PUBLIC randomx_program_loop_load
PUBLIC randomx_program_start PUBLIC randomx_program_start
@ -50,15 +53,36 @@ RANDOMX_SCRATCHPAD_MASK EQU 2097088
RANDOMX_DATASET_BASE_MASK EQU 2147483584 RANDOMX_DATASET_BASE_MASK EQU 2147483584
RANDOMX_CACHE_MASK EQU 4194303 RANDOMX_CACHE_MASK EQU 4194303
randomx_prefetch_scratchpad PROC
mov rdx, rax
and eax, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rax]
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rdx]
randomx_prefetch_scratchpad ENDP
randomx_prefetch_scratchpad_end PROC
randomx_prefetch_scratchpad_end ENDP
ALIGN 64 ALIGN 64
randomx_program_prologue PROC randomx_program_prologue PROC
include asm/program_prologue_win64.inc include asm/program_prologue_win64.inc
movapd xmm13, xmmword ptr [mantissaMask] movapd xmm13, xmmword ptr [mantissaMask]
movapd xmm14, xmmword ptr [exp240] movapd xmm14, xmmword ptr [exp240]
movapd xmm15, xmmword ptr [scaleMask] movapd xmm15, xmmword ptr [scaleMask]
jmp randomx_program_loop_begin
randomx_program_prologue ENDP randomx_program_prologue ENDP
randomx_program_prologue_first_load PROC
xor rax, r8
xor rax, r8
mov rdx, rax
and eax, RANDOMX_SCRATCHPAD_MASK
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
jmp randomx_program_loop_begin
randomx_program_prologue_first_load ENDP
ALIGN 64 ALIGN 64
include asm/program_xmm_constants.inc include asm/program_xmm_constants.inc

View file

@ -29,7 +29,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once #pragma once
extern "C" { extern "C" {
void randomx_prefetch_scratchpad();
void randomx_prefetch_scratchpad_end();
void randomx_program_prologue(); void randomx_program_prologue();
void randomx_program_prologue_first_load();
void randomx_program_loop_begin(); void randomx_program_loop_begin();
void randomx_program_loop_load(); void randomx_program_loop_load();
void randomx_program_start(); void randomx_program_start();

View file

@ -149,9 +149,9 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
memcpy(codeReadDatasetLightSshInitTweaked, a, b - a); memcpy(codeReadDatasetLightSshInitTweaked, a, b - a);
} }
{ {
const uint8_t* a = (const uint8_t*)&randomx_program_loop_load; const uint8_t* a = (const uint8_t*)&randomx_prefetch_scratchpad;
const uint8_t* b = (const uint8_t*)&randomx_program_start; const uint8_t* b = (const uint8_t*)&randomx_prefetch_scratchpad_end;
memcpy(codeLoopLoadTweaked, a, b - a); memcpy(codePrefetchScratchpadTweaked, a, b - a);
} }
#endif #endif
} }
@ -177,8 +177,8 @@ void RandomX_ConfigurationBase::Apply()
ScratchpadL3Mask64_Calculated = ((ScratchpadL3_Size / sizeof(uint64_t)) / 8 - 1) * 64; ScratchpadL3Mask64_Calculated = ((ScratchpadL3_Size / sizeof(uint64_t)) / 8 - 1) * 64;
#if defined(_M_X64) || defined(__x86_64__) #if defined(_M_X64) || defined(__x86_64__)
*(uint32_t*)(codeLoopLoadTweaked + 4) = ScratchpadL3Mask64_Calculated; *(uint32_t*)(codePrefetchScratchpadTweaked + 4) = ScratchpadL3Mask64_Calculated;
*(uint32_t*)(codeLoopLoadTweaked + 50) = ScratchpadL3Mask64_Calculated; *(uint32_t*)(codePrefetchScratchpadTweaked + 18) = ScratchpadL3Mask64_Calculated;
#endif #endif
ConditionMask_Calculated = (1 << JumpBits) - 1; ConditionMask_Calculated = (1 << JumpBits) - 1;

View file

@ -116,7 +116,7 @@ struct RandomX_ConfigurationBase
uint8_t codeShhPrefetchTweaked[20]; uint8_t codeShhPrefetchTweaked[20];
uint8_t codeReadDatasetTweaked[64]; uint8_t codeReadDatasetTweaked[64];
uint8_t codeReadDatasetLightSshInitTweaked[68]; uint8_t codeReadDatasetLightSshInitTweaked[68];
uint8_t codeLoopLoadTweaked[140]; uint8_t codePrefetchScratchpadTweaked[32];
uint32_t CacheLineAlignMask_Calculated; uint32_t CacheLineAlignMask_Calculated;
uint32_t DatasetExtraItems_Calculated; uint32_t DatasetExtraItems_Calculated;