RandomX: added BMI2 version for scratchpad prefetch

Saves 1 instruction and 1 byte in the main loop.
This commit is contained in:
SChernykh 2021-05-19 17:52:16 +02:00
parent 3ac8f6b23a
commit d443dd86f1
6 changed files with 38 additions and 13 deletions

View file

@ -110,8 +110,6 @@ namespace randomx {
#define ADDR(x) ((uint8_t*)&x) #define ADDR(x) ((uint8_t*)&x)
# endif # endif
#define codePrefetchScratchpad ADDR(randomx_prefetch_scratchpad)
#define codePrefetchScratchpadEnd ADDR(randomx_prefetch_scratchpad_end)
#define codePrologue ADDR(randomx_program_prologue) #define codePrologue ADDR(randomx_program_prologue)
#define codeLoopBegin ADDR(randomx_program_loop_begin) #define codeLoopBegin ADDR(randomx_program_loop_begin)
#define codeLoopLoad ADDR(randomx_program_loop_load) #define codeLoopLoad ADDR(randomx_program_loop_load)
@ -134,7 +132,6 @@ namespace randomx {
#define codeShhEnd ADDR(randomx_sshash_end) #define codeShhEnd ADDR(randomx_sshash_end)
#define codeShhInit ADDR(randomx_sshash_init) #define codeShhInit ADDR(randomx_sshash_init)
#define prefetchScratchpadSize (codePrefetchScratchpadEnd - codePrefetchScratchpad)
#define prologueSize (codeLoopBegin - codePrologue) #define prologueSize (codeLoopBegin - codePrologue)
#define loopLoadSize (codeLoopLoadXOP - codeLoopLoad) #define loopLoadSize (codeLoopLoadXOP - codeLoopLoad)
#define loopLoadXOPSize (codeProgamStart - codeLoopLoadXOP) #define loopLoadXOPSize (codeProgamStart - codeLoopLoadXOP)
@ -467,7 +464,7 @@ namespace randomx {
void JitCompilerX86::generateProgramEpilogue(Program& prog, ProgramConfiguration& pcfg) { void JitCompilerX86::generateProgramEpilogue(Program& prog, ProgramConfiguration& pcfg) {
*(uint64_t*)(code + codePos) = 0xc03349c08b49ull + (static_cast<uint64_t>(pcfg.readReg0) << 16) + (static_cast<uint64_t>(pcfg.readReg1) << 40); *(uint64_t*)(code + codePos) = 0xc03349c08b49ull + (static_cast<uint64_t>(pcfg.readReg0) << 16) + (static_cast<uint64_t>(pcfg.readReg1) << 40);
codePos += 6; codePos += 6;
emit(RandomX_CurrentConfig.codePrefetchScratchpadTweaked, prefetchScratchpadSize, code, codePos); emit(RandomX_CurrentConfig.codePrefetchScratchpadTweaked, RandomX_CurrentConfig.codePrefetchScratchpadTweakedSize, code, codePos);
memcpy(code + codePos, codeLoopStore, loopStoreSize); memcpy(code + codePos, codeLoopStore, loopStoreSize);
codePos += loopStoreSize; codePos += loopStoreSize;

View file

@ -38,6 +38,7 @@
#endif #endif
.global DECL(randomx_prefetch_scratchpad) .global DECL(randomx_prefetch_scratchpad)
.global DECL(randomx_prefetch_scratchpad_bmi2)
.global DECL(randomx_prefetch_scratchpad_end) .global DECL(randomx_prefetch_scratchpad_end)
.global DECL(randomx_program_prologue) .global DECL(randomx_program_prologue)
.global DECL(randomx_program_prologue_first_load) .global DECL(randomx_program_prologue_first_load)
@ -80,6 +81,13 @@ DECL(randomx_prefetch_scratchpad):
and edx, RANDOMX_SCRATCHPAD_MASK and edx, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rdx] prefetcht0 [rsi+rdx]
DECL(randomx_prefetch_scratchpad_bmi2):
rorx rdx, rax, 32
and eax, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rax]
and edx, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rdx]
DECL(randomx_prefetch_scratchpad_end): DECL(randomx_prefetch_scratchpad_end):
.balign 64 .balign 64

View file

@ -29,6 +29,7 @@ IFDEF RAX
_RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
PUBLIC randomx_prefetch_scratchpad PUBLIC randomx_prefetch_scratchpad
PUBLIC randomx_prefetch_scratchpad_bmi2
PUBLIC randomx_prefetch_scratchpad_end PUBLIC randomx_prefetch_scratchpad_end
PUBLIC randomx_program_prologue PUBLIC randomx_program_prologue
PUBLIC randomx_program_prologue_first_load PUBLIC randomx_program_prologue_first_load
@ -70,6 +71,14 @@ randomx_prefetch_scratchpad PROC
prefetcht0 [rsi+rdx] prefetcht0 [rsi+rdx]
randomx_prefetch_scratchpad ENDP randomx_prefetch_scratchpad ENDP
randomx_prefetch_scratchpad_bmi2 PROC
rorx rdx, rax, 32
and eax, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rax]
and edx, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rdx]
randomx_prefetch_scratchpad_bmi2 ENDP
randomx_prefetch_scratchpad_end PROC randomx_prefetch_scratchpad_end PROC
randomx_prefetch_scratchpad_end ENDP randomx_prefetch_scratchpad_end ENDP

View file

@ -30,6 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
extern "C" { extern "C" {
void randomx_prefetch_scratchpad(); void randomx_prefetch_scratchpad();
void randomx_prefetch_scratchpad_bmi2();
void randomx_prefetch_scratchpad_end(); void randomx_prefetch_scratchpad_end();
void randomx_program_prologue(); void randomx_program_prologue();
void randomx_program_prologue_first_load(); void randomx_program_prologue_first_load();

View file

@ -177,10 +177,17 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
memcpy(codeReadDatasetRyzenTweaked, a, b - a); memcpy(codeReadDatasetRyzenTweaked, a, b - a);
codeReadDatasetRyzenTweakedSize = b - a; codeReadDatasetRyzenTweakedSize = b - a;
} }
{ if (xmrig::Cpu::info()->hasBMI2()) {
const uint8_t* a = addr(randomx_prefetch_scratchpad); const uint8_t* a = addr(randomx_prefetch_scratchpad_bmi2);
const uint8_t* b = addr(randomx_prefetch_scratchpad_end); const uint8_t* b = addr(randomx_prefetch_scratchpad_end);
memcpy(codePrefetchScratchpadTweaked, a, b - a); memcpy(codePrefetchScratchpadTweaked, a, b - a);
codePrefetchScratchpadTweakedSize = b - a;
}
else {
const uint8_t* a = addr(randomx_prefetch_scratchpad);
const uint8_t* b = addr(randomx_prefetch_scratchpad_bmi2);
memcpy(codePrefetchScratchpadTweaked, a, b - a);
codePrefetchScratchpadTweakedSize = b - a;
} }
# endif # endif
} }
@ -217,13 +224,15 @@ void RandomX_ConfigurationBase::Apply()
//*(uint32_t*)(codeReadDatasetTweaked + 24) = DatasetBaseMask; //*(uint32_t*)(codeReadDatasetTweaked + 24) = DatasetBaseMask;
//*(uint32_t*)(codeReadDatasetLightSshInitTweaked + 59) = DatasetBaseMask; //*(uint32_t*)(codeReadDatasetLightSshInitTweaked + 59) = DatasetBaseMask;
*(uint32_t*)(codePrefetchScratchpadTweaked + 4) = ScratchpadL3Mask64_Calculated; const bool hasBMI2 = xmrig::Cpu::info()->hasBMI2();
*(uint32_t*)(codePrefetchScratchpadTweaked + 18) = ScratchpadL3Mask64_Calculated;
*(uint32_t*)(codePrefetchScratchpadTweaked + (hasBMI2 ? 7 : 4)) = ScratchpadL3Mask64_Calculated;
*(uint32_t*)(codePrefetchScratchpadTweaked + (hasBMI2 ? 17 : 18)) = ScratchpadL3Mask64_Calculated;
// Apply scratchpad prefetch mode // Apply scratchpad prefetch mode
{ {
uint32_t* a = (uint32_t*)(codePrefetchScratchpadTweaked + 8); uint32_t* a = (uint32_t*)(codePrefetchScratchpadTweaked + (hasBMI2 ? 11 : 8));
uint32_t* b = (uint32_t*)(codePrefetchScratchpadTweaked + 22); uint32_t* b = (uint32_t*)(codePrefetchScratchpadTweaked + (hasBMI2 ? 21 : 22));
switch (scratchpadPrefetchMode) switch (scratchpadPrefetchMode)
{ {
@ -290,7 +299,7 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx
INST_HANDLE(IMUL_M, IMUL_R); INST_HANDLE(IMUL_M, IMUL_R);
#if defined(_M_X64) || defined(__x86_64__) #if defined(_M_X64) || defined(__x86_64__)
if (xmrig::Cpu::info()->hasBMI2()) { if (hasBMI2) {
INST_HANDLE2(IMULH_R, IMULH_R_BMI2, IMUL_M); INST_HANDLE2(IMULH_R, IMULH_R_BMI2, IMUL_M);
INST_HANDLE2(IMULH_M, IMULH_M_BMI2, IMULH_R); INST_HANDLE2(IMULH_M, IMULH_M_BMI2, IMULH_R);
} }
@ -332,7 +341,7 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx
#endif #endif
#if defined(_M_X64) || defined(__x86_64__) #if defined(_M_X64) || defined(__x86_64__)
if (xmrig::Cpu::info()->hasBMI2()) { if (hasBMI2) {
INST_HANDLE2(CFROUND, CFROUND_BMI2, CBRANCH); INST_HANDLE2(CFROUND, CFROUND_BMI2, CBRANCH);
} }
else else

View file

@ -129,7 +129,8 @@ struct RandomX_ConfigurationBase
uint32_t codeReadDatasetTweakedSize; uint32_t codeReadDatasetTweakedSize;
uint8_t codeReadDatasetRyzenTweaked[72]; uint8_t codeReadDatasetRyzenTweaked[72];
uint32_t codeReadDatasetRyzenTweakedSize; uint32_t codeReadDatasetRyzenTweakedSize;
uint8_t codePrefetchScratchpadTweaked[32]; uint8_t codePrefetchScratchpadTweaked[28];
uint32_t codePrefetchScratchpadTweakedSize;
uint32_t AddressMask_Calculated[4]; uint32_t AddressMask_Calculated[4];
uint32_t ScratchpadL3Mask_Calculated; uint32_t ScratchpadL3Mask_Calculated;