From 10f9b29e0313a3aa9382a31a1baab2786b8e2c78 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sat, 5 Oct 2019 21:40:21 +0200 Subject: [PATCH] Refactored JIT compiler for x86, small RandomX speedup --- src/crypto/randomx/jit_compiler_x86.cpp | 172 ++++++++++++------------ src/crypto/randomx/jit_compiler_x86.hpp | 69 +++++----- 2 files changed, 123 insertions(+), 118 deletions(-) diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index 2f6cfbda5..2a3425352 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -268,8 +268,6 @@ namespace randomx { } void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) { - memset(registerUsage, -1, sizeof(registerUsage)); - codePos = ((uint8_t*)randomx_program_prologue_first_load) - ((uint8_t*)randomx_program_prologue); code[codePos + 2] = 0xc0 + pcfg.readReg0; code[codePos + 5] = 0xc0 + pcfg.readReg1; @@ -280,13 +278,21 @@ namespace randomx { memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask)); memcpy(code + codePos, codeLoopLoad, loopLoadSize); codePos += loopLoadSize; - for (unsigned i = 0; i < prog.getSize(); ++i) { - Instruction& instr = prog(i); - instr.src %= RegistersCount; - instr.dst %= RegistersCount; - instructionOffsets[i] = codePos; - (this->*(engine[instr.opcode]))(instr, i); + + //mark all registers as used + uint64_t* r = (uint64_t*)registerUsage; + uint64_t k = codePos; + k |= k << 32; + for (unsigned j = 0; j < RegistersCount / 2; ++j) { + r[j] = k; } + + for (int i = 0, n = static_cast(RandomX_CurrentConfig.ProgramSize); i < n; ++i) { + Instruction instr = prog(i); + *((uint64_t*)&instr) &= (uint64_t(-1) - (0xFFFF << 8)) | ((RegistersCount - 1) << 8) | ((RegistersCount - 1) << 16); + (this->*(engine[instr.opcode]))(instr); + } + emit(REX_MOV_RR, code, codePos); emitByte(0xc0 + pcfg.readReg2, code, codePos); emit(REX_XOR_EAX, code, codePos); @@ -402,7 +408,7 @@ namespace randomx { } } - void JitCompilerX86::genAddressReg(Instruction& instr, uint8_t* code, int& codePos, bool rax) { + void JitCompilerX86::genAddressReg(const Instruction& instr, uint8_t* code, int& codePos, bool rax) { emit(LEA_32, code, codePos); emitByte(0x80 + instr.src + (rax ? 0 : 8), code, codePos); if (instr.src == RegisterNeedsSib) { @@ -416,7 +422,7 @@ namespace randomx { emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask, code, codePos); } - void JitCompilerX86::genAddressRegDst(Instruction& instr, uint8_t* code, int& codePos) { + void JitCompilerX86::genAddressRegDst(const Instruction& instr, uint8_t* code, int& codePos) { emit(LEA_32, code, codePos); emitByte(0x80 + instr.dst, code, codePos); if (instr.dst == RegisterNeedsSib) { @@ -432,7 +438,7 @@ namespace randomx { } } - void JitCompilerX86::genAddressImm(Instruction& instr, uint8_t* code, int& codePos) { + void JitCompilerX86::genAddressImm(const Instruction& instr, uint8_t* code, int& codePos) { emit32(instr.getImm32() & ScratchpadL3Mask, code, codePos); } @@ -447,17 +453,18 @@ namespace randomx { 0x3c8d4f, }; - void JitCompilerX86::h_IADD_RS(Instruction& instr, int i) { + void JitCompilerX86::h_IADD_RS(const Instruction& instr) { int pos = codePos; uint8_t* const p = code + pos; - registerUsage[instr.dst] = i; - const uint32_t sib = (instr.getModShift() << 6) | (instr.src << 3) | instr.dst; *(uint32_t*)(p) = template_IADD_RS[instr.dst] | (sib << 24); *(uint32_t*)(p + 4) = instr.getImm32(); - codePos = pos + ((instr.dst == RegisterNeedsDisplacement) ? 8 : 4); + pos += ((instr.dst == RegisterNeedsDisplacement) ? 8 : 4); + + registerUsage[instr.dst] = pos; + codePos = pos; } static const uint32_t template_IADD_M[8] = { @@ -471,11 +478,10 @@ namespace randomx { 0x063c034c, }; - void JitCompilerX86::h_IADD_M(Instruction& instr, int i) { + void JitCompilerX86::h_IADD_M(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - registerUsage[instr.dst] = i; if (instr.src != instr.dst) { genAddressReg(instr, p, pos); emit32(template_IADD_M[instr.dst], p, pos); @@ -486,6 +492,7 @@ namespace randomx { genAddressImm(instr, p, pos); } + registerUsage[instr.dst] = pos; codePos = pos; } @@ -493,11 +500,10 @@ namespace randomx { emitByte((scale << 6) | (index << 3) | base, code, codePos); } - void JitCompilerX86::h_ISUB_R(Instruction& instr, int i) { + void JitCompilerX86::h_ISUB_R(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - registerUsage[instr.dst] = i; if (instr.src != instr.dst) { emit(REX_SUB_RR, p, pos); emitByte(0xc0 + 8 * instr.dst + instr.src, p, pos); @@ -508,14 +514,14 @@ namespace randomx { emit32(instr.getImm32(), p, pos); } + registerUsage[instr.dst] = pos; codePos = pos; } - void JitCompilerX86::h_ISUB_M(Instruction& instr, int i) { + void JitCompilerX86::h_ISUB_M(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - registerUsage[instr.dst] = i; if (instr.src != instr.dst) { genAddressReg(instr, p, pos); emit(REX_SUB_RM, p, pos); @@ -528,14 +534,14 @@ namespace randomx { genAddressImm(instr, p, pos); } + registerUsage[instr.dst] = pos; codePos = pos; } - void JitCompilerX86::h_IMUL_R(Instruction& instr, int i) { + void JitCompilerX86::h_IMUL_R(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - registerUsage[instr.dst] = i; if (instr.src != instr.dst) { emit(REX_IMUL_RR, p, pos); emitByte(0xc0 + 8 * instr.dst + instr.src, p, pos); @@ -546,14 +552,14 @@ namespace randomx { emit32(instr.getImm32(), p, pos); } + registerUsage[instr.dst] = pos; codePos = pos; } - void JitCompilerX86::h_IMUL_M(Instruction& instr, int i) { + void JitCompilerX86::h_IMUL_M(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - registerUsage[instr.dst] = i; if (instr.src != instr.dst) { genAddressReg(instr, p, pos); emit(REX_IMUL_RM, p, pos); @@ -566,14 +572,14 @@ namespace randomx { genAddressImm(instr, p, pos); } + registerUsage[instr.dst] = pos; codePos = pos; } - void JitCompilerX86::h_IMULH_R(Instruction& instr, int i) { + void JitCompilerX86::h_IMULH_R(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - registerUsage[instr.dst] = i; emit(REX_MOV_RR64, p, pos); emitByte(0xc0 + instr.dst, p, pos); emit(REX_MUL_R, p, pos); @@ -581,14 +587,14 @@ namespace randomx { emit(REX_MOV_R64R, p, pos); emitByte(0xc2 + 8 * instr.dst, p, pos); + registerUsage[instr.dst] = pos; codePos = pos; } - void JitCompilerX86::h_IMULH_M(Instruction& instr, int i) { + void JitCompilerX86::h_IMULH_M(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - registerUsage[instr.dst] = i; if (instr.src != instr.dst) { genAddressReg(instr, p, pos, false); emit(REX_MOV_RR64, p, pos); @@ -605,14 +611,14 @@ namespace randomx { emit(REX_MOV_R64R, p, pos); emitByte(0xc2 + 8 * instr.dst, p, pos); + registerUsage[instr.dst] = pos; codePos = pos; } - void JitCompilerX86::h_ISMULH_R(Instruction& instr, int i) { + void JitCompilerX86::h_ISMULH_R(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - registerUsage[instr.dst] = i; emit(REX_MOV_RR64, p, pos); emitByte(0xc0 + instr.dst, p, pos); emit(REX_MUL_R, p, pos); @@ -620,14 +626,14 @@ namespace randomx { emit(REX_MOV_R64R, p, pos); emitByte(0xc2 + 8 * instr.dst, p, pos); + registerUsage[instr.dst] = pos; codePos = pos; } - void JitCompilerX86::h_ISMULH_M(Instruction& instr, int i) { + void JitCompilerX86::h_ISMULH_M(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - registerUsage[instr.dst] = i; if (instr.src != instr.dst) { genAddressReg(instr, p, pos, false); emit(REX_MOV_RR64, p, pos); @@ -644,41 +650,41 @@ namespace randomx { emit(REX_MOV_R64R, p, pos); emitByte(0xc2 + 8 * instr.dst, p, pos); + registerUsage[instr.dst] = pos; codePos = pos; } - void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) { + void JitCompilerX86::h_IMUL_RCP(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; uint64_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { - registerUsage[instr.dst] = i; emit(MOV_RAX_I, p, pos); emit64(randomx_reciprocal_fast(divisor), p, pos); emit(REX_IMUL_RM, p, pos); emitByte(0xc0 + 8 * instr.dst, p, pos); + registerUsage[instr.dst] = pos; } codePos = pos; } - void JitCompilerX86::h_INEG_R(Instruction& instr, int i) { + void JitCompilerX86::h_INEG_R(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - registerUsage[instr.dst] = i; emit(REX_NEG, p, pos); emitByte(0xd8 + instr.dst, p, pos); + registerUsage[instr.dst] = pos; codePos = pos; } - void JitCompilerX86::h_IXOR_R(Instruction& instr, int i) { + void JitCompilerX86::h_IXOR_R(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - registerUsage[instr.dst] = i; if (instr.src != instr.dst) { emit(REX_XOR_RR, p, pos); emitByte(0xc0 + 8 * instr.dst + instr.src, p, pos); @@ -689,14 +695,14 @@ namespace randomx { emit32(instr.getImm32(), p, pos); } + registerUsage[instr.dst] = pos; codePos = pos; } - void JitCompilerX86::h_IXOR_M(Instruction& instr, int i) { + void JitCompilerX86::h_IXOR_M(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - registerUsage[instr.dst] = i; if (instr.src != instr.dst) { genAddressReg(instr, p, pos); emit(REX_XOR_RM, p, pos); @@ -709,14 +715,14 @@ namespace randomx { genAddressImm(instr, p, pos); } + registerUsage[instr.dst] = pos; codePos = pos; } - void JitCompilerX86::h_IROR_R(Instruction& instr, int i) { + void JitCompilerX86::h_IROR_R(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - registerUsage[instr.dst] = i; if (instr.src != instr.dst) { emit(REX_MOV_RR, p, pos); emitByte(0xc8 + instr.src, p, pos); @@ -729,14 +735,14 @@ namespace randomx { emitByte(instr.getImm32() & 63, p, pos); } + registerUsage[instr.dst] = pos; codePos = pos; } - void JitCompilerX86::h_IROL_R(Instruction& instr, int i) { + void JitCompilerX86::h_IROL_R(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - registerUsage[instr.dst] = i; if (instr.src != instr.dst) { emit(REX_MOV_RR, p, pos); emitByte(0xc8 + instr.src, p, pos); @@ -749,24 +755,25 @@ namespace randomx { emitByte(instr.getImm32() & 63, p, pos); } + registerUsage[instr.dst] = pos; codePos = pos; } - void JitCompilerX86::h_ISWAP_R(Instruction& instr, int i) { + void JitCompilerX86::h_ISWAP_R(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; if (instr.src != instr.dst) { - registerUsage[instr.dst] = i; - registerUsage[instr.src] = i; emit(REX_XCHG, p, pos); emitByte(0xc0 + instr.src + 8 * instr.dst, p, pos); + registerUsage[instr.dst] = pos; + registerUsage[instr.src] = pos; } codePos = pos; } - void JitCompilerX86::h_FSWAP_R(Instruction& instr, int i) { + void JitCompilerX86::h_FSWAP_R(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; @@ -777,105 +784,105 @@ namespace randomx { codePos = pos; } - void JitCompilerX86::h_FADD_R(Instruction& instr, int i) { + void JitCompilerX86::h_FADD_R(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - instr.dst %= RegisterCountFlt; - instr.src %= RegisterCountFlt; + const uint32_t dst = instr.dst % RegisterCountFlt; + const uint32_t src = instr.src % RegisterCountFlt; emit(REX_ADDPD, p, pos); - emitByte(0xc0 + instr.src + 8 * instr.dst, p, pos); + emitByte(0xc0 + src + 8 * dst, p, pos); codePos = pos; } - void JitCompilerX86::h_FADD_M(Instruction& instr, int i) { + void JitCompilerX86::h_FADD_M(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - instr.dst %= RegisterCountFlt; + const uint32_t dst = instr.dst % RegisterCountFlt; genAddressReg(instr, p, pos); emit(REX_CVTDQ2PD_XMM12, p, pos); emit(REX_ADDPD, p, pos); - emitByte(0xc4 + 8 * instr.dst, p, pos); + emitByte(0xc4 + 8 * dst, p, pos); codePos = pos; } - void JitCompilerX86::h_FSUB_R(Instruction& instr, int i) { + void JitCompilerX86::h_FSUB_R(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - instr.dst %= RegisterCountFlt; - instr.src %= RegisterCountFlt; + const uint32_t dst = instr.dst % RegisterCountFlt; + const uint32_t src = instr.src % RegisterCountFlt; emit(REX_SUBPD, p, pos); - emitByte(0xc0 + instr.src + 8 * instr.dst, p, pos); + emitByte(0xc0 + src + 8 * dst, p, pos); codePos = pos; } - void JitCompilerX86::h_FSUB_M(Instruction& instr, int i) { + void JitCompilerX86::h_FSUB_M(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - instr.dst %= RegisterCountFlt; + const uint32_t dst = instr.dst % RegisterCountFlt; genAddressReg(instr, p, pos); emit(REX_CVTDQ2PD_XMM12, p, pos); emit(REX_SUBPD, p, pos); - emitByte(0xc4 + 8 * instr.dst, p, pos); + emitByte(0xc4 + 8 * dst, p, pos); codePos = pos; } - void JitCompilerX86::h_FSCAL_R(Instruction& instr, int i) { + void JitCompilerX86::h_FSCAL_R(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - instr.dst %= RegisterCountFlt; + const uint32_t dst = instr.dst % RegisterCountFlt; emit(REX_XORPS, p, pos); - emitByte(0xc7 + 8 * instr.dst, p, pos); + emitByte(0xc7 + 8 * dst, p, pos); codePos = pos; } - void JitCompilerX86::h_FMUL_R(Instruction& instr, int i) { + void JitCompilerX86::h_FMUL_R(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - instr.dst %= RegisterCountFlt; - instr.src %= RegisterCountFlt; + const uint32_t dst = instr.dst % RegisterCountFlt; + const uint32_t src = instr.src % RegisterCountFlt; emit(REX_MULPD, p, pos); - emitByte(0xe0 + instr.src + 8 * instr.dst, p, pos); + emitByte(0xe0 + src + 8 * dst, p, pos); codePos = pos; } - void JitCompilerX86::h_FDIV_M(Instruction& instr, int i) { + void JitCompilerX86::h_FDIV_M(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - instr.dst %= RegisterCountFlt; + const uint32_t dst = instr.dst % RegisterCountFlt; genAddressReg(instr, p, pos); emit(REX_CVTDQ2PD_XMM12, p, pos); emit(REX_ANDPS_XMM12, p, pos); emit(REX_DIVPD, p, pos); - emitByte(0xe4 + 8 * instr.dst, p, pos); + emitByte(0xe4 + 8 * dst, p, pos); codePos = pos; } - void JitCompilerX86::h_FSQRT_R(Instruction& instr, int i) { + void JitCompilerX86::h_FSQRT_R(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; - instr.dst %= RegisterCountFlt; + const uint32_t dst = instr.dst % RegisterCountFlt; emit(SQRTPD, p, pos); - emitByte(0xe4 + 9 * instr.dst, p, pos); + emitByte(0xe4 + 9 * dst, p, pos); codePos = pos; } - void JitCompilerX86::h_CFROUND(Instruction& instr, int i) { + void JitCompilerX86::h_CFROUND(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; @@ -891,12 +898,11 @@ namespace randomx { codePos = pos; } - void JitCompilerX86::h_CBRANCH(Instruction& instr, int i) { + void JitCompilerX86::h_CBRANCH(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; int reg = instr.dst; - int target = registerUsage[reg] + 1; emit(REX_ADD_I, p, pos); emitByte(0xc0 + reg, p, pos); int shift = instr.getModCond() + RandomX_CurrentConfig.JumpOffset; @@ -908,10 +914,10 @@ namespace randomx { emitByte(0xc0 + reg, p, pos); emit32(RandomX_CurrentConfig.ConditionMask_Calculated << shift, p, pos); emit(JZ, p, pos); - emit32(instructionOffsets[target] - (pos + 4), p, pos); + emit32(registerUsage[reg] - (pos + 4), p, pos); //mark all registers as used uint64_t* r = (uint64_t*) registerUsage; - uint64_t k = i; + uint64_t k = pos; k |= k << 32; for (unsigned j = 0; j < RegistersCount / 2; ++j) { r[j] = k; @@ -920,7 +926,7 @@ namespace randomx { codePos = pos; } - void JitCompilerX86::h_ISTORE(Instruction& instr, int i) { + void JitCompilerX86::h_ISTORE(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; @@ -932,7 +938,7 @@ namespace randomx { codePos = pos; } - void JitCompilerX86::h_NOP(Instruction& instr, int i) { + void JitCompilerX86::h_NOP(const Instruction& instr) { emit(NOP1, code, codePos); } diff --git a/src/crypto/randomx/jit_compiler_x86.hpp b/src/crypto/randomx/jit_compiler_x86.hpp index 942fe8210..30b16f586 100644 --- a/src/crypto/randomx/jit_compiler_x86.hpp +++ b/src/crypto/randomx/jit_compiler_x86.hpp @@ -41,7 +41,7 @@ namespace randomx { class JitCompilerX86; class Instruction; - typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int); + typedef void(JitCompilerX86::*InstructionGeneratorX86)(const Instruction&); constexpr uint32_t CodeSize = 64 * 1024; @@ -66,16 +66,15 @@ namespace randomx { size_t getCodeSize(); static InstructionGeneratorX86 engine[256]; - int32_t instructionOffsets[512]; int registerUsage[RegistersCount]; uint8_t* code; int32_t codePos; void generateProgramPrologue(Program&, ProgramConfiguration&); void generateProgramEpilogue(Program&, ProgramConfiguration&); - static void genAddressReg(Instruction&, uint8_t* code, int& codePos, bool rax = true); - static void genAddressRegDst(Instruction&, uint8_t* code, int& codePos); - static void genAddressImm(Instruction&, uint8_t* code, int& codePos); + static void genAddressReg(const Instruction&, uint8_t* code, int& codePos, bool rax = true); + static void genAddressRegDst(const Instruction&, uint8_t* code, int& codePos); + static void genAddressImm(const Instruction&, uint8_t* code, int& codePos); static void genSIB(int scale, int index, int base, uint8_t* code, int& codePos); void generateSuperscalarCode(Instruction &, std::vector &); @@ -105,36 +104,36 @@ namespace randomx { codePos += count; } - void h_IADD_RS(Instruction&, int); - void h_IADD_M(Instruction&, int); - void h_ISUB_R(Instruction&, int); - void h_ISUB_M(Instruction&, int); - void h_IMUL_R(Instruction&, int); - void h_IMUL_M(Instruction&, int); - void h_IMULH_R(Instruction&, int); - void h_IMULH_M(Instruction&, int); - void h_ISMULH_R(Instruction&, int); - void h_ISMULH_M(Instruction&, int); - void h_IMUL_RCP(Instruction&, int); - void h_INEG_R(Instruction&, int); - void h_IXOR_R(Instruction&, int); - void h_IXOR_M(Instruction&, int); - void h_IROR_R(Instruction&, int); - void h_IROL_R(Instruction&, int); - void h_ISWAP_R(Instruction&, int); - void h_FSWAP_R(Instruction&, int); - void h_FADD_R(Instruction&, int); - void h_FADD_M(Instruction&, int); - void h_FSUB_R(Instruction&, int); - void h_FSUB_M(Instruction&, int); - void h_FSCAL_R(Instruction&, int); - void h_FMUL_R(Instruction&, int); - void h_FDIV_M(Instruction&, int); - void h_FSQRT_R(Instruction&, int); - void h_CBRANCH(Instruction&, int); - void h_CFROUND(Instruction&, int); - void h_ISTORE(Instruction&, int); - void h_NOP(Instruction&, int); + void h_IADD_RS(const Instruction&); + void h_IADD_M(const Instruction&); + void h_ISUB_R(const Instruction&); + void h_ISUB_M(const Instruction&); + void h_IMUL_R(const Instruction&); + void h_IMUL_M(const Instruction&); + void h_IMULH_R(const Instruction&); + void h_IMULH_M(const Instruction&); + void h_ISMULH_R(const Instruction&); + void h_ISMULH_M(const Instruction&); + void h_IMUL_RCP(const Instruction&); + void h_INEG_R(const Instruction&); + void h_IXOR_R(const Instruction&); + void h_IXOR_M(const Instruction&); + void h_IROR_R(const Instruction&); + void h_IROL_R(const Instruction&); + void h_ISWAP_R(const Instruction&); + void h_FSWAP_R(const Instruction&); + void h_FADD_R(const Instruction&); + void h_FADD_M(const Instruction&); + void h_FSUB_R(const Instruction&); + void h_FSUB_M(const Instruction&); + void h_FSCAL_R(const Instruction&); + void h_FMUL_R(const Instruction&); + void h_FDIV_M(const Instruction&); + void h_FSQRT_R(const Instruction&); + void h_CBRANCH(const Instruction&); + void h_CFROUND(const Instruction&); + void h_ISTORE(const Instruction&); + void h_NOP(const Instruction&); }; }