Tuned JIT compiler

0.3-0.4% speedup depending on CPU.
This commit is contained in:
SChernykh 2020-02-20 20:59:22 +01:00
parent 887c891ab2
commit 0caeb41bff
6 changed files with 61 additions and 15 deletions

View file

@ -234,7 +234,7 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi
rx_vec_i128 fill_state2 = rx_load_vec_i128((rx_vec_i128*)fill_state + 2);
rx_vec_i128 fill_state3 = rx_load_vec_i128((rx_vec_i128*)fill_state + 3);
constexpr int PREFETCH_DISTANCE = 4096;
constexpr int PREFETCH_DISTANCE = 7168;
const char* prefetchPtr = ((const char*)scratchpad) + PREFETCH_DISTANCE;
scratchpadEnd -= PREFETCH_DISTANCE;
@ -258,8 +258,25 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi
rx_prefetch_t0(prefetchPtr);
scratchpadPtr += 64;
prefetchPtr += 64;
hash_state0 = aesenc<softAes>(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 4));
hash_state1 = aesdec<softAes>(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 5));
hash_state2 = aesenc<softAes>(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 6));
hash_state3 = aesdec<softAes>(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 7));
fill_state0 = aesdec<softAes>(fill_state0, key0);
fill_state1 = aesenc<softAes>(fill_state1, key1);
fill_state2 = aesdec<softAes>(fill_state2, key2);
fill_state3 = aesenc<softAes>(fill_state3, key3);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 4, fill_state0);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 5, fill_state1);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 6, fill_state2);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 7, fill_state3);
rx_prefetch_t0(prefetchPtr + 64);
scratchpadPtr += 128;
prefetchPtr += 128;
}
prefetchPtr = (const char*) scratchpad;
scratchpadEnd += PREFETCH_DISTANCE;

View file

@ -49,6 +49,7 @@ namespace randomx {
JitCompilerA64();
~JitCompilerA64();
void prepare() {}
void generateProgram(Program&, ProgramConfiguration&, uint32_t);
void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);

View file

@ -325,6 +325,13 @@ namespace randomx {
freePagedMemory(allocatedCode, CodeSize);
}
void JitCompilerX86::prepare() {
for (int i = 0; i < sizeof(engine); i += 64)
rx_prefetch_nta((const char*)(&engine) + i);
for (int i = 0; i < sizeof(RandomX_CurrentConfig); i += 64)
rx_prefetch_nta((const char*)(&RandomX_CurrentConfig) + i);
}
void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t flags) {
vm_flags = flags;
@ -419,11 +426,29 @@ namespace randomx {
r[j] = k;
}
for (int i = 0, n = static_cast<int>(RandomX_CurrentConfig.ProgramSize); i < n; ++i) {
Instruction& instr = prog(i);
const uint8_t opcode = instr.opcode;
*((uint64_t*)&instr) &= (uint64_t(-1) - (0xFFFF << 8)) | ((RegistersCount - 1) << 8) | ((RegistersCount - 1) << 16);
(this->*(engine[opcode]))(instr);
constexpr uint64_t instr_mask = (uint64_t(-1) - (0xFFFF << 8)) | ((RegistersCount - 1) << 8) | ((RegistersCount - 1) << 16);
for (int i = 0, n = static_cast<int>(RandomX_CurrentConfig.ProgramSize); i < n; i += 4) {
Instruction& instr1 = prog(i);
Instruction& instr2 = prog(i + 1);
Instruction& instr3 = prog(i + 2);
Instruction& instr4 = prog(i + 3);
InstructionGeneratorX86 gen1 = engine[instr1.opcode];
InstructionGeneratorX86 gen2 = engine[instr2.opcode];
InstructionGeneratorX86 gen3 = engine[instr3.opcode];
InstructionGeneratorX86 gen4 = engine[instr4.opcode];
*((uint64_t*)&instr1) &= instr_mask;
(this->*gen1)(instr1);
*((uint64_t*)&instr2) &= instr_mask;
(this->*gen2)(instr2);
*((uint64_t*)&instr3) &= instr_mask;
(this->*gen3)(instr3);
*((uint64_t*)&instr4) &= instr_mask;
(this->*gen4)(instr4);
}
emit(REX_MOV_RR, code, codePos);
@ -609,13 +634,14 @@ namespace randomx {
int pos = codePos;
uint8_t* const p = code + pos;
const uint32_t sib = (instr.getModShift() << 6) | (instr.src << 3) | instr.dst;
*(uint32_t*)(p) = template_IADD_RS[instr.dst] | (sib << 24);
const uint32_t dst = instr.dst;
const uint32_t sib = (instr.getModShift() << 6) | (instr.src << 3) | dst;
*(uint32_t*)(p) = template_IADD_RS[dst] | (sib << 24);
*(uint32_t*)(p + 4) = instr.getImm32();
pos += ((instr.dst == RegisterNeedsDisplacement) ? 8 : 4);
pos += ((dst == RegisterNeedsDisplacement) ? 8 : 4);
registerUsage[instr.dst] = pos;
registerUsage[dst] = pos;
codePos = pos;
}
@ -1152,6 +1178,6 @@ namespace randomx {
emit(NOP1, code, codePos);
}
InstructionGeneratorX86 JitCompilerX86::engine[256] = {};
alignas(64) InstructionGeneratorX86 JitCompilerX86::engine[256] = {};
}

View file

@ -49,6 +49,7 @@ namespace randomx {
public:
JitCompilerX86();
~JitCompilerX86();
void prepare();
void generateProgram(Program&, ProgramConfiguration&, uint32_t);
void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);
template<size_t N>
@ -65,7 +66,7 @@ namespace randomx {
}
size_t getCodeSize();
static InstructionGeneratorX86 engine[256];
alignas(64) static InstructionGeneratorX86 engine[256];
int registerUsage[RegistersCount];
uint8_t* allocatedCode;
uint8_t* code;

View file

@ -291,7 +291,7 @@ RandomX_ConfigurationLoki RandomX_LokiConfig;
RandomX_ConfigurationArqma RandomX_ArqmaConfig;
RandomX_ConfigurationSafex RandomX_SafexConfig;
RandomX_ConfigurationBase RandomX_CurrentConfig;
alignas(64) RandomX_ConfigurationBase RandomX_CurrentConfig;
extern "C" {

View file

@ -41,6 +41,7 @@ namespace randomx {
template<bool softAes>
void CompiledVm<softAes>::run(void* seed) {
compiler.prepare();
VmBase<softAes>::generateProgram(seed);
randomx_vm::initialize();
compiler.generateProgram(program, config, randomx_vm::getFlags());