Merge pull request #1142 from SChernykh/dev

Updated and optimized RandomX
This commit is contained in:
xmrig 2019-08-28 03:29:53 +07:00 committed by GitHub
commit b953d8db05
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 584 additions and 314 deletions

View file

@ -263,19 +263,6 @@ int rxa2_validate_inputs(const argon2_context *context) {
return ARGON2_INCORRECT_PARAMETER; return ARGON2_INCORRECT_PARAMETER;
} }
if (NULL == context->out) {
return ARGON2_OUTPUT_PTR_NULL;
}
/* Validate output length */
if (ARGON2_MIN_OUTLEN > context->outlen) {
return ARGON2_OUTPUT_TOO_SHORT;
}
if (ARGON2_MAX_OUTLEN < context->outlen) {
return ARGON2_OUTPUT_TOO_LONG;
}
/* Validate password (required param) */ /* Validate password (required param) */
if (NULL == context->pwd) { if (NULL == context->pwd) {
if (0 != context->pwdlen) { if (0 != context->pwdlen) {

View file

@ -46,7 +46,7 @@ namespace randomx {
return data[dataIndex++]; return data[dataIndex++];
} }
uint32_t Blake2Generator::getInt32() { uint32_t Blake2Generator::getUInt32() {
checkData(4); checkData(4);
auto ret = load32(&data[dataIndex]); auto ret = load32(&data[dataIndex]);
dataIndex += 4; dataIndex += 4;

View file

@ -36,7 +36,7 @@ namespace randomx {
public: public:
Blake2Generator(const void* seed, size_t seedSize, int nonce = 0); Blake2Generator(const void* seed, size_t seedSize, int nonce = 0);
uint8_t getByte(); uint8_t getByte();
uint32_t getInt32(); uint32_t getUInt32();
private: private:
void checkData(const size_t); void checkData(const size_t);

View file

@ -244,7 +244,7 @@ namespace randomx {
if (opcode < RandomX_CurrentConfig.CEIL_IMUL_RCP) { if (opcode < RandomX_CurrentConfig.CEIL_IMUL_RCP) {
uint64_t divisor = instr.getImm32(); uint64_t divisor = instr.getImm32();
if (!isPowerOf2(divisor)) { if (!isZeroOrPowerOf2(divisor)) {
auto dst = instr.dst % RegistersCount; auto dst = instr.dst % RegistersCount;
ibc.type = InstructionType::IMUL_R; ibc.type = InstructionType::IMUL_R;
ibc.idst = &nreg->r[dst]; ibc.idst = &nreg->r[dst];

View file

@ -137,7 +137,7 @@ namespace randomx {
constexpr int RegisterNeedsDisplacement = 5; //x86 r13 register constexpr int RegisterNeedsDisplacement = 5; //x86 r13 register
constexpr int RegisterNeedsSib = 4; //x86 r12 register constexpr int RegisterNeedsSib = 4; //x86 r12 register
inline bool isPowerOf2(uint64_t x) { inline bool isZeroOrPowerOf2(uint64_t x) {
return (x & (x - 1)) == 0; return (x & (x - 1)) == 0;
} }

View file

@ -77,13 +77,13 @@ namespace randomx {
void setImm32(uint32_t val) { void setImm32(uint32_t val) {
return store32(&imm32, val); return store32(&imm32, val);
} }
int getModMem() const { uint32_t getModMem() const {
return mod % 4; //bits 0-1 return mod & 3; //bits 0-1
} }
int getModShift() const { uint32_t getModShift() const {
return (mod >> 2) % 4; //bits 2-3 return (mod >> 2) & 3; //bits 2-3
} }
int getModCond() const { uint32_t getModCond() const {
return mod >> 4; //bits 4-7 return mod >> 4; //bits 4-7
} }
void setMod(uint8_t val) { void setMod(uint8_t val) {

View file

@ -376,11 +376,131 @@ FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
#define RANDOMX_DEFAULT_FENV #define RANDOMX_DEFAULT_FENV
void rx_reset_float_state(); #elif defined(__aarch64__)
void rx_set_rounding_mode(uint32_t mode); #include <stdlib.h>
#include <arm_neon.h>
#include <arm_acle.h>
#else //end altivec typedef uint8x16_t rx_vec_i128;
typedef float64x2_t rx_vec_f128;
#define rx_aligned_alloc(size, align) aligned_alloc(align, size)
#define rx_aligned_free(a) free(a)
inline void rx_prefetch_nta(void* ptr) {
asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr));
}
FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
return vld1q_f64((const float64_t*)pd);
}
FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 val) {
vst1q_f64((float64_t*)mem_addr, val);
}
FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) {
float64x2_t temp;
temp = vcopyq_laneq_f64(temp, 1, a, 1);
a = vcopyq_laneq_f64(a, 1, a, 0);
return vcopyq_laneq_f64(a, 0, temp, 1);
}
FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
uint64x2_t temp0 = vdupq_n_u64(x0);
uint64x2_t temp1 = vdupq_n_u64(x1);
return vreinterpretq_f64_u64(vcopyq_laneq_u64(temp0, 1, temp1, 0));
}
FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
return vreinterpretq_f64_u64(vdupq_n_u64(x));
}
#define rx_add_vec_f128 vaddq_f64
#define rx_sub_vec_f128 vsubq_f64
#define rx_mul_vec_f128 vmulq_f64
#define rx_div_vec_f128 vdivq_f64
#define rx_sqrt_vec_f128 vsqrtq_f64
FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
return vreinterpretq_f64_u8(veorq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b)));
}
FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
return vreinterpretq_f64_u8(vandq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b)));
}
FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
return vreinterpretq_f64_u8(vorrq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b)));
}
#ifdef __ARM_FEATURE_CRYPTO
FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 a, rx_vec_i128 key) {
const uint8x16_t zero = { 0 };
return vaesmcq_u8(vaeseq_u8(a, zero)) ^ key;
}
FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 a, rx_vec_i128 key) {
const uint8x16_t zero = { 0 };
return vaesimcq_u8(vaesdq_u8(a, zero)) ^ key;
}
#define HAVE_AES
#endif
#define rx_xor_vec_i128 veorq_u8
FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
return vgetq_lane_s32(vreinterpretq_s32_u8(a), 0);
}
FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
return vgetq_lane_s32(vreinterpretq_s32_u8(a), 1);
}
FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
return vgetq_lane_s32(vreinterpretq_s32_u8(a), 2);
}
FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
return vgetq_lane_s32(vreinterpretq_s32_u8(a), 3);
}
FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) {
int32_t data[4];
data[0] = _I0;
data[1] = _I1;
data[2] = _I2;
data[3] = _I3;
return vreinterpretq_u8_s32(vld1q_s32(data));
};
#define rx_xor_vec_i128 veorq_u8
FORCE_INLINE rx_vec_i128 rx_load_vec_i128(const rx_vec_i128* mem_addr) {
return vld1q_u8((const uint8_t*)mem_addr);
}
FORCE_INLINE void rx_store_vec_i128(rx_vec_i128* mem_addr, rx_vec_i128 val) {
vst1q_u8((uint8_t*)mem_addr, val);
}
FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
double lo = unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0));
double hi = unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4));
rx_vec_f128 x;
x = vsetq_lane_f64(lo, x, 0);
x = vsetq_lane_f64(hi, x, 1);
return x;
}
#define RANDOMX_DEFAULT_FENV
#else //portable fallback
#include <cstdint> #include <cstdint>
#include <stdexcept> #include <stdexcept>
@ -487,7 +607,6 @@ FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
return v; return v;
} }
FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
rx_vec_f128 x; rx_vec_f128 x;
x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0]; x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0];
@ -578,10 +697,6 @@ FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
#define RANDOMX_DEFAULT_FENV #define RANDOMX_DEFAULT_FENV
void rx_reset_float_state();
void rx_set_rounding_mode(uint32_t mode);
#endif #endif
#ifndef HAVE_AES #ifndef HAVE_AES
@ -598,6 +713,14 @@ FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
} }
#endif #endif
#ifdef RANDOMX_DEFAULT_FENV
void rx_reset_float_state();
void rx_set_rounding_mode(uint32_t mode);
#endif
double loadDoublePortable(const void* addr); double loadDoublePortable(const void* addr);
uint64_t mulh(uint64_t, uint64_t); uint64_t mulh(uint64_t, uint64_t);
int64_t smulh(int64_t, int64_t); int64_t smulh(int64_t, int64_t);

View file

@ -181,7 +181,7 @@ namespace randomx {
static const uint8_t REX_TEST[] = { 0x49, 0xF7 }; static const uint8_t REX_TEST[] = { 0x49, 0xF7 };
static const uint8_t JZ[] = { 0x0f, 0x84 }; static const uint8_t JZ[] = { 0x0f, 0x84 };
static const uint8_t RET = 0xc3; static const uint8_t RET = 0xc3;
static const uint8_t LEA_32[] = { 0x67, 0x41, 0x8d }; static const uint8_t LEA_32[] = { 0x41, 0x8d };
static const uint8_t MOVNTI[] = { 0x4c, 0x0f, 0xc3 }; static const uint8_t MOVNTI[] = { 0x4c, 0x0f, 0xc3 };
static const uint8_t ADD_EBX_I[] = { 0x81, 0xc3 }; static const uint8_t ADD_EBX_I[] = { 0x81, 0xc3 };
@ -197,7 +197,7 @@ namespace randomx {
// static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 }; // static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 };
size_t JitCompilerX86::getCodeSize() { size_t JitCompilerX86::getCodeSize() {
return codePos - prologueSize; return codePos < prologueSize ? 0 : codePos - prologueSize;
} }
JitCompilerX86::JitCompilerX86() { JitCompilerX86::JitCompilerX86() {
@ -219,12 +219,12 @@ namespace randomx {
void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) {
generateProgramPrologue(prog, pcfg); generateProgramPrologue(prog, pcfg);
emit(RandomX_CurrentConfig.codeReadDatasetLightSshInitTweaked, readDatasetLightInitSize); emit(RandomX_CurrentConfig.codeReadDatasetLightSshInitTweaked, readDatasetLightInitSize, code, codePos);
emit(ADD_EBX_I); emit(ADD_EBX_I, code, codePos);
emit32(datasetOffset / CacheLineSize); emit32(datasetOffset / CacheLineSize, code, codePos);
emitByte(CALL); emitByte(CALL, code, codePos);
emit32(superScalarHashOffset - (codePos + 4)); emit32(superScalarHashOffset - (codePos + 4), code, codePos);
emit(codeReadDatasetLightSshFin, readDatasetLightFinSize); emit(codeReadDatasetLightSshFin, readDatasetLightFinSize, code, codePos);
generateProgramEpilogue(prog); generateProgramEpilogue(prog);
} }
@ -238,23 +238,23 @@ namespace randomx {
Instruction& instr = prog(i); Instruction& instr = prog(i);
generateSuperscalarCode(instr, reciprocalCache); generateSuperscalarCode(instr, reciprocalCache);
} }
emit(codeShhLoad, codeSshLoadSize); emit(codeShhLoad, codeSshLoadSize, code, codePos);
if (j < RandomX_CurrentConfig.CacheAccesses - 1) { if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
emit(REX_MOV_RR64); emit(REX_MOV_RR64, code, codePos);
emitByte(0xd8 + prog.getAddressRegister()); emitByte(0xd8 + prog.getAddressRegister(), code, codePos);
emit(RandomX_CurrentConfig.codeShhPrefetchTweaked, codeSshPrefetchSize); emit(RandomX_CurrentConfig.codeShhPrefetchTweaked, codeSshPrefetchSize, code, codePos);
#ifdef RANDOMX_ALIGN #ifdef RANDOMX_ALIGN
int align = (codePos % 16); int align = (codePos % 16);
while (align != 0) { while (align != 0) {
int nopSize = 16 - align; int nopSize = 16 - align;
if (nopSize > 8) nopSize = 8; if (nopSize > 8) nopSize = 8;
emit(NOPX[nopSize - 1], nopSize); emit(NOPX[nopSize - 1], nopSize, code, codePos);
align = (codePos % 16); align = (codePos % 16);
} }
#endif #endif
} }
} }
emitByte(RET); emitByte(RET, code, codePos);
} }
template template
@ -265,508 +265,664 @@ namespace randomx {
} }
void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) { void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) {
instructionOffsets.clear(); memset(registerUsage, -1, sizeof(registerUsage));
for (unsigned i = 0; i < 8; ++i) {
registerUsage[i] = -1;
}
codePos = prologueSize; codePos = prologueSize;
memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask)); memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask));
emit(REX_XOR_RAX_R64); emit(REX_XOR_RAX_R64, code, codePos);
emitByte(0xc0 + pcfg.readReg0); emitByte(0xc0 + pcfg.readReg0, code, codePos);
emit(REX_XOR_RAX_R64); emit(REX_XOR_RAX_R64, code, codePos);
emitByte(0xc0 + pcfg.readReg1); emitByte(0xc0 + pcfg.readReg1, code, codePos);
memcpy(code + codePos, RandomX_CurrentConfig.codeLoopLoadTweaked, loopLoadSize); memcpy(code + codePos, RandomX_CurrentConfig.codeLoopLoadTweaked, loopLoadSize);
codePos += loopLoadSize; codePos += loopLoadSize;
for (unsigned i = 0; i < prog.getSize(); ++i) { for (unsigned i = 0; i < prog.getSize(); ++i) {
Instruction& instr = prog(i); Instruction& instr = prog(i);
instr.src %= RegistersCount; instr.src %= RegistersCount;
instr.dst %= RegistersCount; instr.dst %= RegistersCount;
generateCode(instr, i); instructionOffsets[i] = codePos;
(this->*(engine[instr.opcode]))(instr, i);
} }
emit(REX_MOV_RR); emit(REX_MOV_RR, code, codePos);
emitByte(0xc0 + pcfg.readReg2); emitByte(0xc0 + pcfg.readReg2, code, codePos);
emit(REX_XOR_EAX); emit(REX_XOR_EAX, code, codePos);
emitByte(0xc0 + pcfg.readReg3); emitByte(0xc0 + pcfg.readReg3, code, codePos);
} }
void JitCompilerX86::generateProgramEpilogue(Program& prog) { void JitCompilerX86::generateProgramEpilogue(Program& prog) {
memcpy(code + codePos, codeLoopStore, loopStoreSize); memcpy(code + codePos, codeLoopStore, loopStoreSize);
codePos += loopStoreSize; codePos += loopStoreSize;
emit(SUB_EBX); emit(SUB_EBX, code, codePos);
emit(JNZ); emit(JNZ, code, codePos);
emit32(prologueSize - codePos - 4); emit32(prologueSize - codePos - 4, code, codePos);
emitByte(JMP); emitByte(JMP, code, codePos);
emit32(epilogueOffset - codePos - 4); emit32(epilogueOffset - codePos - 4, code, codePos);
}
void JitCompilerX86::generateCode(Instruction& instr, int i) {
instructionOffsets.push_back(codePos);
auto generator = engine[instr.opcode];
(this->*generator)(instr, i);
} }
void JitCompilerX86::generateSuperscalarCode(Instruction& instr, std::vector<uint64_t> &reciprocalCache) { void JitCompilerX86::generateSuperscalarCode(Instruction& instr, std::vector<uint64_t> &reciprocalCache) {
switch ((SuperscalarInstructionType)instr.opcode) switch ((SuperscalarInstructionType)instr.opcode)
{ {
case randomx::SuperscalarInstructionType::ISUB_R: case randomx::SuperscalarInstructionType::ISUB_R:
emit(REX_SUB_RR); emit(REX_SUB_RR, code, codePos);
emitByte(0xc0 + 8 * instr.dst + instr.src); emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
break; break;
case randomx::SuperscalarInstructionType::IXOR_R: case randomx::SuperscalarInstructionType::IXOR_R:
emit(REX_XOR_RR); emit(REX_XOR_RR, code, codePos);
emitByte(0xc0 + 8 * instr.dst + instr.src); emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
break; break;
case randomx::SuperscalarInstructionType::IADD_RS: case randomx::SuperscalarInstructionType::IADD_RS:
emit(REX_LEA); emit(REX_LEA, code, codePos);
emitByte(0x04 + 8 * instr.dst); emitByte(0x04 + 8 * instr.dst, code, codePos);
genSIB(instr.getModShift(), instr.src, instr.dst); genSIB(instr.getModShift(), instr.src, instr.dst, code, codePos);
break; break;
case randomx::SuperscalarInstructionType::IMUL_R: case randomx::SuperscalarInstructionType::IMUL_R:
emit(REX_IMUL_RR); emit(REX_IMUL_RR, code, codePos);
emitByte(0xc0 + 8 * instr.dst + instr.src); emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
break; break;
case randomx::SuperscalarInstructionType::IROR_C: case randomx::SuperscalarInstructionType::IROR_C:
emit(REX_ROT_I8); emit(REX_ROT_I8, code, codePos);
emitByte(0xc8 + instr.dst); emitByte(0xc8 + instr.dst, code, codePos);
emitByte(instr.getImm32() & 63); emitByte(instr.getImm32() & 63, code, codePos);
break; break;
case randomx::SuperscalarInstructionType::IADD_C7: case randomx::SuperscalarInstructionType::IADD_C7:
emit(REX_81); emit(REX_81, code, codePos);
emitByte(0xc0 + instr.dst); emitByte(0xc0 + instr.dst, code, codePos);
emit32(instr.getImm32()); emit32(instr.getImm32(), code, codePos);
break; break;
case randomx::SuperscalarInstructionType::IXOR_C7: case randomx::SuperscalarInstructionType::IXOR_C7:
emit(REX_XOR_RI); emit(REX_XOR_RI, code, codePos);
emitByte(0xf0 + instr.dst); emitByte(0xf0 + instr.dst, code, codePos);
emit32(instr.getImm32()); emit32(instr.getImm32(), code, codePos);
break; break;
case randomx::SuperscalarInstructionType::IADD_C8: case randomx::SuperscalarInstructionType::IADD_C8:
emit(REX_81); emit(REX_81, code, codePos);
emitByte(0xc0 + instr.dst); emitByte(0xc0 + instr.dst, code, codePos);
emit32(instr.getImm32()); emit32(instr.getImm32(), code, codePos);
#ifdef RANDOMX_ALIGN #ifdef RANDOMX_ALIGN
emit(NOP1); emit(NOP1, code, codePos);
#endif #endif
break; break;
case randomx::SuperscalarInstructionType::IXOR_C8: case randomx::SuperscalarInstructionType::IXOR_C8:
emit(REX_XOR_RI); emit(REX_XOR_RI, code, codePos);
emitByte(0xf0 + instr.dst); emitByte(0xf0 + instr.dst, code, codePos);
emit32(instr.getImm32()); emit32(instr.getImm32(), code, codePos);
#ifdef RANDOMX_ALIGN #ifdef RANDOMX_ALIGN
emit(NOP1); emit(NOP1, code, codePos);
#endif #endif
break; break;
case randomx::SuperscalarInstructionType::IADD_C9: case randomx::SuperscalarInstructionType::IADD_C9:
emit(REX_81); emit(REX_81, code, codePos);
emitByte(0xc0 + instr.dst); emitByte(0xc0 + instr.dst, code, codePos);
emit32(instr.getImm32()); emit32(instr.getImm32(), code, codePos);
#ifdef RANDOMX_ALIGN #ifdef RANDOMX_ALIGN
emit(NOP2); emit(NOP2, code, codePos);
#endif #endif
break; break;
case randomx::SuperscalarInstructionType::IXOR_C9: case randomx::SuperscalarInstructionType::IXOR_C9:
emit(REX_XOR_RI); emit(REX_XOR_RI, code, codePos);
emitByte(0xf0 + instr.dst); emitByte(0xf0 + instr.dst, code, codePos);
emit32(instr.getImm32()); emit32(instr.getImm32(), code, codePos);
#ifdef RANDOMX_ALIGN #ifdef RANDOMX_ALIGN
emit(NOP2); emit(NOP2, code, codePos);
#endif #endif
break; break;
case randomx::SuperscalarInstructionType::IMULH_R: case randomx::SuperscalarInstructionType::IMULH_R:
emit(REX_MOV_RR64); emit(REX_MOV_RR64, code, codePos);
emitByte(0xc0 + instr.dst); emitByte(0xc0 + instr.dst, code, codePos);
emit(REX_MUL_R); emit(REX_MUL_R, code, codePos);
emitByte(0xe0 + instr.src); emitByte(0xe0 + instr.src, code, codePos);
emit(REX_MOV_R64R); emit(REX_MOV_R64R, code, codePos);
emitByte(0xc2 + 8 * instr.dst); emitByte(0xc2 + 8 * instr.dst, code, codePos);
break; break;
case randomx::SuperscalarInstructionType::ISMULH_R: case randomx::SuperscalarInstructionType::ISMULH_R:
emit(REX_MOV_RR64); emit(REX_MOV_RR64, code, codePos);
emitByte(0xc0 + instr.dst); emitByte(0xc0 + instr.dst, code, codePos);
emit(REX_MUL_R); emit(REX_MUL_R, code, codePos);
emitByte(0xe8 + instr.src); emitByte(0xe8 + instr.src, code, codePos);
emit(REX_MOV_R64R); emit(REX_MOV_R64R, code, codePos);
emitByte(0xc2 + 8 * instr.dst); emitByte(0xc2 + 8 * instr.dst, code, codePos);
break; break;
case randomx::SuperscalarInstructionType::IMUL_RCP: case randomx::SuperscalarInstructionType::IMUL_RCP:
emit(MOV_RAX_I); emit(MOV_RAX_I, code, codePos);
emit64(reciprocalCache[instr.getImm32()]); emit64(reciprocalCache[instr.getImm32()], code, codePos);
emit(REX_IMUL_RM); emit(REX_IMUL_RM, code, codePos);
emitByte(0xc0 + 8 * instr.dst); emitByte(0xc0 + 8 * instr.dst, code, codePos);
break; break;
default: default:
UNREACHABLE; UNREACHABLE;
} }
} }
void JitCompilerX86::genAddressReg(Instruction& instr, bool rax = true) { void JitCompilerX86::genAddressReg(Instruction& instr, uint8_t* code, int& codePos, bool rax) {
emit(LEA_32); emit(LEA_32, code, codePos);
emitByte(0x80 + instr.src + (rax ? 0 : 8)); emitByte(0x80 + instr.src + (rax ? 0 : 8), code, codePos);
if (instr.src == RegisterNeedsSib) { if (instr.src == RegisterNeedsSib) {
emitByte(0x24); emitByte(0x24, code, codePos);
} }
emit32(instr.getImm32()); emit32(instr.getImm32(), code, codePos);
if (rax) if (rax)
emitByte(AND_EAX_I); emitByte(AND_EAX_I, code, codePos);
else else
emit(AND_ECX_I); emit(AND_ECX_I, code, codePos);
emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask, code, codePos);
} }
void JitCompilerX86::genAddressRegDst(Instruction& instr) { void JitCompilerX86::genAddressRegDst(Instruction& instr, uint8_t* code, int& codePos) {
emit(LEA_32); emit(LEA_32, code, codePos);
emitByte(0x80 + instr.dst); emitByte(0x80 + instr.dst, code, codePos);
if (instr.dst == RegisterNeedsSib) { if (instr.dst == RegisterNeedsSib) {
emitByte(0x24); emitByte(0x24, code, codePos);
} }
emit32(instr.getImm32()); emit32(instr.getImm32(), code, codePos);
emitByte(AND_EAX_I); emitByte(AND_EAX_I, code, codePos);
if (instr.getModCond() < StoreL3Condition) { if (instr.getModCond() < StoreL3Condition) {
emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask, code, codePos);
} }
else { else {
emit32(ScratchpadL3Mask); emit32(ScratchpadL3Mask, code, codePos);
} }
} }
void JitCompilerX86::genAddressImm(Instruction& instr) { void JitCompilerX86::genAddressImm(Instruction& instr, uint8_t* code, int& codePos) {
emit32(instr.getImm32() & ScratchpadL3Mask); emit32(instr.getImm32() & ScratchpadL3Mask, code, codePos);
} }
static const uint32_t template_IADD_RS[8] = {
0x048d4f,
0x0c8d4f,
0x148d4f,
0x1c8d4f,
0x248d4f,
0xac8d4f,
0x348d4f,
0x3c8d4f,
};
void JitCompilerX86::h_IADD_RS(Instruction& instr, int i) { void JitCompilerX86::h_IADD_RS(Instruction& instr, int i) {
int pos = codePos;
uint8_t* const p = code + pos;
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
emit(REX_LEA);
if (instr.dst == RegisterNeedsDisplacement) const uint32_t sib = (instr.getModShift() << 6) | (instr.src << 3) | instr.dst;
emitByte(0xac); *(uint32_t*)(p) = template_IADD_RS[instr.dst] | (sib << 24);
else *(uint32_t*)(p + 4) = instr.getImm32();
emitByte(0x04 + 8 * instr.dst);
genSIB(instr.getModShift(), instr.src, instr.dst); codePos = pos + ((instr.dst == RegisterNeedsDisplacement) ? 8 : 4);
if (instr.dst == RegisterNeedsDisplacement)
emit32(instr.getImm32());
} }
static const uint32_t template_IADD_M[8] = {
0x0604034c,
0x060c034c,
0x0614034c,
0x061c034c,
0x0624034c,
0x062c034c,
0x0634034c,
0x063c034c,
};
void JitCompilerX86::h_IADD_M(Instruction& instr, int i) { void JitCompilerX86::h_IADD_M(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
genAddressReg(instr); genAddressReg(instr, p, pos);
emit(REX_ADD_RM); emit32(template_IADD_M[instr.dst], p, pos);
emitByte(0x04 + 8 * instr.dst);
emitByte(0x06);
} }
else { else {
emit(REX_ADD_RM); emit(REX_ADD_RM, p, pos);
emitByte(0x86 + 8 * instr.dst); emitByte(0x86 + 8 * instr.dst, p, pos);
genAddressImm(instr); genAddressImm(instr, p, pos);
} }
codePos = pos;
} }
void JitCompilerX86::genSIB(int scale, int index, int base) { void JitCompilerX86::genSIB(int scale, int index, int base, uint8_t* code, int& codePos) {
emitByte((scale << 6) | (index << 3) | base); emitByte((scale << 6) | (index << 3) | base, code, codePos);
} }
void JitCompilerX86::h_ISUB_R(Instruction& instr, int i) { void JitCompilerX86::h_ISUB_R(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
emit(REX_SUB_RR); emit(REX_SUB_RR, p, pos);
emitByte(0xc0 + 8 * instr.dst + instr.src); emitByte(0xc0 + 8 * instr.dst + instr.src, p, pos);
} }
else { else {
emit(REX_81); emit(REX_81, p, pos);
emitByte(0xe8 + instr.dst); emitByte(0xe8 + instr.dst, p, pos);
emit32(instr.getImm32()); emit32(instr.getImm32(), p, pos);
} }
codePos = pos;
} }
void JitCompilerX86::h_ISUB_M(Instruction& instr, int i) { void JitCompilerX86::h_ISUB_M(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
genAddressReg(instr); genAddressReg(instr, p, pos);
emit(REX_SUB_RM); emit(REX_SUB_RM, p, pos);
emitByte(0x04 + 8 * instr.dst); emitByte(0x04 + 8 * instr.dst, p, pos);
emitByte(0x06); emitByte(0x06, p, pos);
} }
else { else {
emit(REX_SUB_RM); emit(REX_SUB_RM, p, pos);
emitByte(0x86 + 8 * instr.dst); emitByte(0x86 + 8 * instr.dst, p, pos);
genAddressImm(instr); genAddressImm(instr, p, pos);
} }
codePos = pos;
} }
void JitCompilerX86::h_IMUL_R(Instruction& instr, int i) { void JitCompilerX86::h_IMUL_R(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
emit(REX_IMUL_RR); emit(REX_IMUL_RR, p, pos);
emitByte(0xc0 + 8 * instr.dst + instr.src); emitByte(0xc0 + 8 * instr.dst + instr.src, p, pos);
} }
else { else {
emit(REX_IMUL_RRI); emit(REX_IMUL_RRI, p, pos);
emitByte(0xc0 + 9 * instr.dst); emitByte(0xc0 + 9 * instr.dst, p, pos);
emit32(instr.getImm32()); emit32(instr.getImm32(), p, pos);
} }
codePos = pos;
} }
void JitCompilerX86::h_IMUL_M(Instruction& instr, int i) { void JitCompilerX86::h_IMUL_M(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
genAddressReg(instr); genAddressReg(instr, p, pos);
emit(REX_IMUL_RM); emit(REX_IMUL_RM, p, pos);
emitByte(0x04 + 8 * instr.dst); emitByte(0x04 + 8 * instr.dst, p, pos);
emitByte(0x06); emitByte(0x06, p, pos);
} }
else { else {
emit(REX_IMUL_RM); emit(REX_IMUL_RM, p, pos);
emitByte(0x86 + 8 * instr.dst); emitByte(0x86 + 8 * instr.dst, p, pos);
genAddressImm(instr); genAddressImm(instr, p, pos);
} }
codePos = pos;
} }
void JitCompilerX86::h_IMULH_R(Instruction& instr, int i) { void JitCompilerX86::h_IMULH_R(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
emit(REX_MOV_RR64); emit(REX_MOV_RR64, p, pos);
emitByte(0xc0 + instr.dst); emitByte(0xc0 + instr.dst, p, pos);
emit(REX_MUL_R); emit(REX_MUL_R, p, pos);
emitByte(0xe0 + instr.src); emitByte(0xe0 + instr.src, p, pos);
emit(REX_MOV_R64R); emit(REX_MOV_R64R, p, pos);
emitByte(0xc2 + 8 * instr.dst); emitByte(0xc2 + 8 * instr.dst, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_IMULH_M(Instruction& instr, int i) { void JitCompilerX86::h_IMULH_M(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
genAddressReg(instr, false); genAddressReg(instr, p, pos, false);
emit(REX_MOV_RR64); emit(REX_MOV_RR64, p, pos);
emitByte(0xc0 + instr.dst); emitByte(0xc0 + instr.dst, p, pos);
emit(REX_MUL_MEM); emit(REX_MUL_MEM, p, pos);
} }
else { else {
emit(REX_MOV_RR64); emit(REX_MOV_RR64, p, pos);
emitByte(0xc0 + instr.dst); emitByte(0xc0 + instr.dst, p, pos);
emit(REX_MUL_M); emit(REX_MUL_M, p, pos);
emitByte(0xa6); emitByte(0xa6, p, pos);
genAddressImm(instr); genAddressImm(instr, p, pos);
} }
emit(REX_MOV_R64R); emit(REX_MOV_R64R, p, pos);
emitByte(0xc2 + 8 * instr.dst); emitByte(0xc2 + 8 * instr.dst, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_ISMULH_R(Instruction& instr, int i) { void JitCompilerX86::h_ISMULH_R(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
emit(REX_MOV_RR64); emit(REX_MOV_RR64, p, pos);
emitByte(0xc0 + instr.dst); emitByte(0xc0 + instr.dst, p, pos);
emit(REX_MUL_R); emit(REX_MUL_R, p, pos);
emitByte(0xe8 + instr.src); emitByte(0xe8 + instr.src, p, pos);
emit(REX_MOV_R64R); emit(REX_MOV_R64R, p, pos);
emitByte(0xc2 + 8 * instr.dst); emitByte(0xc2 + 8 * instr.dst, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_ISMULH_M(Instruction& instr, int i) { void JitCompilerX86::h_ISMULH_M(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
genAddressReg(instr, false); genAddressReg(instr, p, pos, false);
emit(REX_MOV_RR64); emit(REX_MOV_RR64, p, pos);
emitByte(0xc0 + instr.dst); emitByte(0xc0 + instr.dst, p, pos);
emit(REX_IMUL_MEM); emit(REX_IMUL_MEM, p, pos);
} }
else { else {
emit(REX_MOV_RR64); emit(REX_MOV_RR64, p, pos);
emitByte(0xc0 + instr.dst); emitByte(0xc0 + instr.dst, p, pos);
emit(REX_MUL_M); emit(REX_MUL_M, p, pos);
emitByte(0xae); emitByte(0xae, p, pos);
genAddressImm(instr); genAddressImm(instr, p, pos);
} }
emit(REX_MOV_R64R); emit(REX_MOV_R64R, p, pos);
emitByte(0xc2 + 8 * instr.dst); emitByte(0xc2 + 8 * instr.dst, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) { void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
uint64_t divisor = instr.getImm32(); uint64_t divisor = instr.getImm32();
if (!isPowerOf2(divisor)) { if (!isZeroOrPowerOf2(divisor)) {
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
emit(MOV_RAX_I); emit(MOV_RAX_I, p, pos);
emit64(randomx_reciprocal_fast(divisor)); emit64(randomx_reciprocal_fast(divisor), p, pos);
emit(REX_IMUL_RM); emit(REX_IMUL_RM, p, pos);
emitByte(0xc0 + 8 * instr.dst); emitByte(0xc0 + 8 * instr.dst, p, pos);
} }
codePos = pos;
} }
void JitCompilerX86::h_INEG_R(Instruction& instr, int i) { void JitCompilerX86::h_INEG_R(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
emit(REX_NEG); emit(REX_NEG, p, pos);
emitByte(0xd8 + instr.dst); emitByte(0xd8 + instr.dst, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_IXOR_R(Instruction& instr, int i) { void JitCompilerX86::h_IXOR_R(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
emit(REX_XOR_RR); emit(REX_XOR_RR, p, pos);
emitByte(0xc0 + 8 * instr.dst + instr.src); emitByte(0xc0 + 8 * instr.dst + instr.src, p, pos);
} }
else { else {
emit(REX_XOR_RI); emit(REX_XOR_RI, p, pos);
emitByte(0xf0 + instr.dst); emitByte(0xf0 + instr.dst, p, pos);
emit32(instr.getImm32()); emit32(instr.getImm32(), p, pos);
} }
codePos = pos;
} }
void JitCompilerX86::h_IXOR_M(Instruction& instr, int i) { void JitCompilerX86::h_IXOR_M(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
genAddressReg(instr); genAddressReg(instr, p, pos);
emit(REX_XOR_RM); emit(REX_XOR_RM, p, pos);
emitByte(0x04 + 8 * instr.dst); emitByte(0x04 + 8 * instr.dst, p, pos);
emitByte(0x06); emitByte(0x06, p, pos);
} }
else { else {
emit(REX_XOR_RM); emit(REX_XOR_RM, p, pos);
emitByte(0x86 + 8 * instr.dst); emitByte(0x86 + 8 * instr.dst, p, pos);
genAddressImm(instr); genAddressImm(instr, p, pos);
} }
codePos = pos;
} }
void JitCompilerX86::h_IROR_R(Instruction& instr, int i) { void JitCompilerX86::h_IROR_R(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
emit(REX_MOV_RR); emit(REX_MOV_RR, p, pos);
emitByte(0xc8 + instr.src); emitByte(0xc8 + instr.src, p, pos);
emit(REX_ROT_CL); emit(REX_ROT_CL, p, pos);
emitByte(0xc8 + instr.dst); emitByte(0xc8 + instr.dst, p, pos);
} }
else { else {
emit(REX_ROT_I8); emit(REX_ROT_I8, p, pos);
emitByte(0xc8 + instr.dst); emitByte(0xc8 + instr.dst, p, pos);
emitByte(instr.getImm32() & 63); emitByte(instr.getImm32() & 63, p, pos);
} }
codePos = pos;
} }
void JitCompilerX86::h_IROL_R(Instruction& instr, int i) { void JitCompilerX86::h_IROL_R(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
emit(REX_MOV_RR); emit(REX_MOV_RR, p, pos);
emitByte(0xc8 + instr.src); emitByte(0xc8 + instr.src, p, pos);
emit(REX_ROT_CL); emit(REX_ROT_CL, p, pos);
emitByte(0xc0 + instr.dst); emitByte(0xc0 + instr.dst, p, pos);
} }
else { else {
emit(REX_ROT_I8); emit(REX_ROT_I8, p, pos);
emitByte(0xc0 + instr.dst); emitByte(0xc0 + instr.dst, p, pos);
emitByte(instr.getImm32() & 63); emitByte(instr.getImm32() & 63, p, pos);
} }
codePos = pos;
} }
void JitCompilerX86::h_ISWAP_R(Instruction& instr, int i) { void JitCompilerX86::h_ISWAP_R(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
registerUsage[instr.dst] = i; registerUsage[instr.dst] = i;
registerUsage[instr.src] = i; registerUsage[instr.src] = i;
emit(REX_XCHG); emit(REX_XCHG, p, pos);
emitByte(0xc0 + instr.src + 8 * instr.dst); emitByte(0xc0 + instr.src + 8 * instr.dst, p, pos);
} }
codePos = pos;
} }
void JitCompilerX86::h_FSWAP_R(Instruction& instr, int i) { void JitCompilerX86::h_FSWAP_R(Instruction& instr, int i) {
emit(SHUFPD); uint8_t* const p = code;
emitByte(0xc0 + 9 * instr.dst); int pos = codePos;
emitByte(1);
emit(SHUFPD, p, pos);
emitByte(0xc0 + 9 * instr.dst, p, pos);
emitByte(1, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_FADD_R(Instruction& instr, int i) { void JitCompilerX86::h_FADD_R(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
instr.dst %= RegisterCountFlt; instr.dst %= RegisterCountFlt;
instr.src %= RegisterCountFlt; instr.src %= RegisterCountFlt;
emit(REX_ADDPD); emit(REX_ADDPD, p, pos);
emitByte(0xc0 + instr.src + 8 * instr.dst); emitByte(0xc0 + instr.src + 8 * instr.dst, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_FADD_M(Instruction& instr, int i) { void JitCompilerX86::h_FADD_M(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
instr.dst %= RegisterCountFlt; instr.dst %= RegisterCountFlt;
genAddressReg(instr); genAddressReg(instr, p, pos);
emit(REX_CVTDQ2PD_XMM12); emit(REX_CVTDQ2PD_XMM12, p, pos);
emit(REX_ADDPD); emit(REX_ADDPD, p, pos);
emitByte(0xc4 + 8 * instr.dst); emitByte(0xc4 + 8 * instr.dst, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_FSUB_R(Instruction& instr, int i) { void JitCompilerX86::h_FSUB_R(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
instr.dst %= RegisterCountFlt; instr.dst %= RegisterCountFlt;
instr.src %= RegisterCountFlt; instr.src %= RegisterCountFlt;
emit(REX_SUBPD); emit(REX_SUBPD, p, pos);
emitByte(0xc0 + instr.src + 8 * instr.dst); emitByte(0xc0 + instr.src + 8 * instr.dst, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_FSUB_M(Instruction& instr, int i) { void JitCompilerX86::h_FSUB_M(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
instr.dst %= RegisterCountFlt; instr.dst %= RegisterCountFlt;
genAddressReg(instr); genAddressReg(instr, p, pos);
emit(REX_CVTDQ2PD_XMM12); emit(REX_CVTDQ2PD_XMM12, p, pos);
emit(REX_SUBPD); emit(REX_SUBPD, p, pos);
emitByte(0xc4 + 8 * instr.dst); emitByte(0xc4 + 8 * instr.dst, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_FSCAL_R(Instruction& instr, int i) { void JitCompilerX86::h_FSCAL_R(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
instr.dst %= RegisterCountFlt; instr.dst %= RegisterCountFlt;
emit(REX_XORPS); emit(REX_XORPS, p, pos);
emitByte(0xc7 + 8 * instr.dst); emitByte(0xc7 + 8 * instr.dst, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_FMUL_R(Instruction& instr, int i) { void JitCompilerX86::h_FMUL_R(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
instr.dst %= RegisterCountFlt; instr.dst %= RegisterCountFlt;
instr.src %= RegisterCountFlt; instr.src %= RegisterCountFlt;
emit(REX_MULPD); emit(REX_MULPD, p, pos);
emitByte(0xe0 + instr.src + 8 * instr.dst); emitByte(0xe0 + instr.src + 8 * instr.dst, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_FDIV_M(Instruction& instr, int i) { void JitCompilerX86::h_FDIV_M(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
instr.dst %= RegisterCountFlt; instr.dst %= RegisterCountFlt;
genAddressReg(instr); genAddressReg(instr, p, pos);
emit(REX_CVTDQ2PD_XMM12); emit(REX_CVTDQ2PD_XMM12, p, pos);
emit(REX_ANDPS_XMM12); emit(REX_ANDPS_XMM12, p, pos);
emit(REX_DIVPD); emit(REX_DIVPD, p, pos);
emitByte(0xe4 + 8 * instr.dst); emitByte(0xe4 + 8 * instr.dst, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_FSQRT_R(Instruction& instr, int i) { void JitCompilerX86::h_FSQRT_R(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
instr.dst %= RegisterCountFlt; instr.dst %= RegisterCountFlt;
emit(SQRTPD); emit(SQRTPD, p, pos);
emitByte(0xe4 + 9 * instr.dst); emitByte(0xe4 + 9 * instr.dst, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_CFROUND(Instruction& instr, int i) { void JitCompilerX86::h_CFROUND(Instruction& instr, int i) {
emit(REX_MOV_RR64); uint8_t* const p = code;
emitByte(0xc0 + instr.src); int pos = codePos;
emit(REX_MOV_RR64, p, pos);
emitByte(0xc0 + instr.src, p, pos);
int rotate = (13 - (instr.getImm32() & 63)) & 63; int rotate = (13 - (instr.getImm32() & 63)) & 63;
if (rotate != 0) { if (rotate != 0) {
emit(ROL_RAX); emit(ROL_RAX, p, pos);
emitByte(rotate); emitByte(rotate, p, pos);
} }
emit(AND_OR_MOV_LDMXCSR); emit(AND_OR_MOV_LDMXCSR, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_CBRANCH(Instruction& instr, int i) { void JitCompilerX86::h_CBRANCH(Instruction& instr, int i) {
uint8_t* const p = code;
int pos = codePos;
int reg = instr.dst; int reg = instr.dst;
int target = registerUsage[reg] + 1; int target = registerUsage[reg] + 1;
emit(REX_ADD_I); emit(REX_ADD_I, p, pos);
emitByte(0xc0 + reg); emitByte(0xc0 + reg, p, pos);
int shift = instr.getModCond() + RandomX_CurrentConfig.JumpOffset; int shift = instr.getModCond() + RandomX_CurrentConfig.JumpOffset;
uint32_t imm = instr.getImm32() | (1UL << shift); uint32_t imm = instr.getImm32() | (1UL << shift);
if (RandomX_CurrentConfig.JumpOffset > 0 || shift > 0) if (RandomX_CurrentConfig.JumpOffset > 0 || shift > 0)
imm &= ~(1UL << (shift - 1)); imm &= ~(1UL << (shift - 1));
emit32(imm); emit32(imm, p, pos);
emit(REX_TEST); emit(REX_TEST, p, pos);
emitByte(0xc0 + reg); emitByte(0xc0 + reg, p, pos);
emit32(RandomX_CurrentConfig.ConditionMask_Calculated << shift); emit32(RandomX_CurrentConfig.ConditionMask_Calculated << shift, p, pos);
emit(JZ); emit(JZ, p, pos);
emit32(instructionOffsets[target] - (codePos + 4)); emit32(instructionOffsets[target] - (pos + 4), p, pos);
//mark all registers as used //mark all registers as used
for (unsigned j = 0; j < RegistersCount; ++j) { uint64_t* r = (uint64_t*) registerUsage;
registerUsage[j] = i; uint64_t k = i;
k |= k << 32;
for (unsigned j = 0; j < RegistersCount / 2; ++j) {
r[j] = k;
} }
codePos = pos;
} }
void JitCompilerX86::h_ISTORE(Instruction& instr, int i) { void JitCompilerX86::h_ISTORE(Instruction& instr, int i) {
genAddressRegDst(instr); uint8_t* const p = code;
emit(REX_MOV_MR); int pos = codePos;
emitByte(0x04 + 8 * instr.src);
emitByte(0x06); genAddressRegDst(instr, p, pos);
emit(REX_MOV_MR, p, pos);
emitByte(0x04 + 8 * instr.src, p, pos);
emitByte(0x06, p, pos);
codePos = pos;
} }
void JitCompilerX86::h_NOP(Instruction& instr, int i) { void JitCompilerX86::h_NOP(Instruction& instr, int i) {
emit(NOP1); emit(NOP1, code, codePos);
} }
InstructionGeneratorX86 JitCompilerX86::engine[256] = {}; InstructionGeneratorX86 JitCompilerX86::engine[256] = {};

View file

@ -66,42 +66,41 @@ namespace randomx {
size_t getCodeSize(); size_t getCodeSize();
static InstructionGeneratorX86 engine[256]; static InstructionGeneratorX86 engine[256];
std::vector<int32_t> instructionOffsets; int32_t instructionOffsets[512];
int registerUsage[RegistersCount]; int registerUsage[RegistersCount];
uint8_t* code; uint8_t* code;
int32_t codePos; int32_t codePos;
void generateProgramPrologue(Program&, ProgramConfiguration&); void generateProgramPrologue(Program&, ProgramConfiguration&);
void generateProgramEpilogue(Program&); void generateProgramEpilogue(Program&);
void genAddressReg(Instruction&, bool); static void genAddressReg(Instruction&, uint8_t* code, int& codePos, bool rax = true);
void genAddressRegDst(Instruction&); static void genAddressRegDst(Instruction&, uint8_t* code, int& codePos);
void genAddressImm(Instruction&); static void genAddressImm(Instruction&, uint8_t* code, int& codePos);
void genSIB(int scale, int index, int base); static void genSIB(int scale, int index, int base, uint8_t* code, int& codePos);
void generateCode(Instruction&, int);
void generateSuperscalarCode(Instruction &, std::vector<uint64_t> &); void generateSuperscalarCode(Instruction &, std::vector<uint64_t> &);
void emitByte(uint8_t val) { static void emitByte(uint8_t val, uint8_t* code, int& codePos) {
code[codePos] = val; code[codePos] = val;
codePos++; ++codePos;
} }
void emit32(uint32_t val) { static void emit32(uint32_t val, uint8_t* code, int& codePos) {
memcpy(code + codePos, &val, sizeof val); memcpy(code + codePos, &val, sizeof val);
codePos += sizeof val; codePos += sizeof val;
} }
void emit64(uint64_t val) { static void emit64(uint64_t val, uint8_t* code, int& codePos) {
memcpy(code + codePos, &val, sizeof val); memcpy(code + codePos, &val, sizeof val);
codePos += sizeof val; codePos += sizeof val;
} }
template<size_t N> template<size_t N>
void emit(const uint8_t (&src)[N]) { static void emit(const uint8_t (&src)[N], uint8_t* code, int& codePos) {
emit(src, N); emit(src, N, code, codePos);
} }
void emit(const uint8_t* src, size_t count) { static void emit(const uint8_t* src, size_t count, uint8_t* code, int& codePos) {
memcpy(code + codePos, src, count); memcpy(code + codePos, src, count);
codePos += count; codePos += count;
} }

View file

@ -44,12 +44,14 @@ RandomX_ConfigurationWownero::RandomX_ConfigurationWownero()
ScratchpadL2_Size = 131072; ScratchpadL2_Size = 131072;
ScratchpadL3_Size = 1048576; ScratchpadL3_Size = 1048576;
RANDOMX_FREQ_IADD_RS = 25;
RANDOMX_FREQ_IROR_R = 10; RANDOMX_FREQ_IROR_R = 10;
RANDOMX_FREQ_IROL_R = 0; RANDOMX_FREQ_IROL_R = 0;
RANDOMX_FREQ_FSWAP_R = 8; RANDOMX_FREQ_FSWAP_R = 8;
RANDOMX_FREQ_FADD_R = 20; RANDOMX_FREQ_FADD_R = 20;
RANDOMX_FREQ_FSUB_R = 20; RANDOMX_FREQ_FSUB_R = 20;
RANDOMX_FREQ_FMUL_R = 20; RANDOMX_FREQ_FMUL_R = 20;
RANDOMX_FREQ_CBRANCH = 16;
fillAes4Rx4_Key[0] = rx_set_int_vec_i128(0xcf359e95, 0x141f82b7, 0x7ffbe4a6, 0xf890465d); fillAes4Rx4_Key[0] = rx_set_int_vec_i128(0xcf359e95, 0x141f82b7, 0x7ffbe4a6, 0xf890465d);
fillAes4Rx4_Key[1] = rx_set_int_vec_i128(0x6741ffdc, 0xbd5c5ac3, 0xfee8278a, 0x6a55c450); fillAes4Rx4_Key[1] = rx_set_int_vec_i128(0x6741ffdc, 0xbd5c5ac3, 0xfee8278a, 0x6a55c450);
@ -68,6 +70,9 @@ RandomX_ConfigurationLoki::RandomX_ConfigurationLoki()
ArgonSalt = "RandomXL\x12"; ArgonSalt = "RandomXL\x12";
ProgramSize = 320; ProgramSize = 320;
ProgramCount = 7; ProgramCount = 7;
RANDOMX_FREQ_IADD_RS = 25;
RANDOMX_FREQ_CBRANCH = 16;
} }
RandomX_ConfigurationBase::RandomX_ConfigurationBase() RandomX_ConfigurationBase::RandomX_ConfigurationBase()
@ -87,7 +92,7 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
, ProgramCount(8) , ProgramCount(8)
, JumpBits(8) , JumpBits(8)
, JumpOffset(8) , JumpOffset(8)
, RANDOMX_FREQ_IADD_RS(25) , RANDOMX_FREQ_IADD_RS(16)
, RANDOMX_FREQ_IADD_M(7) , RANDOMX_FREQ_IADD_M(7)
, RANDOMX_FREQ_ISUB_R(16) , RANDOMX_FREQ_ISUB_R(16)
, RANDOMX_FREQ_ISUB_M(7) , RANDOMX_FREQ_ISUB_M(7)
@ -113,7 +118,7 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
, RANDOMX_FREQ_FMUL_R(32) , RANDOMX_FREQ_FMUL_R(32)
, RANDOMX_FREQ_FDIV_M(4) , RANDOMX_FREQ_FDIV_M(4)
, RANDOMX_FREQ_FSQRT_R(6) , RANDOMX_FREQ_FSQRT_R(6)
, RANDOMX_FREQ_CBRANCH(16) , RANDOMX_FREQ_CBRANCH(25)
, RANDOMX_FREQ_CFROUND(1) , RANDOMX_FREQ_CFROUND(1)
, RANDOMX_FREQ_ISTORE(16) , RANDOMX_FREQ_ISTORE(16)
, RANDOMX_FREQ_NOP(0) , RANDOMX_FREQ_NOP(0)

View file

@ -329,7 +329,7 @@ namespace randomx {
return false; return false;
if (availableRegisters.size() > 1) { if (availableRegisters.size() > 1) {
index = gen.getInt32() % availableRegisters.size(); index = gen.getUInt32() % availableRegisters.size();
} }
else { else {
index = 0; index = 0;
@ -442,7 +442,7 @@ namespace randomx {
case SuperscalarInstructionType::IADD_C8: case SuperscalarInstructionType::IADD_C8:
case SuperscalarInstructionType::IADD_C9: { case SuperscalarInstructionType::IADD_C9: {
mod_ = 0; mod_ = 0;
imm32_ = gen.getInt32(); imm32_ = gen.getUInt32();
opGroup_ = SuperscalarInstructionType::IADD_C7; opGroup_ = SuperscalarInstructionType::IADD_C7;
opGroupPar_ = -1; opGroupPar_ = -1;
} break; } break;
@ -451,7 +451,7 @@ namespace randomx {
case SuperscalarInstructionType::IXOR_C8: case SuperscalarInstructionType::IXOR_C8:
case SuperscalarInstructionType::IXOR_C9: { case SuperscalarInstructionType::IXOR_C9: {
mod_ = 0; mod_ = 0;
imm32_ = gen.getInt32(); imm32_ = gen.getUInt32();
opGroup_ = SuperscalarInstructionType::IXOR_C7; opGroup_ = SuperscalarInstructionType::IXOR_C7;
opGroupPar_ = -1; opGroupPar_ = -1;
} break; } break;
@ -461,7 +461,7 @@ namespace randomx {
mod_ = 0; mod_ = 0;
imm32_ = 0; imm32_ = 0;
opGroup_ = SuperscalarInstructionType::IMULH_R; opGroup_ = SuperscalarInstructionType::IMULH_R;
opGroupPar_ = gen.getInt32(); opGroupPar_ = gen.getUInt32();
} break; } break;
case SuperscalarInstructionType::ISMULH_R: { case SuperscalarInstructionType::ISMULH_R: {
@ -469,14 +469,14 @@ namespace randomx {
mod_ = 0; mod_ = 0;
imm32_ = 0; imm32_ = 0;
opGroup_ = SuperscalarInstructionType::ISMULH_R; opGroup_ = SuperscalarInstructionType::ISMULH_R;
opGroupPar_ = gen.getInt32(); opGroupPar_ = gen.getUInt32();
} break; } break;
case SuperscalarInstructionType::IMUL_RCP: { case SuperscalarInstructionType::IMUL_RCP: {
mod_ = 0; mod_ = 0;
do { do {
imm32_ = gen.getInt32(); imm32_ = gen.getUInt32();
} while ((imm32_ & (imm32_ - 1)) == 0); } while (isZeroOrPowerOf2(imm32_));
opGroup_ = SuperscalarInstructionType::IMUL_RCP; opGroup_ = SuperscalarInstructionType::IMUL_RCP;
opGroupPar_ = -1; opGroupPar_ = -1;
} break; } break;