RandomX fixes

Intel JCC erratum fix and various other improvements, see more here: https://www.phoronix.com/scan.php?page=article&item=intel-jcc-microcode&num=1
This commit is contained in:
SChernykh 2019-12-01 08:46:35 +01:00
parent 8791261220
commit 84d7eb05f3
12 changed files with 320 additions and 40 deletions

View file

@ -33,6 +33,7 @@
#include "base/io/Console.h" #include "base/io/Console.h"
#include "base/io/log/Log.h" #include "base/io/log/Log.h"
#include "base/kernel/Signals.h" #include "base/kernel/Signals.h"
#include "base/kernel/Platform.h"
#include "core/config/Config.h" #include "core/config/Config.h"
#include "core/Controller.h" #include "core/Controller.h"
#include "core/Miner.h" #include "core/Miner.h"
@ -89,6 +90,8 @@ int xmrig::App::exec()
m_controller->start(); m_controller->start();
Platform::setThreadPriority(5);
rc = uv_run(uv_default_loop(), UV_RUN_DEFAULT); rc = uv_run(uv_default_loop(), UV_RUN_DEFAULT);
uv_loop_close(uv_default_loop()); uv_loop_close(uv_default_loop());

View file

@ -109,6 +109,11 @@ void xmrig::Workers<T>::start(const std::vector<T> &data)
for (Thread<T> *worker : m_workers) { for (Thread<T> *worker : m_workers) {
worker->start(Workers<T>::onReady); worker->start(Workers<T>::onReady);
// This sleep is important for optimal caching!
// Threads must allocate scratchpads in order so that adjacent cores will use adjacent scratchpads
// Sub-optimal caching can result in up to 0.5% hashrate penalty
std::this_thread::sleep_for(std::chrono::milliseconds(20));
} }
} }

View file

@ -185,8 +185,20 @@ void xmrig::CpuWorker<N>::start()
consumeJob(); consumeJob();
} }
uint64_t storeStatsMask = 7;
# ifdef XMRIG_ALGO_RANDOMX
bool first = true;
uint64_t tempHash[8] = {};
// RandomX is faster, we don't need to store stats so often
if (m_job.currentJob().algorithm().family() == Algorithm::RANDOM_X) {
storeStatsMask = 63;
}
# endif
while (!Nonce::isOutdated(Nonce::CPU, m_job.sequence())) { while (!Nonce::isOutdated(Nonce::CPU, m_job.sequence())) {
if ((m_count & 0x7) == 0) { if ((m_count & storeStatsMask) == 0) {
storeStats(); storeStats();
} }
@ -196,26 +208,34 @@ void xmrig::CpuWorker<N>::start()
break; break;
} }
uint32_t current_job_nonces[N];
for (size_t i = 0; i < N; ++i) {
current_job_nonces[i] = *m_job.nonce(i);
}
# ifdef XMRIG_ALGO_RANDOMX # ifdef XMRIG_ALGO_RANDOMX
if (job.algorithm().family() == Algorithm::RANDOM_X) { if (job.algorithm().family() == Algorithm::RANDOM_X) {
randomx_calculate_hash(m_vm->get(), m_job.blob(), job.size(), m_hash); if (first) {
first = false;
randomx_calculate_hash_first(m_vm->get(), tempHash, m_job.blob(), job.size());
}
m_job.nextRound(kReserveCount, 1);
randomx_calculate_hash_next(m_vm->get(), tempHash, m_job.blob(), job.size(), m_hash);
} }
else else
# endif # endif
{ {
fn(job.algorithm())(m_job.blob(), job.size(), m_hash, m_ctx, job.height()); fn(job.algorithm())(m_job.blob(), job.size(), m_hash, m_ctx, job.height());
m_job.nextRound(kReserveCount, 1);
} }
for (size_t i = 0; i < N; ++i) { for (size_t i = 0; i < N; ++i) {
if (*reinterpret_cast<uint64_t*>(m_hash + (i * 32) + 24) < job.target()) { if (*reinterpret_cast<uint64_t*>(m_hash + (i * 32) + 24) < job.target()) {
JobResults::submit(job, *m_job.nonce(i), m_hash + (i * 32)); JobResults::submit(job, current_job_nonces[i], m_hash + (i * 32));
} }
} }
m_job.nextRound(kReserveCount, 1);
m_count += N; m_count += N;
std::this_thread::yield();
} }
consumeJob(); consumeJob();

View file

@ -212,3 +212,84 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
template void fillAes4Rx4<true>(void *state, size_t outputSize, void *buffer); template void fillAes4Rx4<true>(void *state, size_t outputSize, void *buffer);
template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer); template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer);
template<bool softAes>
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
// initial state
rx_vec_i128 hash_state0 = rx_set_int_vec_i128(AES_HASH_1R_STATE0);
rx_vec_i128 hash_state1 = rx_set_int_vec_i128(AES_HASH_1R_STATE1);
rx_vec_i128 hash_state2 = rx_set_int_vec_i128(AES_HASH_1R_STATE2);
rx_vec_i128 hash_state3 = rx_set_int_vec_i128(AES_HASH_1R_STATE3);
const rx_vec_i128 key0 = rx_set_int_vec_i128(AES_GEN_1R_KEY0);
const rx_vec_i128 key1 = rx_set_int_vec_i128(AES_GEN_1R_KEY1);
const rx_vec_i128 key2 = rx_set_int_vec_i128(AES_GEN_1R_KEY2);
const rx_vec_i128 key3 = rx_set_int_vec_i128(AES_GEN_1R_KEY3);
rx_vec_i128 fill_state0 = rx_load_vec_i128((rx_vec_i128*)fill_state + 0);
rx_vec_i128 fill_state1 = rx_load_vec_i128((rx_vec_i128*)fill_state + 1);
rx_vec_i128 fill_state2 = rx_load_vec_i128((rx_vec_i128*)fill_state + 2);
rx_vec_i128 fill_state3 = rx_load_vec_i128((rx_vec_i128*)fill_state + 3);
constexpr int PREFETCH_DISTANCE = 4096;
const char* prefetchPtr = ((const char*)scratchpad) + PREFETCH_DISTANCE;
scratchpadEnd -= PREFETCH_DISTANCE;
for (int i = 0; i < 2; ++i) {
//process 64 bytes at a time in 4 lanes
while (scratchpadPtr < scratchpadEnd) {
hash_state0 = aesenc<softAes>(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 0));
hash_state1 = aesdec<softAes>(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 1));
hash_state2 = aesenc<softAes>(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 2));
hash_state3 = aesdec<softAes>(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 3));
fill_state0 = aesdec<softAes>(fill_state0, key0);
fill_state1 = aesenc<softAes>(fill_state1, key1);
fill_state2 = aesdec<softAes>(fill_state2, key2);
fill_state3 = aesenc<softAes>(fill_state3, key3);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 0, fill_state0);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 1, fill_state1);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 2, fill_state2);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 3, fill_state3);
rx_prefetch_t0(prefetchPtr);
scratchpadPtr += 64;
prefetchPtr += 64;
}
prefetchPtr = (const char*) scratchpad;
scratchpadEnd += PREFETCH_DISTANCE;
}
rx_store_vec_i128((rx_vec_i128*)fill_state + 0, fill_state0);
rx_store_vec_i128((rx_vec_i128*)fill_state + 1, fill_state1);
rx_store_vec_i128((rx_vec_i128*)fill_state + 2, fill_state2);
rx_store_vec_i128((rx_vec_i128*)fill_state + 3, fill_state3);
//two extra rounds to achieve full diffusion
rx_vec_i128 xkey0 = rx_set_int_vec_i128(AES_HASH_1R_XKEY0);
rx_vec_i128 xkey1 = rx_set_int_vec_i128(AES_HASH_1R_XKEY1);
hash_state0 = aesenc<softAes>(hash_state0, xkey0);
hash_state1 = aesdec<softAes>(hash_state1, xkey0);
hash_state2 = aesenc<softAes>(hash_state2, xkey0);
hash_state3 = aesdec<softAes>(hash_state3, xkey0);
hash_state0 = aesenc<softAes>(hash_state0, xkey1);
hash_state1 = aesdec<softAes>(hash_state1, xkey1);
hash_state2 = aesenc<softAes>(hash_state2, xkey1);
hash_state3 = aesdec<softAes>(hash_state3, xkey1);
//output hash
rx_store_vec_i128((rx_vec_i128*)hash + 0, hash_state0);
rx_store_vec_i128((rx_vec_i128*)hash + 1, hash_state1);
rx_store_vec_i128((rx_vec_i128*)hash + 2, hash_state2);
rx_store_vec_i128((rx_vec_i128*)hash + 3, hash_state3);
}
template void hashAndFillAes1Rx4<false>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
template void hashAndFillAes1Rx4<true>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);

View file

@ -38,3 +38,6 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer);
template<bool softAes> template<bool softAes>
void fillAes4Rx4(void *state, size_t outputSize, void *buffer); void fillAes4Rx4(void *state, size_t outputSize, void *buffer);
template<bool softAes>
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);

View file

@ -102,6 +102,7 @@ typedef __m128d rx_vec_f128;
#define rx_aligned_alloc(a, b) _mm_malloc(a,b) #define rx_aligned_alloc(a, b) _mm_malloc(a,b)
#define rx_aligned_free(a) _mm_free(a) #define rx_aligned_free(a) _mm_free(a)
#define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) #define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
#define rx_prefetch_t0(x) _mm_prefetch((const char *)(x), _MM_HINT_T0)
#define rx_load_vec_f128 _mm_load_pd #define rx_load_vec_f128 _mm_load_pd
#define rx_store_vec_f128 _mm_store_pd #define rx_store_vec_f128 _mm_store_pd
@ -201,6 +202,7 @@ typedef union{
#define rx_aligned_alloc(a, b) malloc(a) #define rx_aligned_alloc(a, b) malloc(a)
#define rx_aligned_free(a) free(a) #define rx_aligned_free(a) free(a)
#define rx_prefetch_nta(x) #define rx_prefetch_nta(x)
#define rx_prefetch_t0(x)
/* Splat 64-bit long long to 2 64-bit long longs */ /* Splat 64-bit long long to 2 64-bit long longs */
FORCE_INLINE __m128i vec_splat2sd (int64_t scalar) FORCE_INLINE __m128i vec_splat2sd (int64_t scalar)
@ -399,6 +401,10 @@ inline void rx_prefetch_nta(void* ptr) {
asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr)); asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr));
} }
inline void rx_prefetch_t0(const void* ptr) {
asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr));
}
FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
return vld1q_f64((const float64_t*)pd); return vld1q_f64((const float64_t*)pd);
} }
@ -532,6 +538,7 @@ typedef union {
#define rx_aligned_alloc(a, b) malloc(a) #define rx_aligned_alloc(a, b) malloc(a)
#define rx_aligned_free(a) free(a) #define rx_aligned_free(a) free(a)
#define rx_prefetch_nta(x) #define rx_prefetch_nta(x)
#define rx_prefetch_t0(x)
FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
rx_vec_f128 x; rx_vec_f128 x;

View file

@ -29,6 +29,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdexcept> #include <stdexcept>
#include <cstring> #include <cstring>
#include <climits> #include <climits>
#include <atomic>
#include "crypto/randomx/jit_compiler_x86.hpp" #include "crypto/randomx/jit_compiler_x86.hpp"
#include "crypto/randomx/jit_compiler_x86_static.hpp" #include "crypto/randomx/jit_compiler_x86_static.hpp"
#include "crypto/randomx/superscalar.hpp" #include "crypto/randomx/superscalar.hpp"
@ -36,6 +37,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "crypto/randomx/reciprocal.h" #include "crypto/randomx/reciprocal.h"
#include "crypto/randomx/virtual_memory.hpp" #include "crypto/randomx/virtual_memory.hpp"
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <cpuid.h>
#endif
namespace randomx { namespace randomx {
/* /*
@ -108,7 +115,7 @@ namespace randomx {
const int32_t codeSshPrefetchSize = codeShhEnd - codeShhPrefetch; const int32_t codeSshPrefetchSize = codeShhEnd - codeShhPrefetch;
const int32_t codeSshInitSize = codeProgramEnd - codeShhInit; const int32_t codeSshInitSize = codeProgramEnd - codeShhInit;
const int32_t epilogueOffset = CodeSize - epilogueSize; const int32_t epilogueOffset = (CodeSize - epilogueSize) & ~63;
constexpr int32_t superScalarHashOffset = 32768; constexpr int32_t superScalarHashOffset = 32768;
static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 }; static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 };
@ -183,6 +190,7 @@ namespace randomx {
static const uint8_t REX_ADD_I[] = { 0x49, 0x81 }; static const uint8_t REX_ADD_I[] = { 0x49, 0x81 };
static const uint8_t REX_TEST[] = { 0x49, 0xF7 }; static const uint8_t REX_TEST[] = { 0x49, 0xF7 };
static const uint8_t JZ[] = { 0x0f, 0x84 }; static const uint8_t JZ[] = { 0x0f, 0x84 };
static const uint8_t JZ_SHORT = 0x74;
static const uint8_t RET = 0xc3; static const uint8_t RET = 0xc3;
static const uint8_t LEA_32[] = { 0x41, 0x8d }; static const uint8_t LEA_32[] = { 0x41, 0x8d };
static const uint8_t MOVNTI[] = { 0x4c, 0x0f, 0xc3 }; static const uint8_t MOVNTI[] = { 0x4c, 0x0f, 0xc3 };
@ -197,20 +205,100 @@ namespace randomx {
static const uint8_t NOP7[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 }; static const uint8_t NOP7[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 };
static const uint8_t NOP8[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }; static const uint8_t NOP8[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };
// static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 }; static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 };
static const uint8_t JMP_ALIGN_PREFIX[14][16] = {
{},
{0x2E},
{0x2E, 0x2E},
{0x2E, 0x2E, 0x2E},
{0x2E, 0x2E, 0x2E, 0x2E},
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x66, 0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x0F, 0x1F, 0x40, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x0F, 0x1F, 0x44, 0x00, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
};
bool JitCompilerX86::BranchesWithin32B = false;
size_t JitCompilerX86::getCodeSize() { size_t JitCompilerX86::getCodeSize() {
return codePos < prologueSize ? 0 : codePos - prologueSize; return codePos < prologueSize ? 0 : codePos - prologueSize;
} }
static inline void cpuid(uint32_t level, int32_t output[4])
{
memset(output, 0, sizeof(int32_t) * 4);
# ifdef _MSC_VER
__cpuid(output, static_cast<int>(level));
# else
__cpuid_count(level, 0, output[0], output[1], output[2], output[3]);
# endif
}
// CPU-specific tweaks
void JitCompilerX86::applyTweaks() {
int32_t info[4];
cpuid(0, info);
int32_t manufacturer[4];
manufacturer[0] = info[1];
manufacturer[1] = info[3];
manufacturer[2] = info[2];
manufacturer[3] = 0;
if (strcmp((const char*)manufacturer, "GenuineIntel") == 0) {
struct
{
unsigned int stepping : 4;
unsigned int model : 4;
unsigned int family : 4;
unsigned int processor_type : 2;
unsigned int reserved1 : 2;
unsigned int ext_model : 4;
unsigned int ext_family : 8;
unsigned int reserved2 : 4;
} processor_info;
cpuid(1, info);
memcpy(&processor_info, info, sizeof(processor_info));
// Intel JCC erratum mitigation
if (processor_info.family == 6) {
const uint32_t model = processor_info.model | (processor_info.ext_model << 4);
const uint32_t stepping = processor_info.stepping;
// Affected CPU models and stepping numbers are taken from https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
BranchesWithin32B =
((model == 0x4E) && (stepping == 0x3)) ||
((model == 0x55) && (stepping == 0x4)) ||
((model == 0x5E) && (stepping == 0x3)) ||
((model == 0x8E) && (stepping >= 0x9) && (stepping <= 0xC)) ||
((model == 0x9E) && (stepping >= 0x9) && (stepping <= 0xD)) ||
((model == 0xA6) && (stepping == 0x0)) ||
((model == 0xAE) && (stepping == 0xA));
}
}
}
static std::atomic<size_t> codeOffset;
JitCompilerX86::JitCompilerX86() { JitCompilerX86::JitCompilerX86() {
code = (uint8_t*)allocExecutableMemory(CodeSize); applyTweaks();
allocatedCode = (uint8_t*)allocExecutableMemory(CodeSize * 2);
// Shift code base address to improve caching - all threads will use different L2/L3 cache sets
code = allocatedCode + (codeOffset.fetch_add(59 * 64) % CodeSize);
memcpy(code, codePrologue, prologueSize); memcpy(code, codePrologue, prologueSize);
memcpy(code + epilogueOffset, codeEpilogue, epilogueSize); memcpy(code + epilogueOffset, codeEpilogue, epilogueSize);
} }
JitCompilerX86::~JitCompilerX86() { JitCompilerX86::~JitCompilerX86() {
freePagedMemory(code, CodeSize); freePagedMemory(allocatedCode, CodeSize);
} }
void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) { void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) {
@ -307,6 +395,22 @@ namespace randomx {
emit(RandomX_CurrentConfig.codePrefetchScratchpadTweaked, prefetchScratchpadSize, code, codePos); emit(RandomX_CurrentConfig.codePrefetchScratchpadTweaked, prefetchScratchpadSize, code, codePos);
memcpy(code + codePos, codeLoopStore, loopStoreSize); memcpy(code + codePos, codeLoopStore, loopStoreSize);
codePos += loopStoreSize; codePos += loopStoreSize;
if (BranchesWithin32B) {
const uint32_t branch_begin = static_cast<uint32_t>(codePos);
const uint32_t branch_end = static_cast<uint32_t>(branch_begin + 9);
// If the jump crosses or touches 32-byte boundary, align it
if ((branch_begin ^ branch_end) >= 32) {
uint32_t alignment_size = 32 - (branch_begin & 31);
if (alignment_size > 8) {
emit(NOPX[alignment_size - 9], alignment_size - 8, code, codePos);
alignment_size = 8;
}
emit(NOPX[alignment_size - 1], alignment_size, code, codePos);
}
}
emit(SUB_EBX, code, codePos); emit(SUB_EBX, code, codePos);
emit(JNZ, code, codePos); emit(JNZ, code, codePos);
emit32(prologueSize - codePos - 4, code, codePos); emit32(prologueSize - codePos - 4, code, codePos);
@ -408,12 +512,13 @@ namespace randomx {
} }
} }
void JitCompilerX86::genAddressReg(const Instruction& instr, uint8_t* code, int& codePos, bool rax) { template<bool rax>
emit(LEA_32, code, codePos); FORCE_INLINE void JitCompilerX86::genAddressReg(const Instruction& instr, uint8_t* code, int& codePos) {
emitByte(0x80 + instr.src + (rax ? 0 : 8), code, codePos); const uint32_t src = *((uint32_t*)&instr) & 0xFF0000;
if (instr.src == RegisterNeedsSib) {
emitByte(0x24, code, codePos); *(uint32_t*)(code + codePos) = (rax ? 0x24808d41 : 0x24888d41) + src;
} codePos += (src == (RegisterNeedsSib << 16)) ? 4 : 3;
emit32(instr.getImm32(), code, codePos); emit32(instr.getImm32(), code, codePos);
if (rax) if (rax)
emitByte(AND_EAX_I, code, codePos); emitByte(AND_EAX_I, code, codePos);
@ -422,12 +527,14 @@ namespace randomx {
emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask, code, codePos); emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask, code, codePos);
} }
void JitCompilerX86::genAddressRegDst(const Instruction& instr, uint8_t* code, int& codePos) { template void JitCompilerX86::genAddressReg<false>(const Instruction& instr, uint8_t* code, int& codePos);
emit(LEA_32, code, codePos); template void JitCompilerX86::genAddressReg<true>(const Instruction& instr, uint8_t* code, int& codePos);
emitByte(0x80 + instr.dst, code, codePos);
if (instr.dst == RegisterNeedsSib) { FORCE_INLINE void JitCompilerX86::genAddressRegDst(const Instruction& instr, uint8_t* code, int& codePos) {
emitByte(0x24, code, codePos); const uint32_t dst = static_cast<uint32_t>(instr.dst) << 16;
} *(uint32_t*)(code + codePos) = 0x24808d41 + dst;
codePos += (dst == (RegisterNeedsSib << 16)) ? 4 : 3;
emit32(instr.getImm32(), code, codePos); emit32(instr.getImm32(), code, codePos);
emitByte(AND_EAX_I, code, codePos); emitByte(AND_EAX_I, code, codePos);
if (instr.getModCond() < StoreL3Condition) { if (instr.getModCond() < StoreL3Condition) {
@ -438,7 +545,7 @@ namespace randomx {
} }
} }
void JitCompilerX86::genAddressImm(const Instruction& instr, uint8_t* code, int& codePos) { FORCE_INLINE void JitCompilerX86::genAddressImm(const Instruction& instr, uint8_t* code, int& codePos) {
emit32(instr.getImm32() & ScratchpadL3Mask, code, codePos); emit32(instr.getImm32() & ScratchpadL3Mask, code, codePos);
} }
@ -483,7 +590,7 @@ namespace randomx {
int pos = codePos; int pos = codePos;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
genAddressReg(instr, p, pos); genAddressReg<true>(instr, p, pos);
emit32(template_IADD_M[instr.dst], p, pos); emit32(template_IADD_M[instr.dst], p, pos);
} }
else { else {
@ -523,7 +630,7 @@ namespace randomx {
int pos = codePos; int pos = codePos;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
genAddressReg(instr, p, pos); genAddressReg<true>(instr, p, pos);
emit(REX_SUB_RM, p, pos); emit(REX_SUB_RM, p, pos);
emitByte(0x04 + 8 * instr.dst, p, pos); emitByte(0x04 + 8 * instr.dst, p, pos);
emitByte(0x06, p, pos); emitByte(0x06, p, pos);
@ -561,7 +668,7 @@ namespace randomx {
int pos = codePos; int pos = codePos;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
genAddressReg(instr, p, pos); genAddressReg<true>(instr, p, pos);
emit(REX_IMUL_RM, p, pos); emit(REX_IMUL_RM, p, pos);
emitByte(0x04 + 8 * instr.dst, p, pos); emitByte(0x04 + 8 * instr.dst, p, pos);
emitByte(0x06, p, pos); emitByte(0x06, p, pos);
@ -596,7 +703,7 @@ namespace randomx {
int pos = codePos; int pos = codePos;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
genAddressReg(instr, p, pos, false); genAddressReg<false>(instr, p, pos);
emit(REX_MOV_RR64, p, pos); emit(REX_MOV_RR64, p, pos);
emitByte(0xc0 + instr.dst, p, pos); emitByte(0xc0 + instr.dst, p, pos);
emit(REX_MUL_MEM, p, pos); emit(REX_MUL_MEM, p, pos);
@ -635,7 +742,7 @@ namespace randomx {
int pos = codePos; int pos = codePos;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
genAddressReg(instr, p, pos, false); genAddressReg<false>(instr, p, pos);
emit(REX_MOV_RR64, p, pos); emit(REX_MOV_RR64, p, pos);
emitByte(0xc0 + instr.dst, p, pos); emitByte(0xc0 + instr.dst, p, pos);
emit(REX_IMUL_MEM, p, pos); emit(REX_IMUL_MEM, p, pos);
@ -704,7 +811,7 @@ namespace randomx {
int pos = codePos; int pos = codePos;
if (instr.src != instr.dst) { if (instr.src != instr.dst) {
genAddressReg(instr, p, pos); genAddressReg<true>(instr, p, pos);
emit(REX_XOR_RM, p, pos); emit(REX_XOR_RM, p, pos);
emitByte(0x04 + 8 * instr.dst, p, pos); emitByte(0x04 + 8 * instr.dst, p, pos);
emitByte(0x06, p, pos); emitByte(0x06, p, pos);
@ -801,7 +908,7 @@ namespace randomx {
int pos = codePos; int pos = codePos;
const uint32_t dst = instr.dst % RegisterCountFlt; const uint32_t dst = instr.dst % RegisterCountFlt;
genAddressReg(instr, p, pos); genAddressReg<true>(instr, p, pos);
emit(REX_CVTDQ2PD_XMM12, p, pos); emit(REX_CVTDQ2PD_XMM12, p, pos);
emit(REX_ADDPD, p, pos); emit(REX_ADDPD, p, pos);
emitByte(0xc4 + 8 * dst, p, pos); emitByte(0xc4 + 8 * dst, p, pos);
@ -826,7 +933,7 @@ namespace randomx {
int pos = codePos; int pos = codePos;
const uint32_t dst = instr.dst % RegisterCountFlt; const uint32_t dst = instr.dst % RegisterCountFlt;
genAddressReg(instr, p, pos); genAddressReg<true>(instr, p, pos);
emit(REX_CVTDQ2PD_XMM12, p, pos); emit(REX_CVTDQ2PD_XMM12, p, pos);
emit(REX_SUBPD, p, pos); emit(REX_SUBPD, p, pos);
emitByte(0xc4 + 8 * dst, p, pos); emitByte(0xc4 + 8 * dst, p, pos);
@ -862,7 +969,7 @@ namespace randomx {
int pos = codePos; int pos = codePos;
const uint32_t dst = instr.dst % RegisterCountFlt; const uint32_t dst = instr.dst % RegisterCountFlt;
genAddressReg(instr, p, pos); genAddressReg<true>(instr, p, pos);
emit(REX_CVTDQ2PD_XMM12, p, pos); emit(REX_CVTDQ2PD_XMM12, p, pos);
emit(REX_ANDPS_XMM12, p, pos); emit(REX_ANDPS_XMM12, p, pos);
emit(REX_DIVPD, p, pos); emit(REX_DIVPD, p, pos);
@ -902,19 +1009,39 @@ namespace randomx {
uint8_t* const p = code; uint8_t* const p = code;
int pos = codePos; int pos = codePos;
int reg = instr.dst; const int reg = instr.dst;
int32_t jmp_offset = registerUsage[reg] - (pos + 16);
if (BranchesWithin32B) {
const uint32_t branch_begin = static_cast<uint32_t>(pos + 7);
const uint32_t branch_end = static_cast<uint32_t>(branch_begin + ((jmp_offset >= -128) ? 9 : 13));
// If the jump crosses or touches 32-byte boundary, align it
if ((branch_begin ^ branch_end) >= 32) {
const uint32_t alignment_size = 32 - (branch_begin & 31);
jmp_offset -= alignment_size;
emit(JMP_ALIGN_PREFIX[alignment_size], alignment_size, p, pos);
}
}
emit(REX_ADD_I, p, pos); emit(REX_ADD_I, p, pos);
emitByte(0xc0 + reg, p, pos); emitByte(0xc0 + reg, p, pos);
int shift = instr.getModCond() + RandomX_CurrentConfig.JumpOffset; const int shift = instr.getModCond() + RandomX_CurrentConfig.JumpOffset;
uint32_t imm = instr.getImm32() | (1UL << shift); const uint32_t imm = (instr.getImm32() | (1UL << shift)) & ~(1UL << (shift - 1));
if (RandomX_CurrentConfig.JumpOffset > 0 || shift > 0)
imm &= ~(1UL << (shift - 1));
emit32(imm, p, pos); emit32(imm, p, pos);
emit(REX_TEST, p, pos); emit(REX_TEST, p, pos);
emitByte(0xc0 + reg, p, pos); emitByte(0xc0 + reg, p, pos);
emit32(RandomX_CurrentConfig.ConditionMask_Calculated << shift, p, pos); emit32(RandomX_CurrentConfig.ConditionMask_Calculated << shift, p, pos);
if (jmp_offset >= -128) {
emitByte(JZ_SHORT, p, pos);
emitByte(jmp_offset, p, pos);
}
else {
emit(JZ, p, pos); emit(JZ, p, pos);
emit32(registerUsage[reg] - (pos + 4), p, pos); emit32(jmp_offset - 4, p, pos);
}
//mark all registers as used //mark all registers as used
uint64_t* r = (uint64_t*) registerUsage; uint64_t* r = (uint64_t*) registerUsage;
uint64_t k = pos; uint64_t k = pos;

View file

@ -67,12 +67,17 @@ namespace randomx {
static InstructionGeneratorX86 engine[256]; static InstructionGeneratorX86 engine[256];
int registerUsage[RegistersCount]; int registerUsage[RegistersCount];
uint8_t* allocatedCode;
uint8_t* code; uint8_t* code;
int32_t codePos; int32_t codePos;
static bool BranchesWithin32B;
static void applyTweaks();
void generateProgramPrologue(Program&, ProgramConfiguration&); void generateProgramPrologue(Program&, ProgramConfiguration&);
void generateProgramEpilogue(Program&, ProgramConfiguration&); void generateProgramEpilogue(Program&, ProgramConfiguration&);
static void genAddressReg(const Instruction&, uint8_t* code, int& codePos, bool rax = true); template<bool rax>
static void genAddressReg(const Instruction&, uint8_t* code, int& codePos);
static void genAddressRegDst(const Instruction&, uint8_t* code, int& codePos); static void genAddressRegDst(const Instruction&, uint8_t* code, int& codePos);
static void genAddressImm(const Instruction&, uint8_t* code, int& codePos); static void genAddressImm(const Instruction&, uint8_t* code, int& codePos);
static void genSIB(int scale, int index, int base, uint8_t* code, int& codePos); static void genSIB(int scale, int index, int base, uint8_t* code, int& codePos);

View file

@ -473,4 +473,22 @@ extern "C" {
machine->getFinalResult(output, RANDOMX_HASH_SIZE); machine->getFinalResult(output, RANDOMX_HASH_SIZE);
} }
void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize) {
rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
machine->initScratchpad(tempHash);
}
void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output) {
machine->resetRoundingMode();
for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) {
machine->run(&tempHash);
rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
}
machine->run(&tempHash);
// Finish current hash and fill the scratchpad for the next hash at the same time
rx_blake2b(tempHash, sizeof(tempHash), nextInput, nextInputSize, nullptr, 0);
machine->hashAndFill(output, RANDOMX_HASH_SIZE, tempHash);
}
} }

View file

@ -338,6 +338,9 @@ RANDOMX_EXPORT void randomx_destroy_vm(randomx_vm *machine);
*/ */
RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output); RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output);
RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize);
RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output);
#if defined(__cplusplus) #if defined(__cplusplus)
} }
#endif #endif

View file

@ -114,6 +114,12 @@ namespace randomx {
rx_blake2b(out, outSize, &reg, sizeof(RegisterFile), nullptr, 0); rx_blake2b(out, outSize, &reg, sizeof(RegisterFile), nullptr, 0);
} }
template<bool softAes>
void VmBase<softAes>::hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) {
hashAndFillAes1Rx4<softAes>(scratchpad, ScratchpadSize, &reg.a, fill_state);
rx_blake2b(out, outSize, &reg, sizeof(RegisterFile), nullptr, 0);
}
template<bool softAes> template<bool softAes>
void VmBase<softAes>::initScratchpad(void* seed) { void VmBase<softAes>::initScratchpad(void* seed) {
fillAes1Rx4<softAes>(seed, ScratchpadSize, scratchpad); fillAes1Rx4<softAes>(seed, ScratchpadSize, scratchpad);

View file

@ -39,6 +39,7 @@ public:
virtual ~randomx_vm() = 0; virtual ~randomx_vm() = 0;
virtual void setScratchpad(uint8_t *scratchpad) = 0; virtual void setScratchpad(uint8_t *scratchpad) = 0;
virtual void getFinalResult(void* out, size_t outSize) = 0; virtual void getFinalResult(void* out, size_t outSize) = 0;
virtual void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) = 0;
virtual void setDataset(randomx_dataset* dataset) { } virtual void setDataset(randomx_dataset* dataset) { }
virtual void setCache(randomx_cache* cache) { } virtual void setCache(randomx_cache* cache) { }
virtual void initScratchpad(void* seed) = 0; virtual void initScratchpad(void* seed) = 0;
@ -82,6 +83,7 @@ namespace randomx {
void setScratchpad(uint8_t *scratchpad) override; void setScratchpad(uint8_t *scratchpad) override;
void initScratchpad(void* seed) override; void initScratchpad(void* seed) override;
void getFinalResult(void* out, size_t outSize) override; void getFinalResult(void* out, size_t outSize) override;
void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) override;
protected: protected:
void generateProgram(void* seed); void generateProgram(void* seed);