Dataset initialization with AVX2 (WIP)

This commit is contained in:
SChernykh 2020-12-18 14:53:54 +01:00
parent 6b21a51a2f
commit 515a85e66c
17 changed files with 721 additions and 90 deletions

View file

@ -214,13 +214,6 @@ void xmrig::Workers<T>::start(const std::vector<T> &data, bool sleep)
for (auto worker : m_workers) {
worker->start(Workers<T>::onReady);
// This sleep is important for optimal caching!
// Threads must allocate scratchpads in order so that adjacent cores will use adjacent scratchpads
// Sub-optimal caching can result in up to 0.5% hashrate penalty
if (sleep) {
std::this_thread::sleep_for(std::chrono::milliseconds(20));
}
}
}

View file

@ -53,6 +53,7 @@ public:
enum Flag : uint32_t {
FLAG_AES,
FLAG_AVX,
FLAG_AVX2,
FLAG_AVX512F,
FLAG_BMI2,
@ -80,9 +81,11 @@ public:
virtual Assembly::Id assembly() const = 0;
virtual bool has(Flag feature) const = 0;
virtual bool hasAES() const = 0;
virtual bool hasAVX() const = 0;
virtual bool hasAVX2() const = 0;
virtual bool hasBMI2() const = 0;
virtual bool hasOneGbPages() const = 0;
virtual bool hasXOP() const = 0;
virtual bool hasCatL3() const = 0;
virtual bool isVM() const = 0;
virtual const char *backend() const = 0;

View file

@ -52,8 +52,8 @@
namespace xmrig {
constexpr size_t kCpuFlagsSize = 13;
static const std::array<const char *, kCpuFlagsSize> flagNames = { "aes", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" };
constexpr size_t kCpuFlagsSize = 14;
static const std::array<const char *, kCpuFlagsSize> flagNames = { "aes", "avx", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" };
static_assert(kCpuFlagsSize == ICpuInfo::FLAG_MAX, "kCpuFlagsSize and FLAG_MAX mismatch");
@ -134,11 +134,12 @@ static inline uint64_t xgetbv()
#endif
}
static inline bool has_xcr_avx2() { return (xgetbv() & 0x06) == 0x06; }
static inline bool has_xcr_avx() { return (xgetbv() & 0x06) == 0x06; }
static inline bool has_xcr_avx512() { return (xgetbv() & 0xE6) == 0xE6; }
static inline bool has_osxsave() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 27); }
static inline bool has_aes_ni() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 25); }
static inline bool has_avx2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx2(); }
static inline bool has_avx() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 28) && has_osxsave() && has_xcr_avx(); }
static inline bool has_avx2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx(); }
static inline bool has_avx512f() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 16) && has_osxsave() && has_xcr_avx512(); }
static inline bool has_bmi2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 8); }
static inline bool has_pdpe1gb() { return has_feature(PROCESSOR_EXT_INFO, EDX_Reg, 1 << 26); }
@ -175,6 +176,7 @@ xmrig::BasicCpuInfo::BasicCpuInfo() :
cpu_brand_string(m_brand);
m_flags.set(FLAG_AES, has_aes_ni());
m_flags.set(FLAG_AVX, has_avx());
m_flags.set(FLAG_AVX2, has_avx2());
m_flags.set(FLAG_AVX512F, has_avx512f());
m_flags.set(FLAG_BMI2, has_bmi2());

View file

@ -48,9 +48,11 @@ protected:
inline Assembly::Id assembly() const override { return m_assembly; }
inline bool has(Flag flag) const override { return m_flags.test(flag); }
inline bool hasAES() const override { return has(FLAG_AES); }
inline bool hasAVX() const override { return has(FLAG_AVX); }
inline bool hasAVX2() const override { return has(FLAG_AVX2); }
inline bool hasBMI2() const override { return has(FLAG_BMI2); }
inline bool hasOneGbPages() const override { return has(FLAG_PDPE1GB); }
inline bool hasXOP() const override { return has(FLAG_XOP); }
inline bool hasCatL3() const override { return has(FLAG_CAT_L3); }
inline bool isVM() const override { return has(FLAG_VM); }
inline const char *brand() const override { return m_brand; }

View file

@ -0,0 +1,28 @@
r0_avx2_increments:
db 2,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0
mul_hi_avx2_data:
db 0,0,0,0,1,0,0,0
r0_avx2_mul:
;#/ 6364136223846793005
db 45, 127, 149, 76, 45, 244, 81, 88
r1_avx2_add:
;#/ 9298411001130361340
db 252, 161, 245, 89, 138, 151, 10, 129
r2_avx2_add:
;#/ 12065312585734608966
db 70, 216, 194, 56, 223, 153, 112, 167
r3_avx2_add:
;#/ 9306329213124626780
db 92, 73, 34, 191, 28, 185, 38, 129
r4_avx2_add:
;#/ 5281919268842080866
db 98, 138, 159, 23, 151, 37, 77, 73
r5_avx2_add:
;#/ 10536153434571861004
db 12, 236, 170, 206, 185, 239, 55, 146
r6_avx2_add:
;#/ 3398623926847679864
db 120, 45, 230, 108, 116, 86, 42, 47
r7_avx2_add:
;#/ 9549104520008361294
db 78, 229, 44, 182, 247, 59, 133, 132

View file

@ -0,0 +1,31 @@
add rsp, 32
pop r9
movdqu xmm0, xmmword ptr [rsp]
movdqu xmm1, xmmword ptr [rsp + 16]
movdqu xmm2, xmmword ptr [rsp + 32]
movdqu xmm3, xmmword ptr [rsp + 48]
movdqu xmm4, xmmword ptr [rsp + 64]
movdqu xmm5, xmmword ptr [rsp + 80]
movdqu xmm6, xmmword ptr [rsp + 96]
movdqu xmm7, xmmword ptr [rsp + 112]
movdqu xmm8, xmmword ptr [rsp + 128]
movdqu xmm9, xmmword ptr [rsp + 144]
movdqu xmm10, xmmword ptr [rsp + 160]
movdqu xmm11, xmmword ptr [rsp + 176]
movdqu xmm12, xmmword ptr [rsp + 192]
movdqu xmm13, xmmword ptr [rsp + 208]
movdqu xmm14, xmmword ptr [rsp + 224]
movdqu xmm15, xmmword ptr [rsp + 240]
vzeroupper
add rsp, 256
pop r15
pop r14
pop r13
pop r12
pop rsi
pop rdi
pop rbp
pop rbx
ret

View file

@ -0,0 +1,37 @@
;# prefetch RandomX dataset lines
prefetchnta byte ptr [rsi]
prefetchnta byte ptr [rsi+64]
prefetchnta byte ptr [rsi+128]
prefetchnta byte ptr [rsi+192]
prefetchnta byte ptr [rsi+256]
;# prefetch RandomX cache lines
mov rbx, rbp
and rbx, RANDOMX_CACHE_MASK
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rax, [rbp+1]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
prefetchnta byte ptr [rax]
mov [rsp], rax
lea rax, [rbp+2]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
prefetchnta byte ptr [rax]
mov [rsp+8], rax
lea rax, [rbp+3]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
prefetchnta byte ptr [rax]
mov [rsp+16], rax
lea rax, [rbp+4]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
prefetchnta byte ptr [rax]
mov [rsp+24], rax

View file

@ -0,0 +1,38 @@
mov qword ptr [rsi+0], r8
vpunpcklqdq ymm8, ymm0, ymm1
mov qword ptr [rsi+8], r9
vpunpcklqdq ymm9, ymm2, ymm3
mov qword ptr [rsi+16], r10
vpunpcklqdq ymm10, ymm4, ymm5
mov qword ptr [rsi+24], r11
vpunpcklqdq ymm11, ymm6, ymm7
mov qword ptr [rsi+32], r12
vpunpckhqdq ymm12, ymm0, ymm1
mov qword ptr [rsi+40], r13
vpunpckhqdq ymm13, ymm2, ymm3
mov qword ptr [rsi+48], r14
vpunpckhqdq ymm14, ymm4, ymm5
mov qword ptr [rsi+56], r15
vpunpckhqdq ymm15, ymm6, ymm7
vperm2i128 ymm0, ymm8, ymm9, 32
vperm2i128 ymm1, ymm10, ymm11, 32
vmovdqu ymmword ptr [rsi+64], ymm0
vmovdqu ymmword ptr [rsi+96], ymm1
vperm2i128 ymm2, ymm12, ymm13, 32
vperm2i128 ymm3, ymm14, ymm15, 32
vmovdqu ymmword ptr [rsi+128], ymm2
vmovdqu ymmword ptr [rsi+160], ymm3
vperm2i128 ymm4, ymm8, ymm9, 49
vperm2i128 ymm5, ymm10, ymm11, 49
vmovdqu ymmword ptr [rsi+192], ymm4
vmovdqu ymmword ptr [rsi+224], ymm5
vperm2i128 ymm6, ymm12, ymm13, 49
vperm2i128 ymm7, ymm14, ymm15, 49
vmovdqu ymmword ptr [rsi+256], ymm6
vmovdqu ymmword ptr [rsi+288], ymm7
add rbp, 5
add rsi, 320
cmp rbp, qword ptr [rsp+32]
db 15, 130, 0, 0, 0, 0 ;# jb rel32

View file

@ -0,0 +1,27 @@
push rbx
push rbp
push rdi
push rsi
push r12
push r13
push r14
push r15
;# save all XMM registers just to be safe for all calling conventions
sub rsp, 256
movdqu xmmword ptr [rsp], xmm0
movdqu xmmword ptr [rsp + 16], xmm1
movdqu xmmword ptr [rsp + 32], xmm2
movdqu xmmword ptr [rsp + 48], xmm3
movdqu xmmword ptr [rsp + 64], xmm4
movdqu xmmword ptr [rsp + 80], xmm5
movdqu xmmword ptr [rsp + 96], xmm6
movdqu xmmword ptr [rsp + 112], xmm7
movdqu xmmword ptr [rsp + 128], xmm8
movdqu xmmword ptr [rsp + 144], xmm9
movdqu xmmword ptr [rsp + 160], xmm10
movdqu xmmword ptr [rsp + 176], xmm11
movdqu xmmword ptr [rsp + 192], xmm12
movdqu xmmword ptr [rsp + 208], xmm13
movdqu xmmword ptr [rsp + 224], xmm14
movdqu xmmword ptr [rsp + 240], xmm15

View file

@ -0,0 +1,50 @@
sub rsp, 40
mov [rsp], rbx
vmovdqu ymmword ptr [rsp+8], ymm14
mov rax, [rsp+40]
mov rbx, [rsp+48]
mov rcx, [rsp+56]
mov rdx, [rsp+64]
vmovdqu ymm8, ymmword ptr [rax] ;# ymm8 = r0[1], r1[1], r2[1], r3[1]
vmovdqu ymm9, ymmword ptr [rbx] ;# ymm9 = r0[2], r1[2], r2[2], r3[2]
vmovdqu ymm10, ymmword ptr [rcx] ;# ymm10 = r0[3], r1[3], r2[3], r3[3]
vmovdqu ymm11, ymmword ptr [rdx] ;# ymm11 = r0[4], r1[4], r2[4], r3[4]
vpunpcklqdq ymm12, ymm8, ymm9 ;# ymm12 = r0[1], r0[2], r2[1], r2[2]
vpunpcklqdq ymm13, ymm10, ymm11 ;# ymm13 = r0[3], r0[4], r2[3], r2[4]
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r0[1], r0[2], r0[3], r0[4]
vpxor ymm0, ymm0, ymm14
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r2[1], r2[2], r2[3], r2[4]
vpxor ymm2, ymm2, ymm14
vpunpckhqdq ymm12, ymm8, ymm9 ;# ymm12 = r1[1], r1[2], r3[1], r3[2]
vpunpckhqdq ymm13, ymm10, ymm11 ;# ymm13 = r1[3], r1[4], r3[3], r3[4]
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r1[1], r1[2], r1[3], r1[4]
vpxor ymm1, ymm1, ymm14
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r3[1], r3[2], r3[3], r3[4]
vpxor ymm3, ymm3, ymm14
vmovdqu ymm8, ymmword ptr [rax+32] ;# ymm8 = r4[1], r5[1], r6[1], r7[1]
vmovdqu ymm9, ymmword ptr [rbx+32] ;# ymm9 = r4[2], r5[2], r6[2], r7[2]
vmovdqu ymm10, ymmword ptr [rcx+32] ;# ymm10 = r4[3], r5[3], r6[3], r7[3]
vmovdqu ymm11, ymmword ptr [rdx+32] ;# ymm11 = r4[4], r5[4], r6[4], r7[4]
vpunpcklqdq ymm12, ymm8, ymm9 ;# ymm12 = r4[1], r4[2], r6[1], r6[2]
vpunpcklqdq ymm13, ymm10, ymm11 ;# ymm13 = r4[3], r4[4], r6[3], r6[4]
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r4[1], r4[2], r4[3], r4[4]
vpxor ymm4, ymm4, ymm14
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r6[1], r6[2], r6[3], r6[4]
vpxor ymm6, ymm6, ymm14
vpunpckhqdq ymm12, ymm8, ymm9 ;# ymm12 = r5[1], r5[2], r7[1], r7[2]
vpunpckhqdq ymm13, ymm10, ymm11 ;# ymm13 = r5[3], r5[4], r7[3], r7[4]
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r5[1], r5[2], r5[3], r5[4]
vpxor ymm5, ymm5, ymm14
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r7[1], r7[2], r7[3], r7[4]
vpxor ymm7, ymm7, ymm14
mov rbx, [rsp]
vmovdqu ymm14, ymmword ptr [rsp+8]
add rsp, 40

View file

@ -0,0 +1,29 @@
vmovdqu ymmword ptr [rsp], ymm0
mov rax, [rsp]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
mov [rsp], rax
prefetchnta byte ptr [rax]
mov rax, [rsp+8]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
mov [rsp+8], rax
prefetchnta byte ptr [rax]
mov rax, [rsp+16]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
mov [rsp+16], rax
prefetchnta byte ptr [rax]
mov rax, [rsp+24]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
mov [rsp+24], rax
prefetchnta byte ptr [rax]

View file

@ -49,8 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <cpuid.h>
#endif
static bool hugePagesJIT = false;
@ -116,6 +114,11 @@ namespace randomx {
#define codeReadDatasetLightSshInit ADDR(randomx_program_read_dataset_sshash_init)
#define codeReadDatasetLightSshFin ADDR(randomx_program_read_dataset_sshash_fin)
#define codeDatasetInit ADDR(randomx_dataset_init)
#define codeDatasetInitAVX2_prologue ADDR(randomx_dataset_init_avx2_prologue)
#define codeDatasetInitAVX2_loop_end ADDR(randomx_dataset_init_avx2_loop_end)
#define codeDatasetInitAVX2_loop_epilogue ADDR(randomx_dataset_init_avx2_epilogue)
#define codeDatasetInitAVX2_ssh_load ADDR(randomx_dataset_init_avx2_ssh_load)
#define codeDatasetInitAVX2_ssh_prefetch ADDR(randomx_dataset_init_avx2_ssh_prefetch)
#define codeLoopStore ADDR(randomx_program_loop_store)
#define codeLoopEnd ADDR(randomx_program_loop_end)
#define codeEpilogue ADDR(randomx_program_epilogue)
@ -132,7 +135,12 @@ namespace randomx {
#define readDatasetLightInitSize (codeReadDatasetLightSshFin - codeReadDatasetLightSshInit)
#define readDatasetLightFinSize (codeLoopStore - codeReadDatasetLightSshFin)
#define loopStoreSize (codeLoopEnd - codeLoopStore)
#define datasetInitSize (codeEpilogue - codeDatasetInit)
#define datasetInitSize (codeDatasetInitAVX2_prologue - codeDatasetInit)
#define datasetInitAVX2_prologue_size (codeDatasetInitAVX2_loop_end - codeDatasetInitAVX2_prologue)
#define datasetInitAVX2_loop_end_size (codeDatasetInitAVX2_loop_epilogue - codeDatasetInitAVX2_loop_end)
#define datasetInitAVX2_epilogue_size (codeDatasetInitAVX2_ssh_load - codeDatasetInitAVX2_loop_epilogue)
#define datasetInitAVX2_ssh_load_size (codeDatasetInitAVX2_ssh_prefetch - codeDatasetInitAVX2_ssh_load)
#define datasetInitAVX2_ssh_prefetch_size (codeEpilogue - codeDatasetInitAVX2_ssh_prefetch)
#define epilogueSize (codeShhLoad - codeEpilogue)
#define codeSshLoadSize (codeShhPrefetch - codeShhLoad)
#define codeSshPrefetchSize (codeShhEnd - codeShhPrefetch)
@ -192,17 +200,6 @@ namespace randomx {
xmrig::VirtualMemory::protectRX(p1, p2 - p1);
}
static inline void cpuid(uint32_t level, int32_t output[4])
{
memset(output, 0, sizeof(int32_t) * 4);
# ifdef _MSC_VER
__cpuid(output, static_cast<int>(level));
# else
__cpuid_count(level, 0, output[0], output[1], output[2], output[3]);
# endif
}
# ifdef _MSC_VER
static FORCE_INLINE uint32_t rotl32(uint32_t a, int shift) { return _rotl(a, shift); }
# else
@ -215,14 +212,11 @@ namespace randomx {
JitCompilerX86::JitCompilerX86(bool hugePagesEnable) {
BranchesWithin32B = xmrig::Cpu::info()->jccErratum();
int32_t info[4];
cpuid(1, info);
hasAVX = ((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0);
hasAVX = xmrig::Cpu::info()->hasAVX();
hasAVX2 = xmrig::Cpu::info()->hasAVX2();
hasXOP = xmrig::Cpu::info()->hasXOP();
cpuid(0x80000001, info);
hasXOP = ((info[2] & (1 << 11)) != 0);
allocatedSize = CodeSize * 2;
allocatedSize = hasAVX2 ? (CodeSize * 4) : (CodeSize * 2);
allocatedCode = static_cast<uint8_t*>(allocExecutableMemory(allocatedSize,
# ifdef XMRIG_SECURE_JIT
false
@ -304,14 +298,49 @@ namespace randomx {
template<size_t N>
void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[N]) {
uint8_t* p = code;
if (hasAVX2) {
codePos = 0;
emit(codeDatasetInitAVX2_prologue, datasetInitAVX2_prologue_size, code, codePos);
for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) {
SuperscalarProgram& prog = programs[j];
uint32_t pos = codePos;
for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) {
generateSuperscalarCode<true>(prog(i), p, pos);
}
codePos = pos;
emit(codeShhLoad, codeSshLoadSize, code, codePos);
emit(codeDatasetInitAVX2_ssh_load, datasetInitAVX2_ssh_load_size, code, codePos);
if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
*(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast<uint32_t>(prog.getAddressRegister()) << 16);
codePos += 3;
emit(RandomX_CurrentConfig.codeShhPrefetchTweaked, codeSshPrefetchSize, code, codePos);
uint8_t* p = code + codePos;
emit(codeDatasetInitAVX2_ssh_prefetch, datasetInitAVX2_ssh_prefetch_size, code, codePos);
p[3] += prog.getAddressRegister() << 3;
}
}
emit(codeDatasetInitAVX2_loop_end, datasetInitAVX2_loop_end_size, code, codePos);
// Number of bytes from the start of randomx_dataset_init_avx2_prologue to loop_begin label
constexpr int32_t prologue_size = 320;
*(int32_t*)(code + codePos - 4) = prologue_size - codePos;
emit(codeDatasetInitAVX2_loop_epilogue, datasetInitAVX2_epilogue_size, code, codePos);
return;
}
memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize);
codePos = superScalarHashOffset + codeSshInitSize;
for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) {
SuperscalarProgram& prog = programs[j];
for (unsigned i = 0; i < prog.getSize(); ++i) {
Instruction& instr = prog(i);
generateSuperscalarCode(instr);
uint32_t pos = codePos;
for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) {
generateSuperscalarCode<false>(prog(i), p, pos);
}
codePos = pos;
emit(codeShhLoad, codeSshLoadSize, code, codePos);
if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
*(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast<uint32_t>(prog.getAddressRegister()) << 16);
@ -326,7 +355,10 @@ namespace randomx {
void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_MAX_ACCESSES]);
void JitCompilerX86::generateDatasetInitCode() {
memcpy(code, codeDatasetInit, datasetInitSize);
// AVX2 code is generated in generateSuperscalarHash()
if (!hasAVX2) {
memcpy(code, codeDatasetInit, datasetInitSize);
}
}
void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) {
@ -405,85 +437,243 @@ namespace randomx {
emit32(epilogueOffset - codePos - 4, code, codePos);
}
void JitCompilerX86::generateSuperscalarCode(Instruction& instr) {
static constexpr uint8_t REX_SUB_RR[] = { 0x4d, 0x2b };
static constexpr uint8_t REX_MOV_RR64[] = { 0x49, 0x8b };
static constexpr uint8_t REX_MOV_R64R[] = { 0x4c, 0x8b };
static constexpr uint8_t REX_IMUL_RR[] = { 0x4d, 0x0f, 0xaf };
static constexpr uint8_t REX_IMUL_RM[] = { 0x4c, 0x0f, 0xaf };
static constexpr uint8_t REX_MUL_R[] = { 0x49, 0xf7 };
static constexpr uint8_t REX_81[] = { 0x49, 0x81 };
static constexpr uint8_t MOV_RAX_I[] = { 0x48, 0xb8 };
static constexpr uint8_t REX_LEA[] = { 0x4f, 0x8d };
static constexpr uint8_t REX_XOR_RR[] = { 0x4D, 0x33 };
static constexpr uint8_t REX_XOR_RI[] = { 0x49, 0x81 };
static constexpr uint8_t REX_ROT_I8[] = { 0x49, 0xc1 };
template<bool AVX2>
FORCE_INLINE void JitCompilerX86::generateSuperscalarCode(Instruction& instr, uint8_t* code, uint32_t& codePos) {
switch ((SuperscalarInstructionType)instr.opcode)
{
case randomx::SuperscalarInstructionType::ISUB_R:
emit(REX_SUB_RR, code, codePos);
emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
*(uint32_t*)(code + codePos) = 0x00C02B4DUL + (instr.dst << 19) + (instr.src << 16);
codePos += 3;
if (AVX2) {
emit32(0xC0FBFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
}
break;
case randomx::SuperscalarInstructionType::IXOR_R:
emit(REX_XOR_RR, code, codePos);
emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
*(uint32_t*)(code + codePos) = 0x00C0334DUL + (instr.dst << 19) + (instr.src << 16);
codePos += 3;
if (AVX2) {
emit32(0xC0EFFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
}
break;
case randomx::SuperscalarInstructionType::IADD_RS:
emit(REX_LEA, code, codePos);
emitByte(0x04 + 8 * instr.dst, code, codePos);
genSIB(instr.getModShift(), instr.src, instr.dst, code, codePos);
emit32(0x00048D4F + (instr.dst << 19) + (genSIB(instr.getModShift(), instr.src, instr.dst) << 24), code, codePos);
if (AVX2) {
if (instr.getModShift()) {
static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xF0, 0x00, 0xC5, 0xBD, 0xD4, 0xC0 };
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[3] += instr.src;
p[4] = instr.getModShift();
p[8] += instr.dst * 9;
}
else {
emit32(0xC0D4FDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
}
}
break;
case randomx::SuperscalarInstructionType::IMUL_R:
emit(REX_IMUL_RR, code, codePos);
emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
emit32(0xC0AF0F4DUL + (instr.dst << 27) + (instr.src << 24), code, codePos);
if (AVX2) {
static const uint8_t t[] = {
0xC5, 0xBD, 0x73, 0xD0, 0x20,
0xC5, 0xB5, 0x73, 0xD0, 0x20,
0xC5, 0x7D, 0xF4, 0xD0,
0xC5, 0x35, 0xF4, 0xD8,
0xC5, 0xBD, 0xF4, 0xC0,
0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20,
0xC5, 0xFD, 0x73, 0xF0, 0x20,
0xC4, 0x41, 0x2D, 0xD4, 0xD3,
0xC5, 0xAD, 0xD4, 0xC0
};
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[3] += instr.dst;
p[8] += instr.src;
p[11] -= instr.dst * 8;
p[13] += instr.src;
p[17] += instr.dst;
p[21] += instr.dst * 8 + instr.src;
p[29] -= instr.dst * 8;
p[31] += instr.dst;
p[41] += instr.dst * 9;
}
break;
case randomx::SuperscalarInstructionType::IROR_C:
emit(REX_ROT_I8, code, codePos);
emitByte(0xc8 + instr.dst, code, codePos);
emitByte(instr.getImm32() & 63, code, codePos);
{
const uint32_t shift = instr.getImm32() & 63;
emit32(0x00C8C149UL + (instr.dst << 16) + (shift << 24), code, codePos);
if (AVX2) {
static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xD0, 0x00, 0xC5, 0xB5, 0x73, 0xF0, 0x00, 0xC4, 0xC1, 0x3D, 0xEB, 0xC1 };
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[3] += instr.dst;
p[4] = shift;
p[8] += instr.dst;
p[9] = 64 - shift;
p[14] += instr.dst * 8;
}
}
break;
case randomx::SuperscalarInstructionType::IADD_C7:
case randomx::SuperscalarInstructionType::IADD_C8:
case randomx::SuperscalarInstructionType::IADD_C9:
emit(REX_81, code, codePos);
emitByte(0xc0 + instr.dst, code, codePos);
emit32(instr.getImm32(), code, codePos);
if (AVX2) {
static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x03, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xD4, 0xC0 };
uint8_t* p = code + codePos;
emit(t, code, codePos);
*(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32());
p[12] += instr.dst * 8;
p[24] -= instr.dst * 8;
p[26] += instr.dst * 8;
}
else {
*(uint32_t*)(code + codePos) = 0x00C08149UL + (instr.dst << 16);
codePos += 3;
emit32(instr.getImm32(), code, codePos);
}
break;
case randomx::SuperscalarInstructionType::IXOR_C7:
case randomx::SuperscalarInstructionType::IXOR_C8:
case randomx::SuperscalarInstructionType::IXOR_C9:
emit(REX_XOR_RI, code, codePos);
emitByte(0xf0 + instr.dst, code, codePos);
emit32(instr.getImm32(), code, codePos);
if (AVX2) {
static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x33, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xEF, 0xC0 };
uint8_t* p = code + codePos;
emit(t, code, codePos);
*(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32());
p[12] += instr.dst * 8;
p[24] -= instr.dst * 8;
p[26] += instr.dst * 8;
}
else {
*(uint32_t*)(code + codePos) = 0x00F08149UL + (instr.dst << 16);
codePos += 3;
emit32(instr.getImm32(), code, codePos);
}
break;
case randomx::SuperscalarInstructionType::IMULH_R:
emit(REX_MOV_RR64, code, codePos);
emitByte(0xc0 + instr.dst, code, codePos);
emit(REX_MUL_R, code, codePos);
emitByte(0xe0 + instr.src, code, codePos);
emit(REX_MOV_R64R, code, codePos);
emitByte(0xc2 + 8 * instr.dst, code, codePos);
*(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16);
codePos += 3;
*(uint32_t*)(code + codePos) = 0x00E0F749UL + (instr.src << 16);
codePos += 3;
*(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19);
codePos += 3;
if (AVX2) {
static const uint8_t t[] = {
0xC5, 0xBD, 0x73, 0xD0, 0x20,
0xC5, 0xB5, 0x73, 0xD0, 0x20,
0xC5, 0x7D, 0xF4, 0xD0,
0xC5, 0x3D, 0xF4, 0xD8,
0xC4, 0x41, 0x7D, 0xF4, 0xE1,
0xC4, 0xC1, 0x3D, 0xF4, 0xC1,
0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20,
0xC4, 0x41, 0x25, 0xEF, 0xC6,
0xC4, 0x41, 0x25, 0xD4, 0xDC,
0xC4, 0x41, 0x25, 0xD4, 0xDA,
0xC4, 0x41, 0x25, 0xEF, 0xCE,
0xC4, 0x42, 0x3D, 0x37, 0xC1,
0xC4, 0x41, 0x3D, 0xDB, 0xC7,
0xC5, 0xBD, 0xD4, 0xC0,
0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20,
0xC5, 0xA5, 0xD4, 0xC0
};
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[3] += instr.dst;
p[8] += instr.src;
p[11] -= instr.dst * 8;
p[13] += instr.src;
p[17] += instr.src;
p[20] -= instr.dst * 8;
p[27] += instr.dst * 8;
p[67] += instr.dst * 9;
p[77] += instr.dst * 9;
}
break;
case randomx::SuperscalarInstructionType::ISMULH_R:
emit(REX_MOV_RR64, code, codePos);
emitByte(0xc0 + instr.dst, code, codePos);
emit(REX_MUL_R, code, codePos);
emitByte(0xe8 + instr.src, code, codePos);
emit(REX_MOV_R64R, code, codePos);
emitByte(0xc2 + 8 * instr.dst, code, codePos);
*(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16);
codePos += 3;
*(uint32_t*)(code + codePos) = 0x00E8F749UL + (instr.src << 16);
codePos += 3;
*(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19);
codePos += 3;
if (AVX2) {
static const uint8_t t[] = {
0xC5, 0xBD, 0x73, 0xD0, 0x20,
0xC5, 0xB5, 0x73, 0xD0, 0x20,
0xC5, 0x7D, 0xF4, 0xD0,
0xC5, 0x3D, 0xF4, 0xD8,
0xC4, 0x41, 0x7D, 0xF4, 0xE1,
0xC4, 0x41, 0x3D, 0xF4, 0xE9,
0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20,
0xC4, 0x41, 0x25, 0xEF, 0xC6,
0xC4, 0x41, 0x25, 0xD4, 0xDC,
0xC4, 0x41, 0x25, 0xD4, 0xDA,
0xC4, 0x41, 0x25, 0xEF, 0xCE,
0xC4, 0x42, 0x3D, 0x37, 0xC1,
0xC4, 0x41, 0x3D, 0xDB, 0xC7,
0xC4, 0x41, 0x15, 0xD4, 0xE8,
0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20,
0xC4, 0x41, 0x15, 0xD4, 0xC3,
0xC4, 0x41, 0x35, 0xEF, 0xC9,
0xC4, 0x62, 0x35, 0x37, 0xD0,
0xC4, 0x62, 0x35, 0x37, 0xD8,
0xC5, 0x2D, 0xDB, 0xD0,
0xC5, 0x25, 0xDB, 0xD8,
0xC4, 0x41, 0x3D, 0xFB, 0xC2,
0xC4, 0xC1, 0x3D, 0xFB, 0xC3
};
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[3] += instr.dst;
p[8] += instr.src;
p[11] -= instr.dst * 8;
p[13] += instr.src;
p[17] += instr.src;
p[20] -= instr.dst * 8;
p[89] += instr.dst;
p[94] += instr.src;
p[98] += instr.src;
p[102] += instr.dst;
p[112] += instr.dst * 8;
}
break;
case randomx::SuperscalarInstructionType::IMUL_RCP:
emit(MOV_RAX_I, code, codePos);
*(uint32_t*)(code + codePos) = 0x0000B848UL;
codePos += 2;
emit64(randomx_reciprocal_fast(instr.getImm32()), code, codePos);
emit(REX_IMUL_RM, code, codePos);
emitByte(0xc0 + 8 * instr.dst, code, codePos);
emit32(0xC0AF0F4CUL + (instr.dst << 27), code, codePos);
if (AVX2) {
static const uint8_t t[] = {
0xC4, 0x62, 0x7D, 0x19, 0x25, 0xEB, 0xFF, 0xFF, 0xFF,
0xC5, 0xBD, 0x73, 0xD0, 0x20,
0xC4, 0xC1, 0x35, 0x73, 0xD4, 0x20,
0xC4, 0x41, 0x7D, 0xF4, 0xD4,
0xC5, 0x35, 0xF4, 0xD8,
0xC4, 0xC1, 0x3D, 0xF4, 0xC4,
0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20,
0xC5, 0xFD, 0x73, 0xF0, 0x20,
0xC4, 0x41, 0x2D, 0xD4, 0xD3,
0xC5, 0xAD, 0xD4, 0xC0
};
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[12] += instr.dst;
p[22] -= instr.dst * 8;
p[28] += instr.dst;
p[33] += instr.dst * 8;
p[41] -= instr.dst * 8;
p[43] += instr.dst;
p[53] += instr.dst * 9;
}
break;
default:
UNREACHABLE;
}
}
template void JitCompilerX86::generateSuperscalarCode<false>(Instruction&, uint8_t*, uint32_t&);
template void JitCompilerX86::generateSuperscalarCode<true>(Instruction&, uint8_t*, uint32_t&);
template<bool rax>
FORCE_INLINE void JitCompilerX86::genAddressReg(const Instruction& instr, const uint32_t src, uint8_t* code, uint32_t& codePos) {
*(uint32_t*)(code + codePos) = (rax ? 0x24808d41 : 0x24888d41) + (src << 16);
@ -563,10 +753,6 @@ namespace randomx {
codePos = pos;
}
void JitCompilerX86::genSIB(int scale, int index, int base, uint8_t* code, uint32_t& codePos) {
emitByte((scale << 6) | (index << 3) | base, code, codePos);
}
void JitCompilerX86::h_ISUB_R(const Instruction& instr) {
uint8_t* const p = code;
uint32_t pos = codePos;

View file

@ -96,6 +96,7 @@ namespace randomx {
bool BranchesWithin32B = false;
bool hasAVX;
bool hasAVX2;
bool hasXOP;
uint8_t* allocatedCode = nullptr;
@ -107,9 +108,10 @@ namespace randomx {
static void genAddressReg(const Instruction&, const uint32_t src, uint8_t* code, uint32_t& codePos);
static void genAddressRegDst(const Instruction&, uint8_t* code, uint32_t& codePos);
static void genAddressImm(const Instruction&, uint8_t* code, uint32_t& codePos);
static void genSIB(int scale, int index, int base, uint8_t* code, uint32_t& codePos);
static uint32_t genSIB(int scale, int index, int base) { return (scale << 6) | (index << 3) | base; }
void generateSuperscalarCode(Instruction &);
template<bool AVX2>
void generateSuperscalarCode(Instruction& inst, uint8_t* code, uint32_t& codePos);
static void emitByte(uint8_t val, uint8_t* code, uint32_t& codePos) {
code[codePos] = val;

View file

@ -52,6 +52,11 @@
.global DECL(randomx_program_loop_store)
.global DECL(randomx_program_loop_end)
.global DECL(randomx_dataset_init)
.global DECL(randomx_dataset_init_avx2_prologue)
.global DECL(randomx_dataset_init_avx2_loop_end)
.global DECL(randomx_dataset_init_avx2_epilogue)
.global DECL(randomx_dataset_init_avx2_ssh_load)
.global DECL(randomx_dataset_init_avx2_ssh_prefetch)
.global DECL(randomx_program_epilogue)
.global DECL(randomx_sshash_load)
.global DECL(randomx_sshash_prefetch)
@ -192,6 +197,98 @@ call_offset:
pop rbx
ret
.balign 64
DECL(randomx_dataset_init_avx2_prologue):
#include "asm/program_sshash_avx2_save_registers.inc"
#if defined(WINABI)
mov rdi, qword ptr [rcx] ;# cache->memory
mov rsi, rdx ;# dataset
mov rbp, r8 ;# block index
push r9 ;# max. block index
#else
mov rdi, qword ptr [rdi] ;# cache->memory
;# dataset in rsi
mov rbp, rdx ;# block index
push rcx ;# max. block index
#endif
sub rsp, 32
jmp randomx_dataset_init_avx2_prologue_loop_begin
#include "asm/program_sshash_avx2_constants.inc"
.balign 64
randomx_dataset_init_avx2_prologue_loop_begin:
#include "asm/program_sshash_avx2_loop_begin.inc"
;# init integer registers (lane 0)
lea r8, [rbp+1]
imul r8, qword ptr [r0_avx2_mul+rip]
mov r9, qword ptr [r1_avx2_add+rip]
xor r9, r8
mov r10, qword ptr [r2_avx2_add+rip]
xor r10, r8
mov r11, qword ptr [r3_avx2_add+rip]
xor r11, r8
mov r12, qword ptr [r4_avx2_add+rip]
xor r12, r8
mov r13, qword ptr [r5_avx2_add+rip]
xor r13, r8
mov r14, qword ptr [r6_avx2_add+rip]
xor r14, r8
mov r15, qword ptr [r7_avx2_add+rip]
xor r15, r8
;# init AVX registers (lanes 1-4)
vpxor ymm0, ymm0, ymm0
movq xmm0, rbp
vpbroadcastq ymm0, xmm0
vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments+rip]
;# ymm0 *= r0_avx2_mul
vbroadcastsd ymm1, qword ptr [r0_avx2_mul+rip]
vpsrlq ymm8, ymm0, 32
vpsrlq ymm9, ymm1, 32
vpmuludq ymm10, ymm0, ymm1
vpmuludq ymm11, ymm9, ymm0
vpmuludq ymm0, ymm8, ymm1
vpsllq ymm11, ymm11, 32
vpsllq ymm0, ymm0, 32
vpaddq ymm10, ymm10, ymm11
vpaddq ymm0, ymm10, ymm0
vbroadcastsd ymm1, qword ptr [r1_avx2_add+rip]
vpxor ymm1, ymm0, ymm1
vbroadcastsd ymm2, qword ptr [r2_avx2_add+rip]
vpxor ymm2, ymm0, ymm2
vbroadcastsd ymm3, qword ptr [r3_avx2_add+rip]
vpxor ymm3, ymm0, ymm3
vbroadcastsd ymm4, qword ptr [r4_avx2_add+rip]
vpxor ymm4, ymm0, ymm4
vbroadcastsd ymm5, qword ptr [r5_avx2_add+rip]
vpxor ymm5, ymm0, ymm5
vbroadcastsd ymm6, qword ptr [r6_avx2_add+rip]
vpxor ymm6, ymm0, ymm6
vbroadcastsd ymm7, qword ptr [r7_avx2_add+rip]
vpxor ymm7, ymm0, ymm7
vbroadcastsd ymm15, qword ptr [mul_hi_avx2_data+rip] ;# carry_bit (bit 32)
vpsllq ymm14, ymm15, 31 ;# sign64 (bit 63)
;# generated SuperscalarHash code goes here
DECL(randomx_dataset_init_avx2_loop_end):
#include "asm/program_sshash_avx2_loop_end.inc"
DECL(randomx_dataset_init_avx2_epilogue):
#include "asm/program_sshash_avx2_epilogue.inc"
DECL(randomx_dataset_init_avx2_ssh_load):
#include "asm/program_sshash_avx2_ssh_load.inc"
DECL(randomx_dataset_init_avx2_ssh_prefetch):
#include "asm/program_sshash_avx2_ssh_prefetch.inc"
.balign 64
DECL(randomx_program_epilogue):
#include "asm/program_epilogue_store.inc"

View file

@ -41,6 +41,11 @@ PUBLIC randomx_program_read_dataset_ryzen
PUBLIC randomx_program_read_dataset_sshash_init
PUBLIC randomx_program_read_dataset_sshash_fin
PUBLIC randomx_dataset_init
PUBLIC randomx_dataset_init_avx2_prologue
PUBLIC randomx_dataset_init_avx2_loop_end
PUBLIC randomx_dataset_init_avx2_epilogue
PUBLIC randomx_dataset_init_avx2_ssh_load
PUBLIC randomx_dataset_init_avx2_ssh_prefetch
PUBLIC randomx_program_loop_store
PUBLIC randomx_program_loop_end
PUBLIC randomx_program_epilogue
@ -183,6 +188,95 @@ init_block_loop:
randomx_dataset_init ENDP
ALIGN 64
randomx_dataset_init_avx2_prologue PROC
include asm/program_sshash_avx2_save_registers.inc
mov rdi, qword ptr [rcx] ;# cache->memory
mov rsi, rdx ;# dataset
mov rbp, r8 ;# block index
push r9 ;# max. block index
sub rsp, 32
jmp loop_begin
include asm/program_sshash_avx2_constants.inc
ALIGN 64
loop_begin:
include asm/program_sshash_avx2_loop_begin.inc
;# init integer registers (lane 0)
lea r8, [rbp+1]
imul r8, qword ptr [r0_avx2_mul]
mov r9, qword ptr [r1_avx2_add]
xor r9, r8
mov r10, qword ptr [r2_avx2_add]
xor r10, r8
mov r11, qword ptr [r3_avx2_add]
xor r11, r8
mov r12, qword ptr [r4_avx2_add]
xor r12, r8
mov r13, qword ptr [r5_avx2_add]
xor r13, r8
mov r14, qword ptr [r6_avx2_add]
xor r14, r8
mov r15, qword ptr [r7_avx2_add]
xor r15, r8
;# init AVX registers (lanes 1-4)
vpxor ymm0, ymm0, ymm0
movq xmm0, rbp
vpbroadcastq ymm0, xmm0
vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments]
;# ymm0 *= r0_avx2_mul
vbroadcastsd ymm1, qword ptr [r0_avx2_mul]
vpsrlq ymm8, ymm0, 32
vpsrlq ymm9, ymm1, 32
vpmuludq ymm10, ymm0, ymm1
vpmuludq ymm11, ymm9, ymm0
vpmuludq ymm0, ymm8, ymm1
vpsllq ymm11, ymm11, 32
vpsllq ymm0, ymm0, 32
vpaddq ymm10, ymm10, ymm11
vpaddq ymm0, ymm10, ymm0
vbroadcastsd ymm1, qword ptr [r1_avx2_add]
vpxor ymm1, ymm0, ymm1
vbroadcastsd ymm2, qword ptr [r2_avx2_add]
vpxor ymm2, ymm0, ymm2
vbroadcastsd ymm3, qword ptr [r3_avx2_add]
vpxor ymm3, ymm0, ymm3
vbroadcastsd ymm4, qword ptr [r4_avx2_add]
vpxor ymm4, ymm0, ymm4
vbroadcastsd ymm5, qword ptr [r5_avx2_add]
vpxor ymm5, ymm0, ymm5
vbroadcastsd ymm6, qword ptr [r6_avx2_add]
vpxor ymm6, ymm0, ymm6
vbroadcastsd ymm7, qword ptr [r7_avx2_add]
vpxor ymm7, ymm0, ymm7
vbroadcastsd ymm15, qword ptr [mul_hi_avx2_data] ;# carry_bit (bit 32)
vpsllq ymm14, ymm15, 31 ;# sign64 (bit 63)
randomx_dataset_init_avx2_prologue ENDP
;# generated SuperscalarHash code goes here
randomx_dataset_init_avx2_loop_end PROC
include asm/program_sshash_avx2_loop_end.inc
randomx_dataset_init_avx2_loop_end ENDP
randomx_dataset_init_avx2_epilogue PROC
include asm/program_sshash_avx2_epilogue.inc
randomx_dataset_init_avx2_epilogue ENDP
randomx_dataset_init_avx2_ssh_load PROC
include asm/program_sshash_avx2_ssh_load.inc
randomx_dataset_init_avx2_ssh_load ENDP
randomx_dataset_init_avx2_ssh_prefetch PROC
include asm/program_sshash_avx2_ssh_prefetch.inc
randomx_dataset_init_avx2_ssh_prefetch ENDP
randomx_program_epilogue PROC
include asm/program_epilogue_store.inc
include asm/program_epilogue_win64.inc

View file

@ -44,6 +44,11 @@ extern "C" {
void randomx_program_loop_store();
void randomx_program_loop_end();
void randomx_dataset_init();
void randomx_dataset_init_avx2_prologue();
void randomx_dataset_init_avx2_loop_end();
void randomx_dataset_init_avx2_epilogue();
void randomx_dataset_init_avx2_ssh_load();
void randomx_dataset_init_avx2_ssh_prefetch();
void randomx_program_epilogue();
void randomx_sshash_load();
void randomx_sshash_prefetch();

View file

@ -19,6 +19,7 @@
#include "crypto/rx/RxDataset.h"
#include "backend/cpu/Cpu.h"
#include "base/io/log/Log.h"
#include "base/io/log/Tags.h"
#include "base/kernel/Platform.h"
@ -39,7 +40,13 @@ static void init_dataset_wrapper(randomx_dataset *dataset, randomx_cache *cache,
{
Platform::setThreadPriority(priority);
randomx_init_dataset(dataset, cache, startItem, itemCount);
if (Cpu::info()->hasAVX2() && (itemCount % 5)) {
randomx_init_dataset(dataset, cache, startItem, itemCount - (itemCount % 5));
randomx_init_dataset(dataset, cache, startItem + itemCount - 5, 5);
}
else {
randomx_init_dataset(dataset, cache, startItem, itemCount);
}
}