Merge pull request #1986 from SChernykh/dev

Dataset initialization with AVX2 (faster startup)
This commit is contained in:
xmrig 2020-12-20 00:16:20 +07:00 committed by GitHub
commit e79e3370f8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
33 changed files with 839 additions and 104 deletions

View file

@ -1,3 +1,4 @@
@echo off
cd %~dp0
xmrig.exe --bench=10M --submit
pause

View file

@ -1,3 +1,4 @@
@echo off
cd %~dp0
xmrig.exe --bench=1M --submit
pause

View file

@ -15,5 +15,6 @@
:: Choose pools outside of top 5 to help Monero network be more decentralized!
:: Smaller pools also often have smaller fees/payout limits.
cd %~dp0
xmrig.exe -o pool.hashvault.pro:3333 -u 48edfHu7V9Z84YzzMa6fUueoELZ9ZRXq9VetWzYGzKt52XU5xvqgzYnDK9URnRoJMk1j8nLwEVsaSWJ4fhdUyZijBGUicoD -p x
pause

View file

@ -11,5 +11,6 @@
:: Mining solo is the best way to help Monero network be more decentralized!
:: But you will only get a payout when you find a block which can take more than a year for a single low-end PC.
cd %~dp0
xmrig.exe -o node.xmr.to:18081 -a rx/0 -u 48edfHu7V9Z84YzzMa6fUueoELZ9ZRXq9VetWzYGzKt52XU5xvqgzYnDK9URnRoJMk1j8nLwEVsaSWJ4fhdUyZijBGUicoD --daemon
pause

View file

@ -214,13 +214,6 @@ void xmrig::Workers<T>::start(const std::vector<T> &data, bool sleep)
for (auto worker : m_workers) {
worker->start(Workers<T>::onReady);
// This sleep is important for optimal caching!
// Threads must allocate scratchpads in order so that adjacent cores will use adjacent scratchpads
// Sub-optimal caching can result in up to 0.5% hashrate penalty
if (sleep) {
std::this_thread::sleep_for(std::chrono::milliseconds(20));
}
}
}

View file

@ -40,6 +40,14 @@ public:
VENDOR_AMD
};
enum Arch : uint32_t {
ARCH_UNKNOWN,
ARCH_ZEN,
ARCH_ZEN_PLUS,
ARCH_ZEN2,
ARCH_ZEN3
};
enum MsrMod : uint32_t {
MSR_MOD_NONE,
MSR_MOD_RYZEN_17H,
@ -53,6 +61,7 @@ public:
enum Flag : uint32_t {
FLAG_AES,
FLAG_AVX,
FLAG_AVX2,
FLAG_AVX512F,
FLAG_BMI2,
@ -80,9 +89,11 @@ public:
virtual Assembly::Id assembly() const = 0;
virtual bool has(Flag feature) const = 0;
virtual bool hasAES() const = 0;
virtual bool hasAVX() const = 0;
virtual bool hasAVX2() const = 0;
virtual bool hasBMI2() const = 0;
virtual bool hasOneGbPages() const = 0;
virtual bool hasXOP() const = 0;
virtual bool hasCatL3() const = 0;
virtual bool isVM() const = 0;
virtual const char *backend() const = 0;
@ -97,6 +108,7 @@ public:
virtual size_t packages() const = 0;
virtual size_t threads() const = 0;
virtual Vendor vendor() const = 0;
virtual Arch arch() const = 0;
virtual bool jccErratum() const = 0;
};

View file

@ -52,8 +52,8 @@
namespace xmrig {
constexpr size_t kCpuFlagsSize = 13;
static const std::array<const char *, kCpuFlagsSize> flagNames = { "aes", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" };
constexpr size_t kCpuFlagsSize = 14;
static const std::array<const char *, kCpuFlagsSize> flagNames = { "aes", "avx", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" };
static_assert(kCpuFlagsSize == ICpuInfo::FLAG_MAX, "kCpuFlagsSize and FLAG_MAX mismatch");
@ -134,11 +134,12 @@ static inline uint64_t xgetbv()
#endif
}
static inline bool has_xcr_avx2() { return (xgetbv() & 0x06) == 0x06; }
static inline bool has_xcr_avx() { return (xgetbv() & 0x06) == 0x06; }
static inline bool has_xcr_avx512() { return (xgetbv() & 0xE6) == 0xE6; }
static inline bool has_osxsave() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 27); }
static inline bool has_aes_ni() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 25); }
static inline bool has_avx2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx2(); }
static inline bool has_avx() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 28) && has_osxsave() && has_xcr_avx(); }
static inline bool has_avx2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx(); }
static inline bool has_avx512f() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 16) && has_osxsave() && has_xcr_avx512(); }
static inline bool has_bmi2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 8); }
static inline bool has_pdpe1gb() { return has_feature(PROCESSOR_EXT_INFO, EDX_Reg, 1 << 26); }
@ -175,6 +176,7 @@ xmrig::BasicCpuInfo::BasicCpuInfo() :
cpu_brand_string(m_brand);
m_flags.set(FLAG_AES, has_aes_ni());
m_flags.set(FLAG_AVX, has_avx());
m_flags.set(FLAG_AVX2, has_avx2());
m_flags.set(FLAG_AVX512F, has_avx512f());
m_flags.set(FLAG_BMI2, has_bmi2());
@ -215,9 +217,27 @@ xmrig::BasicCpuInfo::BasicCpuInfo() :
switch (m_family) {
case 0x17:
m_msrMod = MSR_MOD_RYZEN_17H;
switch (m_model) {
case 1:
case 17:
case 32:
m_arch = ARCH_ZEN;
break;
case 8:
case 24:
m_arch = ARCH_ZEN_PLUS;
break;
case 49:
case 96:
case 113:
case 144:
m_arch = ARCH_ZEN2;
break;
}
break;
case 0x19:
m_arch = ARCH_ZEN3;
m_msrMod = MSR_MOD_RYZEN_19H;
break;

View file

@ -48,9 +48,11 @@ protected:
inline Assembly::Id assembly() const override { return m_assembly; }
inline bool has(Flag flag) const override { return m_flags.test(flag); }
inline bool hasAES() const override { return has(FLAG_AES); }
inline bool hasAVX() const override { return has(FLAG_AVX); }
inline bool hasAVX2() const override { return has(FLAG_AVX2); }
inline bool hasBMI2() const override { return has(FLAG_BMI2); }
inline bool hasOneGbPages() const override { return has(FLAG_PDPE1GB); }
inline bool hasXOP() const override { return has(FLAG_XOP); }
inline bool hasCatL3() const override { return has(FLAG_CAT_L3); }
inline bool isVM() const override { return has(FLAG_VM); }
inline const char *brand() const override { return m_brand; }
@ -62,12 +64,14 @@ protected:
inline size_t packages() const override { return 1; }
inline size_t threads() const override { return m_threads; }
inline Vendor vendor() const override { return m_vendor; }
inline Arch arch() const override { return m_arch; }
inline bool jccErratum() const override { return m_jccErratum; }
protected:
char m_brand[64 + 6]{};
size_t m_threads;
Vendor m_vendor = VENDOR_UNKNOWN;
Arch m_arch = ARCH_UNKNOWN;
bool m_jccErratum = false;
private:

View file

@ -16,6 +16,7 @@
"title": true,
"randomx": {
"init": -1,
"init-avx2": -1,
"mode": "auto",
"1gb-pages": false,
"rdmsr": true,

View file

@ -50,6 +50,7 @@ R"===(
"colors": true,
"randomx": {
"init": -1,
"init-avx2": -1,
"mode": "auto",
"1gb-pages": false,
"rdmsr": true,

View file

@ -0,0 +1,28 @@
r0_avx2_increments:
db 2,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0
mul_hi_avx2_data:
db 0,0,0,0,1,0,0,0
r0_avx2_mul:
;#/ 6364136223846793005
db 45, 127, 149, 76, 45, 244, 81, 88
r1_avx2_add:
;#/ 9298411001130361340
db 252, 161, 245, 89, 138, 151, 10, 129
r2_avx2_add:
;#/ 12065312585734608966
db 70, 216, 194, 56, 223, 153, 112, 167
r3_avx2_add:
;#/ 9306329213124626780
db 92, 73, 34, 191, 28, 185, 38, 129
r4_avx2_add:
;#/ 5281919268842080866
db 98, 138, 159, 23, 151, 37, 77, 73
r5_avx2_add:
;#/ 10536153434571861004
db 12, 236, 170, 206, 185, 239, 55, 146
r6_avx2_add:
;#/ 3398623926847679864
db 120, 45, 230, 108, 116, 86, 42, 47
r7_avx2_add:
;#/ 9549104520008361294
db 78, 229, 44, 182, 247, 59, 133, 132

View file

@ -0,0 +1,31 @@
add rsp, 32
pop r9
movdqu xmm0, xmmword ptr [rsp]
movdqu xmm1, xmmword ptr [rsp + 16]
movdqu xmm2, xmmword ptr [rsp + 32]
movdqu xmm3, xmmword ptr [rsp + 48]
movdqu xmm4, xmmword ptr [rsp + 64]
movdqu xmm5, xmmword ptr [rsp + 80]
movdqu xmm6, xmmword ptr [rsp + 96]
movdqu xmm7, xmmword ptr [rsp + 112]
movdqu xmm8, xmmword ptr [rsp + 128]
movdqu xmm9, xmmword ptr [rsp + 144]
movdqu xmm10, xmmword ptr [rsp + 160]
movdqu xmm11, xmmword ptr [rsp + 176]
movdqu xmm12, xmmword ptr [rsp + 192]
movdqu xmm13, xmmword ptr [rsp + 208]
movdqu xmm14, xmmword ptr [rsp + 224]
movdqu xmm15, xmmword ptr [rsp + 240]
vzeroupper
add rsp, 256
pop r15
pop r14
pop r13
pop r12
pop rsi
pop rdi
pop rbp
pop rbx
ret

View file

@ -0,0 +1,37 @@
;# prefetch RandomX dataset lines
prefetchnta byte ptr [rsi]
prefetchnta byte ptr [rsi+64]
prefetchnta byte ptr [rsi+128]
prefetchnta byte ptr [rsi+192]
prefetchnta byte ptr [rsi+256]
;# prefetch RandomX cache lines
mov rbx, rbp
and rbx, RANDOMX_CACHE_MASK
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rax, [rbp+1]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
prefetchnta byte ptr [rax]
mov [rsp], rax
lea rax, [rbp+2]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
prefetchnta byte ptr [rax]
mov [rsp+8], rax
lea rax, [rbp+3]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
prefetchnta byte ptr [rax]
mov [rsp+16], rax
lea rax, [rbp+4]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
prefetchnta byte ptr [rax]
mov [rsp+24], rax

View file

@ -0,0 +1,38 @@
mov qword ptr [rsi+0], r8
vpunpcklqdq ymm8, ymm0, ymm1
mov qword ptr [rsi+8], r9
vpunpcklqdq ymm9, ymm2, ymm3
mov qword ptr [rsi+16], r10
vpunpcklqdq ymm10, ymm4, ymm5
mov qword ptr [rsi+24], r11
vpunpcklqdq ymm11, ymm6, ymm7
mov qword ptr [rsi+32], r12
vpunpckhqdq ymm12, ymm0, ymm1
mov qword ptr [rsi+40], r13
vpunpckhqdq ymm13, ymm2, ymm3
mov qword ptr [rsi+48], r14
vpunpckhqdq ymm14, ymm4, ymm5
mov qword ptr [rsi+56], r15
vpunpckhqdq ymm15, ymm6, ymm7
vperm2i128 ymm0, ymm8, ymm9, 32
vperm2i128 ymm1, ymm10, ymm11, 32
vmovdqu ymmword ptr [rsi+64], ymm0
vmovdqu ymmword ptr [rsi+96], ymm1
vperm2i128 ymm2, ymm12, ymm13, 32
vperm2i128 ymm3, ymm14, ymm15, 32
vmovdqu ymmword ptr [rsi+128], ymm2
vmovdqu ymmword ptr [rsi+160], ymm3
vperm2i128 ymm4, ymm8, ymm9, 49
vperm2i128 ymm5, ymm10, ymm11, 49
vmovdqu ymmword ptr [rsi+192], ymm4
vmovdqu ymmword ptr [rsi+224], ymm5
vperm2i128 ymm6, ymm12, ymm13, 49
vperm2i128 ymm7, ymm14, ymm15, 49
vmovdqu ymmword ptr [rsi+256], ymm6
vmovdqu ymmword ptr [rsi+288], ymm7
add rbp, 5
add rsi, 320
cmp rbp, qword ptr [rsp+32]
db 15, 130, 0, 0, 0, 0 ;# jb rel32

View file

@ -0,0 +1,27 @@
push rbx
push rbp
push rdi
push rsi
push r12
push r13
push r14
push r15
;# save all XMM registers just to be safe for all calling conventions
sub rsp, 256
movdqu xmmword ptr [rsp], xmm0
movdqu xmmword ptr [rsp + 16], xmm1
movdqu xmmword ptr [rsp + 32], xmm2
movdqu xmmword ptr [rsp + 48], xmm3
movdqu xmmword ptr [rsp + 64], xmm4
movdqu xmmword ptr [rsp + 80], xmm5
movdqu xmmword ptr [rsp + 96], xmm6
movdqu xmmword ptr [rsp + 112], xmm7
movdqu xmmword ptr [rsp + 128], xmm8
movdqu xmmword ptr [rsp + 144], xmm9
movdqu xmmword ptr [rsp + 160], xmm10
movdqu xmmword ptr [rsp + 176], xmm11
movdqu xmmword ptr [rsp + 192], xmm12
movdqu xmmword ptr [rsp + 208], xmm13
movdqu xmmword ptr [rsp + 224], xmm14
movdqu xmmword ptr [rsp + 240], xmm15

View file

@ -0,0 +1,50 @@
sub rsp, 40
mov [rsp], rbx
vmovdqu ymmword ptr [rsp+8], ymm14
mov rax, [rsp+40]
mov rbx, [rsp+48]
mov rcx, [rsp+56]
mov rdx, [rsp+64]
vmovdqu ymm8, ymmword ptr [rax] ;# ymm8 = r0[1], r1[1], r2[1], r3[1]
vmovdqu ymm9, ymmword ptr [rbx] ;# ymm9 = r0[2], r1[2], r2[2], r3[2]
vmovdqu ymm10, ymmword ptr [rcx] ;# ymm10 = r0[3], r1[3], r2[3], r3[3]
vmovdqu ymm11, ymmword ptr [rdx] ;# ymm11 = r0[4], r1[4], r2[4], r3[4]
vpunpcklqdq ymm12, ymm8, ymm9 ;# ymm12 = r0[1], r0[2], r2[1], r2[2]
vpunpcklqdq ymm13, ymm10, ymm11 ;# ymm13 = r0[3], r0[4], r2[3], r2[4]
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r0[1], r0[2], r0[3], r0[4]
vpxor ymm0, ymm0, ymm14
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r2[1], r2[2], r2[3], r2[4]
vpxor ymm2, ymm2, ymm14
vpunpckhqdq ymm12, ymm8, ymm9 ;# ymm12 = r1[1], r1[2], r3[1], r3[2]
vpunpckhqdq ymm13, ymm10, ymm11 ;# ymm13 = r1[3], r1[4], r3[3], r3[4]
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r1[1], r1[2], r1[3], r1[4]
vpxor ymm1, ymm1, ymm14
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r3[1], r3[2], r3[3], r3[4]
vpxor ymm3, ymm3, ymm14
vmovdqu ymm8, ymmword ptr [rax+32] ;# ymm8 = r4[1], r5[1], r6[1], r7[1]
vmovdqu ymm9, ymmword ptr [rbx+32] ;# ymm9 = r4[2], r5[2], r6[2], r7[2]
vmovdqu ymm10, ymmword ptr [rcx+32] ;# ymm10 = r4[3], r5[3], r6[3], r7[3]
vmovdqu ymm11, ymmword ptr [rdx+32] ;# ymm11 = r4[4], r5[4], r6[4], r7[4]
vpunpcklqdq ymm12, ymm8, ymm9 ;# ymm12 = r4[1], r4[2], r6[1], r6[2]
vpunpcklqdq ymm13, ymm10, ymm11 ;# ymm13 = r4[3], r4[4], r6[3], r6[4]
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r4[1], r4[2], r4[3], r4[4]
vpxor ymm4, ymm4, ymm14
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r6[1], r6[2], r6[3], r6[4]
vpxor ymm6, ymm6, ymm14
vpunpckhqdq ymm12, ymm8, ymm9 ;# ymm12 = r5[1], r5[2], r7[1], r7[2]
vpunpckhqdq ymm13, ymm10, ymm11 ;# ymm13 = r5[3], r5[4], r7[3], r7[4]
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r5[1], r5[2], r5[3], r5[4]
vpxor ymm5, ymm5, ymm14
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r7[1], r7[2], r7[3], r7[4]
vpxor ymm7, ymm7, ymm14
mov rbx, [rsp]
vmovdqu ymm14, ymmword ptr [rsp+8]
add rsp, 40

View file

@ -0,0 +1,29 @@
vmovdqu ymmword ptr [rsp], ymm0
mov rax, [rsp]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
mov [rsp], rax
prefetchnta byte ptr [rax]
mov rax, [rsp+8]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
mov [rsp+8], rax
prefetchnta byte ptr [rax]
mov rax, [rsp+16]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
mov [rsp+16], rax
prefetchnta byte ptr [rax]
mov rax, [rsp+24]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
mov [rsp+24], rax
prefetchnta byte ptr [rax]

View file

@ -36,12 +36,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "crypto/randomx/virtual_memory.hpp"
static bool hugePagesJIT = false;
static int optimizedDatasetInit = -1;
void randomx_set_huge_pages_jit(bool hugePages)
{
hugePagesJIT = hugePages;
}
void randomx_set_optimized_dataset_init(int value)
{
optimizedDatasetInit = value;
}
namespace ARMV8A {
constexpr uint32_t B = 0x14000000;
@ -98,7 +104,7 @@ static size_t CalcDatasetItemSize()
constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 };
JitCompilerA64::JitCompilerA64(bool hugePagesEnable) :
JitCompilerA64::JitCompilerA64(bool hugePagesEnable, bool) :
hugePages(hugePagesJIT && hugePagesEnable),
literalPos(ImulRcpLiteralsEnd)
{

View file

@ -47,7 +47,7 @@ namespace randomx {
class JitCompilerA64 {
public:
explicit JitCompilerA64(bool hugePagesEnable);
explicit JitCompilerA64(bool hugePagesEnable, bool optimizedInitDatasetEnable);
~JitCompilerA64();
void prepare() {}

View file

@ -35,3 +35,6 @@ void randomx_set_huge_pages_jit(bool)
{
}
void randomx_set_optimized_dataset_init(int)
{
}

View file

@ -43,7 +43,7 @@ namespace randomx {
class JitCompilerFallback {
public:
explicit JitCompilerFallback(bool) {
explicit JitCompilerFallback(bool, bool) {
throw std::runtime_error("JIT compilation is not supported on this platform");
}
void prepare() {}

View file

@ -49,17 +49,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <cpuid.h>
#endif
static bool hugePagesJIT = false;
static int optimizedDatasetInit = -1;
void randomx_set_huge_pages_jit(bool hugePages)
{
hugePagesJIT = hugePages;
}
void randomx_set_optimized_dataset_init(int value)
{
optimizedDatasetInit = value;
}
namespace randomx {
/*
@ -116,6 +120,11 @@ namespace randomx {
#define codeReadDatasetLightSshInit ADDR(randomx_program_read_dataset_sshash_init)
#define codeReadDatasetLightSshFin ADDR(randomx_program_read_dataset_sshash_fin)
#define codeDatasetInit ADDR(randomx_dataset_init)
#define codeDatasetInitAVX2_prologue ADDR(randomx_dataset_init_avx2_prologue)
#define codeDatasetInitAVX2_loop_end ADDR(randomx_dataset_init_avx2_loop_end)
#define codeDatasetInitAVX2_loop_epilogue ADDR(randomx_dataset_init_avx2_epilogue)
#define codeDatasetInitAVX2_ssh_load ADDR(randomx_dataset_init_avx2_ssh_load)
#define codeDatasetInitAVX2_ssh_prefetch ADDR(randomx_dataset_init_avx2_ssh_prefetch)
#define codeLoopStore ADDR(randomx_program_loop_store)
#define codeLoopEnd ADDR(randomx_program_loop_end)
#define codeEpilogue ADDR(randomx_program_epilogue)
@ -132,7 +141,12 @@ namespace randomx {
#define readDatasetLightInitSize (codeReadDatasetLightSshFin - codeReadDatasetLightSshInit)
#define readDatasetLightFinSize (codeLoopStore - codeReadDatasetLightSshFin)
#define loopStoreSize (codeLoopEnd - codeLoopStore)
#define datasetInitSize (codeEpilogue - codeDatasetInit)
#define datasetInitSize (codeDatasetInitAVX2_prologue - codeDatasetInit)
#define datasetInitAVX2_prologue_size (codeDatasetInitAVX2_loop_end - codeDatasetInitAVX2_prologue)
#define datasetInitAVX2_loop_end_size (codeDatasetInitAVX2_loop_epilogue - codeDatasetInitAVX2_loop_end)
#define datasetInitAVX2_epilogue_size (codeDatasetInitAVX2_ssh_load - codeDatasetInitAVX2_loop_epilogue)
#define datasetInitAVX2_ssh_load_size (codeDatasetInitAVX2_ssh_prefetch - codeDatasetInitAVX2_ssh_load)
#define datasetInitAVX2_ssh_prefetch_size (codeEpilogue - codeDatasetInitAVX2_ssh_prefetch)
#define epilogueSize (codeShhLoad - codeEpilogue)
#define codeSshLoadSize (codeShhPrefetch - codeShhLoad)
#define codeSshPrefetchSize (codeShhEnd - codeShhPrefetch)
@ -192,17 +206,6 @@ namespace randomx {
xmrig::VirtualMemory::protectRX(p1, p2 - p1);
}
static inline void cpuid(uint32_t level, int32_t output[4])
{
memset(output, 0, sizeof(int32_t) * 4);
# ifdef _MSC_VER
__cpuid(output, static_cast<int>(level));
# else
__cpuid_count(level, 0, output[0], output[1], output[2], output[3]);
# endif
}
# ifdef _MSC_VER
static FORCE_INLINE uint32_t rotl32(uint32_t a, int shift) { return _rotl(a, shift); }
# else
@ -212,17 +215,59 @@ namespace randomx {
static std::atomic<size_t> codeOffset;
constexpr size_t codeOffsetIncrement = 59 * 64;
JitCompilerX86::JitCompilerX86(bool hugePagesEnable) {
JitCompilerX86::JitCompilerX86(bool hugePagesEnable, bool optimizedInitDatasetEnable) {
BranchesWithin32B = xmrig::Cpu::info()->jccErratum();
int32_t info[4];
cpuid(1, info);
hasAVX = ((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0);
hasAVX = xmrig::Cpu::info()->hasAVX();
hasAVX2 = xmrig::Cpu::info()->hasAVX2();
cpuid(0x80000001, info);
hasXOP = ((info[2] & (1 << 11)) != 0);
// Disable by default
initDatasetAVX2 = false;
allocatedSize = CodeSize * 2;
if (optimizedInitDatasetEnable) {
// Dataset init using AVX2:
// -1 = Auto detect
// 0 = Always disabled
// +1 = Always enabled
if (optimizedDatasetInit > 0) {
initDatasetAVX2 = true;
}
else if (optimizedDatasetInit < 0) {
xmrig::ICpuInfo::Vendor vendor = xmrig::Cpu::info()->vendor();
xmrig::ICpuInfo::Arch arch = xmrig::Cpu::info()->arch();
if (vendor == xmrig::ICpuInfo::VENDOR_INTEL) {
// AVX2 init is faster on Intel CPUs without HT
initDatasetAVX2 = (xmrig::Cpu::info()->cores() == xmrig::Cpu::info()->threads());
}
else if (vendor == xmrig::ICpuInfo::VENDOR_AMD) {
switch (arch) {
case xmrig::ICpuInfo::ARCH_ZEN:
case xmrig::ICpuInfo::ARCH_ZEN_PLUS:
// AVX2 init is slower on Zen/Zen+
initDatasetAVX2 = false;
break;
case xmrig::ICpuInfo::ARCH_ZEN2:
// AVX2 init is faster on Zen2 without SMT (mobile CPUs)
initDatasetAVX2 = (xmrig::Cpu::info()->cores() == xmrig::Cpu::info()->threads());
break;
case xmrig::ICpuInfo::ARCH_ZEN3:
// AVX2 init is faster on Zen3
initDatasetAVX2 = true;
break;
}
}
}
}
// Sorry, low-end Intel CPUs
if (!hasAVX2) {
initDatasetAVX2 = false;
}
hasXOP = xmrig::Cpu::info()->hasXOP();
allocatedSize = initDatasetAVX2 ? (CodeSize * 4) : (CodeSize * 2);
allocatedCode = static_cast<uint8_t*>(allocExecutableMemory(allocatedSize,
# ifdef XMRIG_SECURE_JIT
false
@ -304,14 +349,49 @@ namespace randomx {
template<size_t N>
void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[N]) {
uint8_t* p = code;
if (initDatasetAVX2) {
codePos = 0;
emit(codeDatasetInitAVX2_prologue, datasetInitAVX2_prologue_size, code, codePos);
for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) {
SuperscalarProgram& prog = programs[j];
uint32_t pos = codePos;
for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) {
generateSuperscalarCode<true>(prog(i), p, pos);
}
codePos = pos;
emit(codeShhLoad, codeSshLoadSize, code, codePos);
emit(codeDatasetInitAVX2_ssh_load, datasetInitAVX2_ssh_load_size, code, codePos);
if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
*(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast<uint32_t>(prog.getAddressRegister()) << 16);
codePos += 3;
emit(RandomX_CurrentConfig.codeShhPrefetchTweaked, codeSshPrefetchSize, code, codePos);
uint8_t* p = code + codePos;
emit(codeDatasetInitAVX2_ssh_prefetch, datasetInitAVX2_ssh_prefetch_size, code, codePos);
p[3] += prog.getAddressRegister() << 3;
}
}
emit(codeDatasetInitAVX2_loop_end, datasetInitAVX2_loop_end_size, code, codePos);
// Number of bytes from the start of randomx_dataset_init_avx2_prologue to loop_begin label
constexpr int32_t prologue_size = 320;
*(int32_t*)(code + codePos - 4) = prologue_size - codePos;
emit(codeDatasetInitAVX2_loop_epilogue, datasetInitAVX2_epilogue_size, code, codePos);
return;
}
memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize);
codePos = superScalarHashOffset + codeSshInitSize;
for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) {
SuperscalarProgram& prog = programs[j];
for (unsigned i = 0; i < prog.getSize(); ++i) {
Instruction& instr = prog(i);
generateSuperscalarCode(instr);
uint32_t pos = codePos;
for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) {
generateSuperscalarCode<false>(prog(i), p, pos);
}
codePos = pos;
emit(codeShhLoad, codeSshLoadSize, code, codePos);
if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
*(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast<uint32_t>(prog.getAddressRegister()) << 16);
@ -326,7 +406,10 @@ namespace randomx {
void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_MAX_ACCESSES]);
void JitCompilerX86::generateDatasetInitCode() {
memcpy(code, codeDatasetInit, datasetInitSize);
// AVX2 code is generated in generateSuperscalarHash()
if (!initDatasetAVX2) {
memcpy(code, codeDatasetInit, datasetInitSize);
}
}
void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) {
@ -405,85 +488,243 @@ namespace randomx {
emit32(epilogueOffset - codePos - 4, code, codePos);
}
void JitCompilerX86::generateSuperscalarCode(Instruction& instr) {
static constexpr uint8_t REX_SUB_RR[] = { 0x4d, 0x2b };
static constexpr uint8_t REX_MOV_RR64[] = { 0x49, 0x8b };
static constexpr uint8_t REX_MOV_R64R[] = { 0x4c, 0x8b };
static constexpr uint8_t REX_IMUL_RR[] = { 0x4d, 0x0f, 0xaf };
static constexpr uint8_t REX_IMUL_RM[] = { 0x4c, 0x0f, 0xaf };
static constexpr uint8_t REX_MUL_R[] = { 0x49, 0xf7 };
static constexpr uint8_t REX_81[] = { 0x49, 0x81 };
static constexpr uint8_t MOV_RAX_I[] = { 0x48, 0xb8 };
static constexpr uint8_t REX_LEA[] = { 0x4f, 0x8d };
static constexpr uint8_t REX_XOR_RR[] = { 0x4D, 0x33 };
static constexpr uint8_t REX_XOR_RI[] = { 0x49, 0x81 };
static constexpr uint8_t REX_ROT_I8[] = { 0x49, 0xc1 };
template<bool AVX2>
FORCE_INLINE void JitCompilerX86::generateSuperscalarCode(Instruction& instr, uint8_t* code, uint32_t& codePos) {
switch ((SuperscalarInstructionType)instr.opcode)
{
case randomx::SuperscalarInstructionType::ISUB_R:
emit(REX_SUB_RR, code, codePos);
emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
*(uint32_t*)(code + codePos) = 0x00C02B4DUL + (instr.dst << 19) + (instr.src << 16);
codePos += 3;
if (AVX2) {
emit32(0xC0FBFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
}
break;
case randomx::SuperscalarInstructionType::IXOR_R:
emit(REX_XOR_RR, code, codePos);
emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
*(uint32_t*)(code + codePos) = 0x00C0334DUL + (instr.dst << 19) + (instr.src << 16);
codePos += 3;
if (AVX2) {
emit32(0xC0EFFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
}
break;
case randomx::SuperscalarInstructionType::IADD_RS:
emit(REX_LEA, code, codePos);
emitByte(0x04 + 8 * instr.dst, code, codePos);
genSIB(instr.getModShift(), instr.src, instr.dst, code, codePos);
emit32(0x00048D4F + (instr.dst << 19) + (genSIB(instr.getModShift(), instr.src, instr.dst) << 24), code, codePos);
if (AVX2) {
if (instr.getModShift()) {
static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xF0, 0x00, 0xC5, 0xBD, 0xD4, 0xC0 };
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[3] += instr.src;
p[4] = instr.getModShift();
p[8] += instr.dst * 9;
}
else {
emit32(0xC0D4FDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
}
}
break;
case randomx::SuperscalarInstructionType::IMUL_R:
emit(REX_IMUL_RR, code, codePos);
emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
emit32(0xC0AF0F4DUL + (instr.dst << 27) + (instr.src << 24), code, codePos);
if (AVX2) {
static const uint8_t t[] = {
0xC5, 0xBD, 0x73, 0xD0, 0x20,
0xC5, 0xB5, 0x73, 0xD0, 0x20,
0xC5, 0x7D, 0xF4, 0xD0,
0xC5, 0x35, 0xF4, 0xD8,
0xC5, 0xBD, 0xF4, 0xC0,
0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20,
0xC5, 0xFD, 0x73, 0xF0, 0x20,
0xC4, 0x41, 0x2D, 0xD4, 0xD3,
0xC5, 0xAD, 0xD4, 0xC0
};
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[3] += instr.dst;
p[8] += instr.src;
p[11] -= instr.dst * 8;
p[13] += instr.src;
p[17] += instr.dst;
p[21] += instr.dst * 8 + instr.src;
p[29] -= instr.dst * 8;
p[31] += instr.dst;
p[41] += instr.dst * 9;
}
break;
case randomx::SuperscalarInstructionType::IROR_C:
emit(REX_ROT_I8, code, codePos);
emitByte(0xc8 + instr.dst, code, codePos);
emitByte(instr.getImm32() & 63, code, codePos);
{
const uint32_t shift = instr.getImm32() & 63;
emit32(0x00C8C149UL + (instr.dst << 16) + (shift << 24), code, codePos);
if (AVX2) {
static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xD0, 0x00, 0xC5, 0xB5, 0x73, 0xF0, 0x00, 0xC4, 0xC1, 0x3D, 0xEB, 0xC1 };
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[3] += instr.dst;
p[4] = shift;
p[8] += instr.dst;
p[9] = 64 - shift;
p[14] += instr.dst * 8;
}
}
break;
case randomx::SuperscalarInstructionType::IADD_C7:
case randomx::SuperscalarInstructionType::IADD_C8:
case randomx::SuperscalarInstructionType::IADD_C9:
emit(REX_81, code, codePos);
emitByte(0xc0 + instr.dst, code, codePos);
emit32(instr.getImm32(), code, codePos);
if (AVX2) {
static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x03, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xD4, 0xC0 };
uint8_t* p = code + codePos;
emit(t, code, codePos);
*(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32());
p[12] += instr.dst * 8;
p[24] -= instr.dst * 8;
p[26] += instr.dst * 8;
}
else {
*(uint32_t*)(code + codePos) = 0x00C08149UL + (instr.dst << 16);
codePos += 3;
emit32(instr.getImm32(), code, codePos);
}
break;
case randomx::SuperscalarInstructionType::IXOR_C7:
case randomx::SuperscalarInstructionType::IXOR_C8:
case randomx::SuperscalarInstructionType::IXOR_C9:
emit(REX_XOR_RI, code, codePos);
emitByte(0xf0 + instr.dst, code, codePos);
emit32(instr.getImm32(), code, codePos);
if (AVX2) {
static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x33, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xEF, 0xC0 };
uint8_t* p = code + codePos;
emit(t, code, codePos);
*(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32());
p[12] += instr.dst * 8;
p[24] -= instr.dst * 8;
p[26] += instr.dst * 8;
}
else {
*(uint32_t*)(code + codePos) = 0x00F08149UL + (instr.dst << 16);
codePos += 3;
emit32(instr.getImm32(), code, codePos);
}
break;
case randomx::SuperscalarInstructionType::IMULH_R:
emit(REX_MOV_RR64, code, codePos);
emitByte(0xc0 + instr.dst, code, codePos);
emit(REX_MUL_R, code, codePos);
emitByte(0xe0 + instr.src, code, codePos);
emit(REX_MOV_R64R, code, codePos);
emitByte(0xc2 + 8 * instr.dst, code, codePos);
*(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16);
codePos += 3;
*(uint32_t*)(code + codePos) = 0x00E0F749UL + (instr.src << 16);
codePos += 3;
*(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19);
codePos += 3;
if (AVX2) {
static const uint8_t t[] = {
0xC5, 0xBD, 0x73, 0xD0, 0x20,
0xC5, 0xB5, 0x73, 0xD0, 0x20,
0xC5, 0x7D, 0xF4, 0xD0,
0xC5, 0x3D, 0xF4, 0xD8,
0xC4, 0x41, 0x7D, 0xF4, 0xE1,
0xC4, 0xC1, 0x3D, 0xF4, 0xC1,
0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20,
0xC4, 0x41, 0x25, 0xEF, 0xC6,
0xC4, 0x41, 0x25, 0xD4, 0xDC,
0xC4, 0x41, 0x25, 0xD4, 0xDA,
0xC4, 0x41, 0x25, 0xEF, 0xCE,
0xC4, 0x42, 0x3D, 0x37, 0xC1,
0xC4, 0x41, 0x3D, 0xDB, 0xC7,
0xC5, 0xBD, 0xD4, 0xC0,
0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20,
0xC5, 0xA5, 0xD4, 0xC0
};
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[3] += instr.dst;
p[8] += instr.src;
p[11] -= instr.dst * 8;
p[13] += instr.src;
p[17] += instr.src;
p[20] -= instr.dst * 8;
p[27] += instr.dst * 8;
p[67] += instr.dst * 9;
p[77] += instr.dst * 9;
}
break;
case randomx::SuperscalarInstructionType::ISMULH_R:
emit(REX_MOV_RR64, code, codePos);
emitByte(0xc0 + instr.dst, code, codePos);
emit(REX_MUL_R, code, codePos);
emitByte(0xe8 + instr.src, code, codePos);
emit(REX_MOV_R64R, code, codePos);
emitByte(0xc2 + 8 * instr.dst, code, codePos);
*(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16);
codePos += 3;
*(uint32_t*)(code + codePos) = 0x00E8F749UL + (instr.src << 16);
codePos += 3;
*(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19);
codePos += 3;
if (AVX2) {
static const uint8_t t[] = {
0xC5, 0xBD, 0x73, 0xD0, 0x20,
0xC5, 0xB5, 0x73, 0xD0, 0x20,
0xC5, 0x7D, 0xF4, 0xD0,
0xC5, 0x3D, 0xF4, 0xD8,
0xC4, 0x41, 0x7D, 0xF4, 0xE1,
0xC4, 0x41, 0x3D, 0xF4, 0xE9,
0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20,
0xC4, 0x41, 0x25, 0xEF, 0xC6,
0xC4, 0x41, 0x25, 0xD4, 0xDC,
0xC4, 0x41, 0x25, 0xD4, 0xDA,
0xC4, 0x41, 0x25, 0xEF, 0xCE,
0xC4, 0x42, 0x3D, 0x37, 0xC1,
0xC4, 0x41, 0x3D, 0xDB, 0xC7,
0xC4, 0x41, 0x15, 0xD4, 0xE8,
0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20,
0xC4, 0x41, 0x15, 0xD4, 0xC3,
0xC4, 0x41, 0x35, 0xEF, 0xC9,
0xC4, 0x62, 0x35, 0x37, 0xD0,
0xC4, 0x62, 0x35, 0x37, 0xD8,
0xC5, 0x2D, 0xDB, 0xD0,
0xC5, 0x25, 0xDB, 0xD8,
0xC4, 0x41, 0x3D, 0xFB, 0xC2,
0xC4, 0xC1, 0x3D, 0xFB, 0xC3
};
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[3] += instr.dst;
p[8] += instr.src;
p[11] -= instr.dst * 8;
p[13] += instr.src;
p[17] += instr.src;
p[20] -= instr.dst * 8;
p[89] += instr.dst;
p[94] += instr.src;
p[98] += instr.src;
p[102] += instr.dst;
p[112] += instr.dst * 8;
}
break;
case randomx::SuperscalarInstructionType::IMUL_RCP:
emit(MOV_RAX_I, code, codePos);
*(uint32_t*)(code + codePos) = 0x0000B848UL;
codePos += 2;
emit64(randomx_reciprocal_fast(instr.getImm32()), code, codePos);
emit(REX_IMUL_RM, code, codePos);
emitByte(0xc0 + 8 * instr.dst, code, codePos);
emit32(0xC0AF0F4CUL + (instr.dst << 27), code, codePos);
if (AVX2) {
static const uint8_t t[] = {
0xC4, 0x62, 0x7D, 0x19, 0x25, 0xEB, 0xFF, 0xFF, 0xFF,
0xC5, 0xBD, 0x73, 0xD0, 0x20,
0xC4, 0xC1, 0x35, 0x73, 0xD4, 0x20,
0xC4, 0x41, 0x7D, 0xF4, 0xD4,
0xC5, 0x35, 0xF4, 0xD8,
0xC4, 0xC1, 0x3D, 0xF4, 0xC4,
0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20,
0xC5, 0xFD, 0x73, 0xF0, 0x20,
0xC4, 0x41, 0x2D, 0xD4, 0xD3,
0xC5, 0xAD, 0xD4, 0xC0
};
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[12] += instr.dst;
p[22] -= instr.dst * 8;
p[28] += instr.dst;
p[33] += instr.dst * 8;
p[41] -= instr.dst * 8;
p[43] += instr.dst;
p[53] += instr.dst * 9;
}
break;
default:
UNREACHABLE;
}
}
template void JitCompilerX86::generateSuperscalarCode<false>(Instruction&, uint8_t*, uint32_t&);
template void JitCompilerX86::generateSuperscalarCode<true>(Instruction&, uint8_t*, uint32_t&);
template<bool rax>
FORCE_INLINE void JitCompilerX86::genAddressReg(const Instruction& instr, const uint32_t src, uint8_t* code, uint32_t& codePos) {
*(uint32_t*)(code + codePos) = (rax ? 0x24808d41 : 0x24888d41) + (src << 16);
@ -563,10 +804,6 @@ namespace randomx {
codePos = pos;
}
void JitCompilerX86::genSIB(int scale, int index, int base, uint8_t* code, uint32_t& codePos) {
emitByte((scale << 6) | (index << 3) | base, code, codePos);
}
void JitCompilerX86::h_ISUB_R(const Instruction& instr) {
uint8_t* const p = code;
uint32_t pos = codePos;

View file

@ -49,7 +49,7 @@ namespace randomx {
class JitCompilerX86 {
public:
explicit JitCompilerX86(bool hugePagesEnable);
explicit JitCompilerX86(bool hugePagesEnable, bool optimizedInitDatasetEnable);
~JitCompilerX86();
void prepare();
void generateProgram(Program&, ProgramConfiguration&, uint32_t);
@ -96,6 +96,8 @@ namespace randomx {
bool BranchesWithin32B = false;
bool hasAVX;
bool hasAVX2;
bool initDatasetAVX2;
bool hasXOP;
uint8_t* allocatedCode = nullptr;
@ -107,9 +109,10 @@ namespace randomx {
static void genAddressReg(const Instruction&, const uint32_t src, uint8_t* code, uint32_t& codePos);
static void genAddressRegDst(const Instruction&, uint8_t* code, uint32_t& codePos);
static void genAddressImm(const Instruction&, uint8_t* code, uint32_t& codePos);
static void genSIB(int scale, int index, int base, uint8_t* code, uint32_t& codePos);
static uint32_t genSIB(int scale, int index, int base) { return (scale << 6) | (index << 3) | base; }
void generateSuperscalarCode(Instruction &);
template<bool AVX2>
void generateSuperscalarCode(Instruction& inst, uint8_t* code, uint32_t& codePos);
static void emitByte(uint8_t val, uint8_t* code, uint32_t& codePos) {
code[codePos] = val;

View file

@ -52,6 +52,11 @@
.global DECL(randomx_program_loop_store)
.global DECL(randomx_program_loop_end)
.global DECL(randomx_dataset_init)
.global DECL(randomx_dataset_init_avx2_prologue)
.global DECL(randomx_dataset_init_avx2_loop_end)
.global DECL(randomx_dataset_init_avx2_epilogue)
.global DECL(randomx_dataset_init_avx2_ssh_load)
.global DECL(randomx_dataset_init_avx2_ssh_prefetch)
.global DECL(randomx_program_epilogue)
.global DECL(randomx_sshash_load)
.global DECL(randomx_sshash_prefetch)
@ -192,6 +197,98 @@ call_offset:
pop rbx
ret
.balign 64
DECL(randomx_dataset_init_avx2_prologue):
#include "asm/program_sshash_avx2_save_registers.inc"
#if defined(WINABI)
mov rdi, qword ptr [rcx] ;# cache->memory
mov rsi, rdx ;# dataset
mov rbp, r8 ;# block index
push r9 ;# max. block index
#else
mov rdi, qword ptr [rdi] ;# cache->memory
;# dataset in rsi
mov rbp, rdx ;# block index
push rcx ;# max. block index
#endif
sub rsp, 32
jmp randomx_dataset_init_avx2_prologue_loop_begin
#include "asm/program_sshash_avx2_constants.inc"
.balign 64
randomx_dataset_init_avx2_prologue_loop_begin:
#include "asm/program_sshash_avx2_loop_begin.inc"
;# init integer registers (lane 0)
lea r8, [rbp+1]
imul r8, qword ptr [r0_avx2_mul+rip]
mov r9, qword ptr [r1_avx2_add+rip]
xor r9, r8
mov r10, qword ptr [r2_avx2_add+rip]
xor r10, r8
mov r11, qword ptr [r3_avx2_add+rip]
xor r11, r8
mov r12, qword ptr [r4_avx2_add+rip]
xor r12, r8
mov r13, qword ptr [r5_avx2_add+rip]
xor r13, r8
mov r14, qword ptr [r6_avx2_add+rip]
xor r14, r8
mov r15, qword ptr [r7_avx2_add+rip]
xor r15, r8
;# init AVX registers (lanes 1-4)
vpxor ymm0, ymm0, ymm0
movq xmm0, rbp
vpbroadcastq ymm0, xmm0
vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments+rip]
;# ymm0 *= r0_avx2_mul
vbroadcastsd ymm1, qword ptr [r0_avx2_mul+rip]
vpsrlq ymm8, ymm0, 32
vpsrlq ymm9, ymm1, 32
vpmuludq ymm10, ymm0, ymm1
vpmuludq ymm11, ymm9, ymm0
vpmuludq ymm0, ymm8, ymm1
vpsllq ymm11, ymm11, 32
vpsllq ymm0, ymm0, 32
vpaddq ymm10, ymm10, ymm11
vpaddq ymm0, ymm10, ymm0
vbroadcastsd ymm1, qword ptr [r1_avx2_add+rip]
vpxor ymm1, ymm0, ymm1
vbroadcastsd ymm2, qword ptr [r2_avx2_add+rip]
vpxor ymm2, ymm0, ymm2
vbroadcastsd ymm3, qword ptr [r3_avx2_add+rip]
vpxor ymm3, ymm0, ymm3
vbroadcastsd ymm4, qword ptr [r4_avx2_add+rip]
vpxor ymm4, ymm0, ymm4
vbroadcastsd ymm5, qword ptr [r5_avx2_add+rip]
vpxor ymm5, ymm0, ymm5
vbroadcastsd ymm6, qword ptr [r6_avx2_add+rip]
vpxor ymm6, ymm0, ymm6
vbroadcastsd ymm7, qword ptr [r7_avx2_add+rip]
vpxor ymm7, ymm0, ymm7
vbroadcastsd ymm15, qword ptr [mul_hi_avx2_data+rip] ;# carry_bit (bit 32)
vpsllq ymm14, ymm15, 31 ;# sign64 (bit 63)
;# generated SuperscalarHash code goes here
DECL(randomx_dataset_init_avx2_loop_end):
#include "asm/program_sshash_avx2_loop_end.inc"
DECL(randomx_dataset_init_avx2_epilogue):
#include "asm/program_sshash_avx2_epilogue.inc"
DECL(randomx_dataset_init_avx2_ssh_load):
#include "asm/program_sshash_avx2_ssh_load.inc"
DECL(randomx_dataset_init_avx2_ssh_prefetch):
#include "asm/program_sshash_avx2_ssh_prefetch.inc"
.balign 64
DECL(randomx_program_epilogue):
#include "asm/program_epilogue_store.inc"

View file

@ -41,6 +41,11 @@ PUBLIC randomx_program_read_dataset_ryzen
PUBLIC randomx_program_read_dataset_sshash_init
PUBLIC randomx_program_read_dataset_sshash_fin
PUBLIC randomx_dataset_init
PUBLIC randomx_dataset_init_avx2_prologue
PUBLIC randomx_dataset_init_avx2_loop_end
PUBLIC randomx_dataset_init_avx2_epilogue
PUBLIC randomx_dataset_init_avx2_ssh_load
PUBLIC randomx_dataset_init_avx2_ssh_prefetch
PUBLIC randomx_program_loop_store
PUBLIC randomx_program_loop_end
PUBLIC randomx_program_epilogue
@ -183,6 +188,95 @@ init_block_loop:
randomx_dataset_init ENDP
ALIGN 64
randomx_dataset_init_avx2_prologue PROC
include asm/program_sshash_avx2_save_registers.inc
mov rdi, qword ptr [rcx] ;# cache->memory
mov rsi, rdx ;# dataset
mov rbp, r8 ;# block index
push r9 ;# max. block index
sub rsp, 32
jmp loop_begin
include asm/program_sshash_avx2_constants.inc
ALIGN 64
loop_begin:
include asm/program_sshash_avx2_loop_begin.inc
;# init integer registers (lane 0)
lea r8, [rbp+1]
imul r8, qword ptr [r0_avx2_mul]
mov r9, qword ptr [r1_avx2_add]
xor r9, r8
mov r10, qword ptr [r2_avx2_add]
xor r10, r8
mov r11, qword ptr [r3_avx2_add]
xor r11, r8
mov r12, qword ptr [r4_avx2_add]
xor r12, r8
mov r13, qword ptr [r5_avx2_add]
xor r13, r8
mov r14, qword ptr [r6_avx2_add]
xor r14, r8
mov r15, qword ptr [r7_avx2_add]
xor r15, r8
;# init AVX registers (lanes 1-4)
vpxor ymm0, ymm0, ymm0
movq xmm0, rbp
vpbroadcastq ymm0, xmm0
vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments]
;# ymm0 *= r0_avx2_mul
vbroadcastsd ymm1, qword ptr [r0_avx2_mul]
vpsrlq ymm8, ymm0, 32
vpsrlq ymm9, ymm1, 32
vpmuludq ymm10, ymm0, ymm1
vpmuludq ymm11, ymm9, ymm0
vpmuludq ymm0, ymm8, ymm1
vpsllq ymm11, ymm11, 32
vpsllq ymm0, ymm0, 32
vpaddq ymm10, ymm10, ymm11
vpaddq ymm0, ymm10, ymm0
vbroadcastsd ymm1, qword ptr [r1_avx2_add]
vpxor ymm1, ymm0, ymm1
vbroadcastsd ymm2, qword ptr [r2_avx2_add]
vpxor ymm2, ymm0, ymm2
vbroadcastsd ymm3, qword ptr [r3_avx2_add]
vpxor ymm3, ymm0, ymm3
vbroadcastsd ymm4, qword ptr [r4_avx2_add]
vpxor ymm4, ymm0, ymm4
vbroadcastsd ymm5, qword ptr [r5_avx2_add]
vpxor ymm5, ymm0, ymm5
vbroadcastsd ymm6, qword ptr [r6_avx2_add]
vpxor ymm6, ymm0, ymm6
vbroadcastsd ymm7, qword ptr [r7_avx2_add]
vpxor ymm7, ymm0, ymm7
vbroadcastsd ymm15, qword ptr [mul_hi_avx2_data] ;# carry_bit (bit 32)
vpsllq ymm14, ymm15, 31 ;# sign64 (bit 63)
randomx_dataset_init_avx2_prologue ENDP
;# generated SuperscalarHash code goes here
randomx_dataset_init_avx2_loop_end PROC
include asm/program_sshash_avx2_loop_end.inc
randomx_dataset_init_avx2_loop_end ENDP
randomx_dataset_init_avx2_epilogue PROC
include asm/program_sshash_avx2_epilogue.inc
randomx_dataset_init_avx2_epilogue ENDP
randomx_dataset_init_avx2_ssh_load PROC
include asm/program_sshash_avx2_ssh_load.inc
randomx_dataset_init_avx2_ssh_load ENDP
randomx_dataset_init_avx2_ssh_prefetch PROC
include asm/program_sshash_avx2_ssh_prefetch.inc
randomx_dataset_init_avx2_ssh_prefetch ENDP
randomx_program_epilogue PROC
include asm/program_epilogue_store.inc
include asm/program_epilogue_win64.inc

View file

@ -44,6 +44,11 @@ extern "C" {
void randomx_program_loop_store();
void randomx_program_loop_end();
void randomx_dataset_init();
void randomx_dataset_init_avx2_prologue();
void randomx_dataset_init_avx2_loop_end();
void randomx_dataset_init_avx2_epilogue();
void randomx_dataset_init_avx2_ssh_load();
void randomx_dataset_init_avx2_ssh_prefetch();
void randomx_program_epilogue();
void randomx_sshash_load();
void randomx_sshash_prefetch();

View file

@ -381,7 +381,7 @@ extern "C" {
break;
case RANDOMX_FLAG_JIT:
cache->jit = new randomx::JitCompiler(false);
cache->jit = new randomx::JitCompiler(false, true);
cache->initialize = &randomx::initCacheCompile;
cache->datasetInit = nullptr;
cache->memory = memory;

View file

@ -170,6 +170,7 @@ void randomx_apply_config(const T& config)
void randomx_set_scratchpad_prefetch_mode(int mode);
void randomx_set_huge_pages_jit(bool hugePages);
void randomx_set_optimized_dataset_init(int value);
#if defined(__cplusplus)
extern "C" {

View file

@ -59,7 +59,7 @@ namespace randomx {
protected:
void execute();
JitCompiler compiler{ true };
JitCompiler compiler{ true, false };
};
using CompiledVmDefault = CompiledVm<1>;

View file

@ -96,6 +96,7 @@ bool xmrig::Rx::init(const T &seed, const RxConfig &config, const CpuConfig &cpu
randomx_set_scratchpad_prefetch_mode(config.scratchpadPrefetchMode());
randomx_set_huge_pages_jit(cpu.isHugePagesJit());
randomx_set_optimized_dataset_init(config.initDatasetAVX2());
if (!msrInitialized) {
msrEnabled = msrInit(config, cpu.threads().get(seed.algorithm()).data());

View file

@ -47,6 +47,7 @@
namespace xmrig {
const char *RxConfig::kInit = "init";
const char *RxConfig::kInitAVX2 = "init-avx2";
const char *RxConfig::kField = "randomx";
const char *RxConfig::kMode = "mode";
const char *RxConfig::kOneGbPages = "1gb-pages";
@ -86,9 +87,10 @@ static_assert (kMsrArraySize == ICpuInfo::MSR_MOD_MAX, "kMsrArraySize and MSR_MO
bool xmrig::RxConfig::read(const rapidjson::Value &value)
{
if (value.IsObject()) {
m_threads = Json::getInt(value, kInit, m_threads);
m_mode = readMode(Json::getValue(value, kMode));
m_rdmsr = Json::getBool(value, kRdmsr, m_rdmsr);
m_threads = Json::getInt(value, kInit, m_threads);
m_initDatasetAVX2 = Json::getInt(value, kInitAVX2, m_initDatasetAVX2);
m_mode = readMode(Json::getValue(value, kMode));
m_rdmsr = Json::getBool(value, kRdmsr, m_rdmsr);
# ifdef XMRIG_FEATURE_MSR
readMSR(Json::getValue(value, kWrmsr));
@ -141,6 +143,7 @@ rapidjson::Value xmrig::RxConfig::toJSON(rapidjson::Document &doc) const
Value obj(kObjectType);
obj.AddMember(StringRef(kInit), m_threads, allocator);
obj.AddMember(StringRef(kInitAVX2), m_initDatasetAVX2, allocator);
obj.AddMember(StringRef(kMode), StringRef(modeName()), allocator);
obj.AddMember(StringRef(kOneGbPages), m_oneGbPages, allocator);
obj.AddMember(StringRef(kRdmsr), m_rdmsr, allocator);

View file

@ -61,6 +61,7 @@ public:
static const char *kCacheQoS;
static const char *kField;
static const char *kInit;
static const char *kInitAVX2;
static const char *kMode;
static const char *kOneGbPages;
static const char *kRdmsr;
@ -83,6 +84,7 @@ public:
const char *modeName() const;
uint32_t threads(uint32_t limit = 100) const;
inline int initDatasetAVX2() const { return m_initDatasetAVX2; }
inline bool isOneGbPages() const { return m_oneGbPages; }
inline bool rdmsr() const { return m_rdmsr; }
inline bool wrmsr() const { return m_wrmsr; }
@ -111,11 +113,12 @@ private:
Mode readMode(const rapidjson::Value &value) const;
bool m_numa = true;
bool m_oneGbPages = false;
bool m_rdmsr = true;
int m_threads = -1;
Mode m_mode = AutoMode;
bool m_numa = true;
bool m_oneGbPages = false;
bool m_rdmsr = true;
int m_threads = -1;
int m_initDatasetAVX2 = -1;
Mode m_mode = AutoMode;
ScratchpadPrefetchMode m_scratchpadPrefetchMode = ScratchpadPrefetchT0;

View file

@ -19,6 +19,7 @@
#include "crypto/rx/RxDataset.h"
#include "backend/cpu/Cpu.h"
#include "base/io/log/Log.h"
#include "base/io/log/Tags.h"
#include "base/kernel/Platform.h"
@ -39,7 +40,13 @@ static void init_dataset_wrapper(randomx_dataset *dataset, randomx_cache *cache,
{
Platform::setThreadPriority(priority);
randomx_init_dataset(dataset, cache, startItem, itemCount);
if (Cpu::info()->hasAVX2() && (itemCount % 5)) {
randomx_init_dataset(dataset, cache, startItem, itemCount - (itemCount % 5));
randomx_init_dataset(dataset, cache, startItem + itemCount - 5, 5);
}
else {
randomx_init_dataset(dataset, cache, startItem, itemCount);
}
}