mirror of
https://github.com/xmrig/xmrig.git
synced 2024-10-30 21:17:52 +00:00
Dataset initialization with AVX2 (WIP)
This commit is contained in:
parent
6b21a51a2f
commit
515a85e66c
17 changed files with 721 additions and 90 deletions
|
@ -214,13 +214,6 @@ void xmrig::Workers<T>::start(const std::vector<T> &data, bool sleep)
|
|||
|
||||
for (auto worker : m_workers) {
|
||||
worker->start(Workers<T>::onReady);
|
||||
|
||||
// This sleep is important for optimal caching!
|
||||
// Threads must allocate scratchpads in order so that adjacent cores will use adjacent scratchpads
|
||||
// Sub-optimal caching can result in up to 0.5% hashrate penalty
|
||||
if (sleep) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(20));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -53,6 +53,7 @@ public:
|
|||
|
||||
enum Flag : uint32_t {
|
||||
FLAG_AES,
|
||||
FLAG_AVX,
|
||||
FLAG_AVX2,
|
||||
FLAG_AVX512F,
|
||||
FLAG_BMI2,
|
||||
|
@ -80,9 +81,11 @@ public:
|
|||
virtual Assembly::Id assembly() const = 0;
|
||||
virtual bool has(Flag feature) const = 0;
|
||||
virtual bool hasAES() const = 0;
|
||||
virtual bool hasAVX() const = 0;
|
||||
virtual bool hasAVX2() const = 0;
|
||||
virtual bool hasBMI2() const = 0;
|
||||
virtual bool hasOneGbPages() const = 0;
|
||||
virtual bool hasXOP() const = 0;
|
||||
virtual bool hasCatL3() const = 0;
|
||||
virtual bool isVM() const = 0;
|
||||
virtual const char *backend() const = 0;
|
||||
|
|
|
@ -52,8 +52,8 @@
|
|||
namespace xmrig {
|
||||
|
||||
|
||||
constexpr size_t kCpuFlagsSize = 13;
|
||||
static const std::array<const char *, kCpuFlagsSize> flagNames = { "aes", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" };
|
||||
constexpr size_t kCpuFlagsSize = 14;
|
||||
static const std::array<const char *, kCpuFlagsSize> flagNames = { "aes", "avx", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" };
|
||||
static_assert(kCpuFlagsSize == ICpuInfo::FLAG_MAX, "kCpuFlagsSize and FLAG_MAX mismatch");
|
||||
|
||||
|
||||
|
@ -134,11 +134,12 @@ static inline uint64_t xgetbv()
|
|||
#endif
|
||||
}
|
||||
|
||||
static inline bool has_xcr_avx2() { return (xgetbv() & 0x06) == 0x06; }
|
||||
static inline bool has_xcr_avx() { return (xgetbv() & 0x06) == 0x06; }
|
||||
static inline bool has_xcr_avx512() { return (xgetbv() & 0xE6) == 0xE6; }
|
||||
static inline bool has_osxsave() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 27); }
|
||||
static inline bool has_aes_ni() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 25); }
|
||||
static inline bool has_avx2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx2(); }
|
||||
static inline bool has_avx() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 28) && has_osxsave() && has_xcr_avx(); }
|
||||
static inline bool has_avx2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx(); }
|
||||
static inline bool has_avx512f() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 16) && has_osxsave() && has_xcr_avx512(); }
|
||||
static inline bool has_bmi2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 8); }
|
||||
static inline bool has_pdpe1gb() { return has_feature(PROCESSOR_EXT_INFO, EDX_Reg, 1 << 26); }
|
||||
|
@ -175,6 +176,7 @@ xmrig::BasicCpuInfo::BasicCpuInfo() :
|
|||
cpu_brand_string(m_brand);
|
||||
|
||||
m_flags.set(FLAG_AES, has_aes_ni());
|
||||
m_flags.set(FLAG_AVX, has_avx());
|
||||
m_flags.set(FLAG_AVX2, has_avx2());
|
||||
m_flags.set(FLAG_AVX512F, has_avx512f());
|
||||
m_flags.set(FLAG_BMI2, has_bmi2());
|
||||
|
|
|
@ -48,9 +48,11 @@ protected:
|
|||
inline Assembly::Id assembly() const override { return m_assembly; }
|
||||
inline bool has(Flag flag) const override { return m_flags.test(flag); }
|
||||
inline bool hasAES() const override { return has(FLAG_AES); }
|
||||
inline bool hasAVX() const override { return has(FLAG_AVX); }
|
||||
inline bool hasAVX2() const override { return has(FLAG_AVX2); }
|
||||
inline bool hasBMI2() const override { return has(FLAG_BMI2); }
|
||||
inline bool hasOneGbPages() const override { return has(FLAG_PDPE1GB); }
|
||||
inline bool hasXOP() const override { return has(FLAG_XOP); }
|
||||
inline bool hasCatL3() const override { return has(FLAG_CAT_L3); }
|
||||
inline bool isVM() const override { return has(FLAG_VM); }
|
||||
inline const char *brand() const override { return m_brand; }
|
||||
|
|
28
src/crypto/randomx/asm/program_sshash_avx2_constants.inc
Normal file
28
src/crypto/randomx/asm/program_sshash_avx2_constants.inc
Normal file
|
@ -0,0 +1,28 @@
|
|||
r0_avx2_increments:
|
||||
db 2,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0
|
||||
mul_hi_avx2_data:
|
||||
db 0,0,0,0,1,0,0,0
|
||||
r0_avx2_mul:
|
||||
;#/ 6364136223846793005
|
||||
db 45, 127, 149, 76, 45, 244, 81, 88
|
||||
r1_avx2_add:
|
||||
;#/ 9298411001130361340
|
||||
db 252, 161, 245, 89, 138, 151, 10, 129
|
||||
r2_avx2_add:
|
||||
;#/ 12065312585734608966
|
||||
db 70, 216, 194, 56, 223, 153, 112, 167
|
||||
r3_avx2_add:
|
||||
;#/ 9306329213124626780
|
||||
db 92, 73, 34, 191, 28, 185, 38, 129
|
||||
r4_avx2_add:
|
||||
;#/ 5281919268842080866
|
||||
db 98, 138, 159, 23, 151, 37, 77, 73
|
||||
r5_avx2_add:
|
||||
;#/ 10536153434571861004
|
||||
db 12, 236, 170, 206, 185, 239, 55, 146
|
||||
r6_avx2_add:
|
||||
;#/ 3398623926847679864
|
||||
db 120, 45, 230, 108, 116, 86, 42, 47
|
||||
r7_avx2_add:
|
||||
;#/ 9549104520008361294
|
||||
db 78, 229, 44, 182, 247, 59, 133, 132
|
31
src/crypto/randomx/asm/program_sshash_avx2_epilogue.inc
Normal file
31
src/crypto/randomx/asm/program_sshash_avx2_epilogue.inc
Normal file
|
@ -0,0 +1,31 @@
|
|||
add rsp, 32
|
||||
pop r9
|
||||
|
||||
movdqu xmm0, xmmword ptr [rsp]
|
||||
movdqu xmm1, xmmword ptr [rsp + 16]
|
||||
movdqu xmm2, xmmword ptr [rsp + 32]
|
||||
movdqu xmm3, xmmword ptr [rsp + 48]
|
||||
movdqu xmm4, xmmword ptr [rsp + 64]
|
||||
movdqu xmm5, xmmword ptr [rsp + 80]
|
||||
movdqu xmm6, xmmword ptr [rsp + 96]
|
||||
movdqu xmm7, xmmword ptr [rsp + 112]
|
||||
movdqu xmm8, xmmword ptr [rsp + 128]
|
||||
movdqu xmm9, xmmword ptr [rsp + 144]
|
||||
movdqu xmm10, xmmword ptr [rsp + 160]
|
||||
movdqu xmm11, xmmword ptr [rsp + 176]
|
||||
movdqu xmm12, xmmword ptr [rsp + 192]
|
||||
movdqu xmm13, xmmword ptr [rsp + 208]
|
||||
movdqu xmm14, xmmword ptr [rsp + 224]
|
||||
movdqu xmm15, xmmword ptr [rsp + 240]
|
||||
vzeroupper
|
||||
add rsp, 256
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rbp
|
||||
pop rbx
|
||||
ret
|
37
src/crypto/randomx/asm/program_sshash_avx2_loop_begin.inc
Normal file
37
src/crypto/randomx/asm/program_sshash_avx2_loop_begin.inc
Normal file
|
@ -0,0 +1,37 @@
|
|||
;# prefetch RandomX dataset lines
|
||||
prefetchnta byte ptr [rsi]
|
||||
prefetchnta byte ptr [rsi+64]
|
||||
prefetchnta byte ptr [rsi+128]
|
||||
prefetchnta byte ptr [rsi+192]
|
||||
prefetchnta byte ptr [rsi+256]
|
||||
|
||||
;# prefetch RandomX cache lines
|
||||
mov rbx, rbp
|
||||
and rbx, RANDOMX_CACHE_MASK
|
||||
shl rbx, 6
|
||||
add rbx, rdi
|
||||
prefetchnta byte ptr [rbx]
|
||||
lea rax, [rbp+1]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
prefetchnta byte ptr [rax]
|
||||
mov [rsp], rax
|
||||
lea rax, [rbp+2]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
prefetchnta byte ptr [rax]
|
||||
mov [rsp+8], rax
|
||||
lea rax, [rbp+3]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
prefetchnta byte ptr [rax]
|
||||
mov [rsp+16], rax
|
||||
lea rax, [rbp+4]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
prefetchnta byte ptr [rax]
|
||||
mov [rsp+24], rax
|
38
src/crypto/randomx/asm/program_sshash_avx2_loop_end.inc
Normal file
38
src/crypto/randomx/asm/program_sshash_avx2_loop_end.inc
Normal file
|
@ -0,0 +1,38 @@
|
|||
mov qword ptr [rsi+0], r8
|
||||
vpunpcklqdq ymm8, ymm0, ymm1
|
||||
mov qword ptr [rsi+8], r9
|
||||
vpunpcklqdq ymm9, ymm2, ymm3
|
||||
mov qword ptr [rsi+16], r10
|
||||
vpunpcklqdq ymm10, ymm4, ymm5
|
||||
mov qword ptr [rsi+24], r11
|
||||
vpunpcklqdq ymm11, ymm6, ymm7
|
||||
mov qword ptr [rsi+32], r12
|
||||
vpunpckhqdq ymm12, ymm0, ymm1
|
||||
mov qword ptr [rsi+40], r13
|
||||
vpunpckhqdq ymm13, ymm2, ymm3
|
||||
mov qword ptr [rsi+48], r14
|
||||
vpunpckhqdq ymm14, ymm4, ymm5
|
||||
mov qword ptr [rsi+56], r15
|
||||
vpunpckhqdq ymm15, ymm6, ymm7
|
||||
|
||||
vperm2i128 ymm0, ymm8, ymm9, 32
|
||||
vperm2i128 ymm1, ymm10, ymm11, 32
|
||||
vmovdqu ymmword ptr [rsi+64], ymm0
|
||||
vmovdqu ymmword ptr [rsi+96], ymm1
|
||||
vperm2i128 ymm2, ymm12, ymm13, 32
|
||||
vperm2i128 ymm3, ymm14, ymm15, 32
|
||||
vmovdqu ymmword ptr [rsi+128], ymm2
|
||||
vmovdqu ymmword ptr [rsi+160], ymm3
|
||||
vperm2i128 ymm4, ymm8, ymm9, 49
|
||||
vperm2i128 ymm5, ymm10, ymm11, 49
|
||||
vmovdqu ymmword ptr [rsi+192], ymm4
|
||||
vmovdqu ymmword ptr [rsi+224], ymm5
|
||||
vperm2i128 ymm6, ymm12, ymm13, 49
|
||||
vperm2i128 ymm7, ymm14, ymm15, 49
|
||||
vmovdqu ymmword ptr [rsi+256], ymm6
|
||||
vmovdqu ymmword ptr [rsi+288], ymm7
|
||||
|
||||
add rbp, 5
|
||||
add rsi, 320
|
||||
cmp rbp, qword ptr [rsp+32]
|
||||
db 15, 130, 0, 0, 0, 0 ;# jb rel32
|
|
@ -0,0 +1,27 @@
|
|||
push rbx
|
||||
push rbp
|
||||
push rdi
|
||||
push rsi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
;# save all XMM registers just to be safe for all calling conventions
|
||||
sub rsp, 256
|
||||
movdqu xmmword ptr [rsp], xmm0
|
||||
movdqu xmmword ptr [rsp + 16], xmm1
|
||||
movdqu xmmword ptr [rsp + 32], xmm2
|
||||
movdqu xmmword ptr [rsp + 48], xmm3
|
||||
movdqu xmmword ptr [rsp + 64], xmm4
|
||||
movdqu xmmword ptr [rsp + 80], xmm5
|
||||
movdqu xmmword ptr [rsp + 96], xmm6
|
||||
movdqu xmmword ptr [rsp + 112], xmm7
|
||||
movdqu xmmword ptr [rsp + 128], xmm8
|
||||
movdqu xmmword ptr [rsp + 144], xmm9
|
||||
movdqu xmmword ptr [rsp + 160], xmm10
|
||||
movdqu xmmword ptr [rsp + 176], xmm11
|
||||
movdqu xmmword ptr [rsp + 192], xmm12
|
||||
movdqu xmmword ptr [rsp + 208], xmm13
|
||||
movdqu xmmword ptr [rsp + 224], xmm14
|
||||
movdqu xmmword ptr [rsp + 240], xmm15
|
50
src/crypto/randomx/asm/program_sshash_avx2_ssh_load.inc
Normal file
50
src/crypto/randomx/asm/program_sshash_avx2_ssh_load.inc
Normal file
|
@ -0,0 +1,50 @@
|
|||
sub rsp, 40
|
||||
mov [rsp], rbx
|
||||
vmovdqu ymmword ptr [rsp+8], ymm14
|
||||
|
||||
mov rax, [rsp+40]
|
||||
mov rbx, [rsp+48]
|
||||
mov rcx, [rsp+56]
|
||||
mov rdx, [rsp+64]
|
||||
|
||||
vmovdqu ymm8, ymmword ptr [rax] ;# ymm8 = r0[1], r1[1], r2[1], r3[1]
|
||||
vmovdqu ymm9, ymmword ptr [rbx] ;# ymm9 = r0[2], r1[2], r2[2], r3[2]
|
||||
vmovdqu ymm10, ymmword ptr [rcx] ;# ymm10 = r0[3], r1[3], r2[3], r3[3]
|
||||
vmovdqu ymm11, ymmword ptr [rdx] ;# ymm11 = r0[4], r1[4], r2[4], r3[4]
|
||||
|
||||
vpunpcklqdq ymm12, ymm8, ymm9 ;# ymm12 = r0[1], r0[2], r2[1], r2[2]
|
||||
vpunpcklqdq ymm13, ymm10, ymm11 ;# ymm13 = r0[3], r0[4], r2[3], r2[4]
|
||||
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r0[1], r0[2], r0[3], r0[4]
|
||||
vpxor ymm0, ymm0, ymm14
|
||||
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r2[1], r2[2], r2[3], r2[4]
|
||||
vpxor ymm2, ymm2, ymm14
|
||||
|
||||
vpunpckhqdq ymm12, ymm8, ymm9 ;# ymm12 = r1[1], r1[2], r3[1], r3[2]
|
||||
vpunpckhqdq ymm13, ymm10, ymm11 ;# ymm13 = r1[3], r1[4], r3[3], r3[4]
|
||||
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r1[1], r1[2], r1[3], r1[4]
|
||||
vpxor ymm1, ymm1, ymm14
|
||||
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r3[1], r3[2], r3[3], r3[4]
|
||||
vpxor ymm3, ymm3, ymm14
|
||||
|
||||
vmovdqu ymm8, ymmword ptr [rax+32] ;# ymm8 = r4[1], r5[1], r6[1], r7[1]
|
||||
vmovdqu ymm9, ymmword ptr [rbx+32] ;# ymm9 = r4[2], r5[2], r6[2], r7[2]
|
||||
vmovdqu ymm10, ymmword ptr [rcx+32] ;# ymm10 = r4[3], r5[3], r6[3], r7[3]
|
||||
vmovdqu ymm11, ymmword ptr [rdx+32] ;# ymm11 = r4[4], r5[4], r6[4], r7[4]
|
||||
|
||||
vpunpcklqdq ymm12, ymm8, ymm9 ;# ymm12 = r4[1], r4[2], r6[1], r6[2]
|
||||
vpunpcklqdq ymm13, ymm10, ymm11 ;# ymm13 = r4[3], r4[4], r6[3], r6[4]
|
||||
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r4[1], r4[2], r4[3], r4[4]
|
||||
vpxor ymm4, ymm4, ymm14
|
||||
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r6[1], r6[2], r6[3], r6[4]
|
||||
vpxor ymm6, ymm6, ymm14
|
||||
|
||||
vpunpckhqdq ymm12, ymm8, ymm9 ;# ymm12 = r5[1], r5[2], r7[1], r7[2]
|
||||
vpunpckhqdq ymm13, ymm10, ymm11 ;# ymm13 = r5[3], r5[4], r7[3], r7[4]
|
||||
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r5[1], r5[2], r5[3], r5[4]
|
||||
vpxor ymm5, ymm5, ymm14
|
||||
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r7[1], r7[2], r7[3], r7[4]
|
||||
vpxor ymm7, ymm7, ymm14
|
||||
|
||||
mov rbx, [rsp]
|
||||
vmovdqu ymm14, ymmword ptr [rsp+8]
|
||||
add rsp, 40
|
29
src/crypto/randomx/asm/program_sshash_avx2_ssh_prefetch.inc
Normal file
29
src/crypto/randomx/asm/program_sshash_avx2_ssh_prefetch.inc
Normal file
|
@ -0,0 +1,29 @@
|
|||
vmovdqu ymmword ptr [rsp], ymm0
|
||||
|
||||
mov rax, [rsp]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
mov [rsp], rax
|
||||
prefetchnta byte ptr [rax]
|
||||
|
||||
mov rax, [rsp+8]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
mov [rsp+8], rax
|
||||
prefetchnta byte ptr [rax]
|
||||
|
||||
mov rax, [rsp+16]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
mov [rsp+16], rax
|
||||
prefetchnta byte ptr [rax]
|
||||
|
||||
mov rax, [rsp+24]
|
||||
and rax, RANDOMX_CACHE_MASK
|
||||
shl rax, 6
|
||||
add rax, rdi
|
||||
mov [rsp+24], rax
|
||||
prefetchnta byte ptr [rax]
|
|
@ -49,8 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#ifdef _MSC_VER
|
||||
# include <intrin.h>
|
||||
#else
|
||||
# include <cpuid.h>
|
||||
#endif
|
||||
|
||||
static bool hugePagesJIT = false;
|
||||
|
@ -116,6 +114,11 @@ namespace randomx {
|
|||
#define codeReadDatasetLightSshInit ADDR(randomx_program_read_dataset_sshash_init)
|
||||
#define codeReadDatasetLightSshFin ADDR(randomx_program_read_dataset_sshash_fin)
|
||||
#define codeDatasetInit ADDR(randomx_dataset_init)
|
||||
#define codeDatasetInitAVX2_prologue ADDR(randomx_dataset_init_avx2_prologue)
|
||||
#define codeDatasetInitAVX2_loop_end ADDR(randomx_dataset_init_avx2_loop_end)
|
||||
#define codeDatasetInitAVX2_loop_epilogue ADDR(randomx_dataset_init_avx2_epilogue)
|
||||
#define codeDatasetInitAVX2_ssh_load ADDR(randomx_dataset_init_avx2_ssh_load)
|
||||
#define codeDatasetInitAVX2_ssh_prefetch ADDR(randomx_dataset_init_avx2_ssh_prefetch)
|
||||
#define codeLoopStore ADDR(randomx_program_loop_store)
|
||||
#define codeLoopEnd ADDR(randomx_program_loop_end)
|
||||
#define codeEpilogue ADDR(randomx_program_epilogue)
|
||||
|
@ -132,7 +135,12 @@ namespace randomx {
|
|||
#define readDatasetLightInitSize (codeReadDatasetLightSshFin - codeReadDatasetLightSshInit)
|
||||
#define readDatasetLightFinSize (codeLoopStore - codeReadDatasetLightSshFin)
|
||||
#define loopStoreSize (codeLoopEnd - codeLoopStore)
|
||||
#define datasetInitSize (codeEpilogue - codeDatasetInit)
|
||||
#define datasetInitSize (codeDatasetInitAVX2_prologue - codeDatasetInit)
|
||||
#define datasetInitAVX2_prologue_size (codeDatasetInitAVX2_loop_end - codeDatasetInitAVX2_prologue)
|
||||
#define datasetInitAVX2_loop_end_size (codeDatasetInitAVX2_loop_epilogue - codeDatasetInitAVX2_loop_end)
|
||||
#define datasetInitAVX2_epilogue_size (codeDatasetInitAVX2_ssh_load - codeDatasetInitAVX2_loop_epilogue)
|
||||
#define datasetInitAVX2_ssh_load_size (codeDatasetInitAVX2_ssh_prefetch - codeDatasetInitAVX2_ssh_load)
|
||||
#define datasetInitAVX2_ssh_prefetch_size (codeEpilogue - codeDatasetInitAVX2_ssh_prefetch)
|
||||
#define epilogueSize (codeShhLoad - codeEpilogue)
|
||||
#define codeSshLoadSize (codeShhPrefetch - codeShhLoad)
|
||||
#define codeSshPrefetchSize (codeShhEnd - codeShhPrefetch)
|
||||
|
@ -192,17 +200,6 @@ namespace randomx {
|
|||
xmrig::VirtualMemory::protectRX(p1, p2 - p1);
|
||||
}
|
||||
|
||||
static inline void cpuid(uint32_t level, int32_t output[4])
|
||||
{
|
||||
memset(output, 0, sizeof(int32_t) * 4);
|
||||
|
||||
# ifdef _MSC_VER
|
||||
__cpuid(output, static_cast<int>(level));
|
||||
# else
|
||||
__cpuid_count(level, 0, output[0], output[1], output[2], output[3]);
|
||||
# endif
|
||||
}
|
||||
|
||||
# ifdef _MSC_VER
|
||||
static FORCE_INLINE uint32_t rotl32(uint32_t a, int shift) { return _rotl(a, shift); }
|
||||
# else
|
||||
|
@ -215,14 +212,11 @@ namespace randomx {
|
|||
JitCompilerX86::JitCompilerX86(bool hugePagesEnable) {
|
||||
BranchesWithin32B = xmrig::Cpu::info()->jccErratum();
|
||||
|
||||
int32_t info[4];
|
||||
cpuid(1, info);
|
||||
hasAVX = ((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0);
|
||||
hasAVX = xmrig::Cpu::info()->hasAVX();
|
||||
hasAVX2 = xmrig::Cpu::info()->hasAVX2();
|
||||
hasXOP = xmrig::Cpu::info()->hasXOP();
|
||||
|
||||
cpuid(0x80000001, info);
|
||||
hasXOP = ((info[2] & (1 << 11)) != 0);
|
||||
|
||||
allocatedSize = CodeSize * 2;
|
||||
allocatedSize = hasAVX2 ? (CodeSize * 4) : (CodeSize * 2);
|
||||
allocatedCode = static_cast<uint8_t*>(allocExecutableMemory(allocatedSize,
|
||||
# ifdef XMRIG_SECURE_JIT
|
||||
false
|
||||
|
@ -304,14 +298,49 @@ namespace randomx {
|
|||
|
||||
template<size_t N>
|
||||
void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[N]) {
|
||||
uint8_t* p = code;
|
||||
if (hasAVX2) {
|
||||
codePos = 0;
|
||||
emit(codeDatasetInitAVX2_prologue, datasetInitAVX2_prologue_size, code, codePos);
|
||||
|
||||
for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) {
|
||||
SuperscalarProgram& prog = programs[j];
|
||||
uint32_t pos = codePos;
|
||||
for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) {
|
||||
generateSuperscalarCode<true>(prog(i), p, pos);
|
||||
}
|
||||
codePos = pos;
|
||||
emit(codeShhLoad, codeSshLoadSize, code, codePos);
|
||||
emit(codeDatasetInitAVX2_ssh_load, datasetInitAVX2_ssh_load_size, code, codePos);
|
||||
if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
|
||||
*(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast<uint32_t>(prog.getAddressRegister()) << 16);
|
||||
codePos += 3;
|
||||
emit(RandomX_CurrentConfig.codeShhPrefetchTweaked, codeSshPrefetchSize, code, codePos);
|
||||
uint8_t* p = code + codePos;
|
||||
emit(codeDatasetInitAVX2_ssh_prefetch, datasetInitAVX2_ssh_prefetch_size, code, codePos);
|
||||
p[3] += prog.getAddressRegister() << 3;
|
||||
}
|
||||
}
|
||||
|
||||
emit(codeDatasetInitAVX2_loop_end, datasetInitAVX2_loop_end_size, code, codePos);
|
||||
|
||||
// Number of bytes from the start of randomx_dataset_init_avx2_prologue to loop_begin label
|
||||
constexpr int32_t prologue_size = 320;
|
||||
*(int32_t*)(code + codePos - 4) = prologue_size - codePos;
|
||||
|
||||
emit(codeDatasetInitAVX2_loop_epilogue, datasetInitAVX2_epilogue_size, code, codePos);
|
||||
return;
|
||||
}
|
||||
|
||||
memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize);
|
||||
codePos = superScalarHashOffset + codeSshInitSize;
|
||||
for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) {
|
||||
SuperscalarProgram& prog = programs[j];
|
||||
for (unsigned i = 0; i < prog.getSize(); ++i) {
|
||||
Instruction& instr = prog(i);
|
||||
generateSuperscalarCode(instr);
|
||||
uint32_t pos = codePos;
|
||||
for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) {
|
||||
generateSuperscalarCode<false>(prog(i), p, pos);
|
||||
}
|
||||
codePos = pos;
|
||||
emit(codeShhLoad, codeSshLoadSize, code, codePos);
|
||||
if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
|
||||
*(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast<uint32_t>(prog.getAddressRegister()) << 16);
|
||||
|
@ -326,7 +355,10 @@ namespace randomx {
|
|||
void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_MAX_ACCESSES]);
|
||||
|
||||
void JitCompilerX86::generateDatasetInitCode() {
|
||||
memcpy(code, codeDatasetInit, datasetInitSize);
|
||||
// AVX2 code is generated in generateSuperscalarHash()
|
||||
if (!hasAVX2) {
|
||||
memcpy(code, codeDatasetInit, datasetInitSize);
|
||||
}
|
||||
}
|
||||
|
||||
void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) {
|
||||
|
@ -405,85 +437,243 @@ namespace randomx {
|
|||
emit32(epilogueOffset - codePos - 4, code, codePos);
|
||||
}
|
||||
|
||||
void JitCompilerX86::generateSuperscalarCode(Instruction& instr) {
|
||||
static constexpr uint8_t REX_SUB_RR[] = { 0x4d, 0x2b };
|
||||
static constexpr uint8_t REX_MOV_RR64[] = { 0x49, 0x8b };
|
||||
static constexpr uint8_t REX_MOV_R64R[] = { 0x4c, 0x8b };
|
||||
static constexpr uint8_t REX_IMUL_RR[] = { 0x4d, 0x0f, 0xaf };
|
||||
static constexpr uint8_t REX_IMUL_RM[] = { 0x4c, 0x0f, 0xaf };
|
||||
static constexpr uint8_t REX_MUL_R[] = { 0x49, 0xf7 };
|
||||
static constexpr uint8_t REX_81[] = { 0x49, 0x81 };
|
||||
static constexpr uint8_t MOV_RAX_I[] = { 0x48, 0xb8 };
|
||||
static constexpr uint8_t REX_LEA[] = { 0x4f, 0x8d };
|
||||
static constexpr uint8_t REX_XOR_RR[] = { 0x4D, 0x33 };
|
||||
static constexpr uint8_t REX_XOR_RI[] = { 0x49, 0x81 };
|
||||
static constexpr uint8_t REX_ROT_I8[] = { 0x49, 0xc1 };
|
||||
|
||||
template<bool AVX2>
|
||||
FORCE_INLINE void JitCompilerX86::generateSuperscalarCode(Instruction& instr, uint8_t* code, uint32_t& codePos) {
|
||||
switch ((SuperscalarInstructionType)instr.opcode)
|
||||
{
|
||||
case randomx::SuperscalarInstructionType::ISUB_R:
|
||||
emit(REX_SUB_RR, code, codePos);
|
||||
emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
|
||||
*(uint32_t*)(code + codePos) = 0x00C02B4DUL + (instr.dst << 19) + (instr.src << 16);
|
||||
codePos += 3;
|
||||
if (AVX2) {
|
||||
emit32(0xC0FBFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
|
||||
}
|
||||
break;
|
||||
case randomx::SuperscalarInstructionType::IXOR_R:
|
||||
emit(REX_XOR_RR, code, codePos);
|
||||
emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
|
||||
*(uint32_t*)(code + codePos) = 0x00C0334DUL + (instr.dst << 19) + (instr.src << 16);
|
||||
codePos += 3;
|
||||
if (AVX2) {
|
||||
emit32(0xC0EFFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
|
||||
}
|
||||
break;
|
||||
case randomx::SuperscalarInstructionType::IADD_RS:
|
||||
emit(REX_LEA, code, codePos);
|
||||
emitByte(0x04 + 8 * instr.dst, code, codePos);
|
||||
genSIB(instr.getModShift(), instr.src, instr.dst, code, codePos);
|
||||
emit32(0x00048D4F + (instr.dst << 19) + (genSIB(instr.getModShift(), instr.src, instr.dst) << 24), code, codePos);
|
||||
if (AVX2) {
|
||||
if (instr.getModShift()) {
|
||||
static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xF0, 0x00, 0xC5, 0xBD, 0xD4, 0xC0 };
|
||||
uint8_t* p = code + codePos;
|
||||
emit(t, code, codePos);
|
||||
p[3] += instr.src;
|
||||
p[4] = instr.getModShift();
|
||||
p[8] += instr.dst * 9;
|
||||
}
|
||||
else {
|
||||
emit32(0xC0D4FDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case randomx::SuperscalarInstructionType::IMUL_R:
|
||||
emit(REX_IMUL_RR, code, codePos);
|
||||
emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
|
||||
emit32(0xC0AF0F4DUL + (instr.dst << 27) + (instr.src << 24), code, codePos);
|
||||
if (AVX2) {
|
||||
static const uint8_t t[] = {
|
||||
0xC5, 0xBD, 0x73, 0xD0, 0x20,
|
||||
0xC5, 0xB5, 0x73, 0xD0, 0x20,
|
||||
0xC5, 0x7D, 0xF4, 0xD0,
|
||||
0xC5, 0x35, 0xF4, 0xD8,
|
||||
0xC5, 0xBD, 0xF4, 0xC0,
|
||||
0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20,
|
||||
0xC5, 0xFD, 0x73, 0xF0, 0x20,
|
||||
0xC4, 0x41, 0x2D, 0xD4, 0xD3,
|
||||
0xC5, 0xAD, 0xD4, 0xC0
|
||||
};
|
||||
uint8_t* p = code + codePos;
|
||||
emit(t, code, codePos);
|
||||
p[3] += instr.dst;
|
||||
p[8] += instr.src;
|
||||
p[11] -= instr.dst * 8;
|
||||
p[13] += instr.src;
|
||||
p[17] += instr.dst;
|
||||
p[21] += instr.dst * 8 + instr.src;
|
||||
p[29] -= instr.dst * 8;
|
||||
p[31] += instr.dst;
|
||||
p[41] += instr.dst * 9;
|
||||
}
|
||||
break;
|
||||
case randomx::SuperscalarInstructionType::IROR_C:
|
||||
emit(REX_ROT_I8, code, codePos);
|
||||
emitByte(0xc8 + instr.dst, code, codePos);
|
||||
emitByte(instr.getImm32() & 63, code, codePos);
|
||||
{
|
||||
const uint32_t shift = instr.getImm32() & 63;
|
||||
emit32(0x00C8C149UL + (instr.dst << 16) + (shift << 24), code, codePos);
|
||||
if (AVX2) {
|
||||
static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xD0, 0x00, 0xC5, 0xB5, 0x73, 0xF0, 0x00, 0xC4, 0xC1, 0x3D, 0xEB, 0xC1 };
|
||||
uint8_t* p = code + codePos;
|
||||
emit(t, code, codePos);
|
||||
p[3] += instr.dst;
|
||||
p[4] = shift;
|
||||
p[8] += instr.dst;
|
||||
p[9] = 64 - shift;
|
||||
p[14] += instr.dst * 8;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case randomx::SuperscalarInstructionType::IADD_C7:
|
||||
case randomx::SuperscalarInstructionType::IADD_C8:
|
||||
case randomx::SuperscalarInstructionType::IADD_C9:
|
||||
emit(REX_81, code, codePos);
|
||||
emitByte(0xc0 + instr.dst, code, codePos);
|
||||
emit32(instr.getImm32(), code, codePos);
|
||||
if (AVX2) {
|
||||
static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x03, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xD4, 0xC0 };
|
||||
uint8_t* p = code + codePos;
|
||||
emit(t, code, codePos);
|
||||
*(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32());
|
||||
p[12] += instr.dst * 8;
|
||||
p[24] -= instr.dst * 8;
|
||||
p[26] += instr.dst * 8;
|
||||
}
|
||||
else {
|
||||
*(uint32_t*)(code + codePos) = 0x00C08149UL + (instr.dst << 16);
|
||||
codePos += 3;
|
||||
emit32(instr.getImm32(), code, codePos);
|
||||
}
|
||||
break;
|
||||
case randomx::SuperscalarInstructionType::IXOR_C7:
|
||||
case randomx::SuperscalarInstructionType::IXOR_C8:
|
||||
case randomx::SuperscalarInstructionType::IXOR_C9:
|
||||
emit(REX_XOR_RI, code, codePos);
|
||||
emitByte(0xf0 + instr.dst, code, codePos);
|
||||
emit32(instr.getImm32(), code, codePos);
|
||||
if (AVX2) {
|
||||
static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x33, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xEF, 0xC0 };
|
||||
uint8_t* p = code + codePos;
|
||||
emit(t, code, codePos);
|
||||
*(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32());
|
||||
p[12] += instr.dst * 8;
|
||||
p[24] -= instr.dst * 8;
|
||||
p[26] += instr.dst * 8;
|
||||
}
|
||||
else {
|
||||
*(uint32_t*)(code + codePos) = 0x00F08149UL + (instr.dst << 16);
|
||||
codePos += 3;
|
||||
emit32(instr.getImm32(), code, codePos);
|
||||
}
|
||||
break;
|
||||
case randomx::SuperscalarInstructionType::IMULH_R:
|
||||
emit(REX_MOV_RR64, code, codePos);
|
||||
emitByte(0xc0 + instr.dst, code, codePos);
|
||||
emit(REX_MUL_R, code, codePos);
|
||||
emitByte(0xe0 + instr.src, code, codePos);
|
||||
emit(REX_MOV_R64R, code, codePos);
|
||||
emitByte(0xc2 + 8 * instr.dst, code, codePos);
|
||||
*(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16);
|
||||
codePos += 3;
|
||||
*(uint32_t*)(code + codePos) = 0x00E0F749UL + (instr.src << 16);
|
||||
codePos += 3;
|
||||
*(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19);
|
||||
codePos += 3;
|
||||
if (AVX2) {
|
||||
static const uint8_t t[] = {
|
||||
0xC5, 0xBD, 0x73, 0xD0, 0x20,
|
||||
0xC5, 0xB5, 0x73, 0xD0, 0x20,
|
||||
0xC5, 0x7D, 0xF4, 0xD0,
|
||||
0xC5, 0x3D, 0xF4, 0xD8,
|
||||
0xC4, 0x41, 0x7D, 0xF4, 0xE1,
|
||||
0xC4, 0xC1, 0x3D, 0xF4, 0xC1,
|
||||
0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20,
|
||||
0xC4, 0x41, 0x25, 0xEF, 0xC6,
|
||||
0xC4, 0x41, 0x25, 0xD4, 0xDC,
|
||||
0xC4, 0x41, 0x25, 0xD4, 0xDA,
|
||||
0xC4, 0x41, 0x25, 0xEF, 0xCE,
|
||||
0xC4, 0x42, 0x3D, 0x37, 0xC1,
|
||||
0xC4, 0x41, 0x3D, 0xDB, 0xC7,
|
||||
0xC5, 0xBD, 0xD4, 0xC0,
|
||||
0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20,
|
||||
0xC5, 0xA5, 0xD4, 0xC0
|
||||
};
|
||||
uint8_t* p = code + codePos;
|
||||
emit(t, code, codePos);
|
||||
p[3] += instr.dst;
|
||||
p[8] += instr.src;
|
||||
p[11] -= instr.dst * 8;
|
||||
p[13] += instr.src;
|
||||
p[17] += instr.src;
|
||||
p[20] -= instr.dst * 8;
|
||||
p[27] += instr.dst * 8;
|
||||
p[67] += instr.dst * 9;
|
||||
p[77] += instr.dst * 9;
|
||||
}
|
||||
break;
|
||||
case randomx::SuperscalarInstructionType::ISMULH_R:
|
||||
emit(REX_MOV_RR64, code, codePos);
|
||||
emitByte(0xc0 + instr.dst, code, codePos);
|
||||
emit(REX_MUL_R, code, codePos);
|
||||
emitByte(0xe8 + instr.src, code, codePos);
|
||||
emit(REX_MOV_R64R, code, codePos);
|
||||
emitByte(0xc2 + 8 * instr.dst, code, codePos);
|
||||
*(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16);
|
||||
codePos += 3;
|
||||
*(uint32_t*)(code + codePos) = 0x00E8F749UL + (instr.src << 16);
|
||||
codePos += 3;
|
||||
*(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19);
|
||||
codePos += 3;
|
||||
if (AVX2) {
|
||||
static const uint8_t t[] = {
|
||||
0xC5, 0xBD, 0x73, 0xD0, 0x20,
|
||||
0xC5, 0xB5, 0x73, 0xD0, 0x20,
|
||||
0xC5, 0x7D, 0xF4, 0xD0,
|
||||
0xC5, 0x3D, 0xF4, 0xD8,
|
||||
0xC4, 0x41, 0x7D, 0xF4, 0xE1,
|
||||
0xC4, 0x41, 0x3D, 0xF4, 0xE9,
|
||||
0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20,
|
||||
0xC4, 0x41, 0x25, 0xEF, 0xC6,
|
||||
0xC4, 0x41, 0x25, 0xD4, 0xDC,
|
||||
0xC4, 0x41, 0x25, 0xD4, 0xDA,
|
||||
0xC4, 0x41, 0x25, 0xEF, 0xCE,
|
||||
0xC4, 0x42, 0x3D, 0x37, 0xC1,
|
||||
0xC4, 0x41, 0x3D, 0xDB, 0xC7,
|
||||
0xC4, 0x41, 0x15, 0xD4, 0xE8,
|
||||
0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20,
|
||||
0xC4, 0x41, 0x15, 0xD4, 0xC3,
|
||||
0xC4, 0x41, 0x35, 0xEF, 0xC9,
|
||||
0xC4, 0x62, 0x35, 0x37, 0xD0,
|
||||
0xC4, 0x62, 0x35, 0x37, 0xD8,
|
||||
0xC5, 0x2D, 0xDB, 0xD0,
|
||||
0xC5, 0x25, 0xDB, 0xD8,
|
||||
0xC4, 0x41, 0x3D, 0xFB, 0xC2,
|
||||
0xC4, 0xC1, 0x3D, 0xFB, 0xC3
|
||||
};
|
||||
uint8_t* p = code + codePos;
|
||||
emit(t, code, codePos);
|
||||
p[3] += instr.dst;
|
||||
p[8] += instr.src;
|
||||
p[11] -= instr.dst * 8;
|
||||
p[13] += instr.src;
|
||||
p[17] += instr.src;
|
||||
p[20] -= instr.dst * 8;
|
||||
p[89] += instr.dst;
|
||||
p[94] += instr.src;
|
||||
p[98] += instr.src;
|
||||
p[102] += instr.dst;
|
||||
p[112] += instr.dst * 8;
|
||||
}
|
||||
break;
|
||||
case randomx::SuperscalarInstructionType::IMUL_RCP:
|
||||
emit(MOV_RAX_I, code, codePos);
|
||||
*(uint32_t*)(code + codePos) = 0x0000B848UL;
|
||||
codePos += 2;
|
||||
emit64(randomx_reciprocal_fast(instr.getImm32()), code, codePos);
|
||||
emit(REX_IMUL_RM, code, codePos);
|
||||
emitByte(0xc0 + 8 * instr.dst, code, codePos);
|
||||
emit32(0xC0AF0F4CUL + (instr.dst << 27), code, codePos);
|
||||
if (AVX2) {
|
||||
static const uint8_t t[] = {
|
||||
0xC4, 0x62, 0x7D, 0x19, 0x25, 0xEB, 0xFF, 0xFF, 0xFF,
|
||||
0xC5, 0xBD, 0x73, 0xD0, 0x20,
|
||||
0xC4, 0xC1, 0x35, 0x73, 0xD4, 0x20,
|
||||
0xC4, 0x41, 0x7D, 0xF4, 0xD4,
|
||||
0xC5, 0x35, 0xF4, 0xD8,
|
||||
0xC4, 0xC1, 0x3D, 0xF4, 0xC4,
|
||||
0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20,
|
||||
0xC5, 0xFD, 0x73, 0xF0, 0x20,
|
||||
0xC4, 0x41, 0x2D, 0xD4, 0xD3,
|
||||
0xC5, 0xAD, 0xD4, 0xC0
|
||||
};
|
||||
uint8_t* p = code + codePos;
|
||||
emit(t, code, codePos);
|
||||
p[12] += instr.dst;
|
||||
p[22] -= instr.dst * 8;
|
||||
p[28] += instr.dst;
|
||||
p[33] += instr.dst * 8;
|
||||
p[41] -= instr.dst * 8;
|
||||
p[43] += instr.dst;
|
||||
p[53] += instr.dst * 9;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE;
|
||||
}
|
||||
}
|
||||
|
||||
template void JitCompilerX86::generateSuperscalarCode<false>(Instruction&, uint8_t*, uint32_t&);
|
||||
template void JitCompilerX86::generateSuperscalarCode<true>(Instruction&, uint8_t*, uint32_t&);
|
||||
|
||||
template<bool rax>
|
||||
FORCE_INLINE void JitCompilerX86::genAddressReg(const Instruction& instr, const uint32_t src, uint8_t* code, uint32_t& codePos) {
|
||||
*(uint32_t*)(code + codePos) = (rax ? 0x24808d41 : 0x24888d41) + (src << 16);
|
||||
|
@ -563,10 +753,6 @@ namespace randomx {
|
|||
codePos = pos;
|
||||
}
|
||||
|
||||
void JitCompilerX86::genSIB(int scale, int index, int base, uint8_t* code, uint32_t& codePos) {
|
||||
emitByte((scale << 6) | (index << 3) | base, code, codePos);
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_ISUB_R(const Instruction& instr) {
|
||||
uint8_t* const p = code;
|
||||
uint32_t pos = codePos;
|
||||
|
|
|
@ -96,6 +96,7 @@ namespace randomx {
|
|||
|
||||
bool BranchesWithin32B = false;
|
||||
bool hasAVX;
|
||||
bool hasAVX2;
|
||||
bool hasXOP;
|
||||
|
||||
uint8_t* allocatedCode = nullptr;
|
||||
|
@ -107,9 +108,10 @@ namespace randomx {
|
|||
static void genAddressReg(const Instruction&, const uint32_t src, uint8_t* code, uint32_t& codePos);
|
||||
static void genAddressRegDst(const Instruction&, uint8_t* code, uint32_t& codePos);
|
||||
static void genAddressImm(const Instruction&, uint8_t* code, uint32_t& codePos);
|
||||
static void genSIB(int scale, int index, int base, uint8_t* code, uint32_t& codePos);
|
||||
static uint32_t genSIB(int scale, int index, int base) { return (scale << 6) | (index << 3) | base; }
|
||||
|
||||
void generateSuperscalarCode(Instruction &);
|
||||
template<bool AVX2>
|
||||
void generateSuperscalarCode(Instruction& inst, uint8_t* code, uint32_t& codePos);
|
||||
|
||||
static void emitByte(uint8_t val, uint8_t* code, uint32_t& codePos) {
|
||||
code[codePos] = val;
|
||||
|
|
|
@ -52,6 +52,11 @@
|
|||
.global DECL(randomx_program_loop_store)
|
||||
.global DECL(randomx_program_loop_end)
|
||||
.global DECL(randomx_dataset_init)
|
||||
.global DECL(randomx_dataset_init_avx2_prologue)
|
||||
.global DECL(randomx_dataset_init_avx2_loop_end)
|
||||
.global DECL(randomx_dataset_init_avx2_epilogue)
|
||||
.global DECL(randomx_dataset_init_avx2_ssh_load)
|
||||
.global DECL(randomx_dataset_init_avx2_ssh_prefetch)
|
||||
.global DECL(randomx_program_epilogue)
|
||||
.global DECL(randomx_sshash_load)
|
||||
.global DECL(randomx_sshash_prefetch)
|
||||
|
@ -192,6 +197,98 @@ call_offset:
|
|||
pop rbx
|
||||
ret
|
||||
|
||||
.balign 64
|
||||
DECL(randomx_dataset_init_avx2_prologue):
|
||||
#include "asm/program_sshash_avx2_save_registers.inc"
|
||||
|
||||
#if defined(WINABI)
|
||||
mov rdi, qword ptr [rcx] ;# cache->memory
|
||||
mov rsi, rdx ;# dataset
|
||||
mov rbp, r8 ;# block index
|
||||
push r9 ;# max. block index
|
||||
#else
|
||||
mov rdi, qword ptr [rdi] ;# cache->memory
|
||||
;# dataset in rsi
|
||||
mov rbp, rdx ;# block index
|
||||
push rcx ;# max. block index
|
||||
#endif
|
||||
sub rsp, 32
|
||||
|
||||
jmp randomx_dataset_init_avx2_prologue_loop_begin
|
||||
#include "asm/program_sshash_avx2_constants.inc"
|
||||
|
||||
.balign 64
|
||||
randomx_dataset_init_avx2_prologue_loop_begin:
|
||||
#include "asm/program_sshash_avx2_loop_begin.inc"
|
||||
|
||||
;# init integer registers (lane 0)
|
||||
lea r8, [rbp+1]
|
||||
imul r8, qword ptr [r0_avx2_mul+rip]
|
||||
mov r9, qword ptr [r1_avx2_add+rip]
|
||||
xor r9, r8
|
||||
mov r10, qword ptr [r2_avx2_add+rip]
|
||||
xor r10, r8
|
||||
mov r11, qword ptr [r3_avx2_add+rip]
|
||||
xor r11, r8
|
||||
mov r12, qword ptr [r4_avx2_add+rip]
|
||||
xor r12, r8
|
||||
mov r13, qword ptr [r5_avx2_add+rip]
|
||||
xor r13, r8
|
||||
mov r14, qword ptr [r6_avx2_add+rip]
|
||||
xor r14, r8
|
||||
mov r15, qword ptr [r7_avx2_add+rip]
|
||||
xor r15, r8
|
||||
|
||||
;# init AVX registers (lanes 1-4)
|
||||
vpxor ymm0, ymm0, ymm0
|
||||
movq xmm0, rbp
|
||||
vpbroadcastq ymm0, xmm0
|
||||
vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments+rip]
|
||||
|
||||
;# ymm0 *= r0_avx2_mul
|
||||
vbroadcastsd ymm1, qword ptr [r0_avx2_mul+rip]
|
||||
vpsrlq ymm8, ymm0, 32
|
||||
vpsrlq ymm9, ymm1, 32
|
||||
vpmuludq ymm10, ymm0, ymm1
|
||||
vpmuludq ymm11, ymm9, ymm0
|
||||
vpmuludq ymm0, ymm8, ymm1
|
||||
vpsllq ymm11, ymm11, 32
|
||||
vpsllq ymm0, ymm0, 32
|
||||
vpaddq ymm10, ymm10, ymm11
|
||||
vpaddq ymm0, ymm10, ymm0
|
||||
|
||||
vbroadcastsd ymm1, qword ptr [r1_avx2_add+rip]
|
||||
vpxor ymm1, ymm0, ymm1
|
||||
vbroadcastsd ymm2, qword ptr [r2_avx2_add+rip]
|
||||
vpxor ymm2, ymm0, ymm2
|
||||
vbroadcastsd ymm3, qword ptr [r3_avx2_add+rip]
|
||||
vpxor ymm3, ymm0, ymm3
|
||||
vbroadcastsd ymm4, qword ptr [r4_avx2_add+rip]
|
||||
vpxor ymm4, ymm0, ymm4
|
||||
vbroadcastsd ymm5, qword ptr [r5_avx2_add+rip]
|
||||
vpxor ymm5, ymm0, ymm5
|
||||
vbroadcastsd ymm6, qword ptr [r6_avx2_add+rip]
|
||||
vpxor ymm6, ymm0, ymm6
|
||||
vbroadcastsd ymm7, qword ptr [r7_avx2_add+rip]
|
||||
vpxor ymm7, ymm0, ymm7
|
||||
|
||||
vbroadcastsd ymm15, qword ptr [mul_hi_avx2_data+rip] ;# carry_bit (bit 32)
|
||||
vpsllq ymm14, ymm15, 31 ;# sign64 (bit 63)
|
||||
|
||||
;# generated SuperscalarHash code goes here
|
||||
|
||||
DECL(randomx_dataset_init_avx2_loop_end):
|
||||
#include "asm/program_sshash_avx2_loop_end.inc"
|
||||
|
||||
DECL(randomx_dataset_init_avx2_epilogue):
|
||||
#include "asm/program_sshash_avx2_epilogue.inc"
|
||||
|
||||
DECL(randomx_dataset_init_avx2_ssh_load):
|
||||
#include "asm/program_sshash_avx2_ssh_load.inc"
|
||||
|
||||
DECL(randomx_dataset_init_avx2_ssh_prefetch):
|
||||
#include "asm/program_sshash_avx2_ssh_prefetch.inc"
|
||||
|
||||
.balign 64
|
||||
DECL(randomx_program_epilogue):
|
||||
#include "asm/program_epilogue_store.inc"
|
||||
|
|
|
@ -41,6 +41,11 @@ PUBLIC randomx_program_read_dataset_ryzen
|
|||
PUBLIC randomx_program_read_dataset_sshash_init
|
||||
PUBLIC randomx_program_read_dataset_sshash_fin
|
||||
PUBLIC randomx_dataset_init
|
||||
PUBLIC randomx_dataset_init_avx2_prologue
|
||||
PUBLIC randomx_dataset_init_avx2_loop_end
|
||||
PUBLIC randomx_dataset_init_avx2_epilogue
|
||||
PUBLIC randomx_dataset_init_avx2_ssh_load
|
||||
PUBLIC randomx_dataset_init_avx2_ssh_prefetch
|
||||
PUBLIC randomx_program_loop_store
|
||||
PUBLIC randomx_program_loop_end
|
||||
PUBLIC randomx_program_epilogue
|
||||
|
@ -183,6 +188,95 @@ init_block_loop:
|
|||
randomx_dataset_init ENDP
|
||||
|
||||
ALIGN 64
|
||||
randomx_dataset_init_avx2_prologue PROC
|
||||
include asm/program_sshash_avx2_save_registers.inc
|
||||
|
||||
mov rdi, qword ptr [rcx] ;# cache->memory
|
||||
mov rsi, rdx ;# dataset
|
||||
mov rbp, r8 ;# block index
|
||||
push r9 ;# max. block index
|
||||
sub rsp, 32
|
||||
|
||||
jmp loop_begin
|
||||
include asm/program_sshash_avx2_constants.inc
|
||||
|
||||
ALIGN 64
|
||||
loop_begin:
|
||||
include asm/program_sshash_avx2_loop_begin.inc
|
||||
|
||||
;# init integer registers (lane 0)
|
||||
lea r8, [rbp+1]
|
||||
imul r8, qword ptr [r0_avx2_mul]
|
||||
mov r9, qword ptr [r1_avx2_add]
|
||||
xor r9, r8
|
||||
mov r10, qword ptr [r2_avx2_add]
|
||||
xor r10, r8
|
||||
mov r11, qword ptr [r3_avx2_add]
|
||||
xor r11, r8
|
||||
mov r12, qword ptr [r4_avx2_add]
|
||||
xor r12, r8
|
||||
mov r13, qword ptr [r5_avx2_add]
|
||||
xor r13, r8
|
||||
mov r14, qword ptr [r6_avx2_add]
|
||||
xor r14, r8
|
||||
mov r15, qword ptr [r7_avx2_add]
|
||||
xor r15, r8
|
||||
|
||||
;# init AVX registers (lanes 1-4)
|
||||
vpxor ymm0, ymm0, ymm0
|
||||
movq xmm0, rbp
|
||||
vpbroadcastq ymm0, xmm0
|
||||
vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments]
|
||||
|
||||
;# ymm0 *= r0_avx2_mul
|
||||
vbroadcastsd ymm1, qword ptr [r0_avx2_mul]
|
||||
vpsrlq ymm8, ymm0, 32
|
||||
vpsrlq ymm9, ymm1, 32
|
||||
vpmuludq ymm10, ymm0, ymm1
|
||||
vpmuludq ymm11, ymm9, ymm0
|
||||
vpmuludq ymm0, ymm8, ymm1
|
||||
vpsllq ymm11, ymm11, 32
|
||||
vpsllq ymm0, ymm0, 32
|
||||
vpaddq ymm10, ymm10, ymm11
|
||||
vpaddq ymm0, ymm10, ymm0
|
||||
|
||||
vbroadcastsd ymm1, qword ptr [r1_avx2_add]
|
||||
vpxor ymm1, ymm0, ymm1
|
||||
vbroadcastsd ymm2, qword ptr [r2_avx2_add]
|
||||
vpxor ymm2, ymm0, ymm2
|
||||
vbroadcastsd ymm3, qword ptr [r3_avx2_add]
|
||||
vpxor ymm3, ymm0, ymm3
|
||||
vbroadcastsd ymm4, qword ptr [r4_avx2_add]
|
||||
vpxor ymm4, ymm0, ymm4
|
||||
vbroadcastsd ymm5, qword ptr [r5_avx2_add]
|
||||
vpxor ymm5, ymm0, ymm5
|
||||
vbroadcastsd ymm6, qword ptr [r6_avx2_add]
|
||||
vpxor ymm6, ymm0, ymm6
|
||||
vbroadcastsd ymm7, qword ptr [r7_avx2_add]
|
||||
vpxor ymm7, ymm0, ymm7
|
||||
|
||||
vbroadcastsd ymm15, qword ptr [mul_hi_avx2_data] ;# carry_bit (bit 32)
|
||||
vpsllq ymm14, ymm15, 31 ;# sign64 (bit 63)
|
||||
randomx_dataset_init_avx2_prologue ENDP
|
||||
|
||||
;# generated SuperscalarHash code goes here
|
||||
|
||||
randomx_dataset_init_avx2_loop_end PROC
|
||||
include asm/program_sshash_avx2_loop_end.inc
|
||||
randomx_dataset_init_avx2_loop_end ENDP
|
||||
|
||||
randomx_dataset_init_avx2_epilogue PROC
|
||||
include asm/program_sshash_avx2_epilogue.inc
|
||||
randomx_dataset_init_avx2_epilogue ENDP
|
||||
|
||||
randomx_dataset_init_avx2_ssh_load PROC
|
||||
include asm/program_sshash_avx2_ssh_load.inc
|
||||
randomx_dataset_init_avx2_ssh_load ENDP
|
||||
|
||||
randomx_dataset_init_avx2_ssh_prefetch PROC
|
||||
include asm/program_sshash_avx2_ssh_prefetch.inc
|
||||
randomx_dataset_init_avx2_ssh_prefetch ENDP
|
||||
|
||||
randomx_program_epilogue PROC
|
||||
include asm/program_epilogue_store.inc
|
||||
include asm/program_epilogue_win64.inc
|
||||
|
|
|
@ -44,6 +44,11 @@ extern "C" {
|
|||
void randomx_program_loop_store();
|
||||
void randomx_program_loop_end();
|
||||
void randomx_dataset_init();
|
||||
void randomx_dataset_init_avx2_prologue();
|
||||
void randomx_dataset_init_avx2_loop_end();
|
||||
void randomx_dataset_init_avx2_epilogue();
|
||||
void randomx_dataset_init_avx2_ssh_load();
|
||||
void randomx_dataset_init_avx2_ssh_prefetch();
|
||||
void randomx_program_epilogue();
|
||||
void randomx_sshash_load();
|
||||
void randomx_sshash_prefetch();
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
|
||||
#include "crypto/rx/RxDataset.h"
|
||||
#include "backend/cpu/Cpu.h"
|
||||
#include "base/io/log/Log.h"
|
||||
#include "base/io/log/Tags.h"
|
||||
#include "base/kernel/Platform.h"
|
||||
|
@ -39,7 +40,13 @@ static void init_dataset_wrapper(randomx_dataset *dataset, randomx_cache *cache,
|
|||
{
|
||||
Platform::setThreadPriority(priority);
|
||||
|
||||
randomx_init_dataset(dataset, cache, startItem, itemCount);
|
||||
if (Cpu::info()->hasAVX2() && (itemCount % 5)) {
|
||||
randomx_init_dataset(dataset, cache, startItem, itemCount - (itemCount % 5));
|
||||
randomx_init_dataset(dataset, cache, startItem + itemCount - 5, 5);
|
||||
}
|
||||
else {
|
||||
randomx_init_dataset(dataset, cache, startItem, itemCount);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue