RandomX: rewrote dataset read code

Unified code for AMD and Intel
1% faster on Intel
0.15% faster on AMD Ryzen
This commit is contained in:
SChernykh 2021-05-20 12:45:42 +02:00
parent 7f7b1fb073
commit ff82ca57f2
10 changed files with 16 additions and 69 deletions

View file

@ -22,6 +22,7 @@
mov rsi, rdx ;# uint8_t* scratchpad
mov rax, rbp
ror rbp, 32
;# zero integer registers
xor r8, r8

View file

@ -35,6 +35,7 @@
mov rbx, r9 ;# loop counter
mov rax, rbp
ror rbp, 32
;# zero integer registers
xor r8, r8

View file

@ -1,17 +1,16 @@
mov ecx, ebp ;# ecx = ma
and ecx, RANDOMX_DATASET_BASE_MASK
xor r8, qword ptr [rdi+rcx]
ror rbp, 32 ;# swap "ma" and "mx"
xor rbp, rax ;# modify "mx"
mov edx, ebp ;# edx = mx
and edx, RANDOMX_DATASET_BASE_MASK
prefetchnta byte ptr [rdi+rdx]
ror rbp, 32 ;# swap "ma" and "mx"
mov edx, ebp ;# edx = ma
and edx, RANDOMX_DATASET_BASE_MASK
lea rcx, [rdi+rdx] ;# dataset cache line
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
xor r10, qword ptr [rcx+16]
xor r11, qword ptr [rcx+24]
xor r12, qword ptr [rcx+32]
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
xor r9, qword ptr [rdi+rcx+8]
xor r10, qword ptr [rdi+rcx+16]
xor r11, qword ptr [rdi+rcx+24]
xor r12, qword ptr [rdi+rcx+32]
xor r13, qword ptr [rdi+rcx+40]
xor r14, qword ptr [rdi+rcx+48]
xor r15, qword ptr [rdi+rcx+56]

View file

@ -1,17 +0,0 @@
mov rcx, rbp ;# ecx = ma
shr rcx, 32
and ecx, RANDOMX_DATASET_BASE_MASK
xor r8, qword ptr [rdi+rcx]
xor rbp, rax ;# modify "mx"
mov edx, ebp ;# edx = mx
and edx, RANDOMX_DATASET_BASE_MASK
prefetchnta byte ptr [rdi+rdx]
ror rbp, 32 ;# swap "ma" and "mx"
xor r9, qword ptr [rdi+rcx+8]
xor r10, qword ptr [rdi+rcx+16]
xor r11, qword ptr [rdi+rcx+24]
xor r12, qword ptr [rdi+rcx+32]
xor r13, qword ptr [rdi+rcx+40]
xor r14, qword ptr [rdi+rcx+48]
xor r15, qword ptr [rdi+rcx+56]

View file

@ -115,6 +115,7 @@ namespace randomx {
#define codeLoopLoad ADDR(randomx_program_loop_load)
#define codeLoopLoadXOP ADDR(randomx_program_loop_load_xop)
#define codeProgamStart ADDR(randomx_program_start)
#define codeReadDataset ADDR(randomx_program_read_dataset)
#define codeReadDatasetLightSshInit ADDR(randomx_program_read_dataset_sshash_init)
#define codeReadDatasetLightSshFin ADDR(randomx_program_read_dataset_sshash_fin)
#define codeDatasetInit ADDR(randomx_dataset_init)
@ -135,6 +136,7 @@ namespace randomx {
#define prologueSize (codeLoopBegin - codePrologue)
#define loopLoadSize (codeLoopLoadXOP - codeLoopLoad)
#define loopLoadXOPSize (codeProgamStart - codeLoopLoadXOP)
#define readDatasetSize (codeReadDatasetLightSshInit - codeReadDataset)
#define readDatasetLightInitSize (codeReadDatasetLightSshFin - codeReadDatasetLightSshInit)
#define readDatasetLightFinSize (codeLoopStore - codeReadDatasetLightSshFin)
#define loopStoreSize (codeLoopEnd - codeLoopStore)
@ -318,20 +320,7 @@ namespace randomx {
vm_flags = flags;
generateProgramPrologue(prog, pcfg);
uint8_t* p;
uint32_t n;
if (flags & RANDOMX_FLAG_AMD) {
p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked;
n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize;
}
else {
p = RandomX_CurrentConfig.codeReadDatasetTweaked;
n = RandomX_CurrentConfig.codeReadDatasetTweakedSize;
}
memcpy(code + codePos, p, n);
codePos += n;
emit(codeReadDataset, readDatasetSize, code, codePos);
generateProgramEpilogue(prog, pcfg);
}

View file

@ -48,7 +48,6 @@
.global DECL(randomx_program_loop_load_xop)
.global DECL(randomx_program_start)
.global DECL(randomx_program_read_dataset)
.global DECL(randomx_program_read_dataset_ryzen)
.global DECL(randomx_program_read_dataset_sshash_init)
.global DECL(randomx_program_read_dataset_sshash_fin)
.global DECL(randomx_program_loop_store)
@ -140,9 +139,6 @@ DECL(randomx_program_start):
DECL(randomx_program_read_dataset):
#include "asm/program_read_dataset.inc"
DECL(randomx_program_read_dataset_ryzen):
#include "asm/program_read_dataset_ryzen.inc"
DECL(randomx_program_read_dataset_sshash_init):
#include "asm/program_read_dataset_sshash_init.inc"

View file

@ -39,7 +39,6 @@ PUBLIC randomx_program_loop_load
PUBLIC randomx_program_loop_load_xop
PUBLIC randomx_program_start
PUBLIC randomx_program_read_dataset
PUBLIC randomx_program_read_dataset_ryzen
PUBLIC randomx_program_read_dataset_sshash_init
PUBLIC randomx_program_read_dataset_sshash_fin
PUBLIC randomx_dataset_init
@ -136,10 +135,6 @@ randomx_program_read_dataset PROC
include asm/program_read_dataset.inc
randomx_program_read_dataset ENDP
randomx_program_read_dataset_ryzen PROC
include asm/program_read_dataset_ryzen.inc
randomx_program_read_dataset_ryzen ENDP
randomx_program_read_dataset_sshash_init PROC
include asm/program_read_dataset_sshash_init.inc
randomx_program_read_dataset_sshash_init ENDP

View file

@ -40,7 +40,6 @@ extern "C" {
void randomx_program_loop_load_xop();
void randomx_program_start();
void randomx_program_read_dataset();
void randomx_program_read_dataset_ryzen();
void randomx_program_read_dataset_sshash_init();
void randomx_program_read_dataset_sshash_fin();
void randomx_program_loop_store();

View file

@ -165,18 +165,6 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
const uint8_t* b = addr(randomx_sshash_end);
memcpy(codeShhPrefetchTweaked, a, b - a);
}
{
const uint8_t* a = addr(randomx_program_read_dataset);
const uint8_t* b = addr(randomx_program_read_dataset_ryzen);
memcpy(codeReadDatasetTweaked, a, b - a);
codeReadDatasetTweakedSize = b - a;
}
{
const uint8_t* a = addr(randomx_program_read_dataset_ryzen);
const uint8_t* b = addr(randomx_program_read_dataset_sshash_init);
memcpy(codeReadDatasetRyzenTweaked, a, b - a);
codeReadDatasetRyzenTweakedSize = b - a;
}
if (xmrig::Cpu::info()->hasBMI2()) {
const uint8_t* a = addr(randomx_prefetch_scratchpad_bmi2);
const uint8_t* b = addr(randomx_prefetch_scratchpad_end);

View file

@ -125,10 +125,6 @@ struct RandomX_ConfigurationBase
rx_vec_i128 fillAes4Rx4_Key[8];
uint8_t codeShhPrefetchTweaked[20];
uint8_t codeReadDatasetTweaked[64];
uint32_t codeReadDatasetTweakedSize;
uint8_t codeReadDatasetRyzenTweaked[72];
uint32_t codeReadDatasetRyzenTweakedSize;
uint8_t codePrefetchScratchpadTweaked[28];
uint32_t codePrefetchScratchpadTweakedSize;