Optimized dataset read for Ryzen CPUs

Removed register dependency in dataset read, +0.8% speedup on average.
This commit is contained in:
SChernykh 2019-12-08 16:14:02 +01:00
parent 4dec063472
commit d0df824599
17 changed files with 81 additions and 20 deletions

View file

@ -96,7 +96,7 @@ void xmrig::CpuWorker<N>::allocateRandomX_VM()
} }
if (!m_vm) { if (!m_vm) {
m_vm = new RxVm(dataset, m_memory->scratchpad(), !m_hwAES); m_vm = new RxVm(dataset, m_memory->scratchpad(), !m_hwAES, m_assembly);
} }
} }
#endif #endif

View file

@ -0,0 +1,19 @@
mov rcx, rbp ;# ecx = ma
shr rcx, 32
and ecx, RANDOMX_DATASET_BASE_MASK
xor rbp, rax ;# modify "mx"
mov rax, qword ptr [rdi+rcx]
mov edx, ebp ;# edx = mx
and edx, RANDOMX_DATASET_BASE_MASK
prefetchnta byte ptr [rdi+rdx]
ror rbp, 32 ;# swap "ma" and "mx"
add rcx, rdi ;# dataset cache line
xor r8, rax
xor r9, qword ptr [rcx+8]
xor r10, qword ptr [rcx+16]
xor r11, qword ptr [rcx+24]
xor r12, qword ptr [rcx+32]
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]

View file

@ -118,7 +118,7 @@ static void clear_code_cache(char* p1, char* p2)
# endif # endif
} }
void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config) void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config, uint32_t)
{ {
uint32_t codePos = MainLoopBegin + 4; uint32_t codePos = MainLoopBegin + 4;

View file

@ -49,7 +49,7 @@ namespace randomx {
JitCompilerA64(); JitCompilerA64();
~JitCompilerA64(); ~JitCompilerA64();
void generateProgram(Program&, ProgramConfiguration&); void generateProgram(Program&, ProgramConfiguration&, uint32_t);
void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);
template<size_t N> template<size_t N>

View file

@ -44,7 +44,7 @@ namespace randomx {
JitCompilerFallback() { JitCompilerFallback() {
throw std::runtime_error("JIT compilation is not supported on this platform"); throw std::runtime_error("JIT compilation is not supported on this platform");
} }
void generateProgram(Program&, ProgramConfiguration&) { void generateProgram(Program&, ProgramConfiguration&, uint32_t) {
} }
void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) { void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) {

View file

@ -89,7 +89,6 @@ namespace randomx {
const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin; const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin;
const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load; const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load;
const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start;
const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset;
const uint8_t* codeReadDatasetLightSshInit = (uint8_t*)&randomx_program_read_dataset_sshash_init; const uint8_t* codeReadDatasetLightSshInit = (uint8_t*)&randomx_program_read_dataset_sshash_init;
const uint8_t* codeReadDatasetLightSshFin = (uint8_t*)&randomx_program_read_dataset_sshash_fin; const uint8_t* codeReadDatasetLightSshFin = (uint8_t*)&randomx_program_read_dataset_sshash_fin;
const uint8_t* codeDatasetInit = (uint8_t*)&randomx_dataset_init; const uint8_t* codeDatasetInit = (uint8_t*)&randomx_dataset_init;
@ -105,7 +104,6 @@ namespace randomx {
const int32_t prefetchScratchpadSize = codePrefetchScratchpadEnd - codePrefetchScratchpad; const int32_t prefetchScratchpadSize = codePrefetchScratchpadEnd - codePrefetchScratchpad;
const int32_t prologueSize = codeLoopBegin - codePrologue; const int32_t prologueSize = codeLoopBegin - codePrologue;
const int32_t loopLoadSize = codeProgamStart - codeLoopLoad; const int32_t loopLoadSize = codeProgamStart - codeLoopLoad;
const int32_t readDatasetSize = codeReadDatasetLightSshInit - codeReadDataset;
const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit; const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit;
const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin; const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin;
const int32_t loopStoreSize = codeLoopEnd - codeLoopStore; const int32_t loopStoreSize = codeLoopEnd - codeLoopStore;
@ -301,10 +299,22 @@ namespace randomx {
freePagedMemory(allocatedCode, CodeSize); freePagedMemory(allocatedCode, CodeSize);
} }
void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) { void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t flags) {
generateProgramPrologue(prog, pcfg); generateProgramPrologue(prog, pcfg);
memcpy(code + codePos, RandomX_CurrentConfig.codeReadDatasetTweaked, readDatasetSize);
codePos += readDatasetSize; uint8_t* p;
uint32_t n;
if (flags & RANDOMX_FLAG_RYZEN) {
p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked;
n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize;
}
else {
p = RandomX_CurrentConfig.codeReadDatasetTweaked;
n = RandomX_CurrentConfig.codeReadDatasetTweakedSize;
}
memcpy(code + codePos, p, n);
codePos += n;
generateProgramEpilogue(prog, pcfg); generateProgramEpilogue(prog, pcfg);
} }

View file

@ -49,7 +49,7 @@ namespace randomx {
public: public:
JitCompilerX86(); JitCompilerX86();
~JitCompilerX86(); ~JitCompilerX86();
void generateProgram(Program&, ProgramConfiguration&); void generateProgram(Program&, ProgramConfiguration&, uint32_t);
void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);
template<size_t N> template<size_t N>
void generateSuperscalarHash(SuperscalarProgram (&programs)[N], std::vector<uint64_t> &); void generateSuperscalarHash(SuperscalarProgram (&programs)[N], std::vector<uint64_t> &);

View file

@ -45,6 +45,7 @@
.global DECL(randomx_program_loop_load) .global DECL(randomx_program_loop_load)
.global DECL(randomx_program_start) .global DECL(randomx_program_start)
.global DECL(randomx_program_read_dataset) .global DECL(randomx_program_read_dataset)
.global DECL(randomx_program_read_dataset_ryzen)
.global DECL(randomx_program_read_dataset_sshash_init) .global DECL(randomx_program_read_dataset_sshash_init)
.global DECL(randomx_program_read_dataset_sshash_fin) .global DECL(randomx_program_read_dataset_sshash_fin)
.global DECL(randomx_program_loop_store) .global DECL(randomx_program_loop_store)
@ -110,6 +111,9 @@ DECL(randomx_program_start):
DECL(randomx_program_read_dataset): DECL(randomx_program_read_dataset):
#include "asm/program_read_dataset.inc" #include "asm/program_read_dataset.inc"
DECL(randomx_program_read_dataset_ryzen):
#include "asm/program_read_dataset_ryzen.inc"
DECL(randomx_program_read_dataset_sshash_init): DECL(randomx_program_read_dataset_sshash_init):
#include "asm/program_read_dataset_sshash_init.inc" #include "asm/program_read_dataset_sshash_init.inc"

View file

@ -36,6 +36,7 @@ PUBLIC randomx_program_loop_begin
PUBLIC randomx_program_loop_load PUBLIC randomx_program_loop_load
PUBLIC randomx_program_start PUBLIC randomx_program_start
PUBLIC randomx_program_read_dataset PUBLIC randomx_program_read_dataset
PUBLIC randomx_program_read_dataset_ryzen
PUBLIC randomx_program_read_dataset_sshash_init PUBLIC randomx_program_read_dataset_sshash_init
PUBLIC randomx_program_read_dataset_sshash_fin PUBLIC randomx_program_read_dataset_sshash_fin
PUBLIC randomx_dataset_init PUBLIC randomx_dataset_init
@ -103,6 +104,10 @@ randomx_program_read_dataset PROC
include asm/program_read_dataset.inc include asm/program_read_dataset.inc
randomx_program_read_dataset ENDP randomx_program_read_dataset ENDP
randomx_program_read_dataset_ryzen PROC
include asm/program_read_dataset_ryzen.inc
randomx_program_read_dataset_ryzen ENDP
randomx_program_read_dataset_sshash_init PROC randomx_program_read_dataset_sshash_init PROC
include asm/program_read_dataset_sshash_init.inc include asm/program_read_dataset_sshash_init.inc
randomx_program_read_dataset_sshash_init ENDP randomx_program_read_dataset_sshash_init ENDP

View file

@ -37,6 +37,7 @@ extern "C" {
void randomx_program_loop_load(); void randomx_program_loop_load();
void randomx_program_start(); void randomx_program_start();
void randomx_program_read_dataset(); void randomx_program_read_dataset();
void randomx_program_read_dataset_ryzen();
void randomx_program_read_dataset_sshash_init(); void randomx_program_read_dataset_sshash_init();
void randomx_program_read_dataset_sshash_fin(); void randomx_program_read_dataset_sshash_fin();
void randomx_program_loop_store(); void randomx_program_loop_store();

View file

@ -157,8 +157,15 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
} }
{ {
const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset; const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset;
const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_init; const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_ryzen;
memcpy(codeReadDatasetTweaked, a, b - a); memcpy(codeReadDatasetTweaked, a, b - a);
codeReadDatasetTweakedSize = b - a;
}
{
const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_ryzen;
const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_init;
memcpy(codeReadDatasetRyzenTweaked, a, b - a);
codeReadDatasetRyzenTweakedSize = b - a;
} }
{ {
const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_sshash_init; const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_sshash_init;
@ -191,10 +198,11 @@ void RandomX_ConfigurationBase::Apply()
#if defined(_M_X64) || defined(__x86_64__) #if defined(_M_X64) || defined(__x86_64__)
*(uint32_t*)(codeShhPrefetchTweaked + 3) = ArgonMemory * 16 - 1; *(uint32_t*)(codeShhPrefetchTweaked + 3) = ArgonMemory * 16 - 1;
const uint32_t DatasetBaseMask = DatasetBaseSize - RANDOMX_DATASET_ITEM_SIZE; // Not needed right now because all variants use default dataset base size
*(uint32_t*)(codeReadDatasetTweaked + 7) = DatasetBaseMask; //const uint32_t DatasetBaseMask = DatasetBaseSize - RANDOMX_DATASET_ITEM_SIZE;
*(uint32_t*)(codeReadDatasetTweaked + 23) = DatasetBaseMask; //*(uint32_t*)(codeReadDatasetTweaked + 9) = DatasetBaseMask;
*(uint32_t*)(codeReadDatasetLightSshInitTweaked + 59) = DatasetBaseMask; //*(uint32_t*)(codeReadDatasetTweaked + 24) = DatasetBaseMask;
//*(uint32_t*)(codeReadDatasetLightSshInitTweaked + 59) = DatasetBaseMask;
*(uint32_t*)(codePrefetchScratchpadTweaked + 4) = ScratchpadL3Mask64_Calculated; *(uint32_t*)(codePrefetchScratchpadTweaked + 4) = ScratchpadL3Mask64_Calculated;
*(uint32_t*)(codePrefetchScratchpadTweaked + 18) = ScratchpadL3Mask64_Calculated; *(uint32_t*)(codePrefetchScratchpadTweaked + 18) = ScratchpadL3Mask64_Calculated;
@ -435,6 +443,7 @@ extern "C" {
} }
vm->setScratchpad(scratchpad); vm->setScratchpad(scratchpad);
vm->setFlags(flags);
} }
catch (std::exception &ex) { catch (std::exception &ex) {
delete vm; delete vm;

View file

@ -49,6 +49,7 @@ enum randomx_flags {
RANDOMX_FLAG_FULL_MEM = 4, RANDOMX_FLAG_FULL_MEM = 4,
RANDOMX_FLAG_JIT = 8, RANDOMX_FLAG_JIT = 8,
RANDOMX_FLAG_1GB_PAGES = 16, RANDOMX_FLAG_1GB_PAGES = 16,
RANDOMX_FLAG_RYZEN = 64,
}; };
@ -118,7 +119,10 @@ struct RandomX_ConfigurationBase
rx_vec_i128 fillAes4Rx4_Key[8]; rx_vec_i128 fillAes4Rx4_Key[8];
uint8_t codeShhPrefetchTweaked[20]; uint8_t codeShhPrefetchTweaked[20];
uint8_t codeReadDatasetTweaked[64]; uint8_t codeReadDatasetTweaked[72];
uint32_t codeReadDatasetTweakedSize;
uint8_t codeReadDatasetRyzenTweaked[72];
uint32_t codeReadDatasetRyzenTweakedSize;
uint8_t codeReadDatasetLightSshInitTweaked[68]; uint8_t codeReadDatasetLightSshInitTweaked[68];
uint8_t codePrefetchScratchpadTweaked[32]; uint8_t codePrefetchScratchpadTweaked[32];

View file

@ -46,6 +46,9 @@ public:
virtual void run(void* seed) = 0; virtual void run(void* seed) = 0;
void resetRoundingMode(); void resetRoundingMode();
void setFlags(uint32_t flags) { vm_flags = flags; }
uint32_t getFlags() const { return vm_flags; }
randomx::RegisterFile *getRegisterFile() { randomx::RegisterFile *getRegisterFile() {
return &reg; return &reg;
} }
@ -71,6 +74,7 @@ protected:
randomx_dataset* datasetPtr; randomx_dataset* datasetPtr;
}; };
uint64_t datasetOffset; uint64_t datasetOffset;
uint32_t vm_flags;
}; };
namespace randomx { namespace randomx {

View file

@ -43,7 +43,7 @@ namespace randomx {
void CompiledVm<softAes>::run(void* seed) { void CompiledVm<softAes>::run(void* seed) {
VmBase<softAes>::generateProgram(seed); VmBase<softAes>::generateProgram(seed);
randomx_vm::initialize(); randomx_vm::initialize();
compiler.generateProgram(program, config); compiler.generateProgram(program, config, getFlags());
mem.memory = datasetPtr->memory + datasetOffset; mem.memory = datasetPtr->memory + datasetOffset;
execute(); execute();
} }

View file

@ -31,7 +31,7 @@
#include "crypto/rx/RxVm.h" #include "crypto/rx/RxVm.h"
xmrig::RxVm::RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes) xmrig::RxVm::RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes, xmrig::Assembly assembly)
{ {
if (!softAes) { if (!softAes) {
m_flags |= RANDOMX_FLAG_HARD_AES; m_flags |= RANDOMX_FLAG_HARD_AES;
@ -45,6 +45,10 @@ xmrig::RxVm::RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes)
m_flags |= RANDOMX_FLAG_JIT; m_flags |= RANDOMX_FLAG_JIT;
} }
if (assembly == Assembly::RYZEN) {
m_flags |= RANDOMX_FLAG_RYZEN;
}
m_vm = randomx_create_vm(static_cast<randomx_flags>(m_flags), dataset->cache() ? dataset->cache()->get() : nullptr, dataset->get(), scratchpad); m_vm = randomx_create_vm(static_cast<randomx_flags>(m_flags), dataset->cache() ? dataset->cache()->get() : nullptr, dataset->get(), scratchpad);
} }

View file

@ -29,6 +29,7 @@
#include "base/tools/Object.h" #include "base/tools/Object.h"
#include "backend/cpu/Cpu.h"
#include <cstdint> #include <cstdint>
@ -49,7 +50,7 @@ class RxVm
public: public:
XMRIG_DISABLE_COPY_MOVE_DEFAULT(RxVm); XMRIG_DISABLE_COPY_MOVE_DEFAULT(RxVm);
RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes); RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes, xmrig::Assembly assembly);
~RxVm(); ~RxVm();
inline randomx_vm *get() const { return m_vm; } inline randomx_vm *get() const { return m_vm; }

View file

@ -117,7 +117,7 @@ static void getResults(JobBundle &bundle, std::vector<JobResult> &results, uint3
return; return;
} }
auto vm = new RxVm(dataset, memory->scratchpad(), !hwAES); auto vm = new RxVm(dataset, memory->scratchpad(), !hwAES, Assembly::NONE);
for (uint32_t nonce : bundle.nonces) { for (uint32_t nonce : bundle.nonces) {
*bundle.job.nonce() = nonce; *bundle.job.nonce() = nonce;