Merge pull request #1502 from SChernykh/dev

Optimizations for AMD Bulldozer
This commit is contained in:
xmrig 2020-01-15 20:26:06 +07:00 committed by GitHub
commit 8e6f4d4ecb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 69 additions and 10 deletions

View file

@ -0,0 +1,24 @@
lea rcx, [rsi+rax]
mov [rsp+8], rcx
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
xor r10, qword ptr [rcx+16]
xor r11, qword ptr [rcx+24]
xor r12, qword ptr [rcx+32]
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
lea rcx, [rsi+rdx]
mov [rsp+16], rcx
cvtdq2pd xmm0, qword ptr [rcx+0]
cvtdq2pd xmm1, qword ptr [rcx+8]
cvtdq2pd xmm2, qword ptr [rcx+16]
cvtdq2pd xmm3, qword ptr [rcx+24]
cvtdq2pd xmm4, qword ptr [rcx+32]
cvtdq2pd xmm5, qword ptr [rcx+40]
cvtdq2pd xmm6, qword ptr [rcx+48]
cvtdq2pd xmm7, qword ptr [rcx+56]
vpcmov xmm4, xmm4, xmm14, xmm13
vpcmov xmm5, xmm5, xmm14, xmm13
vpcmov xmm6, xmm6, xmm14, xmm13
vpcmov xmm7, xmm7, xmm14, xmm13

View file

@ -1,5 +1,5 @@
mantissaMask:
db 255, 255, 255, 255, 255, 255, 255, 0, 255, 255, 255, 255, 255, 255, 255, 0
db 0, 0, 192, 255, 255, 255, 255, 0, 0, 0, 192, 255, 255, 255, 255, 0
exp240:
db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
scaleMask:

View file

@ -89,6 +89,7 @@ namespace randomx {
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin;
const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load;
const uint8_t* codeLoopLoadXOP = (uint8_t*)&randomx_program_loop_load_xop;
const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start;
const uint8_t* codeReadDatasetLightSshInit = (uint8_t*)&randomx_program_read_dataset_sshash_init;
const uint8_t* codeReadDatasetLightSshFin = (uint8_t*)&randomx_program_read_dataset_sshash_fin;
@ -104,7 +105,8 @@ namespace randomx {
const int32_t prefetchScratchpadSize = codePrefetchScratchpadEnd - codePrefetchScratchpad;
const int32_t prologueSize = codeLoopBegin - codePrologue;
const int32_t loopLoadSize = codeProgamStart - codeLoopLoad;
const int32_t loopLoadSize = codeLoopLoadXOP - codeLoopLoad;
const int32_t loopLoadXOPSize = codeProgamStart - codeLoopLoadXOP;
const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit;
const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin;
const int32_t loopStoreSize = codeLoopEnd - codeLoopStore;
@ -184,6 +186,7 @@ namespace randomx {
static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 };
static const uint8_t REX_XCHG[] = { 0x4d, 0x87 };
static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0F, 0x54, 0xE5, 0x45, 0x0F, 0x56, 0xE6 };
static const uint8_t REX_VPCMOV_XMM12[] = { 0x8F, 0x48, 0x18, 0xA2, 0xE6, 0xD0 };
static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f };
static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 };
static const uint8_t CALL = 0xe8;
@ -295,12 +298,23 @@ namespace randomx {
cpuid(1, info);
hasAVX = ((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0);
cpuid(0x80000001, info);
hasXOP = ((info[2] & (1 << 11)) != 0);
allocatedCode = (uint8_t*)allocExecutableMemory(CodeSize * 2);
// Shift code base address to improve caching - all threads will use different L2/L3 cache sets
code = allocatedCode + (codeOffset.fetch_add(59 * 64) % CodeSize);
memcpy(code, codePrologue, prologueSize);
memcpy(code + prologueSize, codeLoopLoad, loopLoadSize);
if (hasXOP) {
memcpy(code + prologueSize, codeLoopLoadXOP, loopLoadXOPSize);
}
else {
memcpy(code + prologueSize, codeLoopLoad, loopLoadSize);
}
memcpy(code + epilogueOffset, codeEpilogue, epilogueSize);
codePosFirst = prologueSize + (hasXOP ? loopLoadXOPSize : loopLoadSize);
# ifdef XMRIG_FIX_RYZEN
mainLoopBounds.first = code + prologueSize;
mainLoopBounds.second = code + epilogueOffset;
@ -318,7 +332,7 @@ namespace randomx {
uint8_t* p;
uint32_t n;
if (flags & RANDOMX_FLAG_RYZEN) {
if (flags & RANDOMX_FLAG_AMD) {
p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked;
n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize;
}
@ -395,7 +409,7 @@ namespace randomx {
# endif
memcpy(code + prologueSize - 48, &pcfg.eMask, sizeof(pcfg.eMask));
codePos = prologueSize + loopLoadSize;
codePos = codePosFirst;
//mark all registers as used
uint64_t* r = (uint64_t*)registerUsage;
@ -991,7 +1005,12 @@ namespace randomx {
const uint32_t dst = instr.dst % RegisterCountFlt;
genAddressReg<true>(instr, p, pos);
emit(REX_CVTDQ2PD_XMM12, p, pos);
emit(REX_ANDPS_XMM12, p, pos);
if (hasXOP) {
emit(REX_VPCMOV_XMM12, p, pos);
}
else {
emit(REX_ANDPS_XMM12, p, pos);
}
emit(REX_DIVPD, p, pos);
emitByte(0xe4 + 8 * dst, p, pos);
@ -1020,7 +1039,7 @@ namespace randomx {
emit(ROL_RAX, p, pos);
emitByte(rotate, p, pos);
}
if (vm_flags & RANDOMX_FLAG_RYZEN) {
if (vm_flags & RANDOMX_FLAG_AMD) {
emit(AND_OR_MOV_LDMXCSR_RYZEN, p, pos);
}
else {

View file

@ -73,10 +73,12 @@ namespace randomx {
std::pair<const void*, const void*> mainLoopBounds;
# endif
int32_t codePos;
int32_t codePosFirst;
uint32_t vm_flags;
static bool BranchesWithin32B;
bool hasAVX;
bool hasXOP;
static void applyTweaks();
void generateProgramPrologue(Program&, ProgramConfiguration&);

View file

@ -43,6 +43,7 @@
.global DECL(randomx_program_prologue_first_load)
.global DECL(randomx_program_loop_begin)
.global DECL(randomx_program_loop_load)
.global DECL(randomx_program_loop_load_xop)
.global DECL(randomx_program_start)
.global DECL(randomx_program_read_dataset)
.global DECL(randomx_program_read_dataset_ryzen)
@ -110,6 +111,9 @@ DECL(randomx_program_loop_begin):
DECL(randomx_program_loop_load):
#include "asm/program_loop_load.inc"
DECL(randomx_program_loop_load_xop):
#include "asm/program_loop_load_xop.inc"
DECL(randomx_program_start):
nop

View file

@ -34,6 +34,7 @@ PUBLIC randomx_program_prologue
PUBLIC randomx_program_prologue_first_load
PUBLIC randomx_program_loop_begin
PUBLIC randomx_program_loop_load
PUBLIC randomx_program_loop_load_xop
PUBLIC randomx_program_start
PUBLIC randomx_program_read_dataset
PUBLIC randomx_program_read_dataset_ryzen
@ -101,6 +102,10 @@ randomx_program_loop_load PROC
include asm/program_loop_load.inc
randomx_program_loop_load ENDP
randomx_program_loop_load_xop PROC
include asm/program_loop_load_xop.inc
randomx_program_loop_load_xop ENDP
randomx_program_start PROC
nop
randomx_program_start ENDP

View file

@ -35,6 +35,7 @@ extern "C" {
void randomx_program_prologue_first_load();
void randomx_program_loop_begin();
void randomx_program_loop_load();
void randomx_program_loop_load_xop();
void randomx_program_start();
void randomx_program_read_dataset();
void randomx_program_read_dataset_ryzen();

View file

@ -49,7 +49,7 @@ enum randomx_flags {
RANDOMX_FLAG_FULL_MEM = 4,
RANDOMX_FLAG_JIT = 8,
RANDOMX_FLAG_1GB_PAGES = 16,
RANDOMX_FLAG_RYZEN = 64,
RANDOMX_FLAG_AMD = 64,
};

View file

@ -45,8 +45,12 @@ xmrig::RxVm::RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes, xmrig::
m_flags |= RANDOMX_FLAG_JIT;
}
if ((assembly == Assembly::RYZEN) || ((assembly == Assembly::AUTO) && (Cpu::info()->assembly() == Assembly::RYZEN))) {
m_flags |= RANDOMX_FLAG_RYZEN;
if (assembly == Assembly::AUTO) {
assembly = Cpu::info()->assembly();
}
if ((assembly == Assembly::RYZEN) || (assembly == Assembly::BULLDOZER)) {
m_flags |= RANDOMX_FLAG_AMD;
}
m_vm = randomx_create_vm(static_cast<randomx_flags>(m_flags), dataset->cache() ? dataset->cache()->get() : nullptr, dataset->get(), scratchpad);