mirror of
https://github.com/xmrig/xmrig.git
synced 2025-01-25 12:06:04 +00:00
Merge pull request #1502 from SChernykh/dev
Optimizations for AMD Bulldozer
This commit is contained in:
commit
8e6f4d4ecb
9 changed files with 69 additions and 10 deletions
24
src/crypto/randomx/asm/program_loop_load_xop.inc
Normal file
24
src/crypto/randomx/asm/program_loop_load_xop.inc
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
lea rcx, [rsi+rax]
|
||||||
|
mov [rsp+8], rcx
|
||||||
|
xor r8, qword ptr [rcx+0]
|
||||||
|
xor r9, qword ptr [rcx+8]
|
||||||
|
xor r10, qword ptr [rcx+16]
|
||||||
|
xor r11, qword ptr [rcx+24]
|
||||||
|
xor r12, qword ptr [rcx+32]
|
||||||
|
xor r13, qword ptr [rcx+40]
|
||||||
|
xor r14, qword ptr [rcx+48]
|
||||||
|
xor r15, qword ptr [rcx+56]
|
||||||
|
lea rcx, [rsi+rdx]
|
||||||
|
mov [rsp+16], rcx
|
||||||
|
cvtdq2pd xmm0, qword ptr [rcx+0]
|
||||||
|
cvtdq2pd xmm1, qword ptr [rcx+8]
|
||||||
|
cvtdq2pd xmm2, qword ptr [rcx+16]
|
||||||
|
cvtdq2pd xmm3, qword ptr [rcx+24]
|
||||||
|
cvtdq2pd xmm4, qword ptr [rcx+32]
|
||||||
|
cvtdq2pd xmm5, qword ptr [rcx+40]
|
||||||
|
cvtdq2pd xmm6, qword ptr [rcx+48]
|
||||||
|
cvtdq2pd xmm7, qword ptr [rcx+56]
|
||||||
|
vpcmov xmm4, xmm4, xmm14, xmm13
|
||||||
|
vpcmov xmm5, xmm5, xmm14, xmm13
|
||||||
|
vpcmov xmm6, xmm6, xmm14, xmm13
|
||||||
|
vpcmov xmm7, xmm7, xmm14, xmm13
|
|
@ -1,5 +1,5 @@
|
||||||
mantissaMask:
|
mantissaMask:
|
||||||
db 255, 255, 255, 255, 255, 255, 255, 0, 255, 255, 255, 255, 255, 255, 255, 0
|
db 0, 0, 192, 255, 255, 255, 255, 0, 0, 0, 192, 255, 255, 255, 255, 0
|
||||||
exp240:
|
exp240:
|
||||||
db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||||
scaleMask:
|
scaleMask:
|
||||||
|
|
|
@ -89,6 +89,7 @@ namespace randomx {
|
||||||
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
|
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
|
||||||
const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin;
|
const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin;
|
||||||
const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load;
|
const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load;
|
||||||
|
const uint8_t* codeLoopLoadXOP = (uint8_t*)&randomx_program_loop_load_xop;
|
||||||
const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start;
|
const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start;
|
||||||
const uint8_t* codeReadDatasetLightSshInit = (uint8_t*)&randomx_program_read_dataset_sshash_init;
|
const uint8_t* codeReadDatasetLightSshInit = (uint8_t*)&randomx_program_read_dataset_sshash_init;
|
||||||
const uint8_t* codeReadDatasetLightSshFin = (uint8_t*)&randomx_program_read_dataset_sshash_fin;
|
const uint8_t* codeReadDatasetLightSshFin = (uint8_t*)&randomx_program_read_dataset_sshash_fin;
|
||||||
|
@ -104,7 +105,8 @@ namespace randomx {
|
||||||
|
|
||||||
const int32_t prefetchScratchpadSize = codePrefetchScratchpadEnd - codePrefetchScratchpad;
|
const int32_t prefetchScratchpadSize = codePrefetchScratchpadEnd - codePrefetchScratchpad;
|
||||||
const int32_t prologueSize = codeLoopBegin - codePrologue;
|
const int32_t prologueSize = codeLoopBegin - codePrologue;
|
||||||
const int32_t loopLoadSize = codeProgamStart - codeLoopLoad;
|
const int32_t loopLoadSize = codeLoopLoadXOP - codeLoopLoad;
|
||||||
|
const int32_t loopLoadXOPSize = codeProgamStart - codeLoopLoadXOP;
|
||||||
const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit;
|
const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit;
|
||||||
const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin;
|
const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin;
|
||||||
const int32_t loopStoreSize = codeLoopEnd - codeLoopStore;
|
const int32_t loopStoreSize = codeLoopEnd - codeLoopStore;
|
||||||
|
@ -184,6 +186,7 @@ namespace randomx {
|
||||||
static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 };
|
static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 };
|
||||||
static const uint8_t REX_XCHG[] = { 0x4d, 0x87 };
|
static const uint8_t REX_XCHG[] = { 0x4d, 0x87 };
|
||||||
static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0F, 0x54, 0xE5, 0x45, 0x0F, 0x56, 0xE6 };
|
static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0F, 0x54, 0xE5, 0x45, 0x0F, 0x56, 0xE6 };
|
||||||
|
static const uint8_t REX_VPCMOV_XMM12[] = { 0x8F, 0x48, 0x18, 0xA2, 0xE6, 0xD0 };
|
||||||
static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f };
|
static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f };
|
||||||
static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 };
|
static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 };
|
||||||
static const uint8_t CALL = 0xe8;
|
static const uint8_t CALL = 0xe8;
|
||||||
|
@ -295,12 +298,23 @@ namespace randomx {
|
||||||
cpuid(1, info);
|
cpuid(1, info);
|
||||||
hasAVX = ((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0);
|
hasAVX = ((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0);
|
||||||
|
|
||||||
|
cpuid(0x80000001, info);
|
||||||
|
hasXOP = ((info[2] & (1 << 11)) != 0);
|
||||||
|
|
||||||
allocatedCode = (uint8_t*)allocExecutableMemory(CodeSize * 2);
|
allocatedCode = (uint8_t*)allocExecutableMemory(CodeSize * 2);
|
||||||
// Shift code base address to improve caching - all threads will use different L2/L3 cache sets
|
// Shift code base address to improve caching - all threads will use different L2/L3 cache sets
|
||||||
code = allocatedCode + (codeOffset.fetch_add(59 * 64) % CodeSize);
|
code = allocatedCode + (codeOffset.fetch_add(59 * 64) % CodeSize);
|
||||||
memcpy(code, codePrologue, prologueSize);
|
memcpy(code, codePrologue, prologueSize);
|
||||||
memcpy(code + prologueSize, codeLoopLoad, loopLoadSize);
|
if (hasXOP) {
|
||||||
|
memcpy(code + prologueSize, codeLoopLoadXOP, loopLoadXOPSize);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
memcpy(code + prologueSize, codeLoopLoad, loopLoadSize);
|
||||||
|
}
|
||||||
memcpy(code + epilogueOffset, codeEpilogue, epilogueSize);
|
memcpy(code + epilogueOffset, codeEpilogue, epilogueSize);
|
||||||
|
|
||||||
|
codePosFirst = prologueSize + (hasXOP ? loopLoadXOPSize : loopLoadSize);
|
||||||
|
|
||||||
# ifdef XMRIG_FIX_RYZEN
|
# ifdef XMRIG_FIX_RYZEN
|
||||||
mainLoopBounds.first = code + prologueSize;
|
mainLoopBounds.first = code + prologueSize;
|
||||||
mainLoopBounds.second = code + epilogueOffset;
|
mainLoopBounds.second = code + epilogueOffset;
|
||||||
|
@ -318,7 +332,7 @@ namespace randomx {
|
||||||
|
|
||||||
uint8_t* p;
|
uint8_t* p;
|
||||||
uint32_t n;
|
uint32_t n;
|
||||||
if (flags & RANDOMX_FLAG_RYZEN) {
|
if (flags & RANDOMX_FLAG_AMD) {
|
||||||
p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked;
|
p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked;
|
||||||
n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize;
|
n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize;
|
||||||
}
|
}
|
||||||
|
@ -395,7 +409,7 @@ namespace randomx {
|
||||||
# endif
|
# endif
|
||||||
|
|
||||||
memcpy(code + prologueSize - 48, &pcfg.eMask, sizeof(pcfg.eMask));
|
memcpy(code + prologueSize - 48, &pcfg.eMask, sizeof(pcfg.eMask));
|
||||||
codePos = prologueSize + loopLoadSize;
|
codePos = codePosFirst;
|
||||||
|
|
||||||
//mark all registers as used
|
//mark all registers as used
|
||||||
uint64_t* r = (uint64_t*)registerUsage;
|
uint64_t* r = (uint64_t*)registerUsage;
|
||||||
|
@ -991,7 +1005,12 @@ namespace randomx {
|
||||||
const uint32_t dst = instr.dst % RegisterCountFlt;
|
const uint32_t dst = instr.dst % RegisterCountFlt;
|
||||||
genAddressReg<true>(instr, p, pos);
|
genAddressReg<true>(instr, p, pos);
|
||||||
emit(REX_CVTDQ2PD_XMM12, p, pos);
|
emit(REX_CVTDQ2PD_XMM12, p, pos);
|
||||||
emit(REX_ANDPS_XMM12, p, pos);
|
if (hasXOP) {
|
||||||
|
emit(REX_VPCMOV_XMM12, p, pos);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
emit(REX_ANDPS_XMM12, p, pos);
|
||||||
|
}
|
||||||
emit(REX_DIVPD, p, pos);
|
emit(REX_DIVPD, p, pos);
|
||||||
emitByte(0xe4 + 8 * dst, p, pos);
|
emitByte(0xe4 + 8 * dst, p, pos);
|
||||||
|
|
||||||
|
@ -1020,7 +1039,7 @@ namespace randomx {
|
||||||
emit(ROL_RAX, p, pos);
|
emit(ROL_RAX, p, pos);
|
||||||
emitByte(rotate, p, pos);
|
emitByte(rotate, p, pos);
|
||||||
}
|
}
|
||||||
if (vm_flags & RANDOMX_FLAG_RYZEN) {
|
if (vm_flags & RANDOMX_FLAG_AMD) {
|
||||||
emit(AND_OR_MOV_LDMXCSR_RYZEN, p, pos);
|
emit(AND_OR_MOV_LDMXCSR_RYZEN, p, pos);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
|
@ -73,10 +73,12 @@ namespace randomx {
|
||||||
std::pair<const void*, const void*> mainLoopBounds;
|
std::pair<const void*, const void*> mainLoopBounds;
|
||||||
# endif
|
# endif
|
||||||
int32_t codePos;
|
int32_t codePos;
|
||||||
|
int32_t codePosFirst;
|
||||||
uint32_t vm_flags;
|
uint32_t vm_flags;
|
||||||
|
|
||||||
static bool BranchesWithin32B;
|
static bool BranchesWithin32B;
|
||||||
bool hasAVX;
|
bool hasAVX;
|
||||||
|
bool hasXOP;
|
||||||
|
|
||||||
static void applyTweaks();
|
static void applyTweaks();
|
||||||
void generateProgramPrologue(Program&, ProgramConfiguration&);
|
void generateProgramPrologue(Program&, ProgramConfiguration&);
|
||||||
|
|
|
@ -43,6 +43,7 @@
|
||||||
.global DECL(randomx_program_prologue_first_load)
|
.global DECL(randomx_program_prologue_first_load)
|
||||||
.global DECL(randomx_program_loop_begin)
|
.global DECL(randomx_program_loop_begin)
|
||||||
.global DECL(randomx_program_loop_load)
|
.global DECL(randomx_program_loop_load)
|
||||||
|
.global DECL(randomx_program_loop_load_xop)
|
||||||
.global DECL(randomx_program_start)
|
.global DECL(randomx_program_start)
|
||||||
.global DECL(randomx_program_read_dataset)
|
.global DECL(randomx_program_read_dataset)
|
||||||
.global DECL(randomx_program_read_dataset_ryzen)
|
.global DECL(randomx_program_read_dataset_ryzen)
|
||||||
|
@ -110,6 +111,9 @@ DECL(randomx_program_loop_begin):
|
||||||
DECL(randomx_program_loop_load):
|
DECL(randomx_program_loop_load):
|
||||||
#include "asm/program_loop_load.inc"
|
#include "asm/program_loop_load.inc"
|
||||||
|
|
||||||
|
DECL(randomx_program_loop_load_xop):
|
||||||
|
#include "asm/program_loop_load_xop.inc"
|
||||||
|
|
||||||
DECL(randomx_program_start):
|
DECL(randomx_program_start):
|
||||||
nop
|
nop
|
||||||
|
|
||||||
|
|
|
@ -34,6 +34,7 @@ PUBLIC randomx_program_prologue
|
||||||
PUBLIC randomx_program_prologue_first_load
|
PUBLIC randomx_program_prologue_first_load
|
||||||
PUBLIC randomx_program_loop_begin
|
PUBLIC randomx_program_loop_begin
|
||||||
PUBLIC randomx_program_loop_load
|
PUBLIC randomx_program_loop_load
|
||||||
|
PUBLIC randomx_program_loop_load_xop
|
||||||
PUBLIC randomx_program_start
|
PUBLIC randomx_program_start
|
||||||
PUBLIC randomx_program_read_dataset
|
PUBLIC randomx_program_read_dataset
|
||||||
PUBLIC randomx_program_read_dataset_ryzen
|
PUBLIC randomx_program_read_dataset_ryzen
|
||||||
|
@ -101,6 +102,10 @@ randomx_program_loop_load PROC
|
||||||
include asm/program_loop_load.inc
|
include asm/program_loop_load.inc
|
||||||
randomx_program_loop_load ENDP
|
randomx_program_loop_load ENDP
|
||||||
|
|
||||||
|
randomx_program_loop_load_xop PROC
|
||||||
|
include asm/program_loop_load_xop.inc
|
||||||
|
randomx_program_loop_load_xop ENDP
|
||||||
|
|
||||||
randomx_program_start PROC
|
randomx_program_start PROC
|
||||||
nop
|
nop
|
||||||
randomx_program_start ENDP
|
randomx_program_start ENDP
|
||||||
|
|
|
@ -35,6 +35,7 @@ extern "C" {
|
||||||
void randomx_program_prologue_first_load();
|
void randomx_program_prologue_first_load();
|
||||||
void randomx_program_loop_begin();
|
void randomx_program_loop_begin();
|
||||||
void randomx_program_loop_load();
|
void randomx_program_loop_load();
|
||||||
|
void randomx_program_loop_load_xop();
|
||||||
void randomx_program_start();
|
void randomx_program_start();
|
||||||
void randomx_program_read_dataset();
|
void randomx_program_read_dataset();
|
||||||
void randomx_program_read_dataset_ryzen();
|
void randomx_program_read_dataset_ryzen();
|
||||||
|
|
|
@ -49,7 +49,7 @@ enum randomx_flags {
|
||||||
RANDOMX_FLAG_FULL_MEM = 4,
|
RANDOMX_FLAG_FULL_MEM = 4,
|
||||||
RANDOMX_FLAG_JIT = 8,
|
RANDOMX_FLAG_JIT = 8,
|
||||||
RANDOMX_FLAG_1GB_PAGES = 16,
|
RANDOMX_FLAG_1GB_PAGES = 16,
|
||||||
RANDOMX_FLAG_RYZEN = 64,
|
RANDOMX_FLAG_AMD = 64,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -45,8 +45,12 @@ xmrig::RxVm::RxVm(RxDataset *dataset, uint8_t *scratchpad, bool softAes, xmrig::
|
||||||
m_flags |= RANDOMX_FLAG_JIT;
|
m_flags |= RANDOMX_FLAG_JIT;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((assembly == Assembly::RYZEN) || ((assembly == Assembly::AUTO) && (Cpu::info()->assembly() == Assembly::RYZEN))) {
|
if (assembly == Assembly::AUTO) {
|
||||||
m_flags |= RANDOMX_FLAG_RYZEN;
|
assembly = Cpu::info()->assembly();
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((assembly == Assembly::RYZEN) || (assembly == Assembly::BULLDOZER)) {
|
||||||
|
m_flags |= RANDOMX_FLAG_AMD;
|
||||||
}
|
}
|
||||||
|
|
||||||
m_vm = randomx_create_vm(static_cast<randomx_flags>(m_flags), dataset->cache() ? dataset->cache()->get() : nullptr, dataset->get(), scratchpad);
|
m_vm = randomx_create_vm(static_cast<randomx_flags>(m_flags), dataset->cache() ? dataset->cache()->get() : nullptr, dataset->get(), scratchpad);
|
||||||
|
|
Loading…
Reference in a new issue