RandomX: optimized IMUL_RCP instruction

+0.4% on AMD Zen2
+0.3% on AMD Zen3
+0.1% on Intel SandyBridge
+0.3% on rx/wow on Intel SandyBridge
This commit is contained in:
SChernykh 2021-04-19 17:43:58 +02:00
parent 61d165a314
commit 3477f9fbc1
6 changed files with 55 additions and 9 deletions

View file

@ -0,0 +1,17 @@
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
db 72, 185, 0, 0, 0, 0, 0, 0, 0, 0, 81
add rsp, 128

View file

@ -428,7 +428,10 @@ namespace randomx {
xmrig::RxFix::setMainLoopBounds(mainLoopBounds);
# endif
memcpy(code + prologueSize - 48, &pcfg.eMask, sizeof(pcfg.eMask));
imul_rcp_storage = code + (ADDR(randomx_program_imul_rcp_store) - codePrologue) + 2;
imul_rcp_storage_used = 0;
memcpy(imul_rcp_storage - 34, &pcfg.eMask, sizeof(pcfg.eMask));
codePos = codePosFirst;
prevCFROUND = 0;
@ -1012,13 +1015,24 @@ namespace randomx {
uint64_t divisor = instr.getImm32();
if (!isZeroOrPowerOf2(divisor)) {
*(uint32_t*)(p + pos) = 0xb848;
pos += 2;
emit64(randomx_reciprocal_fast(divisor), p, pos);
const uint32_t dst = instr.dst % RegistersCount;
emit32(0xc0af0f4c + (dst << 27), p, pos);
const uint64_t reciprocal = randomx_reciprocal_fast(divisor);
if (imul_rcp_storage_used < 16) {
*(uint64_t*)(imul_rcp_storage) = reciprocal;
*(uint64_t*)(p + pos) = 0x2444AF0F4Cull + (dst << 27) + (static_cast<uint64_t>(248 - imul_rcp_storage_used * 8) << 40);
++imul_rcp_storage_used;
imul_rcp_storage += 11;
pos += 6;
}
else {
*(uint32_t*)(p + pos) = 0xb848;
pos += 2;
emit64(reciprocal, p, pos);
emit32(0xc0af0f4c + (dst << 27), p, pos);
}
registerUsage[dst] = pos;
}

View file

@ -104,6 +104,9 @@ namespace randomx {
uint8_t* allocatedCode = nullptr;
size_t allocatedSize = 0;
uint8_t* imul_rcp_storage = nullptr;
uint32_t imul_rcp_storage_used = 0;
void generateProgramPrologue(Program&, ProgramConfiguration&);
void generateProgramEpilogue(Program&, ProgramConfiguration&);
template<bool rax>

View file

@ -41,6 +41,7 @@
.global DECL(randomx_prefetch_scratchpad_end)
.global DECL(randomx_program_prologue)
.global DECL(randomx_program_prologue_first_load)
.global DECL(randomx_program_imul_rcp_store)
.global DECL(randomx_program_loop_begin)
.global DECL(randomx_program_loop_load)
.global DECL(randomx_program_loop_load_xop)
@ -106,11 +107,15 @@ DECL(randomx_program_prologue_first_load):
nop
nop
nop
jmp DECL(randomx_program_loop_begin)
jmp DECL(randomx_program_imul_rcp_store)
.balign 64
#include "asm/program_xmm_constants.inc"
DECL(randomx_program_imul_rcp_store):
#include "asm/program_imul_rcp_store.inc"
jmp DECL(randomx_program_loop_begin)
.balign 64
DECL(randomx_program_loop_begin):
nop

View file

@ -32,6 +32,7 @@ PUBLIC randomx_prefetch_scratchpad
PUBLIC randomx_prefetch_scratchpad_end
PUBLIC randomx_program_prologue
PUBLIC randomx_program_prologue_first_load
PUBLIC randomx_program_imul_rcp_store
PUBLIC randomx_program_loop_begin
PUBLIC randomx_program_loop_load
PUBLIC randomx_program_loop_load_xop
@ -94,12 +95,17 @@ randomx_program_prologue_first_load PROC
nop
nop
nop
jmp randomx_program_loop_begin
jmp randomx_program_imul_rcp_store
randomx_program_prologue_first_load ENDP
ALIGN 64
include asm/program_xmm_constants.inc
randomx_program_imul_rcp_store PROC
include asm/program_imul_rcp_store.inc
jmp randomx_program_loop_begin
randomx_program_imul_rcp_store ENDP
ALIGN 64
randomx_program_loop_begin PROC
nop

View file

@ -33,6 +33,7 @@ extern "C" {
void randomx_prefetch_scratchpad_end();
void randomx_program_prologue();
void randomx_program_prologue_first_load();
void randomx_program_imul_rcp_store();
void randomx_program_loop_begin();
void randomx_program_loop_load();
void randomx_program_loop_load_xop();