Added asm optimized code for AMD Bulldozer

This commit is contained in:
SChernykh 2018-10-21 18:29:03 +02:00
parent afeaabdca4
commit 4b91978af6
10 changed files with 211 additions and 7 deletions

View file

@ -43,7 +43,8 @@ static const char *coloredAsmNames[] = {
"\x1B[1;31mnone\x1B[0m", "\x1B[1;31mnone\x1B[0m",
"auto", "auto",
"\x1B[1;32mintel\x1B[0m", "\x1B[1;32mintel\x1B[0m",
"\x1B[1;32mryzen\x1B[0m" "\x1B[1;32mryzen\x1B[0m",
"\x1B[1;32mbulldozer\x1B[0m"
}; };

View file

@ -99,6 +99,7 @@ enum Assembly {
ASM_AUTO, ASM_AUTO,
ASM_INTEL, ASM_INTEL,
ASM_RYZEN, ASM_RYZEN,
ASM_BULLDOZER,
ASM_MAX ASM_MAX
}; };

View file

@ -84,7 +84,7 @@ Options:\n\
"\ "\
--max-cpu-usage=N maximum CPU usage for automatic threads mode (default 75)\n\ --max-cpu-usage=N maximum CPU usage for automatic threads mode (default 75)\n\
--safe safe adjust threads and av settings for current CPU\n\ --safe safe adjust threads and av settings for current CPU\n\
--asm=ASM ASM code for cn/2, possible values: auto, none, intel, ryzen.\n\ --asm=ASM ASM code for cn/2, possible values: auto, none, intel, ryzen, bulldozer.\n\
--print-time=N print hashrate report every N seconds\n\ --print-time=N print hashrate report every N seconds\n\
--api-port=N port for the miner API\n\ --api-port=N port for the miner API\n\
--api-access-token=T access token for API\n\ --api-access-token=T access token for API\n\

View file

@ -76,7 +76,7 @@ xmrig::AdvancedCpuInfo::AdvancedCpuInfo() :
m_aes = true; m_aes = true;
if (data.vendor == VENDOR_AMD) { if (data.vendor == VENDOR_AMD) {
m_assembly = ASM_RYZEN; m_assembly = (data.ext_family >= 23) ? ASM_RYZEN : ASM_BULLDOZER;
} }
else if (data.vendor == VENDOR_INTEL) { else if (data.vendor == VENDOR_INTEL) {
m_assembly = ASM_INTEL; m_assembly = ASM_INTEL;

View file

@ -40,7 +40,8 @@ static const char *asmNames[] = {
"none", "none",
"auto", "auto",
"intel", "intel",
"ryzen" "ryzen",
"bulldozer"
}; };

View file

@ -564,6 +564,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
#ifndef XMRIG_NO_ASM #ifndef XMRIG_NO_ASM
extern "C" void cnv2_mainloop_ivybridge_asm(cryptonight_ctx *ctx); extern "C" void cnv2_mainloop_ivybridge_asm(cryptonight_ctx *ctx);
extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx *ctx); extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx *ctx);
extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx *ctx);
extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1);
@ -578,9 +579,12 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_
if (ASM == xmrig::ASM_INTEL) { if (ASM == xmrig::ASM_INTEL) {
cnv2_mainloop_ivybridge_asm(ctx[0]); cnv2_mainloop_ivybridge_asm(ctx[0]);
} }
else { else if (ASM == xmrig::ASM_RYZEN) {
cnv2_mainloop_ryzen_asm(ctx[0]); cnv2_mainloop_ryzen_asm(ctx[0]);
} }
else {
cnv2_mainloop_bulldozer_asm(ctx[0]);
}
cn_implode_scratchpad<ALGO, MEM, false>(reinterpret_cast<__m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state)); cn_implode_scratchpad<ALGO, MEM, false>(reinterpret_cast<__m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state));
xmrig::keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24); xmrig::keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);

View file

@ -9,6 +9,7 @@
#endif #endif
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm) .global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
.global FN_PREFIX(cnv2_mainloop_ryzen_asm) .global FN_PREFIX(cnv2_mainloop_ryzen_asm)
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
ALIGN 16 ALIGN 16
@ -27,6 +28,14 @@ FN_PREFIX(cnv2_mainloop_ryzen_asm):
add rsp, 48 add rsp, 48
ret 0 ret 0
ALIGN 16
FN_PREFIX(cnv2_mainloop_bulldozer_asm):
sub rsp, 48
mov rcx, rdi
#include "cnv2_main_loop_bulldozer.inc"
add rsp, 48
ret 0
ALIGN 16 ALIGN 16
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
sub rsp, 48 sub rsp, 48

View file

@ -1,6 +1,7 @@
_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE _TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ivybridge_asm
PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_ryzen_asm
PUBLIC cnv2_mainloop_bulldozer_asm
PUBLIC cnv2_double_mainloop_sandybridge_asm PUBLIC cnv2_double_mainloop_sandybridge_asm
ALIGN 64 ALIGN 64
@ -15,6 +16,12 @@ cnv2_mainloop_ryzen_asm PROC
ret 0 ret 0
cnv2_mainloop_ryzen_asm ENDP cnv2_mainloop_ryzen_asm ENDP
ALIGN 64
cnv2_mainloop_bulldozer_asm PROC
INCLUDE cnv2_main_loop_bulldozer.inc
ret 0
cnv2_mainloop_bulldozer_asm ENDP
ALIGN 64 ALIGN 64
cnv2_double_mainloop_sandybridge_asm PROC cnv2_double_mainloop_sandybridge_asm PROC
INCLUDE cnv2_double_main_loop_sandybridge.inc INCLUDE cnv2_double_main_loop_sandybridge.inc

View file

@ -0,0 +1,180 @@
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 524288
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 2097136
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movq xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
punpcklqdq xmm4, xmm0
ALIGN 16
cnv2_main_loop_bulldozer:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movq xmm6, r8
pinsrq xmm6, r11, 1
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
mov edi, 1023
shl rdi, 52
movq r14, xmm5
pextrq rax, xmm5, 1
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 2097136
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
div r9
mov eax, eax
shl rdx, 32
lea r15, [rax+rdx]
lea rax, [r14+r15]
shr rax, 12
add rax, rdi
movq xmm0, rax
sqrtsd xmm1, xmm0
movq rdi, xmm1
test rdi, 524287
je sqrt_fixup_bulldozer
shr rdi, 19
sqrt_fixup_bulldozer_ret:
mov rax, rsi
mul r14
movq xmm1, rax
movq xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 2097136
movdqa xmm3, xmm5
dec ebp
jne cnv2_main_loop_bulldozer
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp cnv2_main_loop_bulldozer_endp
sqrt_fixup_bulldozer:
movq r9, xmm5
add r9, r15
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp sqrt_fixup_bulldozer_ret
cnv2_main_loop_bulldozer_endp:

View file

@ -64,7 +64,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
assert(variant >= VARIANT_0 && variant < VARIANT_MAX); assert(variant >= VARIANT_0 && variant < VARIANT_MAX);
# ifndef XMRIG_NO_ASM # ifndef XMRIG_NO_ASM
constexpr const size_t count = VARIANT_MAX * 10 * 3 + 3; constexpr const size_t count = VARIANT_MAX * 10 * 3 + 4;
# else # else
constexpr const size_t count = VARIANT_MAX * 10 * 3; constexpr const size_t count = VARIANT_MAX * 10 * 3;
# endif # endif
@ -249,6 +249,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a
# ifndef XMRIG_NO_ASM # ifndef XMRIG_NO_ASM
cryptonight_single_hash_asm<CRYPTONIGHT, VARIANT_2, ASM_INTEL>, cryptonight_single_hash_asm<CRYPTONIGHT, VARIANT_2, ASM_INTEL>,
cryptonight_single_hash_asm<CRYPTONIGHT, VARIANT_2, ASM_RYZEN>, cryptonight_single_hash_asm<CRYPTONIGHT, VARIANT_2, ASM_RYZEN>,
cryptonight_single_hash_asm<CRYPTONIGHT, VARIANT_2, ASM_BULLDOZER>,
cryptonight_double_hash_asm<CRYPTONIGHT, VARIANT_2, ASM_INTEL> cryptonight_double_hash_asm<CRYPTONIGHT, VARIANT_2, ASM_INTEL>
# endif # endif
}; };
@ -447,7 +448,7 @@ size_t xmrig::CpuThread::fnIndex(Algo algorithm, AlgoVariant av, Variant variant
} }
if (av == AV_DOUBLE) { if (av == AV_DOUBLE) {
return offset + 2; return offset + 3;
} }
} }
# endif # endif