From 4b91978af63237d16b709583bc5c72bfa494e83c Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sun, 21 Oct 2018 18:29:03 +0200 Subject: [PATCH] Added asm optimized code for AMD Bulldozer --- src/Summary.cpp | 3 +- src/common/xmrig.h | 1 + src/core/ConfigLoader_platform.h | 2 +- src/core/cpu/AdvancedCpuInfo.cpp | 2 +- src/crypto/Asm.cpp | 3 +- src/crypto/CryptoNight_x86.h | 6 +- src/crypto/asm/cnv2_main_loop.S | 9 + src/crypto/asm/cnv2_main_loop.asm | 7 + src/crypto/asm/cnv2_main_loop_bulldozer.inc | 180 ++++++++++++++++++++ src/workers/CpuThread.cpp | 5 +- 10 files changed, 211 insertions(+), 7 deletions(-) create mode 100644 src/crypto/asm/cnv2_main_loop_bulldozer.inc diff --git a/src/Summary.cpp b/src/Summary.cpp index 3c1d06a70..71c456a58 100644 --- a/src/Summary.cpp +++ b/src/Summary.cpp @@ -43,7 +43,8 @@ static const char *coloredAsmNames[] = { "\x1B[1;31mnone\x1B[0m", "auto", "\x1B[1;32mintel\x1B[0m", - "\x1B[1;32mryzen\x1B[0m" + "\x1B[1;32mryzen\x1B[0m", + "\x1B[1;32mbulldozer\x1B[0m" }; diff --git a/src/common/xmrig.h b/src/common/xmrig.h index 52650f0d2..840a9148e 100644 --- a/src/common/xmrig.h +++ b/src/common/xmrig.h @@ -99,6 +99,7 @@ enum Assembly { ASM_AUTO, ASM_INTEL, ASM_RYZEN, + ASM_BULLDOZER, ASM_MAX }; diff --git a/src/core/ConfigLoader_platform.h b/src/core/ConfigLoader_platform.h index 54546211a..807d80e25 100644 --- a/src/core/ConfigLoader_platform.h +++ b/src/core/ConfigLoader_platform.h @@ -84,7 +84,7 @@ Options:\n\ "\ --max-cpu-usage=N maximum CPU usage for automatic threads mode (default 75)\n\ --safe safe adjust threads and av settings for current CPU\n\ - --asm=ASM ASM code for cn/2, possible values: auto, none, intel, ryzen.\n\ + --asm=ASM ASM code for cn/2, possible values: auto, none, intel, ryzen, bulldozer.\n\ --print-time=N print hashrate report every N seconds\n\ --api-port=N port for the miner API\n\ --api-access-token=T access token for API\n\ diff --git a/src/core/cpu/AdvancedCpuInfo.cpp b/src/core/cpu/AdvancedCpuInfo.cpp index c1a9f8cd7..06df98ad4 100644 --- a/src/core/cpu/AdvancedCpuInfo.cpp +++ b/src/core/cpu/AdvancedCpuInfo.cpp @@ -76,7 +76,7 @@ xmrig::AdvancedCpuInfo::AdvancedCpuInfo() : m_aes = true; if (data.vendor == VENDOR_AMD) { - m_assembly = ASM_RYZEN; + m_assembly = (data.ext_family >= 23) ? ASM_RYZEN : ASM_BULLDOZER; } else if (data.vendor == VENDOR_INTEL) { m_assembly = ASM_INTEL; diff --git a/src/crypto/Asm.cpp b/src/crypto/Asm.cpp index 48c1beb8e..9ef04bf9e 100644 --- a/src/crypto/Asm.cpp +++ b/src/crypto/Asm.cpp @@ -40,7 +40,8 @@ static const char *asmNames[] = { "none", "auto", "intel", - "ryzen" + "ryzen", + "bulldozer" }; diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 8dcdd4144..dfcd12962 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -564,6 +564,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si #ifndef XMRIG_NO_ASM extern "C" void cnv2_mainloop_ivybridge_asm(cryptonight_ctx *ctx); extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx *ctx); +extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx *ctx); extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); @@ -578,9 +579,12 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_ if (ASM == xmrig::ASM_INTEL) { cnv2_mainloop_ivybridge_asm(ctx[0]); } - else { + else if (ASM == xmrig::ASM_RYZEN) { cnv2_mainloop_ryzen_asm(ctx[0]); } + else { + cnv2_mainloop_bulldozer_asm(ctx[0]); + } cn_implode_scratchpad(reinterpret_cast<__m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state)); xmrig::keccakf(reinterpret_cast(ctx[0]->state), 24); diff --git a/src/crypto/asm/cnv2_main_loop.S b/src/crypto/asm/cnv2_main_loop.S index 4dbcbbda7..a23f24bf6 100644 --- a/src/crypto/asm/cnv2_main_loop.S +++ b/src/crypto/asm/cnv2_main_loop.S @@ -9,6 +9,7 @@ #endif .global FN_PREFIX(cnv2_mainloop_ivybridge_asm) .global FN_PREFIX(cnv2_mainloop_ryzen_asm) +.global FN_PREFIX(cnv2_mainloop_bulldozer_asm) .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) ALIGN 16 @@ -27,6 +28,14 @@ FN_PREFIX(cnv2_mainloop_ryzen_asm): add rsp, 48 ret 0 +ALIGN 16 +FN_PREFIX(cnv2_mainloop_bulldozer_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_bulldozer.inc" + add rsp, 48 + ret 0 + ALIGN 16 FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): sub rsp, 48 diff --git a/src/crypto/asm/cnv2_main_loop.asm b/src/crypto/asm/cnv2_main_loop.asm index d95222675..557f1ab68 100644 --- a/src/crypto/asm/cnv2_main_loop.asm +++ b/src/crypto/asm/cnv2_main_loop.asm @@ -1,6 +1,7 @@ _TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm +PUBLIC cnv2_mainloop_bulldozer_asm PUBLIC cnv2_double_mainloop_sandybridge_asm ALIGN 64 @@ -15,6 +16,12 @@ cnv2_mainloop_ryzen_asm PROC ret 0 cnv2_mainloop_ryzen_asm ENDP +ALIGN 64 +cnv2_mainloop_bulldozer_asm PROC + INCLUDE cnv2_main_loop_bulldozer.inc + ret 0 +cnv2_mainloop_bulldozer_asm ENDP + ALIGN 64 cnv2_double_mainloop_sandybridge_asm PROC INCLUDE cnv2_double_main_loop_sandybridge.inc diff --git a/src/crypto/asm/cnv2_main_loop_bulldozer.inc b/src/crypto/asm/cnv2_main_loop_bulldozer.inc new file mode 100644 index 000000000..478976c03 --- /dev/null +++ b/src/crypto/asm/cnv2_main_loop_bulldozer.inc @@ -0,0 +1,180 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 16 +cnv2_main_loop_bulldozer: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm6, r8 + pinsrq xmm6, r11, 1 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + + mov edi, 1023 + shl rdi, 52 + + movq r14, xmm5 + pextrq rax, xmm5, 1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + div r9 + mov eax, eax + shl rdx, 32 + lea r15, [rax+rdx] + lea rax, [r14+r15] + shr rax, 12 + add rax, rdi + movq xmm0, rax + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_bulldozer + shr rdi, 19 + +sqrt_fixup_bulldozer_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne cnv2_main_loop_bulldozer + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_bulldozer_endp + +sqrt_fixup_bulldozer: + movq r9, xmm5 + add r9, r15 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_bulldozer_ret + +cnv2_main_loop_bulldozer_endp: diff --git a/src/workers/CpuThread.cpp b/src/workers/CpuThread.cpp index 4b5281486..aee26b0da 100644 --- a/src/workers/CpuThread.cpp +++ b/src/workers/CpuThread.cpp @@ -64,7 +64,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a assert(variant >= VARIANT_0 && variant < VARIANT_MAX); # ifndef XMRIG_NO_ASM - constexpr const size_t count = VARIANT_MAX * 10 * 3 + 3; + constexpr const size_t count = VARIANT_MAX * 10 * 3 + 4; # else constexpr const size_t count = VARIANT_MAX * 10 * 3; # endif @@ -249,6 +249,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a # ifndef XMRIG_NO_ASM cryptonight_single_hash_asm, cryptonight_single_hash_asm, + cryptonight_single_hash_asm, cryptonight_double_hash_asm # endif }; @@ -447,7 +448,7 @@ size_t xmrig::CpuThread::fnIndex(Algo algorithm, AlgoVariant av, Variant variant } if (av == AV_DOUBLE) { - return offset + 2; + return offset + 3; } } # endif