From 8b9d5cff91fe8b1e2f169e11a5a1267644ebfb3d Mon Sep 17 00:00:00 2001 From: SChernykh Date: Mon, 14 Jan 2019 15:34:55 +0100 Subject: [PATCH 1/2] Added ASM code patching when loading For CNv2 variants with different iterations and memory size. --- src/Mem.h | 3 + src/Mem_unix.cpp | 16 + src/Mem_win.cpp | 12 + src/crypto/CryptoNight_x86.h | 8 +- .../cn_half_double_main_loop_sandybridge.inc | 410 ------------------ .../cn_half/cn_half_main_loop_bulldozer.inc | 180 -------- .../cn_half/cn_half_main_loop_ivybridge.inc | 186 -------- .../asm/cn_half/cn_half_main_loop_ryzen.inc | 179 -------- src/crypto/asm/cn_main_loop.S | 42 +- src/crypto/asm/cn_main_loop.asm | 32 +- .../cn_half_double_main_loop_sandybridge.inc | 410 ------------------ .../cn_half/cn_half_main_loop_bulldozer.inc | 180 -------- .../cn_half/cn_half_main_loop_ivybridge.inc | 186 -------- .../win64/cn_half/cn_half_main_loop_ryzen.inc | 179 -------- src/crypto/asm/win64/cn_main_loop.S | 31 +- src/crypto/asm/win64/cn_main_loop.asm | 32 +- src/workers/CpuThread.cpp | 56 +++ src/workers/CpuThread.h | 6 + src/workers/Workers.cpp | 4 + 19 files changed, 118 insertions(+), 2034 deletions(-) delete mode 100644 src/crypto/asm/cn_half/cn_half_double_main_loop_sandybridge.inc delete mode 100644 src/crypto/asm/cn_half/cn_half_main_loop_bulldozer.inc delete mode 100644 src/crypto/asm/cn_half/cn_half_main_loop_ivybridge.inc delete mode 100644 src/crypto/asm/cn_half/cn_half_main_loop_ryzen.inc delete mode 100644 src/crypto/asm/win64/cn_half/cn_half_double_main_loop_sandybridge.inc delete mode 100644 src/crypto/asm/win64/cn_half/cn_half_main_loop_bulldozer.inc delete mode 100644 src/crypto/asm/win64/cn_half/cn_half_main_loop_ivybridge.inc delete mode 100644 src/crypto/asm/win64/cn_half/cn_half_main_loop_ryzen.inc diff --git a/src/Mem.h b/src/Mem.h index 6fd18fc1..0aa6eb4d 100644 --- a/src/Mem.h +++ b/src/Mem.h @@ -59,6 +59,9 @@ public: static void init(bool enabled); static void release(cryptonight_ctx **ctx, size_t count, MemInfo &info); + static void* allocate_executable_memory(size_t size); + static void FlushInstructionCache(void* p, size_t size); + static inline bool isHugepagesAvailable() { return (m_flags & HugepagesAvailable) != 0; } private: diff --git a/src/Mem_unix.cpp b/src/Mem_unix.cpp index c1aa0fb1..af7791bd 100644 --- a/src/Mem_unix.cpp +++ b/src/Mem_unix.cpp @@ -87,3 +87,19 @@ void Mem::release(MemInfo &info) _mm_free(info.memory); } } + + +void* Mem::allocate_executable_memory(size_t size) +{ +# if defined(__APPLE__) + return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0); +# else + return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +# endif +} + + +void Mem::FlushInstructionCache(void* p, size_t size) +{ + __builtin___clear_cache(reinterpret_cast(p), reinterpret_cast(p) + size); +} diff --git a/src/Mem_win.cpp b/src/Mem_win.cpp index 2bfcc3b0..2fad191d 100644 --- a/src/Mem_win.cpp +++ b/src/Mem_win.cpp @@ -182,3 +182,15 @@ void Mem::release(MemInfo &info) _mm_free(info.memory); } } + + +void* Mem::allocate_executable_memory(size_t size) +{ + return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE); +} + + +void Mem::FlushInstructionCache(void* p, size_t size) +{ + ::FlushInstructionCache(GetCurrentProcess(), p, size); +} diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index fef3dc19..0c3fd52a 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -570,10 +570,10 @@ extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx *ctx); extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx *ctx); extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); -extern "C" void cn_half_mainloop_ivybridge_asm(cryptonight_ctx *ctx); -extern "C" void cn_half_mainloop_ryzen_asm(cryptonight_ctx *ctx); -extern "C" void cn_half_mainloop_bulldozer_asm(cryptonight_ctx *ctx); -extern "C" void cn_half_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); +extern xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ivybridge_asm; +extern xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ryzen_asm; +extern xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_bulldozer_asm; +extern xmrig::CpuThread::cn_mainloop_double_fun cn_half_double_mainloop_sandybridge_asm; template diff --git a/src/crypto/asm/cn_half/cn_half_double_main_loop_sandybridge.inc b/src/crypto/asm/cn_half/cn_half_double_main_loop_sandybridge.inc deleted file mode 100644 index 2497ef95..00000000 --- a/src/crypto/asm/cn_half/cn_half_double_main_loop_sandybridge.inc +++ /dev/null @@ -1,410 +0,0 @@ - mov rax, rsp - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 184 - - stmxcsr DWORD PTR [rsp+272] - mov DWORD PTR [rsp+276], 24448 - ldmxcsr DWORD PTR [rsp+276] - - mov r13, QWORD PTR [rcx+224] - mov r9, rdx - mov r10, QWORD PTR [rcx+32] - mov r8, rcx - xor r10, QWORD PTR [rcx] - mov r14d, 262144 - mov r11, QWORD PTR [rcx+40] - xor r11, QWORD PTR [rcx+8] - mov rsi, QWORD PTR [rdx+224] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov rdi, QWORD PTR [r9+32] - xor rdi, QWORD PTR [r9] - mov rbp, QWORD PTR [r9+40] - xor rbp, QWORD PTR [r9+8] - movq xmm0, rdx - movaps XMMWORD PTR [rax-88], xmm6 - movaps XMMWORD PTR [rax-104], xmm7 - movaps XMMWORD PTR [rax-120], xmm8 - movaps XMMWORD PTR [rsp+112], xmm9 - movaps XMMWORD PTR [rsp+96], xmm10 - movaps XMMWORD PTR [rsp+80], xmm11 - movaps XMMWORD PTR [rsp+64], xmm12 - movaps XMMWORD PTR [rsp+48], xmm13 - movaps XMMWORD PTR [rsp+32], xmm14 - movaps XMMWORD PTR [rsp+16], xmm15 - mov rdx, r10 - movq xmm4, QWORD PTR [r8+96] - and edx, 2097136 - mov rax, QWORD PTR [rcx+48] - xorps xmm13, xmm13 - xor rax, QWORD PTR [rcx+16] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r8+72] - movq xmm5, QWORD PTR [r8+104] - movq xmm7, rax - - mov eax, 1 - shl rax, 52 - movq xmm14, rax - punpcklqdq xmm14, xmm14 - - mov eax, 1023 - shl rax, 52 - movq xmm12, rax - punpcklqdq xmm12, xmm12 - - mov rax, QWORD PTR [r8+80] - xor rax, QWORD PTR [r8+64] - punpcklqdq xmm7, xmm0 - movq xmm0, rcx - mov rcx, QWORD PTR [r9+56] - xor rcx, QWORD PTR [r9+24] - movq xmm3, rax - mov rax, QWORD PTR [r9+48] - xor rax, QWORD PTR [r9+16] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp], r13 - mov rcx, QWORD PTR [r9+88] - xor rcx, QWORD PTR [r9+72] - movq xmm6, rax - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - punpcklqdq xmm6, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp+256], r10 - mov rcx, rdi - mov QWORD PTR [rsp+264], r11 - movq xmm8, rax - and ecx, 2097136 - punpcklqdq xmm8, xmm0 - movq xmm0, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, QWORD PTR [r9+104] - lea r8, QWORD PTR [rcx+rsi] - movdqu xmm11, XMMWORD PTR [r8] - punpcklqdq xmm5, xmm0 - lea r9, QWORD PTR [rdx+r13] - movdqu xmm15, XMMWORD PTR [r9] - - ALIGN 16 -main_loop_double_half_sandybridge: - movdqu xmm9, xmm15 - mov eax, edx - mov ebx, edx - xor eax, 16 - xor ebx, 32 - xor edx, 48 - - movq xmm0, r11 - movq xmm2, r10 - punpcklqdq xmm2, xmm0 - aesenc xmm9, xmm2 - - movdqu xmm0, XMMWORD PTR [rax+r13] - movdqu xmm1, XMMWORD PTR [rbx+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [rbx+r13], xmm0 - movdqu xmm0, XMMWORD PTR [rdx+r13] - movdqu XMMWORD PTR [rdx+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [rax+r13], xmm0 - - movq r11, xmm9 - mov edx, r11d - and edx, 2097136 - movdqa xmm0, xmm9 - pxor xmm0, xmm7 - movdqu XMMWORD PTR [r9], xmm0 - - lea rbx, QWORD PTR [rdx+r13] - mov r10, QWORD PTR [rdx+r13] - - movdqu xmm10, xmm11 - movq xmm0, rbp - movq xmm11, rdi - punpcklqdq xmm11, xmm0 - aesenc xmm10, xmm11 - - mov eax, ecx - mov r12d, ecx - xor eax, 16 - xor r12d, 32 - xor ecx, 48 - - movdqu xmm0, XMMWORD PTR [rax+rsi] - paddq xmm0, xmm6 - movdqu xmm1, XMMWORD PTR [r12+rsi] - movdqu XMMWORD PTR [r12+rsi], xmm0 - paddq xmm1, xmm11 - movdqu xmm0, XMMWORD PTR [rcx+rsi] - movdqu XMMWORD PTR [rcx+rsi], xmm1 - paddq xmm0, xmm8 - movdqu XMMWORD PTR [rax+rsi], xmm0 - - movq rcx, xmm10 - and ecx, 2097136 - - movdqa xmm0, xmm10 - pxor xmm0, xmm6 - movdqu XMMWORD PTR [r8], xmm0 - mov r12, QWORD PTR [rcx+rsi] - - mov r9, QWORD PTR [rbx+8] - - xor edx, 16 - mov r8d, edx - mov r15d, edx - - movq rdx, xmm5 - shl rdx, 32 - movq rax, xmm4 - xor rdx, rax - xor r10, rdx - mov rax, r10 - mul r11 - mov r11d, r8d - xor r11d, 48 - movq xmm0, rdx - xor rdx, [r11+r13] - movq xmm1, rax - xor rax, [r11+r13+8] - punpcklqdq xmm0, xmm1 - - pxor xmm0, XMMWORD PTR [r8+r13] - xor r8d, 32 - movdqu xmm1, XMMWORD PTR [r11+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [r11+r13], xmm0 - movdqu xmm0, XMMWORD PTR [r8+r13] - movdqu XMMWORD PTR [r8+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [r15+r13], xmm0 - - mov r11, QWORD PTR [rsp+256] - add r11, rdx - mov rdx, QWORD PTR [rsp+264] - add rdx, rax - mov QWORD PTR [rbx], r11 - xor r11, r10 - mov QWORD PTR [rbx+8], rdx - xor rdx, r9 - mov QWORD PTR [rsp+256], r11 - and r11d, 2097136 - mov QWORD PTR [rsp+264], rdx - mov QWORD PTR [rsp+8], r11 - lea r15, QWORD PTR [r11+r13] - movdqu xmm15, XMMWORD PTR [r11+r13] - lea r13, QWORD PTR [rsi+rcx] - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movaps xmm2, xmm13 - movq r10, xmm0 - psllq xmm5, 1 - shl r10, 32 - movdqa xmm0, xmm9 - psrldq xmm0, 8 - movdqa xmm1, xmm10 - movq r11, xmm0 - psrldq xmm1, 8 - movq r8, xmm1 - psrldq xmm4, 8 - movaps xmm0, xmm13 - movq rax, xmm4 - xor r10, rax - movaps xmm1, xmm13 - xor r10, r12 - lea rax, QWORD PTR [r11+1] - shr rax, 1 - movdqa xmm3, xmm9 - punpcklqdq xmm3, xmm10 - paddq xmm5, xmm3 - movq rdx, xmm5 - psrldq xmm5, 8 - cvtsi2sd xmm2, rax - or edx, -2147483647 - lea rax, QWORD PTR [r8+1] - shr rax, 1 - movq r9, xmm5 - cvtsi2sd xmm0, rax - or r9d, -2147483647 - cvtsi2sd xmm1, rdx - unpcklpd xmm2, xmm0 - movaps xmm0, xmm13 - cvtsi2sd xmm0, r9 - unpcklpd xmm1, xmm0 - divpd xmm2, xmm1 - paddq xmm2, xmm14 - cvttsd2si rax, xmm2 - psrldq xmm2, 8 - mov rbx, rax - imul rax, rdx - sub r11, rax - js div_fix_1_half_sandybridge -div_fix_1_ret_half_sandybridge: - - cvttsd2si rdx, xmm2 - mov rax, rdx - imul rax, r9 - movd xmm2, r11d - movd xmm4, ebx - sub r8, rax - js div_fix_2_half_sandybridge -div_fix_2_ret_half_sandybridge: - - movd xmm1, r8d - movd xmm0, edx - punpckldq xmm2, xmm1 - punpckldq xmm4, xmm0 - punpckldq xmm4, xmm2 - paddq xmm3, xmm4 - movdqa xmm0, xmm3 - psrlq xmm0, 12 - paddq xmm0, xmm12 - sqrtpd xmm1, xmm0 - movq r9, xmm1 - movdqa xmm5, xmm1 - psrlq xmm5, 19 - test r9, 524287 - je sqrt_fix_1_half_sandybridge -sqrt_fix_1_ret_half_sandybridge: - - movq r9, xmm10 - psrldq xmm1, 8 - movq r8, xmm1 - test r8, 524287 - je sqrt_fix_2_half_sandybridge -sqrt_fix_2_ret_half_sandybridge: - - mov r12d, ecx - mov r8d, ecx - xor r12d, 16 - xor r8d, 32 - xor ecx, 48 - mov rax, r10 - mul r9 - movq xmm0, rax - movq xmm3, rdx - punpcklqdq xmm3, xmm0 - - movdqu xmm0, XMMWORD PTR [r12+rsi] - pxor xmm0, xmm3 - movdqu xmm1, XMMWORD PTR [r8+rsi] - xor rdx, [r8+rsi] - xor rax, [r8+rsi+8] - movdqu xmm3, XMMWORD PTR [rcx+rsi] - paddq xmm0, xmm6 - paddq xmm1, xmm11 - paddq xmm3, xmm8 - movdqu XMMWORD PTR [r8+rsi], xmm0 - movdqu XMMWORD PTR [rcx+rsi], xmm1 - movdqu XMMWORD PTR [r12+rsi], xmm3 - - add rdi, rdx - mov QWORD PTR [r13], rdi - xor rdi, r10 - mov ecx, edi - and ecx, 2097136 - lea r8, QWORD PTR [rcx+rsi] - - mov rdx, QWORD PTR [r13+8] - add rbp, rax - mov QWORD PTR [r13+8], rbp - movdqu xmm11, XMMWORD PTR [rcx+rsi] - xor rbp, rdx - mov r13, QWORD PTR [rsp] - movdqa xmm3, xmm7 - mov rdx, QWORD PTR [rsp+8] - movdqa xmm8, xmm6 - mov r10, QWORD PTR [rsp+256] - movdqa xmm7, xmm9 - mov r11, QWORD PTR [rsp+264] - movdqa xmm6, xmm10 - mov r9, r15 - dec r14d - jne main_loop_double_half_sandybridge - - ldmxcsr DWORD PTR [rsp+272] - movaps xmm13, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+184] - movaps xmm6, XMMWORD PTR [r11-24] - movaps xmm7, XMMWORD PTR [r11-40] - movaps xmm8, XMMWORD PTR [r11-56] - movaps xmm9, XMMWORD PTR [r11-72] - movaps xmm10, XMMWORD PTR [r11-88] - movaps xmm11, XMMWORD PTR [r11-104] - movaps xmm12, XMMWORD PTR [r11-120] - movaps xmm14, XMMWORD PTR [rsp+32] - movaps xmm15, XMMWORD PTR [rsp+16] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_double_mainloop_asm_half_sandybridge_endp - -div_fix_1_half_sandybridge: - dec rbx - add r11, rdx - jmp div_fix_1_ret_half_sandybridge - -div_fix_2_half_sandybridge: - dec rdx - add r8, r9 - jmp div_fix_2_ret_half_sandybridge - -sqrt_fix_1_half_sandybridge: - movq r8, xmm3 - movdqa xmm0, xmm5 - psrldq xmm0, 8 - dec r9 - mov r11d, -1022 - shl r11, 32 - mov rax, r9 - shr r9, 19 - shr rax, 20 - mov rdx, r9 - sub rdx, rax - lea rdx, [rdx+r11+1] - add rax, r11 - imul rdx, rax - sub rdx, r8 - adc r9, 0 - movq xmm5, r9 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_1_ret_half_sandybridge - -sqrt_fix_2_half_sandybridge: - psrldq xmm3, 8 - movq r11, xmm3 - dec r8 - mov ebx, -1022 - shl rbx, 32 - mov rax, r8 - shr r8, 19 - shr rax, 20 - mov rdx, r8 - sub rdx, rax - lea rdx, [rdx+rbx+1] - add rax, rbx - imul rdx, rax - sub rdx, r11 - adc r8, 0 - movq xmm0, r8 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_2_ret_half_sandybridge - -cnv2_double_mainloop_asm_half_sandybridge_endp: diff --git a/src/crypto/asm/cn_half/cn_half_main_loop_bulldozer.inc b/src/crypto/asm/cn_half/cn_half_main_loop_bulldozer.inc deleted file mode 100644 index 460f9b66..00000000 --- a/src/crypto/asm/cn_half/cn_half_main_loop_bulldozer.inc +++ /dev/null @@ -1,180 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 262144 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movq xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movq xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 16 -cnv2_main_loop_half_bulldozer: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movq xmm6, r8 - pinsrq xmm6, r11, 1 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - - mov edi, 1023 - shl rdi, 52 - - movq r14, xmm5 - pextrq rax, xmm5, 1 - - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - div r9 - mov eax, eax - shl rdx, 32 - lea r15, [rax+rdx] - lea rax, [r14+r15] - shr rax, 12 - add rax, rdi - movq xmm0, rax - sqrtsd xmm1, xmm0 - movq rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_half_bulldozer - shr rdi, 19 - -sqrt_fixup_half_bulldozer_ret: - mov rax, rsi - mul r14 - movq xmm1, rax - movq xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne cnv2_main_loop_half_bulldozer - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_half_bulldozer_endp - -sqrt_fixup_half_bulldozer: - movq r9, xmm5 - add r9, r15 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_half_bulldozer_ret - -cnv2_main_loop_half_bulldozer_endp: diff --git a/src/crypto/asm/cn_half/cn_half_main_loop_ivybridge.inc b/src/crypto/asm/cn_half/cn_half_main_loop_ivybridge.inc deleted file mode 100644 index 51b82bec..00000000 --- a/src/crypto/asm/cn_half/cn_half_main_loop_ivybridge.inc +++ /dev/null @@ -1,186 +0,0 @@ - mov QWORD PTR [rsp+24], rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 80 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov esi, 262144 - mov r8, QWORD PTR [rcx+32] - mov r13d, -2147483647 - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm4, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - movq xmm3, QWORD PTR [r9+104] - movaps XMMWORD PTR [rsp+64], xmm6 - movaps XMMWORD PTR [rsp+48], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - and r10d, 2097136 - movq xmm5, rax - - xor eax, eax - mov QWORD PTR [rsp+16], rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, rcx - punpcklqdq xmm5, xmm0 - movdqu xmm6, XMMWORD PTR [r10+rbx] - - ALIGN 16 -main_loop_half_ivybridge: - lea rdx, QWORD PTR [r10+rbx] - mov ecx, r10d - mov eax, r10d - mov rdi, r15 - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - movq xmm0, r11 - movq xmm7, r8 - punpcklqdq xmm7, xmm0 - aesenc xmm6, xmm7 - movq rbp, xmm6 - mov r9, rbp - and r9d, 2097136 - movdqu xmm2, XMMWORD PTR [rcx+rbx] - movdqu xmm1, XMMWORD PTR [rax+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm1, xmm7 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [rcx+rbx], xmm0 - movdqu XMMWORD PTR [rax+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - mov r10, r9 - xor r10d, 32 - movq rcx, xmm3 - mov rax, rcx - shl rax, 32 - xor rdi, rax - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx], xmm0 - xor rdi, QWORD PTR [r9+rbx] - lea r14, QWORD PTR [r9+rbx] - mov r12, QWORD PTR [r14+8] - xor edx, edx - lea r9d, DWORD PTR [ecx+ecx] - add r9d, ebp - movdqa xmm0, xmm6 - psrldq xmm0, 8 - or r9d, r13d - movq rax, xmm0 - div r9 - xorps xmm3, xmm3 - mov eax, eax - shl rdx, 32 - add rdx, rax - lea r9, QWORD PTR [rdx+rbp] - mov r15, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm3, xmm0 - psubq xmm3, XMMWORD PTR [rsp+16] - movq rdx, xmm3 - test edx, 524287 - je sqrt_fixup_half_ivybridge - psrlq xmm3, 19 -sqrt_fixup_half_ivybridge_ret: - - mov ecx, r10d - mov rax, rdi - mul rbp - movq xmm2, rdx - xor rdx, [rcx+rbx] - add r8, rdx - mov QWORD PTR [r14], r8 - xor r8, rdi - mov edi, r8d - and edi, 2097136 - movq xmm0, rax - xor rax, [rcx+rbx+8] - add r11, rax - mov QWORD PTR [r14+8], r11 - punpcklqdq xmm2, xmm0 - - mov r9d, r10d - xor r9d, 48 - xor r10d, 16 - pxor xmm2, XMMWORD PTR [r9+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm0, xmm5 - movdqu xmm1, XMMWORD PTR [rcx+rbx] - paddq xmm2, xmm4 - paddq xmm1, xmm7 - movdqa xmm5, xmm4 - movdqu XMMWORD PTR [r9+rbx], xmm0 - movdqa xmm4, xmm6 - movdqu XMMWORD PTR [rcx+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - movdqu xmm6, [rdi+rbx] - mov r10d, edi - xor r11, r12 - dec rsi - jne main_loop_half_ivybridge - - ldmxcsr DWORD PTR [rsp] - mov rbx, QWORD PTR [rsp+160] - movaps xmm6, XMMWORD PTR [rsp+64] - movaps xmm7, XMMWORD PTR [rsp+48] - movaps xmm8, XMMWORD PTR [rsp+32] - add rsp, 80 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - jmp cnv2_main_loop_half_ivybridge_endp - -sqrt_fixup_half_ivybridge: - dec rdx - mov r13d, -1022 - shl r13, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - add rax, r13 - not r13 - sub rcx, r13 - mov r13d, -2147483647 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm3, rdx - jmp sqrt_fixup_half_ivybridge_ret - -cnv2_main_loop_half_ivybridge_endp: diff --git a/src/crypto/asm/cn_half/cn_half_main_loop_ryzen.inc b/src/crypto/asm/cn_half/cn_half_main_loop_ryzen.inc deleted file mode 100644 index 8da3d8c4..00000000 --- a/src/crypto/asm/cn_half/cn_half_main_loop_ryzen.inc +++ /dev/null @@ -1,179 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 262144 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movq xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movq xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 16 -main_loop_half_ryzen: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movq xmm0, r11 - movq xmm6, r8 - punpcklqdq xmm6, xmm0 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - movq r14, xmm5 - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movq rax, xmm0 - - div r9 - movq xmm0, rax - movq xmm1, rdx - punpckldq xmm0, xmm1 - movq r15, xmm0 - paddq xmm0, xmm5 - movdqa xmm2, xmm0 - psrlq xmm0, 12 - paddq xmm0, xmm7 - sqrtsd xmm1, xmm0 - movq rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_half_ryzen - shr rdi, 19 - -sqrt_fixup_half_ryzen_ret: - mov rax, rsi - mul r14 - movq xmm1, rax - movq xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne main_loop_half_ryzen - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_half_ryzen_endp - -sqrt_fixup_half_ryzen: - movq r9, xmm2 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_half_ryzen_ret - -cnv2_main_loop_half_ryzen_endp: diff --git a/src/crypto/asm/cn_main_loop.S b/src/crypto/asm/cn_main_loop.S index 95905d34..417fd414 100644 --- a/src/crypto/asm/cn_main_loop.S +++ b/src/crypto/asm/cn_main_loop.S @@ -12,11 +12,6 @@ .global FN_PREFIX(cnv2_mainloop_bulldozer_asm) .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) -.global FN_PREFIX(cn_half_mainloop_ivybridge_asm) -.global FN_PREFIX(cn_half_mainloop_ryzen_asm) -.global FN_PREFIX(cn_half_mainloop_bulldozer_asm) -.global FN_PREFIX(cn_half_double_mainloop_sandybridge_asm) - ALIGN 16 FN_PREFIX(cnv2_mainloop_ivybridge_asm): sub rsp, 48 @@ -24,6 +19,7 @@ FN_PREFIX(cnv2_mainloop_ivybridge_asm): #include "cn2/cnv2_main_loop_ivybridge.inc" add rsp, 48 ret 0 + nop;nop;nop;nop; ALIGN 16 FN_PREFIX(cnv2_mainloop_ryzen_asm): @@ -32,6 +28,7 @@ FN_PREFIX(cnv2_mainloop_ryzen_asm): #include "cn2/cnv2_main_loop_ryzen.inc" add rsp, 48 ret 0 + nop;nop;nop;nop; ALIGN 16 FN_PREFIX(cnv2_mainloop_bulldozer_asm): @@ -40,6 +37,7 @@ FN_PREFIX(cnv2_mainloop_bulldozer_asm): #include "cn2/cnv2_main_loop_bulldozer.inc" add rsp, 48 ret 0 + nop;nop;nop;nop; ALIGN 16 FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): @@ -49,36 +47,4 @@ FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): #include "cn2/cnv2_double_main_loop_sandybridge.inc" add rsp, 48 ret 0 - -ALIGN 16 -FN_PREFIX(cn_half_mainloop_ivybridge_asm): - sub rsp, 48 - mov rcx, rdi - #include "cn_half/cn_half_main_loop_ivybridge.inc" - add rsp, 48 - ret 0 - -ALIGN 16 -FN_PREFIX(cn_half_mainloop_ryzen_asm): - sub rsp, 48 - mov rcx, rdi - #include "cn_half/cn_half_main_loop_ryzen.inc" - add rsp, 48 - ret 0 - -ALIGN 16 -FN_PREFIX(cn_half_mainloop_bulldozer_asm): - sub rsp, 48 - mov rcx, rdi - #include "cn_half/cn_half_main_loop_bulldozer.inc" - add rsp, 48 - ret 0 - -ALIGN 16 -FN_PREFIX(cn_half_double_mainloop_sandybridge_asm): - sub rsp, 48 - mov rcx, rdi - mov rdx, rsi - #include "cn_half/cn_half_double_main_loop_sandybridge.inc" - add rsp, 48 - ret 0 + nop;nop;nop;nop; diff --git a/src/crypto/asm/cn_main_loop.asm b/src/crypto/asm/cn_main_loop.asm index fefb77a3..9d4cede0 100644 --- a/src/crypto/asm/cn_main_loop.asm +++ b/src/crypto/asm/cn_main_loop.asm @@ -3,58 +3,34 @@ PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_bulldozer_asm PUBLIC cnv2_double_mainloop_sandybridge_asm -PUBLIC cn_half_mainloop_ivybridge_asm -PUBLIC cn_half_mainloop_ryzen_asm -PUBLIC cn_half_mainloop_bulldozer_asm -PUBLIC cn_half_double_mainloop_sandybridge_asm ALIGN 64 cnv2_mainloop_ivybridge_asm PROC INCLUDE cn2/cnv2_main_loop_ivybridge.inc ret 0 + nop;nop;nop;nop; cnv2_mainloop_ivybridge_asm ENDP ALIGN 64 cnv2_mainloop_ryzen_asm PROC INCLUDE cn2/cnv2_main_loop_ryzen.inc ret 0 + nop;nop;nop;nop; cnv2_mainloop_ryzen_asm ENDP ALIGN 64 cnv2_mainloop_bulldozer_asm PROC INCLUDE cn2/cnv2_main_loop_bulldozer.inc ret 0 + nop;nop;nop;nop; cnv2_mainloop_bulldozer_asm ENDP ALIGN 64 cnv2_double_mainloop_sandybridge_asm PROC INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc ret 0 + nop;nop;nop;nop; cnv2_double_mainloop_sandybridge_asm ENDP -ALIGN 64 -cn_half_mainloop_ivybridge_asm PROC - INCLUDE cn_half/cn_half_main_loop_ivybridge.inc - ret 0 -cn_half_mainloop_ivybridge_asm ENDP - -ALIGN 64 -cn_half_mainloop_ryzen_asm PROC - INCLUDE cn_half/cn_half_main_loop_ryzen.inc - ret 0 -cn_half_mainloop_ryzen_asm ENDP - -ALIGN 64 -cn_half_mainloop_bulldozer_asm PROC - INCLUDE cn_half/cn_half_main_loop_bulldozer.inc - ret 0 -cn_half_mainloop_bulldozer_asm ENDP - -ALIGN 64 -cn_half_double_mainloop_sandybridge_asm PROC - INCLUDE cn_half/cn_half_double_main_loop_sandybridge.inc - ret 0 -cn_half_double_mainloop_sandybridge_asm ENDP - _TEXT_CNV2_MAINLOOP ENDS END diff --git a/src/crypto/asm/win64/cn_half/cn_half_double_main_loop_sandybridge.inc b/src/crypto/asm/win64/cn_half/cn_half_double_main_loop_sandybridge.inc deleted file mode 100644 index 0c207f21..00000000 --- a/src/crypto/asm/win64/cn_half/cn_half_double_main_loop_sandybridge.inc +++ /dev/null @@ -1,410 +0,0 @@ - mov rax, rsp - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 184 - - stmxcsr DWORD PTR [rsp+272] - mov DWORD PTR [rsp+276], 24448 - ldmxcsr DWORD PTR [rsp+276] - - mov r13, QWORD PTR [rcx+224] - mov r9, rdx - mov r10, QWORD PTR [rcx+32] - mov r8, rcx - xor r10, QWORD PTR [rcx] - mov r14d, 262144 - mov r11, QWORD PTR [rcx+40] - xor r11, QWORD PTR [rcx+8] - mov rsi, QWORD PTR [rdx+224] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov rdi, QWORD PTR [r9+32] - xor rdi, QWORD PTR [r9] - mov rbp, QWORD PTR [r9+40] - xor rbp, QWORD PTR [r9+8] - movd xmm0, rdx - movaps XMMWORD PTR [rax-88], xmm6 - movaps XMMWORD PTR [rax-104], xmm7 - movaps XMMWORD PTR [rax-120], xmm8 - movaps XMMWORD PTR [rsp+112], xmm9 - movaps XMMWORD PTR [rsp+96], xmm10 - movaps XMMWORD PTR [rsp+80], xmm11 - movaps XMMWORD PTR [rsp+64], xmm12 - movaps XMMWORD PTR [rsp+48], xmm13 - movaps XMMWORD PTR [rsp+32], xmm14 - movaps XMMWORD PTR [rsp+16], xmm15 - mov rdx, r10 - movd xmm4, QWORD PTR [r8+96] - and edx, 2097136 - mov rax, QWORD PTR [rcx+48] - xorps xmm13, xmm13 - xor rax, QWORD PTR [rcx+16] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r8+72] - movd xmm5, QWORD PTR [r8+104] - movd xmm7, rax - - mov eax, 1 - shl rax, 52 - movd xmm14, rax - punpcklqdq xmm14, xmm14 - - mov eax, 1023 - shl rax, 52 - movd xmm12, rax - punpcklqdq xmm12, xmm12 - - mov rax, QWORD PTR [r8+80] - xor rax, QWORD PTR [r8+64] - punpcklqdq xmm7, xmm0 - movd xmm0, rcx - mov rcx, QWORD PTR [r9+56] - xor rcx, QWORD PTR [r9+24] - movd xmm3, rax - mov rax, QWORD PTR [r9+48] - xor rax, QWORD PTR [r9+16] - punpcklqdq xmm3, xmm0 - movd xmm0, rcx - mov QWORD PTR [rsp], r13 - mov rcx, QWORD PTR [r9+88] - xor rcx, QWORD PTR [r9+72] - movd xmm6, rax - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - punpcklqdq xmm6, xmm0 - movd xmm0, rcx - mov QWORD PTR [rsp+256], r10 - mov rcx, rdi - mov QWORD PTR [rsp+264], r11 - movd xmm8, rax - and ecx, 2097136 - punpcklqdq xmm8, xmm0 - movd xmm0, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movd xmm0, QWORD PTR [r9+104] - lea r8, QWORD PTR [rcx+rsi] - movdqu xmm11, XMMWORD PTR [r8] - punpcklqdq xmm5, xmm0 - lea r9, QWORD PTR [rdx+r13] - movdqu xmm15, XMMWORD PTR [r9] - - ALIGN 16 -main_loop_double_half_sandybridge: - movdqu xmm9, xmm15 - mov eax, edx - mov ebx, edx - xor eax, 16 - xor ebx, 32 - xor edx, 48 - - movd xmm0, r11 - movd xmm2, r10 - punpcklqdq xmm2, xmm0 - aesenc xmm9, xmm2 - - movdqu xmm0, XMMWORD PTR [rax+r13] - movdqu xmm1, XMMWORD PTR [rbx+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [rbx+r13], xmm0 - movdqu xmm0, XMMWORD PTR [rdx+r13] - movdqu XMMWORD PTR [rdx+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [rax+r13], xmm0 - - movd r11, xmm9 - mov edx, r11d - and edx, 2097136 - movdqa xmm0, xmm9 - pxor xmm0, xmm7 - movdqu XMMWORD PTR [r9], xmm0 - - lea rbx, QWORD PTR [rdx+r13] - mov r10, QWORD PTR [rdx+r13] - - movdqu xmm10, xmm11 - movd xmm0, rbp - movd xmm11, rdi - punpcklqdq xmm11, xmm0 - aesenc xmm10, xmm11 - - mov eax, ecx - mov r12d, ecx - xor eax, 16 - xor r12d, 32 - xor ecx, 48 - - movdqu xmm0, XMMWORD PTR [rax+rsi] - paddq xmm0, xmm6 - movdqu xmm1, XMMWORD PTR [r12+rsi] - movdqu XMMWORD PTR [r12+rsi], xmm0 - paddq xmm1, xmm11 - movdqu xmm0, XMMWORD PTR [rcx+rsi] - movdqu XMMWORD PTR [rcx+rsi], xmm1 - paddq xmm0, xmm8 - movdqu XMMWORD PTR [rax+rsi], xmm0 - - movd rcx, xmm10 - and ecx, 2097136 - - movdqa xmm0, xmm10 - pxor xmm0, xmm6 - movdqu XMMWORD PTR [r8], xmm0 - mov r12, QWORD PTR [rcx+rsi] - - mov r9, QWORD PTR [rbx+8] - - xor edx, 16 - mov r8d, edx - mov r15d, edx - - movd rdx, xmm5 - shl rdx, 32 - movd rax, xmm4 - xor rdx, rax - xor r10, rdx - mov rax, r10 - mul r11 - mov r11d, r8d - xor r11d, 48 - movd xmm0, rdx - xor rdx, [r11+r13] - movd xmm1, rax - xor rax, [r11+r13+8] - punpcklqdq xmm0, xmm1 - - pxor xmm0, XMMWORD PTR [r8+r13] - xor r8d, 32 - movdqu xmm1, XMMWORD PTR [r11+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [r11+r13], xmm0 - movdqu xmm0, XMMWORD PTR [r8+r13] - movdqu XMMWORD PTR [r8+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [r15+r13], xmm0 - - mov r11, QWORD PTR [rsp+256] - add r11, rdx - mov rdx, QWORD PTR [rsp+264] - add rdx, rax - mov QWORD PTR [rbx], r11 - xor r11, r10 - mov QWORD PTR [rbx+8], rdx - xor rdx, r9 - mov QWORD PTR [rsp+256], r11 - and r11d, 2097136 - mov QWORD PTR [rsp+264], rdx - mov QWORD PTR [rsp+8], r11 - lea r15, QWORD PTR [r11+r13] - movdqu xmm15, XMMWORD PTR [r11+r13] - lea r13, QWORD PTR [rsi+rcx] - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movaps xmm2, xmm13 - movd r10, xmm0 - psllq xmm5, 1 - shl r10, 32 - movdqa xmm0, xmm9 - psrldq xmm0, 8 - movdqa xmm1, xmm10 - movd r11, xmm0 - psrldq xmm1, 8 - movd r8, xmm1 - psrldq xmm4, 8 - movaps xmm0, xmm13 - movd rax, xmm4 - xor r10, rax - movaps xmm1, xmm13 - xor r10, r12 - lea rax, QWORD PTR [r11+1] - shr rax, 1 - movdqa xmm3, xmm9 - punpcklqdq xmm3, xmm10 - paddq xmm5, xmm3 - movd rdx, xmm5 - psrldq xmm5, 8 - cvtsi2sd xmm2, rax - or edx, -2147483647 - lea rax, QWORD PTR [r8+1] - shr rax, 1 - movd r9, xmm5 - cvtsi2sd xmm0, rax - or r9d, -2147483647 - cvtsi2sd xmm1, rdx - unpcklpd xmm2, xmm0 - movaps xmm0, xmm13 - cvtsi2sd xmm0, r9 - unpcklpd xmm1, xmm0 - divpd xmm2, xmm1 - paddq xmm2, xmm14 - cvttsd2si rax, xmm2 - psrldq xmm2, 8 - mov rbx, rax - imul rax, rdx - sub r11, rax - js div_fix_1_half_sandybridge -div_fix_1_ret_half_sandybridge: - - cvttsd2si rdx, xmm2 - mov rax, rdx - imul rax, r9 - movd xmm2, r11d - movd xmm4, ebx - sub r8, rax - js div_fix_2_half_sandybridge -div_fix_2_ret_half_sandybridge: - - movd xmm1, r8d - movd xmm0, edx - punpckldq xmm2, xmm1 - punpckldq xmm4, xmm0 - punpckldq xmm4, xmm2 - paddq xmm3, xmm4 - movdqa xmm0, xmm3 - psrlq xmm0, 12 - paddq xmm0, xmm12 - sqrtpd xmm1, xmm0 - movd r9, xmm1 - movdqa xmm5, xmm1 - psrlq xmm5, 19 - test r9, 524287 - je sqrt_fix_1_half_sandybridge -sqrt_fix_1_ret_half_sandybridge: - - movd r9, xmm10 - psrldq xmm1, 8 - movd r8, xmm1 - test r8, 524287 - je sqrt_fix_2_half_sandybridge -sqrt_fix_2_ret_half_sandybridge: - - mov r12d, ecx - mov r8d, ecx - xor r12d, 16 - xor r8d, 32 - xor ecx, 48 - mov rax, r10 - mul r9 - movd xmm0, rax - movd xmm3, rdx - punpcklqdq xmm3, xmm0 - - movdqu xmm0, XMMWORD PTR [r12+rsi] - pxor xmm0, xmm3 - movdqu xmm1, XMMWORD PTR [r8+rsi] - xor rdx, [r8+rsi] - xor rax, [r8+rsi+8] - movdqu xmm3, XMMWORD PTR [rcx+rsi] - paddq xmm0, xmm6 - paddq xmm1, xmm11 - paddq xmm3, xmm8 - movdqu XMMWORD PTR [r8+rsi], xmm0 - movdqu XMMWORD PTR [rcx+rsi], xmm1 - movdqu XMMWORD PTR [r12+rsi], xmm3 - - add rdi, rdx - mov QWORD PTR [r13], rdi - xor rdi, r10 - mov ecx, edi - and ecx, 2097136 - lea r8, QWORD PTR [rcx+rsi] - - mov rdx, QWORD PTR [r13+8] - add rbp, rax - mov QWORD PTR [r13+8], rbp - movdqu xmm11, XMMWORD PTR [rcx+rsi] - xor rbp, rdx - mov r13, QWORD PTR [rsp] - movdqa xmm3, xmm7 - mov rdx, QWORD PTR [rsp+8] - movdqa xmm8, xmm6 - mov r10, QWORD PTR [rsp+256] - movdqa xmm7, xmm9 - mov r11, QWORD PTR [rsp+264] - movdqa xmm6, xmm10 - mov r9, r15 - dec r14d - jne main_loop_double_half_sandybridge - - ldmxcsr DWORD PTR [rsp+272] - movaps xmm13, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+184] - movaps xmm6, XMMWORD PTR [r11-24] - movaps xmm7, XMMWORD PTR [r11-40] - movaps xmm8, XMMWORD PTR [r11-56] - movaps xmm9, XMMWORD PTR [r11-72] - movaps xmm10, XMMWORD PTR [r11-88] - movaps xmm11, XMMWORD PTR [r11-104] - movaps xmm12, XMMWORD PTR [r11-120] - movaps xmm14, XMMWORD PTR [rsp+32] - movaps xmm15, XMMWORD PTR [rsp+16] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_double_mainloop_asm_half_sandybridge_endp - -div_fix_1_half_sandybridge: - dec rbx - add r11, rdx - jmp div_fix_1_ret_half_sandybridge - -div_fix_2_half_sandybridge: - dec rdx - add r8, r9 - jmp div_fix_2_ret_half_sandybridge - -sqrt_fix_1_half_sandybridge: - movd r8, xmm3 - movdqa xmm0, xmm5 - psrldq xmm0, 8 - dec r9 - mov r11d, -1022 - shl r11, 32 - mov rax, r9 - shr r9, 19 - shr rax, 20 - mov rdx, r9 - sub rdx, rax - lea rdx, [rdx+r11+1] - add rax, r11 - imul rdx, rax - sub rdx, r8 - adc r9, 0 - movd xmm5, r9 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_1_ret_half_sandybridge - -sqrt_fix_2_half_sandybridge: - psrldq xmm3, 8 - movd r11, xmm3 - dec r8 - mov ebx, -1022 - shl rbx, 32 - mov rax, r8 - shr r8, 19 - shr rax, 20 - mov rdx, r8 - sub rdx, rax - lea rdx, [rdx+rbx+1] - add rax, rbx - imul rdx, rax - sub rdx, r11 - adc r8, 0 - movd xmm0, r8 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_2_ret_half_sandybridge - -cnv2_double_mainloop_asm_half_sandybridge_endp: diff --git a/src/crypto/asm/win64/cn_half/cn_half_main_loop_bulldozer.inc b/src/crypto/asm/win64/cn_half/cn_half_main_loop_bulldozer.inc deleted file mode 100644 index 6597c791..00000000 --- a/src/crypto/asm/win64/cn_half/cn_half_main_loop_bulldozer.inc +++ /dev/null @@ -1,180 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 262144 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movd xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movd xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movd xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movd xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movd xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 16 -cnv2_main_loop_half_bulldozer: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movd xmm6, r8 - pinsrq xmm6, r11, 1 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - - mov edi, 1023 - shl rdi, 52 - - movd r14, xmm5 - pextrq rax, xmm5, 1 - - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - div r9 - mov eax, eax - shl rdx, 32 - lea r15, [rax+rdx] - lea rax, [r14+r15] - shr rax, 12 - add rax, rdi - movd xmm0, rax - sqrtsd xmm1, xmm0 - movd rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_half_bulldozer - shr rdi, 19 - -sqrt_fixup_half_bulldozer_ret: - mov rax, rsi - mul r14 - movd xmm1, rax - movd xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne cnv2_main_loop_half_bulldozer - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_half_bulldozer_endp - -sqrt_fixup_half_bulldozer: - movd r9, xmm5 - add r9, r15 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_half_bulldozer_ret - -cnv2_main_loop_half_bulldozer_endp: diff --git a/src/crypto/asm/win64/cn_half/cn_half_main_loop_ivybridge.inc b/src/crypto/asm/win64/cn_half/cn_half_main_loop_ivybridge.inc deleted file mode 100644 index c769f827..00000000 --- a/src/crypto/asm/win64/cn_half/cn_half_main_loop_ivybridge.inc +++ /dev/null @@ -1,186 +0,0 @@ - mov QWORD PTR [rsp+24], rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 80 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov esi, 262144 - mov r8, QWORD PTR [rcx+32] - mov r13d, -2147483647 - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movd xmm4, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movd xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - movd xmm3, QWORD PTR [r9+104] - movaps XMMWORD PTR [rsp+64], xmm6 - movaps XMMWORD PTR [rsp+48], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - and r10d, 2097136 - movd xmm5, rax - - xor eax, eax - mov QWORD PTR [rsp+16], rax - - mov ax, 1023 - shl rax, 52 - movd xmm8, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movd xmm0, rcx - punpcklqdq xmm5, xmm0 - movdqu xmm6, XMMWORD PTR [r10+rbx] - - ALIGN 16 -main_loop_half_ivybridge: - lea rdx, QWORD PTR [r10+rbx] - mov ecx, r10d - mov eax, r10d - mov rdi, r15 - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - movd xmm0, r11 - movd xmm7, r8 - punpcklqdq xmm7, xmm0 - aesenc xmm6, xmm7 - movd rbp, xmm6 - mov r9, rbp - and r9d, 2097136 - movdqu xmm2, XMMWORD PTR [rcx+rbx] - movdqu xmm1, XMMWORD PTR [rax+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm1, xmm7 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [rcx+rbx], xmm0 - movdqu XMMWORD PTR [rax+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - mov r10, r9 - xor r10d, 32 - movd rcx, xmm3 - mov rax, rcx - shl rax, 32 - xor rdi, rax - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx], xmm0 - xor rdi, QWORD PTR [r9+rbx] - lea r14, QWORD PTR [r9+rbx] - mov r12, QWORD PTR [r14+8] - xor edx, edx - lea r9d, DWORD PTR [ecx+ecx] - add r9d, ebp - movdqa xmm0, xmm6 - psrldq xmm0, 8 - or r9d, r13d - movd rax, xmm0 - div r9 - xorps xmm3, xmm3 - mov eax, eax - shl rdx, 32 - add rdx, rax - lea r9, QWORD PTR [rdx+rbp] - mov r15, rdx - mov rax, r9 - shr rax, 12 - movd xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm3, xmm0 - psubq xmm3, XMMWORD PTR [rsp+16] - movd rdx, xmm3 - test edx, 524287 - je sqrt_fixup_half_ivybridge - psrlq xmm3, 19 -sqrt_fixup_half_ivybridge_ret: - - mov ecx, r10d - mov rax, rdi - mul rbp - movd xmm2, rdx - xor rdx, [rcx+rbx] - add r8, rdx - mov QWORD PTR [r14], r8 - xor r8, rdi - mov edi, r8d - and edi, 2097136 - movd xmm0, rax - xor rax, [rcx+rbx+8] - add r11, rax - mov QWORD PTR [r14+8], r11 - punpcklqdq xmm2, xmm0 - - mov r9d, r10d - xor r9d, 48 - xor r10d, 16 - pxor xmm2, XMMWORD PTR [r9+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm0, xmm5 - movdqu xmm1, XMMWORD PTR [rcx+rbx] - paddq xmm2, xmm4 - paddq xmm1, xmm7 - movdqa xmm5, xmm4 - movdqu XMMWORD PTR [r9+rbx], xmm0 - movdqa xmm4, xmm6 - movdqu XMMWORD PTR [rcx+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - movdqu xmm6, [rdi+rbx] - mov r10d, edi - xor r11, r12 - dec rsi - jne main_loop_half_ivybridge - - ldmxcsr DWORD PTR [rsp] - mov rbx, QWORD PTR [rsp+160] - movaps xmm6, XMMWORD PTR [rsp+64] - movaps xmm7, XMMWORD PTR [rsp+48] - movaps xmm8, XMMWORD PTR [rsp+32] - add rsp, 80 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - jmp cnv2_main_loop_half_ivybridge_endp - -sqrt_fixup_half_ivybridge: - dec rdx - mov r13d, -1022 - shl r13, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - add rax, r13 - not r13 - sub rcx, r13 - mov r13d, -2147483647 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movd xmm3, rdx - jmp sqrt_fixup_half_ivybridge_ret - -cnv2_main_loop_half_ivybridge_endp: diff --git a/src/crypto/asm/win64/cn_half/cn_half_main_loop_ryzen.inc b/src/crypto/asm/win64/cn_half/cn_half_main_loop_ryzen.inc deleted file mode 100644 index 0744aaa4..00000000 --- a/src/crypto/asm/win64/cn_half/cn_half_main_loop_ryzen.inc +++ /dev/null @@ -1,179 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 262144 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movd xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movd xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movd xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movd xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movd xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 16 -main_loop_half_ryzen: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movd xmm0, r11 - movd xmm6, r8 - punpcklqdq xmm6, xmm0 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - movd r14, xmm5 - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movd rax, xmm0 - - div r9 - movd xmm0, rax - movd xmm1, rdx - punpckldq xmm0, xmm1 - movd r15, xmm0 - paddq xmm0, xmm5 - movdqa xmm2, xmm0 - psrlq xmm0, 12 - paddq xmm0, xmm7 - sqrtsd xmm1, xmm0 - movd rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_half_ryzen - shr rdi, 19 - -sqrt_fixup_half_ryzen_ret: - mov rax, rsi - mul r14 - movd xmm1, rax - movd xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne main_loop_half_ryzen - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_half_ryzen_endp - -sqrt_fixup_half_ryzen: - movd r9, xmm2 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_half_ryzen_ret - -cnv2_main_loop_half_ryzen_endp: diff --git a/src/crypto/asm/win64/cn_main_loop.S b/src/crypto/asm/win64/cn_main_loop.S index 691209f8..4caaa1a1 100644 --- a/src/crypto/asm/win64/cn_main_loop.S +++ b/src/crypto/asm/win64/cn_main_loop.S @@ -6,47 +6,26 @@ .global cnv2_mainloop_bulldozer_asm .global cnv2_double_mainloop_sandybridge_asm -.global cn_half_mainloop_ivybridge_asm -.global cn_half_mainloop_ryzen_asm -.global cn_half_mainloop_bulldozer_asm -.global cn_half_double_mainloop_sandybridge_asm - ALIGN 16 cnv2_mainloop_ivybridge_asm: #include "../cn2/cnv2_main_loop_ivybridge.inc" ret 0 + nop;nop;nop;nop; ALIGN 16 cnv2_mainloop_ryzen_asm: #include "../cn2/cnv2_main_loop_ryzen.inc" ret 0 + nop;nop;nop;nop; ALIGN 16 cnv2_mainloop_bulldozer_asm: - #include "../cn2/cnv2_main_loop_bulldozer.inc" + #include "../cn2/cnv2_main_loop_bulldozer.inc" ret 0 + nop;nop;nop;nop; ALIGN 16 cnv2_double_mainloop_sandybridge_asm: #include "../cn2/cnv2_double_main_loop_sandybridge.inc" ret 0 - -ALIGN 16 -cn_half_mainloop_ivybridge_asm: - #include "../cn_half/cn_half_main_loop_ivybridge.inc" - ret 0 - -ALIGN 16 -cn_half_mainloop_ryzen_asm: - #include "../cn_half/cn_half_main_loop_ryzen.inc" - ret 0 - -ALIGN 16 -cn_half_mainloop_bulldozer_asm: - #include "../cn_half/cn_half_main_loop_bulldozer.inc" - ret 0 - -ALIGN 16 -cn_half_double_mainloop_sandybridge_asm: - #include "../cn_half/cn_half_double_main_loop_sandybridge.inc" - ret 0 + nop;nop;nop;nop; diff --git a/src/crypto/asm/win64/cn_main_loop.asm b/src/crypto/asm/win64/cn_main_loop.asm index fefb77a3..9d4cede0 100644 --- a/src/crypto/asm/win64/cn_main_loop.asm +++ b/src/crypto/asm/win64/cn_main_loop.asm @@ -3,58 +3,34 @@ PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_bulldozer_asm PUBLIC cnv2_double_mainloop_sandybridge_asm -PUBLIC cn_half_mainloop_ivybridge_asm -PUBLIC cn_half_mainloop_ryzen_asm -PUBLIC cn_half_mainloop_bulldozer_asm -PUBLIC cn_half_double_mainloop_sandybridge_asm ALIGN 64 cnv2_mainloop_ivybridge_asm PROC INCLUDE cn2/cnv2_main_loop_ivybridge.inc ret 0 + nop;nop;nop;nop; cnv2_mainloop_ivybridge_asm ENDP ALIGN 64 cnv2_mainloop_ryzen_asm PROC INCLUDE cn2/cnv2_main_loop_ryzen.inc ret 0 + nop;nop;nop;nop; cnv2_mainloop_ryzen_asm ENDP ALIGN 64 cnv2_mainloop_bulldozer_asm PROC INCLUDE cn2/cnv2_main_loop_bulldozer.inc ret 0 + nop;nop;nop;nop; cnv2_mainloop_bulldozer_asm ENDP ALIGN 64 cnv2_double_mainloop_sandybridge_asm PROC INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc ret 0 + nop;nop;nop;nop; cnv2_double_mainloop_sandybridge_asm ENDP -ALIGN 64 -cn_half_mainloop_ivybridge_asm PROC - INCLUDE cn_half/cn_half_main_loop_ivybridge.inc - ret 0 -cn_half_mainloop_ivybridge_asm ENDP - -ALIGN 64 -cn_half_mainloop_ryzen_asm PROC - INCLUDE cn_half/cn_half_main_loop_ryzen.inc - ret 0 -cn_half_mainloop_ryzen_asm ENDP - -ALIGN 64 -cn_half_mainloop_bulldozer_asm PROC - INCLUDE cn_half/cn_half_main_loop_bulldozer.inc - ret 0 -cn_half_mainloop_bulldozer_asm ENDP - -ALIGN 64 -cn_half_double_mainloop_sandybridge_asm PROC - INCLUDE cn_half/cn_half_double_main_loop_sandybridge.inc - ret 0 -cn_half_double_mainloop_sandybridge_asm ENDP - _TEXT_CNV2_MAINLOOP ENDS END diff --git a/src/workers/CpuThread.cpp b/src/workers/CpuThread.cpp index 9a98b4e3..8f3457dc 100644 --- a/src/workers/CpuThread.cpp +++ b/src/workers/CpuThread.cpp @@ -31,6 +31,7 @@ #include "crypto/Asm.h" #include "rapidjson/document.h" #include "workers/CpuThread.h" +#include "Mem.h" #if defined(XMRIG_ARM) @@ -54,6 +55,61 @@ xmrig::CpuThread::CpuThread(size_t index, Algo algorithm, AlgoVariant av, Multiw } +#ifndef XMRIG_NO_ASM +template +static void patchCode(T& dst, U src, const uint32_t iterations, const uint32_t mask) +{ + const uint8_t* p = reinterpret_cast(src); + + size_t size = 0; + while (*(uint32_t*)(p + size) != 0x90909090) { + ++size; + } + + memcpy((void*) dst, (const void*) src, size); + + uint8_t* patched_data = reinterpret_cast(dst); + for (size_t i = 0; i + sizeof(uint32_t) <= size; ++i) { + switch (*(uint32_t*)(patched_data + i)) { + case xmrig::CRYPTONIGHT_ITER: + *(uint32_t*)(patched_data + i) = iterations; + break; + case xmrig::CRYPTONIGHT_MASK: + *(uint32_t*)(patched_data + i) = mask; + break; + } + } +} + +extern "C" void cnv2_mainloop_ivybridge_asm(cryptonight_ctx *ctx); +extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx *ctx); +extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx *ctx); +extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); + +xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ivybridge_asm = nullptr; +xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ryzen_asm = nullptr; +xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_bulldozer_asm = nullptr; +xmrig::CpuThread::cn_mainloop_double_fun cn_half_double_mainloop_sandybridge_asm = nullptr; + +void xmrig::CpuThread::patchAsmVariants() +{ + const int allocation_size = 65536; + uint8_t* base = reinterpret_cast(Mem::allocate_executable_memory(allocation_size)); + + cn_half_mainloop_ivybridge_asm = reinterpret_cast (base + 0x0000); + cn_half_mainloop_ryzen_asm = reinterpret_cast (base + 0x1000); + cn_half_mainloop_bulldozer_asm = reinterpret_cast (base + 0x2000); + cn_half_double_mainloop_sandybridge_asm = reinterpret_cast (base + 0x3000); + + patchCode(cn_half_mainloop_ivybridge_asm, cnv2_mainloop_ivybridge_asm, xmrig::CRYPTONIGHT_HALF_ITER, xmrig::CRYPTONIGHT_MASK); + patchCode(cn_half_mainloop_ryzen_asm, cnv2_mainloop_ryzen_asm, xmrig::CRYPTONIGHT_HALF_ITER, xmrig::CRYPTONIGHT_MASK); + patchCode(cn_half_mainloop_bulldozer_asm, cnv2_mainloop_bulldozer_asm, xmrig::CRYPTONIGHT_HALF_ITER, xmrig::CRYPTONIGHT_MASK); + patchCode(cn_half_double_mainloop_sandybridge_asm, cnv2_double_mainloop_sandybridge_asm, xmrig::CRYPTONIGHT_HALF_ITER, xmrig::CRYPTONIGHT_MASK); + + Mem::FlushInstructionCache(base, allocation_size); +} +#endif + bool xmrig::CpuThread::isSoftAES(AlgoVariant av) { return av == AV_SINGLE_SOFT || av == AV_DOUBLE_SOFT || av > AV_PENTA; diff --git a/src/workers/CpuThread.h b/src/workers/CpuThread.h index 71c3173d..e9d764da 100644 --- a/src/workers/CpuThread.h +++ b/src/workers/CpuThread.h @@ -60,6 +60,12 @@ public: CpuThread(size_t index, Algo algorithm, AlgoVariant av, Multiway multiway, int64_t affinity, int priority, bool softAES, bool prefetch, Assembly assembly); typedef void (*cn_hash_fun)(const uint8_t *input, size_t size, uint8_t *output, cryptonight_ctx **ctx); + typedef void (*cn_mainloop_fun)(cryptonight_ctx *ctx); + typedef void (*cn_mainloop_double_fun)(cryptonight_ctx *ctx1, cryptonight_ctx *ctx2); + +# ifndef XMRIG_NO_ASM + static void patchAsmVariants(); +# endif static bool isSoftAES(AlgoVariant av); static cn_hash_fun fn(Algo algorithm, AlgoVariant av, Variant variant, Assembly assembly); diff --git a/src/workers/Workers.cpp b/src/workers/Workers.cpp index a5109e9b..e285005e 100644 --- a/src/workers/Workers.cpp +++ b/src/workers/Workers.cpp @@ -168,6 +168,10 @@ void Workers::start(xmrig::Controller *controller) LOG_NOTICE("--------------------------------------------------------------------------"); # endif +# ifndef XMRIG_NO_ASM + xmrig::CpuThread::patchAsmVariants(); +# endif + m_controller = controller; const std::vector &threads = controller->config()->threads(); From 56cacbd5bcfc6e306ebab90190c1fa7407325b00 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Mon, 14 Jan 2019 16:38:28 +0100 Subject: [PATCH 2/2] Fixes for Visual Studio --- src/crypto/asm/cn_main_loop.S | 20 ++++++++++++++++---- src/crypto/asm/cn_main_loop.asm | 20 ++++++++++++++++---- src/crypto/asm/win64/cn_main_loop.S | 20 ++++++++++++++++---- src/crypto/asm/win64/cn_main_loop.asm | 20 ++++++++++++++++---- src/workers/CpuThread.cpp | 7 ++++++- 5 files changed, 70 insertions(+), 17 deletions(-) diff --git a/src/crypto/asm/cn_main_loop.S b/src/crypto/asm/cn_main_loop.S index 417fd414..e9ac64f5 100644 --- a/src/crypto/asm/cn_main_loop.S +++ b/src/crypto/asm/cn_main_loop.S @@ -19,7 +19,10 @@ FN_PREFIX(cnv2_mainloop_ivybridge_asm): #include "cn2/cnv2_main_loop_ivybridge.inc" add rsp, 48 ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop ALIGN 16 FN_PREFIX(cnv2_mainloop_ryzen_asm): @@ -28,7 +31,10 @@ FN_PREFIX(cnv2_mainloop_ryzen_asm): #include "cn2/cnv2_main_loop_ryzen.inc" add rsp, 48 ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop ALIGN 16 FN_PREFIX(cnv2_mainloop_bulldozer_asm): @@ -37,7 +43,10 @@ FN_PREFIX(cnv2_mainloop_bulldozer_asm): #include "cn2/cnv2_main_loop_bulldozer.inc" add rsp, 48 ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop ALIGN 16 FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): @@ -47,4 +56,7 @@ FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): #include "cn2/cnv2_double_main_loop_sandybridge.inc" add rsp, 48 ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop diff --git a/src/crypto/asm/cn_main_loop.asm b/src/crypto/asm/cn_main_loop.asm index 9d4cede0..9c8a6ea9 100644 --- a/src/crypto/asm/cn_main_loop.asm +++ b/src/crypto/asm/cn_main_loop.asm @@ -8,28 +8,40 @@ ALIGN 64 cnv2_mainloop_ivybridge_asm PROC INCLUDE cn2/cnv2_main_loop_ivybridge.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_mainloop_ivybridge_asm ENDP ALIGN 64 cnv2_mainloop_ryzen_asm PROC INCLUDE cn2/cnv2_main_loop_ryzen.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_mainloop_ryzen_asm ENDP ALIGN 64 cnv2_mainloop_bulldozer_asm PROC INCLUDE cn2/cnv2_main_loop_bulldozer.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_mainloop_bulldozer_asm ENDP ALIGN 64 cnv2_double_mainloop_sandybridge_asm PROC INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_double_mainloop_sandybridge_asm ENDP _TEXT_CNV2_MAINLOOP ENDS diff --git a/src/crypto/asm/win64/cn_main_loop.S b/src/crypto/asm/win64/cn_main_loop.S index 4caaa1a1..ea5a63b8 100644 --- a/src/crypto/asm/win64/cn_main_loop.S +++ b/src/crypto/asm/win64/cn_main_loop.S @@ -10,22 +10,34 @@ ALIGN 16 cnv2_mainloop_ivybridge_asm: #include "../cn2/cnv2_main_loop_ivybridge.inc" ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop ALIGN 16 cnv2_mainloop_ryzen_asm: #include "../cn2/cnv2_main_loop_ryzen.inc" ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop ALIGN 16 cnv2_mainloop_bulldozer_asm: #include "../cn2/cnv2_main_loop_bulldozer.inc" ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop ALIGN 16 cnv2_double_mainloop_sandybridge_asm: #include "../cn2/cnv2_double_main_loop_sandybridge.inc" ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop diff --git a/src/crypto/asm/win64/cn_main_loop.asm b/src/crypto/asm/win64/cn_main_loop.asm index 9d4cede0..9c8a6ea9 100644 --- a/src/crypto/asm/win64/cn_main_loop.asm +++ b/src/crypto/asm/win64/cn_main_loop.asm @@ -8,28 +8,40 @@ ALIGN 64 cnv2_mainloop_ivybridge_asm PROC INCLUDE cn2/cnv2_main_loop_ivybridge.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_mainloop_ivybridge_asm ENDP ALIGN 64 cnv2_mainloop_ryzen_asm PROC INCLUDE cn2/cnv2_main_loop_ryzen.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_mainloop_ryzen_asm ENDP ALIGN 64 cnv2_mainloop_bulldozer_asm PROC INCLUDE cn2/cnv2_main_loop_bulldozer.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_mainloop_bulldozer_asm ENDP ALIGN 64 cnv2_double_mainloop_sandybridge_asm PROC INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_double_mainloop_sandybridge_asm ENDP _TEXT_CNV2_MAINLOOP ENDS diff --git a/src/workers/CpuThread.cpp b/src/workers/CpuThread.cpp index 8f3457dc..cf366860 100644 --- a/src/workers/CpuThread.cpp +++ b/src/workers/CpuThread.cpp @@ -57,10 +57,15 @@ xmrig::CpuThread::CpuThread(size_t index, Algo algorithm, AlgoVariant av, Multiw #ifndef XMRIG_NO_ASM template -static void patchCode(T& dst, U src, const uint32_t iterations, const uint32_t mask) +static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t mask) { const uint8_t* p = reinterpret_cast(src); + // Workaround for Visual Studio placing trampoline in debug builds + if (p[0] == 0xE9) { + p += *(int32_t*)(p + 1) + 5; + } + size_t size = 0; while (*(uint32_t*)(p + size) != 0x90909090) { ++size;