mirror of
https://github.com/xmrig/xmrig.git
synced 2024-11-18 18:11:05 +00:00
Added ASM code patching when loading
For CNv2 variants with different iterations and memory size.
This commit is contained in:
parent
492449e9fb
commit
8b9d5cff91
19 changed files with 118 additions and 2034 deletions
|
@ -59,6 +59,9 @@ public:
|
||||||
static void init(bool enabled);
|
static void init(bool enabled);
|
||||||
static void release(cryptonight_ctx **ctx, size_t count, MemInfo &info);
|
static void release(cryptonight_ctx **ctx, size_t count, MemInfo &info);
|
||||||
|
|
||||||
|
static void* allocate_executable_memory(size_t size);
|
||||||
|
static void FlushInstructionCache(void* p, size_t size);
|
||||||
|
|
||||||
static inline bool isHugepagesAvailable() { return (m_flags & HugepagesAvailable) != 0; }
|
static inline bool isHugepagesAvailable() { return (m_flags & HugepagesAvailable) != 0; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
|
@ -87,3 +87,19 @@ void Mem::release(MemInfo &info)
|
||||||
_mm_free(info.memory);
|
_mm_free(info.memory);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void* Mem::allocate_executable_memory(size_t size)
|
||||||
|
{
|
||||||
|
# if defined(__APPLE__)
|
||||||
|
return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0);
|
||||||
|
# else
|
||||||
|
return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||||
|
# endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void Mem::FlushInstructionCache(void* p, size_t size)
|
||||||
|
{
|
||||||
|
__builtin___clear_cache(reinterpret_cast<char*>(p), reinterpret_cast<char*>(p) + size);
|
||||||
|
}
|
||||||
|
|
|
@ -182,3 +182,15 @@ void Mem::release(MemInfo &info)
|
||||||
_mm_free(info.memory);
|
_mm_free(info.memory);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void* Mem::allocate_executable_memory(size_t size)
|
||||||
|
{
|
||||||
|
return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void Mem::FlushInstructionCache(void* p, size_t size)
|
||||||
|
{
|
||||||
|
::FlushInstructionCache(GetCurrentProcess(), p, size);
|
||||||
|
}
|
||||||
|
|
|
@ -570,10 +570,10 @@ extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx *ctx);
|
||||||
extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx *ctx);
|
extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx *ctx);
|
||||||
extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1);
|
extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1);
|
||||||
|
|
||||||
extern "C" void cn_half_mainloop_ivybridge_asm(cryptonight_ctx *ctx);
|
extern xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ivybridge_asm;
|
||||||
extern "C" void cn_half_mainloop_ryzen_asm(cryptonight_ctx *ctx);
|
extern xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ryzen_asm;
|
||||||
extern "C" void cn_half_mainloop_bulldozer_asm(cryptonight_ctx *ctx);
|
extern xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_bulldozer_asm;
|
||||||
extern "C" void cn_half_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1);
|
extern xmrig::CpuThread::cn_mainloop_double_fun cn_half_double_mainloop_sandybridge_asm;
|
||||||
|
|
||||||
|
|
||||||
template<xmrig::Algo ALGO, xmrig::Variant VARIANT, xmrig::Assembly ASM>
|
template<xmrig::Algo ALGO, xmrig::Variant VARIANT, xmrig::Assembly ASM>
|
||||||
|
|
|
@ -1,410 +0,0 @@
|
||||||
mov rax, rsp
|
|
||||||
push rbx
|
|
||||||
push rbp
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
push r12
|
|
||||||
push r13
|
|
||||||
push r14
|
|
||||||
push r15
|
|
||||||
sub rsp, 184
|
|
||||||
|
|
||||||
stmxcsr DWORD PTR [rsp+272]
|
|
||||||
mov DWORD PTR [rsp+276], 24448
|
|
||||||
ldmxcsr DWORD PTR [rsp+276]
|
|
||||||
|
|
||||||
mov r13, QWORD PTR [rcx+224]
|
|
||||||
mov r9, rdx
|
|
||||||
mov r10, QWORD PTR [rcx+32]
|
|
||||||
mov r8, rcx
|
|
||||||
xor r10, QWORD PTR [rcx]
|
|
||||||
mov r14d, 262144
|
|
||||||
mov r11, QWORD PTR [rcx+40]
|
|
||||||
xor r11, QWORD PTR [rcx+8]
|
|
||||||
mov rsi, QWORD PTR [rdx+224]
|
|
||||||
mov rdx, QWORD PTR [rcx+56]
|
|
||||||
xor rdx, QWORD PTR [rcx+24]
|
|
||||||
mov rdi, QWORD PTR [r9+32]
|
|
||||||
xor rdi, QWORD PTR [r9]
|
|
||||||
mov rbp, QWORD PTR [r9+40]
|
|
||||||
xor rbp, QWORD PTR [r9+8]
|
|
||||||
movq xmm0, rdx
|
|
||||||
movaps XMMWORD PTR [rax-88], xmm6
|
|
||||||
movaps XMMWORD PTR [rax-104], xmm7
|
|
||||||
movaps XMMWORD PTR [rax-120], xmm8
|
|
||||||
movaps XMMWORD PTR [rsp+112], xmm9
|
|
||||||
movaps XMMWORD PTR [rsp+96], xmm10
|
|
||||||
movaps XMMWORD PTR [rsp+80], xmm11
|
|
||||||
movaps XMMWORD PTR [rsp+64], xmm12
|
|
||||||
movaps XMMWORD PTR [rsp+48], xmm13
|
|
||||||
movaps XMMWORD PTR [rsp+32], xmm14
|
|
||||||
movaps XMMWORD PTR [rsp+16], xmm15
|
|
||||||
mov rdx, r10
|
|
||||||
movq xmm4, QWORD PTR [r8+96]
|
|
||||||
and edx, 2097136
|
|
||||||
mov rax, QWORD PTR [rcx+48]
|
|
||||||
xorps xmm13, xmm13
|
|
||||||
xor rax, QWORD PTR [rcx+16]
|
|
||||||
mov rcx, QWORD PTR [rcx+88]
|
|
||||||
xor rcx, QWORD PTR [r8+72]
|
|
||||||
movq xmm5, QWORD PTR [r8+104]
|
|
||||||
movq xmm7, rax
|
|
||||||
|
|
||||||
mov eax, 1
|
|
||||||
shl rax, 52
|
|
||||||
movq xmm14, rax
|
|
||||||
punpcklqdq xmm14, xmm14
|
|
||||||
|
|
||||||
mov eax, 1023
|
|
||||||
shl rax, 52
|
|
||||||
movq xmm12, rax
|
|
||||||
punpcklqdq xmm12, xmm12
|
|
||||||
|
|
||||||
mov rax, QWORD PTR [r8+80]
|
|
||||||
xor rax, QWORD PTR [r8+64]
|
|
||||||
punpcklqdq xmm7, xmm0
|
|
||||||
movq xmm0, rcx
|
|
||||||
mov rcx, QWORD PTR [r9+56]
|
|
||||||
xor rcx, QWORD PTR [r9+24]
|
|
||||||
movq xmm3, rax
|
|
||||||
mov rax, QWORD PTR [r9+48]
|
|
||||||
xor rax, QWORD PTR [r9+16]
|
|
||||||
punpcklqdq xmm3, xmm0
|
|
||||||
movq xmm0, rcx
|
|
||||||
mov QWORD PTR [rsp], r13
|
|
||||||
mov rcx, QWORD PTR [r9+88]
|
|
||||||
xor rcx, QWORD PTR [r9+72]
|
|
||||||
movq xmm6, rax
|
|
||||||
mov rax, QWORD PTR [r9+80]
|
|
||||||
xor rax, QWORD PTR [r9+64]
|
|
||||||
punpcklqdq xmm6, xmm0
|
|
||||||
movq xmm0, rcx
|
|
||||||
mov QWORD PTR [rsp+256], r10
|
|
||||||
mov rcx, rdi
|
|
||||||
mov QWORD PTR [rsp+264], r11
|
|
||||||
movq xmm8, rax
|
|
||||||
and ecx, 2097136
|
|
||||||
punpcklqdq xmm8, xmm0
|
|
||||||
movq xmm0, QWORD PTR [r9+96]
|
|
||||||
punpcklqdq xmm4, xmm0
|
|
||||||
movq xmm0, QWORD PTR [r9+104]
|
|
||||||
lea r8, QWORD PTR [rcx+rsi]
|
|
||||||
movdqu xmm11, XMMWORD PTR [r8]
|
|
||||||
punpcklqdq xmm5, xmm0
|
|
||||||
lea r9, QWORD PTR [rdx+r13]
|
|
||||||
movdqu xmm15, XMMWORD PTR [r9]
|
|
||||||
|
|
||||||
ALIGN 16
|
|
||||||
main_loop_double_half_sandybridge:
|
|
||||||
movdqu xmm9, xmm15
|
|
||||||
mov eax, edx
|
|
||||||
mov ebx, edx
|
|
||||||
xor eax, 16
|
|
||||||
xor ebx, 32
|
|
||||||
xor edx, 48
|
|
||||||
|
|
||||||
movq xmm0, r11
|
|
||||||
movq xmm2, r10
|
|
||||||
punpcklqdq xmm2, xmm0
|
|
||||||
aesenc xmm9, xmm2
|
|
||||||
|
|
||||||
movdqu xmm0, XMMWORD PTR [rax+r13]
|
|
||||||
movdqu xmm1, XMMWORD PTR [rbx+r13]
|
|
||||||
paddq xmm0, xmm7
|
|
||||||
paddq xmm1, xmm2
|
|
||||||
movdqu XMMWORD PTR [rbx+r13], xmm0
|
|
||||||
movdqu xmm0, XMMWORD PTR [rdx+r13]
|
|
||||||
movdqu XMMWORD PTR [rdx+r13], xmm1
|
|
||||||
paddq xmm0, xmm3
|
|
||||||
movdqu XMMWORD PTR [rax+r13], xmm0
|
|
||||||
|
|
||||||
movq r11, xmm9
|
|
||||||
mov edx, r11d
|
|
||||||
and edx, 2097136
|
|
||||||
movdqa xmm0, xmm9
|
|
||||||
pxor xmm0, xmm7
|
|
||||||
movdqu XMMWORD PTR [r9], xmm0
|
|
||||||
|
|
||||||
lea rbx, QWORD PTR [rdx+r13]
|
|
||||||
mov r10, QWORD PTR [rdx+r13]
|
|
||||||
|
|
||||||
movdqu xmm10, xmm11
|
|
||||||
movq xmm0, rbp
|
|
||||||
movq xmm11, rdi
|
|
||||||
punpcklqdq xmm11, xmm0
|
|
||||||
aesenc xmm10, xmm11
|
|
||||||
|
|
||||||
mov eax, ecx
|
|
||||||
mov r12d, ecx
|
|
||||||
xor eax, 16
|
|
||||||
xor r12d, 32
|
|
||||||
xor ecx, 48
|
|
||||||
|
|
||||||
movdqu xmm0, XMMWORD PTR [rax+rsi]
|
|
||||||
paddq xmm0, xmm6
|
|
||||||
movdqu xmm1, XMMWORD PTR [r12+rsi]
|
|
||||||
movdqu XMMWORD PTR [r12+rsi], xmm0
|
|
||||||
paddq xmm1, xmm11
|
|
||||||
movdqu xmm0, XMMWORD PTR [rcx+rsi]
|
|
||||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
|
||||||
paddq xmm0, xmm8
|
|
||||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
|
||||||
|
|
||||||
movq rcx, xmm10
|
|
||||||
and ecx, 2097136
|
|
||||||
|
|
||||||
movdqa xmm0, xmm10
|
|
||||||
pxor xmm0, xmm6
|
|
||||||
movdqu XMMWORD PTR [r8], xmm0
|
|
||||||
mov r12, QWORD PTR [rcx+rsi]
|
|
||||||
|
|
||||||
mov r9, QWORD PTR [rbx+8]
|
|
||||||
|
|
||||||
xor edx, 16
|
|
||||||
mov r8d, edx
|
|
||||||
mov r15d, edx
|
|
||||||
|
|
||||||
movq rdx, xmm5
|
|
||||||
shl rdx, 32
|
|
||||||
movq rax, xmm4
|
|
||||||
xor rdx, rax
|
|
||||||
xor r10, rdx
|
|
||||||
mov rax, r10
|
|
||||||
mul r11
|
|
||||||
mov r11d, r8d
|
|
||||||
xor r11d, 48
|
|
||||||
movq xmm0, rdx
|
|
||||||
xor rdx, [r11+r13]
|
|
||||||
movq xmm1, rax
|
|
||||||
xor rax, [r11+r13+8]
|
|
||||||
punpcklqdq xmm0, xmm1
|
|
||||||
|
|
||||||
pxor xmm0, XMMWORD PTR [r8+r13]
|
|
||||||
xor r8d, 32
|
|
||||||
movdqu xmm1, XMMWORD PTR [r11+r13]
|
|
||||||
paddq xmm0, xmm7
|
|
||||||
paddq xmm1, xmm2
|
|
||||||
movdqu XMMWORD PTR [r11+r13], xmm0
|
|
||||||
movdqu xmm0, XMMWORD PTR [r8+r13]
|
|
||||||
movdqu XMMWORD PTR [r8+r13], xmm1
|
|
||||||
paddq xmm0, xmm3
|
|
||||||
movdqu XMMWORD PTR [r15+r13], xmm0
|
|
||||||
|
|
||||||
mov r11, QWORD PTR [rsp+256]
|
|
||||||
add r11, rdx
|
|
||||||
mov rdx, QWORD PTR [rsp+264]
|
|
||||||
add rdx, rax
|
|
||||||
mov QWORD PTR [rbx], r11
|
|
||||||
xor r11, r10
|
|
||||||
mov QWORD PTR [rbx+8], rdx
|
|
||||||
xor rdx, r9
|
|
||||||
mov QWORD PTR [rsp+256], r11
|
|
||||||
and r11d, 2097136
|
|
||||||
mov QWORD PTR [rsp+264], rdx
|
|
||||||
mov QWORD PTR [rsp+8], r11
|
|
||||||
lea r15, QWORD PTR [r11+r13]
|
|
||||||
movdqu xmm15, XMMWORD PTR [r11+r13]
|
|
||||||
lea r13, QWORD PTR [rsi+rcx]
|
|
||||||
movdqa xmm0, xmm5
|
|
||||||
psrldq xmm0, 8
|
|
||||||
movaps xmm2, xmm13
|
|
||||||
movq r10, xmm0
|
|
||||||
psllq xmm5, 1
|
|
||||||
shl r10, 32
|
|
||||||
movdqa xmm0, xmm9
|
|
||||||
psrldq xmm0, 8
|
|
||||||
movdqa xmm1, xmm10
|
|
||||||
movq r11, xmm0
|
|
||||||
psrldq xmm1, 8
|
|
||||||
movq r8, xmm1
|
|
||||||
psrldq xmm4, 8
|
|
||||||
movaps xmm0, xmm13
|
|
||||||
movq rax, xmm4
|
|
||||||
xor r10, rax
|
|
||||||
movaps xmm1, xmm13
|
|
||||||
xor r10, r12
|
|
||||||
lea rax, QWORD PTR [r11+1]
|
|
||||||
shr rax, 1
|
|
||||||
movdqa xmm3, xmm9
|
|
||||||
punpcklqdq xmm3, xmm10
|
|
||||||
paddq xmm5, xmm3
|
|
||||||
movq rdx, xmm5
|
|
||||||
psrldq xmm5, 8
|
|
||||||
cvtsi2sd xmm2, rax
|
|
||||||
or edx, -2147483647
|
|
||||||
lea rax, QWORD PTR [r8+1]
|
|
||||||
shr rax, 1
|
|
||||||
movq r9, xmm5
|
|
||||||
cvtsi2sd xmm0, rax
|
|
||||||
or r9d, -2147483647
|
|
||||||
cvtsi2sd xmm1, rdx
|
|
||||||
unpcklpd xmm2, xmm0
|
|
||||||
movaps xmm0, xmm13
|
|
||||||
cvtsi2sd xmm0, r9
|
|
||||||
unpcklpd xmm1, xmm0
|
|
||||||
divpd xmm2, xmm1
|
|
||||||
paddq xmm2, xmm14
|
|
||||||
cvttsd2si rax, xmm2
|
|
||||||
psrldq xmm2, 8
|
|
||||||
mov rbx, rax
|
|
||||||
imul rax, rdx
|
|
||||||
sub r11, rax
|
|
||||||
js div_fix_1_half_sandybridge
|
|
||||||
div_fix_1_ret_half_sandybridge:
|
|
||||||
|
|
||||||
cvttsd2si rdx, xmm2
|
|
||||||
mov rax, rdx
|
|
||||||
imul rax, r9
|
|
||||||
movd xmm2, r11d
|
|
||||||
movd xmm4, ebx
|
|
||||||
sub r8, rax
|
|
||||||
js div_fix_2_half_sandybridge
|
|
||||||
div_fix_2_ret_half_sandybridge:
|
|
||||||
|
|
||||||
movd xmm1, r8d
|
|
||||||
movd xmm0, edx
|
|
||||||
punpckldq xmm2, xmm1
|
|
||||||
punpckldq xmm4, xmm0
|
|
||||||
punpckldq xmm4, xmm2
|
|
||||||
paddq xmm3, xmm4
|
|
||||||
movdqa xmm0, xmm3
|
|
||||||
psrlq xmm0, 12
|
|
||||||
paddq xmm0, xmm12
|
|
||||||
sqrtpd xmm1, xmm0
|
|
||||||
movq r9, xmm1
|
|
||||||
movdqa xmm5, xmm1
|
|
||||||
psrlq xmm5, 19
|
|
||||||
test r9, 524287
|
|
||||||
je sqrt_fix_1_half_sandybridge
|
|
||||||
sqrt_fix_1_ret_half_sandybridge:
|
|
||||||
|
|
||||||
movq r9, xmm10
|
|
||||||
psrldq xmm1, 8
|
|
||||||
movq r8, xmm1
|
|
||||||
test r8, 524287
|
|
||||||
je sqrt_fix_2_half_sandybridge
|
|
||||||
sqrt_fix_2_ret_half_sandybridge:
|
|
||||||
|
|
||||||
mov r12d, ecx
|
|
||||||
mov r8d, ecx
|
|
||||||
xor r12d, 16
|
|
||||||
xor r8d, 32
|
|
||||||
xor ecx, 48
|
|
||||||
mov rax, r10
|
|
||||||
mul r9
|
|
||||||
movq xmm0, rax
|
|
||||||
movq xmm3, rdx
|
|
||||||
punpcklqdq xmm3, xmm0
|
|
||||||
|
|
||||||
movdqu xmm0, XMMWORD PTR [r12+rsi]
|
|
||||||
pxor xmm0, xmm3
|
|
||||||
movdqu xmm1, XMMWORD PTR [r8+rsi]
|
|
||||||
xor rdx, [r8+rsi]
|
|
||||||
xor rax, [r8+rsi+8]
|
|
||||||
movdqu xmm3, XMMWORD PTR [rcx+rsi]
|
|
||||||
paddq xmm0, xmm6
|
|
||||||
paddq xmm1, xmm11
|
|
||||||
paddq xmm3, xmm8
|
|
||||||
movdqu XMMWORD PTR [r8+rsi], xmm0
|
|
||||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
|
||||||
movdqu XMMWORD PTR [r12+rsi], xmm3
|
|
||||||
|
|
||||||
add rdi, rdx
|
|
||||||
mov QWORD PTR [r13], rdi
|
|
||||||
xor rdi, r10
|
|
||||||
mov ecx, edi
|
|
||||||
and ecx, 2097136
|
|
||||||
lea r8, QWORD PTR [rcx+rsi]
|
|
||||||
|
|
||||||
mov rdx, QWORD PTR [r13+8]
|
|
||||||
add rbp, rax
|
|
||||||
mov QWORD PTR [r13+8], rbp
|
|
||||||
movdqu xmm11, XMMWORD PTR [rcx+rsi]
|
|
||||||
xor rbp, rdx
|
|
||||||
mov r13, QWORD PTR [rsp]
|
|
||||||
movdqa xmm3, xmm7
|
|
||||||
mov rdx, QWORD PTR [rsp+8]
|
|
||||||
movdqa xmm8, xmm6
|
|
||||||
mov r10, QWORD PTR [rsp+256]
|
|
||||||
movdqa xmm7, xmm9
|
|
||||||
mov r11, QWORD PTR [rsp+264]
|
|
||||||
movdqa xmm6, xmm10
|
|
||||||
mov r9, r15
|
|
||||||
dec r14d
|
|
||||||
jne main_loop_double_half_sandybridge
|
|
||||||
|
|
||||||
ldmxcsr DWORD PTR [rsp+272]
|
|
||||||
movaps xmm13, XMMWORD PTR [rsp+48]
|
|
||||||
lea r11, QWORD PTR [rsp+184]
|
|
||||||
movaps xmm6, XMMWORD PTR [r11-24]
|
|
||||||
movaps xmm7, XMMWORD PTR [r11-40]
|
|
||||||
movaps xmm8, XMMWORD PTR [r11-56]
|
|
||||||
movaps xmm9, XMMWORD PTR [r11-72]
|
|
||||||
movaps xmm10, XMMWORD PTR [r11-88]
|
|
||||||
movaps xmm11, XMMWORD PTR [r11-104]
|
|
||||||
movaps xmm12, XMMWORD PTR [r11-120]
|
|
||||||
movaps xmm14, XMMWORD PTR [rsp+32]
|
|
||||||
movaps xmm15, XMMWORD PTR [rsp+16]
|
|
||||||
mov rsp, r11
|
|
||||||
pop r15
|
|
||||||
pop r14
|
|
||||||
pop r13
|
|
||||||
pop r12
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
pop rbp
|
|
||||||
pop rbx
|
|
||||||
jmp cnv2_double_mainloop_asm_half_sandybridge_endp
|
|
||||||
|
|
||||||
div_fix_1_half_sandybridge:
|
|
||||||
dec rbx
|
|
||||||
add r11, rdx
|
|
||||||
jmp div_fix_1_ret_half_sandybridge
|
|
||||||
|
|
||||||
div_fix_2_half_sandybridge:
|
|
||||||
dec rdx
|
|
||||||
add r8, r9
|
|
||||||
jmp div_fix_2_ret_half_sandybridge
|
|
||||||
|
|
||||||
sqrt_fix_1_half_sandybridge:
|
|
||||||
movq r8, xmm3
|
|
||||||
movdqa xmm0, xmm5
|
|
||||||
psrldq xmm0, 8
|
|
||||||
dec r9
|
|
||||||
mov r11d, -1022
|
|
||||||
shl r11, 32
|
|
||||||
mov rax, r9
|
|
||||||
shr r9, 19
|
|
||||||
shr rax, 20
|
|
||||||
mov rdx, r9
|
|
||||||
sub rdx, rax
|
|
||||||
lea rdx, [rdx+r11+1]
|
|
||||||
add rax, r11
|
|
||||||
imul rdx, rax
|
|
||||||
sub rdx, r8
|
|
||||||
adc r9, 0
|
|
||||||
movq xmm5, r9
|
|
||||||
punpcklqdq xmm5, xmm0
|
|
||||||
jmp sqrt_fix_1_ret_half_sandybridge
|
|
||||||
|
|
||||||
sqrt_fix_2_half_sandybridge:
|
|
||||||
psrldq xmm3, 8
|
|
||||||
movq r11, xmm3
|
|
||||||
dec r8
|
|
||||||
mov ebx, -1022
|
|
||||||
shl rbx, 32
|
|
||||||
mov rax, r8
|
|
||||||
shr r8, 19
|
|
||||||
shr rax, 20
|
|
||||||
mov rdx, r8
|
|
||||||
sub rdx, rax
|
|
||||||
lea rdx, [rdx+rbx+1]
|
|
||||||
add rax, rbx
|
|
||||||
imul rdx, rax
|
|
||||||
sub rdx, r11
|
|
||||||
adc r8, 0
|
|
||||||
movq xmm0, r8
|
|
||||||
punpcklqdq xmm5, xmm0
|
|
||||||
jmp sqrt_fix_2_ret_half_sandybridge
|
|
||||||
|
|
||||||
cnv2_double_mainloop_asm_half_sandybridge_endp:
|
|
|
@ -1,180 +0,0 @@
|
||||||
mov QWORD PTR [rsp+16], rbx
|
|
||||||
mov QWORD PTR [rsp+24], rbp
|
|
||||||
mov QWORD PTR [rsp+32], rsi
|
|
||||||
push rdi
|
|
||||||
push r12
|
|
||||||
push r13
|
|
||||||
push r14
|
|
||||||
push r15
|
|
||||||
sub rsp, 64
|
|
||||||
|
|
||||||
stmxcsr DWORD PTR [rsp]
|
|
||||||
mov DWORD PTR [rsp+4], 24448
|
|
||||||
ldmxcsr DWORD PTR [rsp+4]
|
|
||||||
|
|
||||||
mov rax, QWORD PTR [rcx+48]
|
|
||||||
mov r9, rcx
|
|
||||||
xor rax, QWORD PTR [rcx+16]
|
|
||||||
mov ebp, 262144
|
|
||||||
mov r8, QWORD PTR [rcx+32]
|
|
||||||
xor r8, QWORD PTR [rcx]
|
|
||||||
mov r11, QWORD PTR [rcx+40]
|
|
||||||
mov r10, r8
|
|
||||||
mov rdx, QWORD PTR [rcx+56]
|
|
||||||
movq xmm3, rax
|
|
||||||
xor rdx, QWORD PTR [rcx+24]
|
|
||||||
xor r11, QWORD PTR [rcx+8]
|
|
||||||
mov rbx, QWORD PTR [rcx+224]
|
|
||||||
mov rax, QWORD PTR [r9+80]
|
|
||||||
xor rax, QWORD PTR [r9+64]
|
|
||||||
movq xmm0, rdx
|
|
||||||
mov rcx, QWORD PTR [rcx+88]
|
|
||||||
xor rcx, QWORD PTR [r9+72]
|
|
||||||
mov rdi, QWORD PTR [r9+104]
|
|
||||||
and r10d, 2097136
|
|
||||||
movaps XMMWORD PTR [rsp+48], xmm6
|
|
||||||
movq xmm4, rax
|
|
||||||
movaps XMMWORD PTR [rsp+32], xmm7
|
|
||||||
movaps XMMWORD PTR [rsp+16], xmm8
|
|
||||||
xorps xmm8, xmm8
|
|
||||||
mov ax, 1023
|
|
||||||
shl rax, 52
|
|
||||||
movq xmm7, rax
|
|
||||||
mov r15, QWORD PTR [r9+96]
|
|
||||||
punpcklqdq xmm3, xmm0
|
|
||||||
movq xmm0, rcx
|
|
||||||
punpcklqdq xmm4, xmm0
|
|
||||||
|
|
||||||
ALIGN 16
|
|
||||||
cnv2_main_loop_half_bulldozer:
|
|
||||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
|
||||||
movq xmm6, r8
|
|
||||||
pinsrq xmm6, r11, 1
|
|
||||||
lea rdx, QWORD PTR [r10+rbx]
|
|
||||||
lea r9, QWORD PTR [rdi+rdi]
|
|
||||||
shl rdi, 32
|
|
||||||
|
|
||||||
mov ecx, r10d
|
|
||||||
mov eax, r10d
|
|
||||||
xor ecx, 16
|
|
||||||
xor eax, 32
|
|
||||||
xor r10d, 48
|
|
||||||
aesenc xmm5, xmm6
|
|
||||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
|
||||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
|
||||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
|
||||||
paddq xmm2, xmm3
|
|
||||||
paddq xmm1, xmm6
|
|
||||||
paddq xmm0, xmm4
|
|
||||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
|
||||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
|
||||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
|
||||||
|
|
||||||
movaps xmm1, xmm8
|
|
||||||
mov rsi, r15
|
|
||||||
xor rsi, rdi
|
|
||||||
|
|
||||||
mov edi, 1023
|
|
||||||
shl rdi, 52
|
|
||||||
|
|
||||||
movq r14, xmm5
|
|
||||||
pextrq rax, xmm5, 1
|
|
||||||
|
|
||||||
movdqa xmm0, xmm5
|
|
||||||
pxor xmm0, xmm3
|
|
||||||
mov r10, r14
|
|
||||||
and r10d, 2097136
|
|
||||||
movdqa XMMWORD PTR [rdx], xmm0
|
|
||||||
xor rsi, QWORD PTR [r10+rbx]
|
|
||||||
lea r12, QWORD PTR [r10+rbx]
|
|
||||||
mov r13, QWORD PTR [r10+rbx+8]
|
|
||||||
|
|
||||||
add r9d, r14d
|
|
||||||
or r9d, -2147483647
|
|
||||||
xor edx, edx
|
|
||||||
div r9
|
|
||||||
mov eax, eax
|
|
||||||
shl rdx, 32
|
|
||||||
lea r15, [rax+rdx]
|
|
||||||
lea rax, [r14+r15]
|
|
||||||
shr rax, 12
|
|
||||||
add rax, rdi
|
|
||||||
movq xmm0, rax
|
|
||||||
sqrtsd xmm1, xmm0
|
|
||||||
movq rdi, xmm1
|
|
||||||
test rdi, 524287
|
|
||||||
je sqrt_fixup_half_bulldozer
|
|
||||||
shr rdi, 19
|
|
||||||
|
|
||||||
sqrt_fixup_half_bulldozer_ret:
|
|
||||||
mov rax, rsi
|
|
||||||
mul r14
|
|
||||||
movq xmm1, rax
|
|
||||||
movq xmm0, rdx
|
|
||||||
punpcklqdq xmm0, xmm1
|
|
||||||
|
|
||||||
mov r9d, r10d
|
|
||||||
mov ecx, r10d
|
|
||||||
xor r9d, 16
|
|
||||||
xor ecx, 32
|
|
||||||
xor r10d, 48
|
|
||||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
|
||||||
xor rdx, [rcx+rbx]
|
|
||||||
xor rax, [rcx+rbx+8]
|
|
||||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
|
||||||
pxor xmm2, xmm0
|
|
||||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
|
||||||
paddq xmm2, xmm3
|
|
||||||
paddq xmm1, xmm6
|
|
||||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
|
||||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
|
||||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
|
||||||
|
|
||||||
movdqa xmm4, xmm3
|
|
||||||
add r8, rdx
|
|
||||||
add r11, rax
|
|
||||||
mov QWORD PTR [r12], r8
|
|
||||||
xor r8, rsi
|
|
||||||
mov QWORD PTR [r12+8], r11
|
|
||||||
mov r10, r8
|
|
||||||
xor r11, r13
|
|
||||||
and r10d, 2097136
|
|
||||||
movdqa xmm3, xmm5
|
|
||||||
dec ebp
|
|
||||||
jne cnv2_main_loop_half_bulldozer
|
|
||||||
|
|
||||||
ldmxcsr DWORD PTR [rsp]
|
|
||||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
|
||||||
lea r11, QWORD PTR [rsp+64]
|
|
||||||
mov rbx, QWORD PTR [r11+56]
|
|
||||||
mov rbp, QWORD PTR [r11+64]
|
|
||||||
mov rsi, QWORD PTR [r11+72]
|
|
||||||
movaps xmm8, XMMWORD PTR [r11-48]
|
|
||||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
|
||||||
mov rsp, r11
|
|
||||||
pop r15
|
|
||||||
pop r14
|
|
||||||
pop r13
|
|
||||||
pop r12
|
|
||||||
pop rdi
|
|
||||||
jmp cnv2_main_loop_half_bulldozer_endp
|
|
||||||
|
|
||||||
sqrt_fixup_half_bulldozer:
|
|
||||||
movq r9, xmm5
|
|
||||||
add r9, r15
|
|
||||||
dec rdi
|
|
||||||
mov edx, -1022
|
|
||||||
shl rdx, 32
|
|
||||||
mov rax, rdi
|
|
||||||
shr rdi, 19
|
|
||||||
shr rax, 20
|
|
||||||
mov rcx, rdi
|
|
||||||
sub rcx, rax
|
|
||||||
lea rcx, [rcx+rdx+1]
|
|
||||||
add rax, rdx
|
|
||||||
imul rcx, rax
|
|
||||||
sub rcx, r9
|
|
||||||
adc rdi, 0
|
|
||||||
jmp sqrt_fixup_half_bulldozer_ret
|
|
||||||
|
|
||||||
cnv2_main_loop_half_bulldozer_endp:
|
|
|
@ -1,186 +0,0 @@
|
||||||
mov QWORD PTR [rsp+24], rbx
|
|
||||||
push rbp
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
push r12
|
|
||||||
push r13
|
|
||||||
push r14
|
|
||||||
push r15
|
|
||||||
sub rsp, 80
|
|
||||||
|
|
||||||
stmxcsr DWORD PTR [rsp]
|
|
||||||
mov DWORD PTR [rsp+4], 24448
|
|
||||||
ldmxcsr DWORD PTR [rsp+4]
|
|
||||||
|
|
||||||
mov rax, QWORD PTR [rcx+48]
|
|
||||||
mov r9, rcx
|
|
||||||
xor rax, QWORD PTR [rcx+16]
|
|
||||||
mov esi, 262144
|
|
||||||
mov r8, QWORD PTR [rcx+32]
|
|
||||||
mov r13d, -2147483647
|
|
||||||
xor r8, QWORD PTR [rcx]
|
|
||||||
mov r11, QWORD PTR [rcx+40]
|
|
||||||
mov r10, r8
|
|
||||||
mov rdx, QWORD PTR [rcx+56]
|
|
||||||
movq xmm4, rax
|
|
||||||
xor rdx, QWORD PTR [rcx+24]
|
|
||||||
xor r11, QWORD PTR [rcx+8]
|
|
||||||
mov rbx, QWORD PTR [rcx+224]
|
|
||||||
mov rax, QWORD PTR [r9+80]
|
|
||||||
xor rax, QWORD PTR [r9+64]
|
|
||||||
movq xmm0, rdx
|
|
||||||
mov rcx, QWORD PTR [rcx+88]
|
|
||||||
xor rcx, QWORD PTR [r9+72]
|
|
||||||
movq xmm3, QWORD PTR [r9+104]
|
|
||||||
movaps XMMWORD PTR [rsp+64], xmm6
|
|
||||||
movaps XMMWORD PTR [rsp+48], xmm7
|
|
||||||
movaps XMMWORD PTR [rsp+32], xmm8
|
|
||||||
and r10d, 2097136
|
|
||||||
movq xmm5, rax
|
|
||||||
|
|
||||||
xor eax, eax
|
|
||||||
mov QWORD PTR [rsp+16], rax
|
|
||||||
|
|
||||||
mov ax, 1023
|
|
||||||
shl rax, 52
|
|
||||||
movq xmm8, rax
|
|
||||||
mov r15, QWORD PTR [r9+96]
|
|
||||||
punpcklqdq xmm4, xmm0
|
|
||||||
movq xmm0, rcx
|
|
||||||
punpcklqdq xmm5, xmm0
|
|
||||||
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
|
||||||
|
|
||||||
ALIGN 16
|
|
||||||
main_loop_half_ivybridge:
|
|
||||||
lea rdx, QWORD PTR [r10+rbx]
|
|
||||||
mov ecx, r10d
|
|
||||||
mov eax, r10d
|
|
||||||
mov rdi, r15
|
|
||||||
xor ecx, 16
|
|
||||||
xor eax, 32
|
|
||||||
xor r10d, 48
|
|
||||||
movq xmm0, r11
|
|
||||||
movq xmm7, r8
|
|
||||||
punpcklqdq xmm7, xmm0
|
|
||||||
aesenc xmm6, xmm7
|
|
||||||
movq rbp, xmm6
|
|
||||||
mov r9, rbp
|
|
||||||
and r9d, 2097136
|
|
||||||
movdqu xmm2, XMMWORD PTR [rcx+rbx]
|
|
||||||
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
|
||||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
|
||||||
paddq xmm1, xmm7
|
|
||||||
paddq xmm0, xmm5
|
|
||||||
paddq xmm2, xmm4
|
|
||||||
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
|
||||||
movdqu XMMWORD PTR [rax+rbx], xmm2
|
|
||||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
|
||||||
mov r10, r9
|
|
||||||
xor r10d, 32
|
|
||||||
movq rcx, xmm3
|
|
||||||
mov rax, rcx
|
|
||||||
shl rax, 32
|
|
||||||
xor rdi, rax
|
|
||||||
movdqa xmm0, xmm6
|
|
||||||
pxor xmm0, xmm4
|
|
||||||
movdqu XMMWORD PTR [rdx], xmm0
|
|
||||||
xor rdi, QWORD PTR [r9+rbx]
|
|
||||||
lea r14, QWORD PTR [r9+rbx]
|
|
||||||
mov r12, QWORD PTR [r14+8]
|
|
||||||
xor edx, edx
|
|
||||||
lea r9d, DWORD PTR [ecx+ecx]
|
|
||||||
add r9d, ebp
|
|
||||||
movdqa xmm0, xmm6
|
|
||||||
psrldq xmm0, 8
|
|
||||||
or r9d, r13d
|
|
||||||
movq rax, xmm0
|
|
||||||
div r9
|
|
||||||
xorps xmm3, xmm3
|
|
||||||
mov eax, eax
|
|
||||||
shl rdx, 32
|
|
||||||
add rdx, rax
|
|
||||||
lea r9, QWORD PTR [rdx+rbp]
|
|
||||||
mov r15, rdx
|
|
||||||
mov rax, r9
|
|
||||||
shr rax, 12
|
|
||||||
movq xmm0, rax
|
|
||||||
paddq xmm0, xmm8
|
|
||||||
sqrtsd xmm3, xmm0
|
|
||||||
psubq xmm3, XMMWORD PTR [rsp+16]
|
|
||||||
movq rdx, xmm3
|
|
||||||
test edx, 524287
|
|
||||||
je sqrt_fixup_half_ivybridge
|
|
||||||
psrlq xmm3, 19
|
|
||||||
sqrt_fixup_half_ivybridge_ret:
|
|
||||||
|
|
||||||
mov ecx, r10d
|
|
||||||
mov rax, rdi
|
|
||||||
mul rbp
|
|
||||||
movq xmm2, rdx
|
|
||||||
xor rdx, [rcx+rbx]
|
|
||||||
add r8, rdx
|
|
||||||
mov QWORD PTR [r14], r8
|
|
||||||
xor r8, rdi
|
|
||||||
mov edi, r8d
|
|
||||||
and edi, 2097136
|
|
||||||
movq xmm0, rax
|
|
||||||
xor rax, [rcx+rbx+8]
|
|
||||||
add r11, rax
|
|
||||||
mov QWORD PTR [r14+8], r11
|
|
||||||
punpcklqdq xmm2, xmm0
|
|
||||||
|
|
||||||
mov r9d, r10d
|
|
||||||
xor r9d, 48
|
|
||||||
xor r10d, 16
|
|
||||||
pxor xmm2, XMMWORD PTR [r9+rbx]
|
|
||||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
|
||||||
paddq xmm0, xmm5
|
|
||||||
movdqu xmm1, XMMWORD PTR [rcx+rbx]
|
|
||||||
paddq xmm2, xmm4
|
|
||||||
paddq xmm1, xmm7
|
|
||||||
movdqa xmm5, xmm4
|
|
||||||
movdqu XMMWORD PTR [r9+rbx], xmm0
|
|
||||||
movdqa xmm4, xmm6
|
|
||||||
movdqu XMMWORD PTR [rcx+rbx], xmm2
|
|
||||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
|
||||||
movdqu xmm6, [rdi+rbx]
|
|
||||||
mov r10d, edi
|
|
||||||
xor r11, r12
|
|
||||||
dec rsi
|
|
||||||
jne main_loop_half_ivybridge
|
|
||||||
|
|
||||||
ldmxcsr DWORD PTR [rsp]
|
|
||||||
mov rbx, QWORD PTR [rsp+160]
|
|
||||||
movaps xmm6, XMMWORD PTR [rsp+64]
|
|
||||||
movaps xmm7, XMMWORD PTR [rsp+48]
|
|
||||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
|
||||||
add rsp, 80
|
|
||||||
pop r15
|
|
||||||
pop r14
|
|
||||||
pop r13
|
|
||||||
pop r12
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
pop rbp
|
|
||||||
jmp cnv2_main_loop_half_ivybridge_endp
|
|
||||||
|
|
||||||
sqrt_fixup_half_ivybridge:
|
|
||||||
dec rdx
|
|
||||||
mov r13d, -1022
|
|
||||||
shl r13, 32
|
|
||||||
mov rax, rdx
|
|
||||||
shr rdx, 19
|
|
||||||
shr rax, 20
|
|
||||||
mov rcx, rdx
|
|
||||||
sub rcx, rax
|
|
||||||
add rax, r13
|
|
||||||
not r13
|
|
||||||
sub rcx, r13
|
|
||||||
mov r13d, -2147483647
|
|
||||||
imul rcx, rax
|
|
||||||
sub rcx, r9
|
|
||||||
adc rdx, 0
|
|
||||||
movq xmm3, rdx
|
|
||||||
jmp sqrt_fixup_half_ivybridge_ret
|
|
||||||
|
|
||||||
cnv2_main_loop_half_ivybridge_endp:
|
|
|
@ -1,179 +0,0 @@
|
||||||
mov QWORD PTR [rsp+16], rbx
|
|
||||||
mov QWORD PTR [rsp+24], rbp
|
|
||||||
mov QWORD PTR [rsp+32], rsi
|
|
||||||
push rdi
|
|
||||||
push r12
|
|
||||||
push r13
|
|
||||||
push r14
|
|
||||||
push r15
|
|
||||||
sub rsp, 64
|
|
||||||
|
|
||||||
stmxcsr DWORD PTR [rsp]
|
|
||||||
mov DWORD PTR [rsp+4], 24448
|
|
||||||
ldmxcsr DWORD PTR [rsp+4]
|
|
||||||
|
|
||||||
mov rax, QWORD PTR [rcx+48]
|
|
||||||
mov r9, rcx
|
|
||||||
xor rax, QWORD PTR [rcx+16]
|
|
||||||
mov ebp, 262144
|
|
||||||
mov r8, QWORD PTR [rcx+32]
|
|
||||||
xor r8, QWORD PTR [rcx]
|
|
||||||
mov r11, QWORD PTR [rcx+40]
|
|
||||||
mov r10, r8
|
|
||||||
mov rdx, QWORD PTR [rcx+56]
|
|
||||||
movq xmm3, rax
|
|
||||||
xor rdx, QWORD PTR [rcx+24]
|
|
||||||
xor r11, QWORD PTR [rcx+8]
|
|
||||||
mov rbx, QWORD PTR [rcx+224]
|
|
||||||
mov rax, QWORD PTR [r9+80]
|
|
||||||
xor rax, QWORD PTR [r9+64]
|
|
||||||
movq xmm0, rdx
|
|
||||||
mov rcx, QWORD PTR [rcx+88]
|
|
||||||
xor rcx, QWORD PTR [r9+72]
|
|
||||||
mov rdi, QWORD PTR [r9+104]
|
|
||||||
and r10d, 2097136
|
|
||||||
movaps XMMWORD PTR [rsp+48], xmm6
|
|
||||||
movq xmm4, rax
|
|
||||||
movaps XMMWORD PTR [rsp+32], xmm7
|
|
||||||
movaps XMMWORD PTR [rsp+16], xmm8
|
|
||||||
xorps xmm8, xmm8
|
|
||||||
mov ax, 1023
|
|
||||||
shl rax, 52
|
|
||||||
movq xmm7, rax
|
|
||||||
mov r15, QWORD PTR [r9+96]
|
|
||||||
punpcklqdq xmm3, xmm0
|
|
||||||
movq xmm0, rcx
|
|
||||||
punpcklqdq xmm4, xmm0
|
|
||||||
|
|
||||||
ALIGN 16
|
|
||||||
main_loop_half_ryzen:
|
|
||||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
|
||||||
movq xmm0, r11
|
|
||||||
movq xmm6, r8
|
|
||||||
punpcklqdq xmm6, xmm0
|
|
||||||
lea rdx, QWORD PTR [r10+rbx]
|
|
||||||
lea r9, QWORD PTR [rdi+rdi]
|
|
||||||
shl rdi, 32
|
|
||||||
|
|
||||||
mov ecx, r10d
|
|
||||||
mov eax, r10d
|
|
||||||
xor ecx, 16
|
|
||||||
xor eax, 32
|
|
||||||
xor r10d, 48
|
|
||||||
aesenc xmm5, xmm6
|
|
||||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
|
||||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
|
||||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
|
||||||
paddq xmm2, xmm3
|
|
||||||
paddq xmm1, xmm6
|
|
||||||
paddq xmm0, xmm4
|
|
||||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
|
||||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
|
||||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
|
||||||
|
|
||||||
movaps xmm1, xmm8
|
|
||||||
mov rsi, r15
|
|
||||||
xor rsi, rdi
|
|
||||||
movq r14, xmm5
|
|
||||||
movdqa xmm0, xmm5
|
|
||||||
pxor xmm0, xmm3
|
|
||||||
mov r10, r14
|
|
||||||
and r10d, 2097136
|
|
||||||
movdqa XMMWORD PTR [rdx], xmm0
|
|
||||||
xor rsi, QWORD PTR [r10+rbx]
|
|
||||||
lea r12, QWORD PTR [r10+rbx]
|
|
||||||
mov r13, QWORD PTR [r10+rbx+8]
|
|
||||||
|
|
||||||
add r9d, r14d
|
|
||||||
or r9d, -2147483647
|
|
||||||
xor edx, edx
|
|
||||||
movdqa xmm0, xmm5
|
|
||||||
psrldq xmm0, 8
|
|
||||||
movq rax, xmm0
|
|
||||||
|
|
||||||
div r9
|
|
||||||
movq xmm0, rax
|
|
||||||
movq xmm1, rdx
|
|
||||||
punpckldq xmm0, xmm1
|
|
||||||
movq r15, xmm0
|
|
||||||
paddq xmm0, xmm5
|
|
||||||
movdqa xmm2, xmm0
|
|
||||||
psrlq xmm0, 12
|
|
||||||
paddq xmm0, xmm7
|
|
||||||
sqrtsd xmm1, xmm0
|
|
||||||
movq rdi, xmm1
|
|
||||||
test rdi, 524287
|
|
||||||
je sqrt_fixup_half_ryzen
|
|
||||||
shr rdi, 19
|
|
||||||
|
|
||||||
sqrt_fixup_half_ryzen_ret:
|
|
||||||
mov rax, rsi
|
|
||||||
mul r14
|
|
||||||
movq xmm1, rax
|
|
||||||
movq xmm0, rdx
|
|
||||||
punpcklqdq xmm0, xmm1
|
|
||||||
|
|
||||||
mov r9d, r10d
|
|
||||||
mov ecx, r10d
|
|
||||||
xor r9d, 16
|
|
||||||
xor ecx, 32
|
|
||||||
xor r10d, 48
|
|
||||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
|
||||||
xor rdx, [rcx+rbx]
|
|
||||||
xor rax, [rcx+rbx+8]
|
|
||||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
|
||||||
pxor xmm2, xmm0
|
|
||||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
|
||||||
paddq xmm2, xmm3
|
|
||||||
paddq xmm1, xmm6
|
|
||||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
|
||||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
|
||||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
|
||||||
|
|
||||||
movdqa xmm4, xmm3
|
|
||||||
add r8, rdx
|
|
||||||
add r11, rax
|
|
||||||
mov QWORD PTR [r12], r8
|
|
||||||
xor r8, rsi
|
|
||||||
mov QWORD PTR [r12+8], r11
|
|
||||||
mov r10, r8
|
|
||||||
xor r11, r13
|
|
||||||
and r10d, 2097136
|
|
||||||
movdqa xmm3, xmm5
|
|
||||||
dec ebp
|
|
||||||
jne main_loop_half_ryzen
|
|
||||||
|
|
||||||
ldmxcsr DWORD PTR [rsp]
|
|
||||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
|
||||||
lea r11, QWORD PTR [rsp+64]
|
|
||||||
mov rbx, QWORD PTR [r11+56]
|
|
||||||
mov rbp, QWORD PTR [r11+64]
|
|
||||||
mov rsi, QWORD PTR [r11+72]
|
|
||||||
movaps xmm8, XMMWORD PTR [r11-48]
|
|
||||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
|
||||||
mov rsp, r11
|
|
||||||
pop r15
|
|
||||||
pop r14
|
|
||||||
pop r13
|
|
||||||
pop r12
|
|
||||||
pop rdi
|
|
||||||
jmp cnv2_main_loop_half_ryzen_endp
|
|
||||||
|
|
||||||
sqrt_fixup_half_ryzen:
|
|
||||||
movq r9, xmm2
|
|
||||||
dec rdi
|
|
||||||
mov edx, -1022
|
|
||||||
shl rdx, 32
|
|
||||||
mov rax, rdi
|
|
||||||
shr rdi, 19
|
|
||||||
shr rax, 20
|
|
||||||
mov rcx, rdi
|
|
||||||
sub rcx, rax
|
|
||||||
lea rcx, [rcx+rdx+1]
|
|
||||||
add rax, rdx
|
|
||||||
imul rcx, rax
|
|
||||||
sub rcx, r9
|
|
||||||
adc rdi, 0
|
|
||||||
jmp sqrt_fixup_half_ryzen_ret
|
|
||||||
|
|
||||||
cnv2_main_loop_half_ryzen_endp:
|
|
|
@ -12,11 +12,6 @@
|
||||||
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
|
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
|
||||||
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
|
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
|
||||||
|
|
||||||
.global FN_PREFIX(cn_half_mainloop_ivybridge_asm)
|
|
||||||
.global FN_PREFIX(cn_half_mainloop_ryzen_asm)
|
|
||||||
.global FN_PREFIX(cn_half_mainloop_bulldozer_asm)
|
|
||||||
.global FN_PREFIX(cn_half_double_mainloop_sandybridge_asm)
|
|
||||||
|
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
FN_PREFIX(cnv2_mainloop_ivybridge_asm):
|
FN_PREFIX(cnv2_mainloop_ivybridge_asm):
|
||||||
sub rsp, 48
|
sub rsp, 48
|
||||||
|
@ -24,6 +19,7 @@ FN_PREFIX(cnv2_mainloop_ivybridge_asm):
|
||||||
#include "cn2/cnv2_main_loop_ivybridge.inc"
|
#include "cn2/cnv2_main_loop_ivybridge.inc"
|
||||||
add rsp, 48
|
add rsp, 48
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
|
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
FN_PREFIX(cnv2_mainloop_ryzen_asm):
|
FN_PREFIX(cnv2_mainloop_ryzen_asm):
|
||||||
|
@ -32,6 +28,7 @@ FN_PREFIX(cnv2_mainloop_ryzen_asm):
|
||||||
#include "cn2/cnv2_main_loop_ryzen.inc"
|
#include "cn2/cnv2_main_loop_ryzen.inc"
|
||||||
add rsp, 48
|
add rsp, 48
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
|
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
FN_PREFIX(cnv2_mainloop_bulldozer_asm):
|
FN_PREFIX(cnv2_mainloop_bulldozer_asm):
|
||||||
|
@ -40,6 +37,7 @@ FN_PREFIX(cnv2_mainloop_bulldozer_asm):
|
||||||
#include "cn2/cnv2_main_loop_bulldozer.inc"
|
#include "cn2/cnv2_main_loop_bulldozer.inc"
|
||||||
add rsp, 48
|
add rsp, 48
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
|
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
|
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
|
||||||
|
@ -49,36 +47,4 @@ FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
|
||||||
#include "cn2/cnv2_double_main_loop_sandybridge.inc"
|
#include "cn2/cnv2_double_main_loop_sandybridge.inc"
|
||||||
add rsp, 48
|
add rsp, 48
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
ALIGN 16
|
|
||||||
FN_PREFIX(cn_half_mainloop_ivybridge_asm):
|
|
||||||
sub rsp, 48
|
|
||||||
mov rcx, rdi
|
|
||||||
#include "cn_half/cn_half_main_loop_ivybridge.inc"
|
|
||||||
add rsp, 48
|
|
||||||
ret 0
|
|
||||||
|
|
||||||
ALIGN 16
|
|
||||||
FN_PREFIX(cn_half_mainloop_ryzen_asm):
|
|
||||||
sub rsp, 48
|
|
||||||
mov rcx, rdi
|
|
||||||
#include "cn_half/cn_half_main_loop_ryzen.inc"
|
|
||||||
add rsp, 48
|
|
||||||
ret 0
|
|
||||||
|
|
||||||
ALIGN 16
|
|
||||||
FN_PREFIX(cn_half_mainloop_bulldozer_asm):
|
|
||||||
sub rsp, 48
|
|
||||||
mov rcx, rdi
|
|
||||||
#include "cn_half/cn_half_main_loop_bulldozer.inc"
|
|
||||||
add rsp, 48
|
|
||||||
ret 0
|
|
||||||
|
|
||||||
ALIGN 16
|
|
||||||
FN_PREFIX(cn_half_double_mainloop_sandybridge_asm):
|
|
||||||
sub rsp, 48
|
|
||||||
mov rcx, rdi
|
|
||||||
mov rdx, rsi
|
|
||||||
#include "cn_half/cn_half_double_main_loop_sandybridge.inc"
|
|
||||||
add rsp, 48
|
|
||||||
ret 0
|
|
||||||
|
|
|
@ -3,58 +3,34 @@ PUBLIC cnv2_mainloop_ivybridge_asm
|
||||||
PUBLIC cnv2_mainloop_ryzen_asm
|
PUBLIC cnv2_mainloop_ryzen_asm
|
||||||
PUBLIC cnv2_mainloop_bulldozer_asm
|
PUBLIC cnv2_mainloop_bulldozer_asm
|
||||||
PUBLIC cnv2_double_mainloop_sandybridge_asm
|
PUBLIC cnv2_double_mainloop_sandybridge_asm
|
||||||
PUBLIC cn_half_mainloop_ivybridge_asm
|
|
||||||
PUBLIC cn_half_mainloop_ryzen_asm
|
|
||||||
PUBLIC cn_half_mainloop_bulldozer_asm
|
|
||||||
PUBLIC cn_half_double_mainloop_sandybridge_asm
|
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
cnv2_mainloop_ivybridge_asm PROC
|
cnv2_mainloop_ivybridge_asm PROC
|
||||||
INCLUDE cn2/cnv2_main_loop_ivybridge.inc
|
INCLUDE cn2/cnv2_main_loop_ivybridge.inc
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
cnv2_mainloop_ivybridge_asm ENDP
|
cnv2_mainloop_ivybridge_asm ENDP
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
cnv2_mainloop_ryzen_asm PROC
|
cnv2_mainloop_ryzen_asm PROC
|
||||||
INCLUDE cn2/cnv2_main_loop_ryzen.inc
|
INCLUDE cn2/cnv2_main_loop_ryzen.inc
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
cnv2_mainloop_ryzen_asm ENDP
|
cnv2_mainloop_ryzen_asm ENDP
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
cnv2_mainloop_bulldozer_asm PROC
|
cnv2_mainloop_bulldozer_asm PROC
|
||||||
INCLUDE cn2/cnv2_main_loop_bulldozer.inc
|
INCLUDE cn2/cnv2_main_loop_bulldozer.inc
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
cnv2_mainloop_bulldozer_asm ENDP
|
cnv2_mainloop_bulldozer_asm ENDP
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
cnv2_double_mainloop_sandybridge_asm PROC
|
cnv2_double_mainloop_sandybridge_asm PROC
|
||||||
INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc
|
INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
cnv2_double_mainloop_sandybridge_asm ENDP
|
cnv2_double_mainloop_sandybridge_asm ENDP
|
||||||
|
|
||||||
ALIGN 64
|
|
||||||
cn_half_mainloop_ivybridge_asm PROC
|
|
||||||
INCLUDE cn_half/cn_half_main_loop_ivybridge.inc
|
|
||||||
ret 0
|
|
||||||
cn_half_mainloop_ivybridge_asm ENDP
|
|
||||||
|
|
||||||
ALIGN 64
|
|
||||||
cn_half_mainloop_ryzen_asm PROC
|
|
||||||
INCLUDE cn_half/cn_half_main_loop_ryzen.inc
|
|
||||||
ret 0
|
|
||||||
cn_half_mainloop_ryzen_asm ENDP
|
|
||||||
|
|
||||||
ALIGN 64
|
|
||||||
cn_half_mainloop_bulldozer_asm PROC
|
|
||||||
INCLUDE cn_half/cn_half_main_loop_bulldozer.inc
|
|
||||||
ret 0
|
|
||||||
cn_half_mainloop_bulldozer_asm ENDP
|
|
||||||
|
|
||||||
ALIGN 64
|
|
||||||
cn_half_double_mainloop_sandybridge_asm PROC
|
|
||||||
INCLUDE cn_half/cn_half_double_main_loop_sandybridge.inc
|
|
||||||
ret 0
|
|
||||||
cn_half_double_mainloop_sandybridge_asm ENDP
|
|
||||||
|
|
||||||
_TEXT_CNV2_MAINLOOP ENDS
|
_TEXT_CNV2_MAINLOOP ENDS
|
||||||
END
|
END
|
||||||
|
|
|
@ -1,410 +0,0 @@
|
||||||
mov rax, rsp
|
|
||||||
push rbx
|
|
||||||
push rbp
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
push r12
|
|
||||||
push r13
|
|
||||||
push r14
|
|
||||||
push r15
|
|
||||||
sub rsp, 184
|
|
||||||
|
|
||||||
stmxcsr DWORD PTR [rsp+272]
|
|
||||||
mov DWORD PTR [rsp+276], 24448
|
|
||||||
ldmxcsr DWORD PTR [rsp+276]
|
|
||||||
|
|
||||||
mov r13, QWORD PTR [rcx+224]
|
|
||||||
mov r9, rdx
|
|
||||||
mov r10, QWORD PTR [rcx+32]
|
|
||||||
mov r8, rcx
|
|
||||||
xor r10, QWORD PTR [rcx]
|
|
||||||
mov r14d, 262144
|
|
||||||
mov r11, QWORD PTR [rcx+40]
|
|
||||||
xor r11, QWORD PTR [rcx+8]
|
|
||||||
mov rsi, QWORD PTR [rdx+224]
|
|
||||||
mov rdx, QWORD PTR [rcx+56]
|
|
||||||
xor rdx, QWORD PTR [rcx+24]
|
|
||||||
mov rdi, QWORD PTR [r9+32]
|
|
||||||
xor rdi, QWORD PTR [r9]
|
|
||||||
mov rbp, QWORD PTR [r9+40]
|
|
||||||
xor rbp, QWORD PTR [r9+8]
|
|
||||||
movd xmm0, rdx
|
|
||||||
movaps XMMWORD PTR [rax-88], xmm6
|
|
||||||
movaps XMMWORD PTR [rax-104], xmm7
|
|
||||||
movaps XMMWORD PTR [rax-120], xmm8
|
|
||||||
movaps XMMWORD PTR [rsp+112], xmm9
|
|
||||||
movaps XMMWORD PTR [rsp+96], xmm10
|
|
||||||
movaps XMMWORD PTR [rsp+80], xmm11
|
|
||||||
movaps XMMWORD PTR [rsp+64], xmm12
|
|
||||||
movaps XMMWORD PTR [rsp+48], xmm13
|
|
||||||
movaps XMMWORD PTR [rsp+32], xmm14
|
|
||||||
movaps XMMWORD PTR [rsp+16], xmm15
|
|
||||||
mov rdx, r10
|
|
||||||
movd xmm4, QWORD PTR [r8+96]
|
|
||||||
and edx, 2097136
|
|
||||||
mov rax, QWORD PTR [rcx+48]
|
|
||||||
xorps xmm13, xmm13
|
|
||||||
xor rax, QWORD PTR [rcx+16]
|
|
||||||
mov rcx, QWORD PTR [rcx+88]
|
|
||||||
xor rcx, QWORD PTR [r8+72]
|
|
||||||
movd xmm5, QWORD PTR [r8+104]
|
|
||||||
movd xmm7, rax
|
|
||||||
|
|
||||||
mov eax, 1
|
|
||||||
shl rax, 52
|
|
||||||
movd xmm14, rax
|
|
||||||
punpcklqdq xmm14, xmm14
|
|
||||||
|
|
||||||
mov eax, 1023
|
|
||||||
shl rax, 52
|
|
||||||
movd xmm12, rax
|
|
||||||
punpcklqdq xmm12, xmm12
|
|
||||||
|
|
||||||
mov rax, QWORD PTR [r8+80]
|
|
||||||
xor rax, QWORD PTR [r8+64]
|
|
||||||
punpcklqdq xmm7, xmm0
|
|
||||||
movd xmm0, rcx
|
|
||||||
mov rcx, QWORD PTR [r9+56]
|
|
||||||
xor rcx, QWORD PTR [r9+24]
|
|
||||||
movd xmm3, rax
|
|
||||||
mov rax, QWORD PTR [r9+48]
|
|
||||||
xor rax, QWORD PTR [r9+16]
|
|
||||||
punpcklqdq xmm3, xmm0
|
|
||||||
movd xmm0, rcx
|
|
||||||
mov QWORD PTR [rsp], r13
|
|
||||||
mov rcx, QWORD PTR [r9+88]
|
|
||||||
xor rcx, QWORD PTR [r9+72]
|
|
||||||
movd xmm6, rax
|
|
||||||
mov rax, QWORD PTR [r9+80]
|
|
||||||
xor rax, QWORD PTR [r9+64]
|
|
||||||
punpcklqdq xmm6, xmm0
|
|
||||||
movd xmm0, rcx
|
|
||||||
mov QWORD PTR [rsp+256], r10
|
|
||||||
mov rcx, rdi
|
|
||||||
mov QWORD PTR [rsp+264], r11
|
|
||||||
movd xmm8, rax
|
|
||||||
and ecx, 2097136
|
|
||||||
punpcklqdq xmm8, xmm0
|
|
||||||
movd xmm0, QWORD PTR [r9+96]
|
|
||||||
punpcklqdq xmm4, xmm0
|
|
||||||
movd xmm0, QWORD PTR [r9+104]
|
|
||||||
lea r8, QWORD PTR [rcx+rsi]
|
|
||||||
movdqu xmm11, XMMWORD PTR [r8]
|
|
||||||
punpcklqdq xmm5, xmm0
|
|
||||||
lea r9, QWORD PTR [rdx+r13]
|
|
||||||
movdqu xmm15, XMMWORD PTR [r9]
|
|
||||||
|
|
||||||
ALIGN 16
|
|
||||||
main_loop_double_half_sandybridge:
|
|
||||||
movdqu xmm9, xmm15
|
|
||||||
mov eax, edx
|
|
||||||
mov ebx, edx
|
|
||||||
xor eax, 16
|
|
||||||
xor ebx, 32
|
|
||||||
xor edx, 48
|
|
||||||
|
|
||||||
movd xmm0, r11
|
|
||||||
movd xmm2, r10
|
|
||||||
punpcklqdq xmm2, xmm0
|
|
||||||
aesenc xmm9, xmm2
|
|
||||||
|
|
||||||
movdqu xmm0, XMMWORD PTR [rax+r13]
|
|
||||||
movdqu xmm1, XMMWORD PTR [rbx+r13]
|
|
||||||
paddq xmm0, xmm7
|
|
||||||
paddq xmm1, xmm2
|
|
||||||
movdqu XMMWORD PTR [rbx+r13], xmm0
|
|
||||||
movdqu xmm0, XMMWORD PTR [rdx+r13]
|
|
||||||
movdqu XMMWORD PTR [rdx+r13], xmm1
|
|
||||||
paddq xmm0, xmm3
|
|
||||||
movdqu XMMWORD PTR [rax+r13], xmm0
|
|
||||||
|
|
||||||
movd r11, xmm9
|
|
||||||
mov edx, r11d
|
|
||||||
and edx, 2097136
|
|
||||||
movdqa xmm0, xmm9
|
|
||||||
pxor xmm0, xmm7
|
|
||||||
movdqu XMMWORD PTR [r9], xmm0
|
|
||||||
|
|
||||||
lea rbx, QWORD PTR [rdx+r13]
|
|
||||||
mov r10, QWORD PTR [rdx+r13]
|
|
||||||
|
|
||||||
movdqu xmm10, xmm11
|
|
||||||
movd xmm0, rbp
|
|
||||||
movd xmm11, rdi
|
|
||||||
punpcklqdq xmm11, xmm0
|
|
||||||
aesenc xmm10, xmm11
|
|
||||||
|
|
||||||
mov eax, ecx
|
|
||||||
mov r12d, ecx
|
|
||||||
xor eax, 16
|
|
||||||
xor r12d, 32
|
|
||||||
xor ecx, 48
|
|
||||||
|
|
||||||
movdqu xmm0, XMMWORD PTR [rax+rsi]
|
|
||||||
paddq xmm0, xmm6
|
|
||||||
movdqu xmm1, XMMWORD PTR [r12+rsi]
|
|
||||||
movdqu XMMWORD PTR [r12+rsi], xmm0
|
|
||||||
paddq xmm1, xmm11
|
|
||||||
movdqu xmm0, XMMWORD PTR [rcx+rsi]
|
|
||||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
|
||||||
paddq xmm0, xmm8
|
|
||||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
|
||||||
|
|
||||||
movd rcx, xmm10
|
|
||||||
and ecx, 2097136
|
|
||||||
|
|
||||||
movdqa xmm0, xmm10
|
|
||||||
pxor xmm0, xmm6
|
|
||||||
movdqu XMMWORD PTR [r8], xmm0
|
|
||||||
mov r12, QWORD PTR [rcx+rsi]
|
|
||||||
|
|
||||||
mov r9, QWORD PTR [rbx+8]
|
|
||||||
|
|
||||||
xor edx, 16
|
|
||||||
mov r8d, edx
|
|
||||||
mov r15d, edx
|
|
||||||
|
|
||||||
movd rdx, xmm5
|
|
||||||
shl rdx, 32
|
|
||||||
movd rax, xmm4
|
|
||||||
xor rdx, rax
|
|
||||||
xor r10, rdx
|
|
||||||
mov rax, r10
|
|
||||||
mul r11
|
|
||||||
mov r11d, r8d
|
|
||||||
xor r11d, 48
|
|
||||||
movd xmm0, rdx
|
|
||||||
xor rdx, [r11+r13]
|
|
||||||
movd xmm1, rax
|
|
||||||
xor rax, [r11+r13+8]
|
|
||||||
punpcklqdq xmm0, xmm1
|
|
||||||
|
|
||||||
pxor xmm0, XMMWORD PTR [r8+r13]
|
|
||||||
xor r8d, 32
|
|
||||||
movdqu xmm1, XMMWORD PTR [r11+r13]
|
|
||||||
paddq xmm0, xmm7
|
|
||||||
paddq xmm1, xmm2
|
|
||||||
movdqu XMMWORD PTR [r11+r13], xmm0
|
|
||||||
movdqu xmm0, XMMWORD PTR [r8+r13]
|
|
||||||
movdqu XMMWORD PTR [r8+r13], xmm1
|
|
||||||
paddq xmm0, xmm3
|
|
||||||
movdqu XMMWORD PTR [r15+r13], xmm0
|
|
||||||
|
|
||||||
mov r11, QWORD PTR [rsp+256]
|
|
||||||
add r11, rdx
|
|
||||||
mov rdx, QWORD PTR [rsp+264]
|
|
||||||
add rdx, rax
|
|
||||||
mov QWORD PTR [rbx], r11
|
|
||||||
xor r11, r10
|
|
||||||
mov QWORD PTR [rbx+8], rdx
|
|
||||||
xor rdx, r9
|
|
||||||
mov QWORD PTR [rsp+256], r11
|
|
||||||
and r11d, 2097136
|
|
||||||
mov QWORD PTR [rsp+264], rdx
|
|
||||||
mov QWORD PTR [rsp+8], r11
|
|
||||||
lea r15, QWORD PTR [r11+r13]
|
|
||||||
movdqu xmm15, XMMWORD PTR [r11+r13]
|
|
||||||
lea r13, QWORD PTR [rsi+rcx]
|
|
||||||
movdqa xmm0, xmm5
|
|
||||||
psrldq xmm0, 8
|
|
||||||
movaps xmm2, xmm13
|
|
||||||
movd r10, xmm0
|
|
||||||
psllq xmm5, 1
|
|
||||||
shl r10, 32
|
|
||||||
movdqa xmm0, xmm9
|
|
||||||
psrldq xmm0, 8
|
|
||||||
movdqa xmm1, xmm10
|
|
||||||
movd r11, xmm0
|
|
||||||
psrldq xmm1, 8
|
|
||||||
movd r8, xmm1
|
|
||||||
psrldq xmm4, 8
|
|
||||||
movaps xmm0, xmm13
|
|
||||||
movd rax, xmm4
|
|
||||||
xor r10, rax
|
|
||||||
movaps xmm1, xmm13
|
|
||||||
xor r10, r12
|
|
||||||
lea rax, QWORD PTR [r11+1]
|
|
||||||
shr rax, 1
|
|
||||||
movdqa xmm3, xmm9
|
|
||||||
punpcklqdq xmm3, xmm10
|
|
||||||
paddq xmm5, xmm3
|
|
||||||
movd rdx, xmm5
|
|
||||||
psrldq xmm5, 8
|
|
||||||
cvtsi2sd xmm2, rax
|
|
||||||
or edx, -2147483647
|
|
||||||
lea rax, QWORD PTR [r8+1]
|
|
||||||
shr rax, 1
|
|
||||||
movd r9, xmm5
|
|
||||||
cvtsi2sd xmm0, rax
|
|
||||||
or r9d, -2147483647
|
|
||||||
cvtsi2sd xmm1, rdx
|
|
||||||
unpcklpd xmm2, xmm0
|
|
||||||
movaps xmm0, xmm13
|
|
||||||
cvtsi2sd xmm0, r9
|
|
||||||
unpcklpd xmm1, xmm0
|
|
||||||
divpd xmm2, xmm1
|
|
||||||
paddq xmm2, xmm14
|
|
||||||
cvttsd2si rax, xmm2
|
|
||||||
psrldq xmm2, 8
|
|
||||||
mov rbx, rax
|
|
||||||
imul rax, rdx
|
|
||||||
sub r11, rax
|
|
||||||
js div_fix_1_half_sandybridge
|
|
||||||
div_fix_1_ret_half_sandybridge:
|
|
||||||
|
|
||||||
cvttsd2si rdx, xmm2
|
|
||||||
mov rax, rdx
|
|
||||||
imul rax, r9
|
|
||||||
movd xmm2, r11d
|
|
||||||
movd xmm4, ebx
|
|
||||||
sub r8, rax
|
|
||||||
js div_fix_2_half_sandybridge
|
|
||||||
div_fix_2_ret_half_sandybridge:
|
|
||||||
|
|
||||||
movd xmm1, r8d
|
|
||||||
movd xmm0, edx
|
|
||||||
punpckldq xmm2, xmm1
|
|
||||||
punpckldq xmm4, xmm0
|
|
||||||
punpckldq xmm4, xmm2
|
|
||||||
paddq xmm3, xmm4
|
|
||||||
movdqa xmm0, xmm3
|
|
||||||
psrlq xmm0, 12
|
|
||||||
paddq xmm0, xmm12
|
|
||||||
sqrtpd xmm1, xmm0
|
|
||||||
movd r9, xmm1
|
|
||||||
movdqa xmm5, xmm1
|
|
||||||
psrlq xmm5, 19
|
|
||||||
test r9, 524287
|
|
||||||
je sqrt_fix_1_half_sandybridge
|
|
||||||
sqrt_fix_1_ret_half_sandybridge:
|
|
||||||
|
|
||||||
movd r9, xmm10
|
|
||||||
psrldq xmm1, 8
|
|
||||||
movd r8, xmm1
|
|
||||||
test r8, 524287
|
|
||||||
je sqrt_fix_2_half_sandybridge
|
|
||||||
sqrt_fix_2_ret_half_sandybridge:
|
|
||||||
|
|
||||||
mov r12d, ecx
|
|
||||||
mov r8d, ecx
|
|
||||||
xor r12d, 16
|
|
||||||
xor r8d, 32
|
|
||||||
xor ecx, 48
|
|
||||||
mov rax, r10
|
|
||||||
mul r9
|
|
||||||
movd xmm0, rax
|
|
||||||
movd xmm3, rdx
|
|
||||||
punpcklqdq xmm3, xmm0
|
|
||||||
|
|
||||||
movdqu xmm0, XMMWORD PTR [r12+rsi]
|
|
||||||
pxor xmm0, xmm3
|
|
||||||
movdqu xmm1, XMMWORD PTR [r8+rsi]
|
|
||||||
xor rdx, [r8+rsi]
|
|
||||||
xor rax, [r8+rsi+8]
|
|
||||||
movdqu xmm3, XMMWORD PTR [rcx+rsi]
|
|
||||||
paddq xmm0, xmm6
|
|
||||||
paddq xmm1, xmm11
|
|
||||||
paddq xmm3, xmm8
|
|
||||||
movdqu XMMWORD PTR [r8+rsi], xmm0
|
|
||||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
|
||||||
movdqu XMMWORD PTR [r12+rsi], xmm3
|
|
||||||
|
|
||||||
add rdi, rdx
|
|
||||||
mov QWORD PTR [r13], rdi
|
|
||||||
xor rdi, r10
|
|
||||||
mov ecx, edi
|
|
||||||
and ecx, 2097136
|
|
||||||
lea r8, QWORD PTR [rcx+rsi]
|
|
||||||
|
|
||||||
mov rdx, QWORD PTR [r13+8]
|
|
||||||
add rbp, rax
|
|
||||||
mov QWORD PTR [r13+8], rbp
|
|
||||||
movdqu xmm11, XMMWORD PTR [rcx+rsi]
|
|
||||||
xor rbp, rdx
|
|
||||||
mov r13, QWORD PTR [rsp]
|
|
||||||
movdqa xmm3, xmm7
|
|
||||||
mov rdx, QWORD PTR [rsp+8]
|
|
||||||
movdqa xmm8, xmm6
|
|
||||||
mov r10, QWORD PTR [rsp+256]
|
|
||||||
movdqa xmm7, xmm9
|
|
||||||
mov r11, QWORD PTR [rsp+264]
|
|
||||||
movdqa xmm6, xmm10
|
|
||||||
mov r9, r15
|
|
||||||
dec r14d
|
|
||||||
jne main_loop_double_half_sandybridge
|
|
||||||
|
|
||||||
ldmxcsr DWORD PTR [rsp+272]
|
|
||||||
movaps xmm13, XMMWORD PTR [rsp+48]
|
|
||||||
lea r11, QWORD PTR [rsp+184]
|
|
||||||
movaps xmm6, XMMWORD PTR [r11-24]
|
|
||||||
movaps xmm7, XMMWORD PTR [r11-40]
|
|
||||||
movaps xmm8, XMMWORD PTR [r11-56]
|
|
||||||
movaps xmm9, XMMWORD PTR [r11-72]
|
|
||||||
movaps xmm10, XMMWORD PTR [r11-88]
|
|
||||||
movaps xmm11, XMMWORD PTR [r11-104]
|
|
||||||
movaps xmm12, XMMWORD PTR [r11-120]
|
|
||||||
movaps xmm14, XMMWORD PTR [rsp+32]
|
|
||||||
movaps xmm15, XMMWORD PTR [rsp+16]
|
|
||||||
mov rsp, r11
|
|
||||||
pop r15
|
|
||||||
pop r14
|
|
||||||
pop r13
|
|
||||||
pop r12
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
pop rbp
|
|
||||||
pop rbx
|
|
||||||
jmp cnv2_double_mainloop_asm_half_sandybridge_endp
|
|
||||||
|
|
||||||
div_fix_1_half_sandybridge:
|
|
||||||
dec rbx
|
|
||||||
add r11, rdx
|
|
||||||
jmp div_fix_1_ret_half_sandybridge
|
|
||||||
|
|
||||||
div_fix_2_half_sandybridge:
|
|
||||||
dec rdx
|
|
||||||
add r8, r9
|
|
||||||
jmp div_fix_2_ret_half_sandybridge
|
|
||||||
|
|
||||||
sqrt_fix_1_half_sandybridge:
|
|
||||||
movd r8, xmm3
|
|
||||||
movdqa xmm0, xmm5
|
|
||||||
psrldq xmm0, 8
|
|
||||||
dec r9
|
|
||||||
mov r11d, -1022
|
|
||||||
shl r11, 32
|
|
||||||
mov rax, r9
|
|
||||||
shr r9, 19
|
|
||||||
shr rax, 20
|
|
||||||
mov rdx, r9
|
|
||||||
sub rdx, rax
|
|
||||||
lea rdx, [rdx+r11+1]
|
|
||||||
add rax, r11
|
|
||||||
imul rdx, rax
|
|
||||||
sub rdx, r8
|
|
||||||
adc r9, 0
|
|
||||||
movd xmm5, r9
|
|
||||||
punpcklqdq xmm5, xmm0
|
|
||||||
jmp sqrt_fix_1_ret_half_sandybridge
|
|
||||||
|
|
||||||
sqrt_fix_2_half_sandybridge:
|
|
||||||
psrldq xmm3, 8
|
|
||||||
movd r11, xmm3
|
|
||||||
dec r8
|
|
||||||
mov ebx, -1022
|
|
||||||
shl rbx, 32
|
|
||||||
mov rax, r8
|
|
||||||
shr r8, 19
|
|
||||||
shr rax, 20
|
|
||||||
mov rdx, r8
|
|
||||||
sub rdx, rax
|
|
||||||
lea rdx, [rdx+rbx+1]
|
|
||||||
add rax, rbx
|
|
||||||
imul rdx, rax
|
|
||||||
sub rdx, r11
|
|
||||||
adc r8, 0
|
|
||||||
movd xmm0, r8
|
|
||||||
punpcklqdq xmm5, xmm0
|
|
||||||
jmp sqrt_fix_2_ret_half_sandybridge
|
|
||||||
|
|
||||||
cnv2_double_mainloop_asm_half_sandybridge_endp:
|
|
|
@ -1,180 +0,0 @@
|
||||||
mov QWORD PTR [rsp+16], rbx
|
|
||||||
mov QWORD PTR [rsp+24], rbp
|
|
||||||
mov QWORD PTR [rsp+32], rsi
|
|
||||||
push rdi
|
|
||||||
push r12
|
|
||||||
push r13
|
|
||||||
push r14
|
|
||||||
push r15
|
|
||||||
sub rsp, 64
|
|
||||||
|
|
||||||
stmxcsr DWORD PTR [rsp]
|
|
||||||
mov DWORD PTR [rsp+4], 24448
|
|
||||||
ldmxcsr DWORD PTR [rsp+4]
|
|
||||||
|
|
||||||
mov rax, QWORD PTR [rcx+48]
|
|
||||||
mov r9, rcx
|
|
||||||
xor rax, QWORD PTR [rcx+16]
|
|
||||||
mov ebp, 262144
|
|
||||||
mov r8, QWORD PTR [rcx+32]
|
|
||||||
xor r8, QWORD PTR [rcx]
|
|
||||||
mov r11, QWORD PTR [rcx+40]
|
|
||||||
mov r10, r8
|
|
||||||
mov rdx, QWORD PTR [rcx+56]
|
|
||||||
movd xmm3, rax
|
|
||||||
xor rdx, QWORD PTR [rcx+24]
|
|
||||||
xor r11, QWORD PTR [rcx+8]
|
|
||||||
mov rbx, QWORD PTR [rcx+224]
|
|
||||||
mov rax, QWORD PTR [r9+80]
|
|
||||||
xor rax, QWORD PTR [r9+64]
|
|
||||||
movd xmm0, rdx
|
|
||||||
mov rcx, QWORD PTR [rcx+88]
|
|
||||||
xor rcx, QWORD PTR [r9+72]
|
|
||||||
mov rdi, QWORD PTR [r9+104]
|
|
||||||
and r10d, 2097136
|
|
||||||
movaps XMMWORD PTR [rsp+48], xmm6
|
|
||||||
movd xmm4, rax
|
|
||||||
movaps XMMWORD PTR [rsp+32], xmm7
|
|
||||||
movaps XMMWORD PTR [rsp+16], xmm8
|
|
||||||
xorps xmm8, xmm8
|
|
||||||
mov ax, 1023
|
|
||||||
shl rax, 52
|
|
||||||
movd xmm7, rax
|
|
||||||
mov r15, QWORD PTR [r9+96]
|
|
||||||
punpcklqdq xmm3, xmm0
|
|
||||||
movd xmm0, rcx
|
|
||||||
punpcklqdq xmm4, xmm0
|
|
||||||
|
|
||||||
ALIGN 16
|
|
||||||
cnv2_main_loop_half_bulldozer:
|
|
||||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
|
||||||
movd xmm6, r8
|
|
||||||
pinsrq xmm6, r11, 1
|
|
||||||
lea rdx, QWORD PTR [r10+rbx]
|
|
||||||
lea r9, QWORD PTR [rdi+rdi]
|
|
||||||
shl rdi, 32
|
|
||||||
|
|
||||||
mov ecx, r10d
|
|
||||||
mov eax, r10d
|
|
||||||
xor ecx, 16
|
|
||||||
xor eax, 32
|
|
||||||
xor r10d, 48
|
|
||||||
aesenc xmm5, xmm6
|
|
||||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
|
||||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
|
||||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
|
||||||
paddq xmm2, xmm3
|
|
||||||
paddq xmm1, xmm6
|
|
||||||
paddq xmm0, xmm4
|
|
||||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
|
||||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
|
||||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
|
||||||
|
|
||||||
movaps xmm1, xmm8
|
|
||||||
mov rsi, r15
|
|
||||||
xor rsi, rdi
|
|
||||||
|
|
||||||
mov edi, 1023
|
|
||||||
shl rdi, 52
|
|
||||||
|
|
||||||
movd r14, xmm5
|
|
||||||
pextrq rax, xmm5, 1
|
|
||||||
|
|
||||||
movdqa xmm0, xmm5
|
|
||||||
pxor xmm0, xmm3
|
|
||||||
mov r10, r14
|
|
||||||
and r10d, 2097136
|
|
||||||
movdqa XMMWORD PTR [rdx], xmm0
|
|
||||||
xor rsi, QWORD PTR [r10+rbx]
|
|
||||||
lea r12, QWORD PTR [r10+rbx]
|
|
||||||
mov r13, QWORD PTR [r10+rbx+8]
|
|
||||||
|
|
||||||
add r9d, r14d
|
|
||||||
or r9d, -2147483647
|
|
||||||
xor edx, edx
|
|
||||||
div r9
|
|
||||||
mov eax, eax
|
|
||||||
shl rdx, 32
|
|
||||||
lea r15, [rax+rdx]
|
|
||||||
lea rax, [r14+r15]
|
|
||||||
shr rax, 12
|
|
||||||
add rax, rdi
|
|
||||||
movd xmm0, rax
|
|
||||||
sqrtsd xmm1, xmm0
|
|
||||||
movd rdi, xmm1
|
|
||||||
test rdi, 524287
|
|
||||||
je sqrt_fixup_half_bulldozer
|
|
||||||
shr rdi, 19
|
|
||||||
|
|
||||||
sqrt_fixup_half_bulldozer_ret:
|
|
||||||
mov rax, rsi
|
|
||||||
mul r14
|
|
||||||
movd xmm1, rax
|
|
||||||
movd xmm0, rdx
|
|
||||||
punpcklqdq xmm0, xmm1
|
|
||||||
|
|
||||||
mov r9d, r10d
|
|
||||||
mov ecx, r10d
|
|
||||||
xor r9d, 16
|
|
||||||
xor ecx, 32
|
|
||||||
xor r10d, 48
|
|
||||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
|
||||||
xor rdx, [rcx+rbx]
|
|
||||||
xor rax, [rcx+rbx+8]
|
|
||||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
|
||||||
pxor xmm2, xmm0
|
|
||||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
|
||||||
paddq xmm2, xmm3
|
|
||||||
paddq xmm1, xmm6
|
|
||||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
|
||||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
|
||||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
|
||||||
|
|
||||||
movdqa xmm4, xmm3
|
|
||||||
add r8, rdx
|
|
||||||
add r11, rax
|
|
||||||
mov QWORD PTR [r12], r8
|
|
||||||
xor r8, rsi
|
|
||||||
mov QWORD PTR [r12+8], r11
|
|
||||||
mov r10, r8
|
|
||||||
xor r11, r13
|
|
||||||
and r10d, 2097136
|
|
||||||
movdqa xmm3, xmm5
|
|
||||||
dec ebp
|
|
||||||
jne cnv2_main_loop_half_bulldozer
|
|
||||||
|
|
||||||
ldmxcsr DWORD PTR [rsp]
|
|
||||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
|
||||||
lea r11, QWORD PTR [rsp+64]
|
|
||||||
mov rbx, QWORD PTR [r11+56]
|
|
||||||
mov rbp, QWORD PTR [r11+64]
|
|
||||||
mov rsi, QWORD PTR [r11+72]
|
|
||||||
movaps xmm8, XMMWORD PTR [r11-48]
|
|
||||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
|
||||||
mov rsp, r11
|
|
||||||
pop r15
|
|
||||||
pop r14
|
|
||||||
pop r13
|
|
||||||
pop r12
|
|
||||||
pop rdi
|
|
||||||
jmp cnv2_main_loop_half_bulldozer_endp
|
|
||||||
|
|
||||||
sqrt_fixup_half_bulldozer:
|
|
||||||
movd r9, xmm5
|
|
||||||
add r9, r15
|
|
||||||
dec rdi
|
|
||||||
mov edx, -1022
|
|
||||||
shl rdx, 32
|
|
||||||
mov rax, rdi
|
|
||||||
shr rdi, 19
|
|
||||||
shr rax, 20
|
|
||||||
mov rcx, rdi
|
|
||||||
sub rcx, rax
|
|
||||||
lea rcx, [rcx+rdx+1]
|
|
||||||
add rax, rdx
|
|
||||||
imul rcx, rax
|
|
||||||
sub rcx, r9
|
|
||||||
adc rdi, 0
|
|
||||||
jmp sqrt_fixup_half_bulldozer_ret
|
|
||||||
|
|
||||||
cnv2_main_loop_half_bulldozer_endp:
|
|
|
@ -1,186 +0,0 @@
|
||||||
mov QWORD PTR [rsp+24], rbx
|
|
||||||
push rbp
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
push r12
|
|
||||||
push r13
|
|
||||||
push r14
|
|
||||||
push r15
|
|
||||||
sub rsp, 80
|
|
||||||
|
|
||||||
stmxcsr DWORD PTR [rsp]
|
|
||||||
mov DWORD PTR [rsp+4], 24448
|
|
||||||
ldmxcsr DWORD PTR [rsp+4]
|
|
||||||
|
|
||||||
mov rax, QWORD PTR [rcx+48]
|
|
||||||
mov r9, rcx
|
|
||||||
xor rax, QWORD PTR [rcx+16]
|
|
||||||
mov esi, 262144
|
|
||||||
mov r8, QWORD PTR [rcx+32]
|
|
||||||
mov r13d, -2147483647
|
|
||||||
xor r8, QWORD PTR [rcx]
|
|
||||||
mov r11, QWORD PTR [rcx+40]
|
|
||||||
mov r10, r8
|
|
||||||
mov rdx, QWORD PTR [rcx+56]
|
|
||||||
movd xmm4, rax
|
|
||||||
xor rdx, QWORD PTR [rcx+24]
|
|
||||||
xor r11, QWORD PTR [rcx+8]
|
|
||||||
mov rbx, QWORD PTR [rcx+224]
|
|
||||||
mov rax, QWORD PTR [r9+80]
|
|
||||||
xor rax, QWORD PTR [r9+64]
|
|
||||||
movd xmm0, rdx
|
|
||||||
mov rcx, QWORD PTR [rcx+88]
|
|
||||||
xor rcx, QWORD PTR [r9+72]
|
|
||||||
movd xmm3, QWORD PTR [r9+104]
|
|
||||||
movaps XMMWORD PTR [rsp+64], xmm6
|
|
||||||
movaps XMMWORD PTR [rsp+48], xmm7
|
|
||||||
movaps XMMWORD PTR [rsp+32], xmm8
|
|
||||||
and r10d, 2097136
|
|
||||||
movd xmm5, rax
|
|
||||||
|
|
||||||
xor eax, eax
|
|
||||||
mov QWORD PTR [rsp+16], rax
|
|
||||||
|
|
||||||
mov ax, 1023
|
|
||||||
shl rax, 52
|
|
||||||
movd xmm8, rax
|
|
||||||
mov r15, QWORD PTR [r9+96]
|
|
||||||
punpcklqdq xmm4, xmm0
|
|
||||||
movd xmm0, rcx
|
|
||||||
punpcklqdq xmm5, xmm0
|
|
||||||
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
|
||||||
|
|
||||||
ALIGN 16
|
|
||||||
main_loop_half_ivybridge:
|
|
||||||
lea rdx, QWORD PTR [r10+rbx]
|
|
||||||
mov ecx, r10d
|
|
||||||
mov eax, r10d
|
|
||||||
mov rdi, r15
|
|
||||||
xor ecx, 16
|
|
||||||
xor eax, 32
|
|
||||||
xor r10d, 48
|
|
||||||
movd xmm0, r11
|
|
||||||
movd xmm7, r8
|
|
||||||
punpcklqdq xmm7, xmm0
|
|
||||||
aesenc xmm6, xmm7
|
|
||||||
movd rbp, xmm6
|
|
||||||
mov r9, rbp
|
|
||||||
and r9d, 2097136
|
|
||||||
movdqu xmm2, XMMWORD PTR [rcx+rbx]
|
|
||||||
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
|
||||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
|
||||||
paddq xmm1, xmm7
|
|
||||||
paddq xmm0, xmm5
|
|
||||||
paddq xmm2, xmm4
|
|
||||||
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
|
||||||
movdqu XMMWORD PTR [rax+rbx], xmm2
|
|
||||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
|
||||||
mov r10, r9
|
|
||||||
xor r10d, 32
|
|
||||||
movd rcx, xmm3
|
|
||||||
mov rax, rcx
|
|
||||||
shl rax, 32
|
|
||||||
xor rdi, rax
|
|
||||||
movdqa xmm0, xmm6
|
|
||||||
pxor xmm0, xmm4
|
|
||||||
movdqu XMMWORD PTR [rdx], xmm0
|
|
||||||
xor rdi, QWORD PTR [r9+rbx]
|
|
||||||
lea r14, QWORD PTR [r9+rbx]
|
|
||||||
mov r12, QWORD PTR [r14+8]
|
|
||||||
xor edx, edx
|
|
||||||
lea r9d, DWORD PTR [ecx+ecx]
|
|
||||||
add r9d, ebp
|
|
||||||
movdqa xmm0, xmm6
|
|
||||||
psrldq xmm0, 8
|
|
||||||
or r9d, r13d
|
|
||||||
movd rax, xmm0
|
|
||||||
div r9
|
|
||||||
xorps xmm3, xmm3
|
|
||||||
mov eax, eax
|
|
||||||
shl rdx, 32
|
|
||||||
add rdx, rax
|
|
||||||
lea r9, QWORD PTR [rdx+rbp]
|
|
||||||
mov r15, rdx
|
|
||||||
mov rax, r9
|
|
||||||
shr rax, 12
|
|
||||||
movd xmm0, rax
|
|
||||||
paddq xmm0, xmm8
|
|
||||||
sqrtsd xmm3, xmm0
|
|
||||||
psubq xmm3, XMMWORD PTR [rsp+16]
|
|
||||||
movd rdx, xmm3
|
|
||||||
test edx, 524287
|
|
||||||
je sqrt_fixup_half_ivybridge
|
|
||||||
psrlq xmm3, 19
|
|
||||||
sqrt_fixup_half_ivybridge_ret:
|
|
||||||
|
|
||||||
mov ecx, r10d
|
|
||||||
mov rax, rdi
|
|
||||||
mul rbp
|
|
||||||
movd xmm2, rdx
|
|
||||||
xor rdx, [rcx+rbx]
|
|
||||||
add r8, rdx
|
|
||||||
mov QWORD PTR [r14], r8
|
|
||||||
xor r8, rdi
|
|
||||||
mov edi, r8d
|
|
||||||
and edi, 2097136
|
|
||||||
movd xmm0, rax
|
|
||||||
xor rax, [rcx+rbx+8]
|
|
||||||
add r11, rax
|
|
||||||
mov QWORD PTR [r14+8], r11
|
|
||||||
punpcklqdq xmm2, xmm0
|
|
||||||
|
|
||||||
mov r9d, r10d
|
|
||||||
xor r9d, 48
|
|
||||||
xor r10d, 16
|
|
||||||
pxor xmm2, XMMWORD PTR [r9+rbx]
|
|
||||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
|
||||||
paddq xmm0, xmm5
|
|
||||||
movdqu xmm1, XMMWORD PTR [rcx+rbx]
|
|
||||||
paddq xmm2, xmm4
|
|
||||||
paddq xmm1, xmm7
|
|
||||||
movdqa xmm5, xmm4
|
|
||||||
movdqu XMMWORD PTR [r9+rbx], xmm0
|
|
||||||
movdqa xmm4, xmm6
|
|
||||||
movdqu XMMWORD PTR [rcx+rbx], xmm2
|
|
||||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
|
||||||
movdqu xmm6, [rdi+rbx]
|
|
||||||
mov r10d, edi
|
|
||||||
xor r11, r12
|
|
||||||
dec rsi
|
|
||||||
jne main_loop_half_ivybridge
|
|
||||||
|
|
||||||
ldmxcsr DWORD PTR [rsp]
|
|
||||||
mov rbx, QWORD PTR [rsp+160]
|
|
||||||
movaps xmm6, XMMWORD PTR [rsp+64]
|
|
||||||
movaps xmm7, XMMWORD PTR [rsp+48]
|
|
||||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
|
||||||
add rsp, 80
|
|
||||||
pop r15
|
|
||||||
pop r14
|
|
||||||
pop r13
|
|
||||||
pop r12
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
pop rbp
|
|
||||||
jmp cnv2_main_loop_half_ivybridge_endp
|
|
||||||
|
|
||||||
sqrt_fixup_half_ivybridge:
|
|
||||||
dec rdx
|
|
||||||
mov r13d, -1022
|
|
||||||
shl r13, 32
|
|
||||||
mov rax, rdx
|
|
||||||
shr rdx, 19
|
|
||||||
shr rax, 20
|
|
||||||
mov rcx, rdx
|
|
||||||
sub rcx, rax
|
|
||||||
add rax, r13
|
|
||||||
not r13
|
|
||||||
sub rcx, r13
|
|
||||||
mov r13d, -2147483647
|
|
||||||
imul rcx, rax
|
|
||||||
sub rcx, r9
|
|
||||||
adc rdx, 0
|
|
||||||
movd xmm3, rdx
|
|
||||||
jmp sqrt_fixup_half_ivybridge_ret
|
|
||||||
|
|
||||||
cnv2_main_loop_half_ivybridge_endp:
|
|
|
@ -1,179 +0,0 @@
|
||||||
mov QWORD PTR [rsp+16], rbx
|
|
||||||
mov QWORD PTR [rsp+24], rbp
|
|
||||||
mov QWORD PTR [rsp+32], rsi
|
|
||||||
push rdi
|
|
||||||
push r12
|
|
||||||
push r13
|
|
||||||
push r14
|
|
||||||
push r15
|
|
||||||
sub rsp, 64
|
|
||||||
|
|
||||||
stmxcsr DWORD PTR [rsp]
|
|
||||||
mov DWORD PTR [rsp+4], 24448
|
|
||||||
ldmxcsr DWORD PTR [rsp+4]
|
|
||||||
|
|
||||||
mov rax, QWORD PTR [rcx+48]
|
|
||||||
mov r9, rcx
|
|
||||||
xor rax, QWORD PTR [rcx+16]
|
|
||||||
mov ebp, 262144
|
|
||||||
mov r8, QWORD PTR [rcx+32]
|
|
||||||
xor r8, QWORD PTR [rcx]
|
|
||||||
mov r11, QWORD PTR [rcx+40]
|
|
||||||
mov r10, r8
|
|
||||||
mov rdx, QWORD PTR [rcx+56]
|
|
||||||
movd xmm3, rax
|
|
||||||
xor rdx, QWORD PTR [rcx+24]
|
|
||||||
xor r11, QWORD PTR [rcx+8]
|
|
||||||
mov rbx, QWORD PTR [rcx+224]
|
|
||||||
mov rax, QWORD PTR [r9+80]
|
|
||||||
xor rax, QWORD PTR [r9+64]
|
|
||||||
movd xmm0, rdx
|
|
||||||
mov rcx, QWORD PTR [rcx+88]
|
|
||||||
xor rcx, QWORD PTR [r9+72]
|
|
||||||
mov rdi, QWORD PTR [r9+104]
|
|
||||||
and r10d, 2097136
|
|
||||||
movaps XMMWORD PTR [rsp+48], xmm6
|
|
||||||
movd xmm4, rax
|
|
||||||
movaps XMMWORD PTR [rsp+32], xmm7
|
|
||||||
movaps XMMWORD PTR [rsp+16], xmm8
|
|
||||||
xorps xmm8, xmm8
|
|
||||||
mov ax, 1023
|
|
||||||
shl rax, 52
|
|
||||||
movd xmm7, rax
|
|
||||||
mov r15, QWORD PTR [r9+96]
|
|
||||||
punpcklqdq xmm3, xmm0
|
|
||||||
movd xmm0, rcx
|
|
||||||
punpcklqdq xmm4, xmm0
|
|
||||||
|
|
||||||
ALIGN 16
|
|
||||||
main_loop_half_ryzen:
|
|
||||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
|
||||||
movd xmm0, r11
|
|
||||||
movd xmm6, r8
|
|
||||||
punpcklqdq xmm6, xmm0
|
|
||||||
lea rdx, QWORD PTR [r10+rbx]
|
|
||||||
lea r9, QWORD PTR [rdi+rdi]
|
|
||||||
shl rdi, 32
|
|
||||||
|
|
||||||
mov ecx, r10d
|
|
||||||
mov eax, r10d
|
|
||||||
xor ecx, 16
|
|
||||||
xor eax, 32
|
|
||||||
xor r10d, 48
|
|
||||||
aesenc xmm5, xmm6
|
|
||||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
|
||||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
|
||||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
|
||||||
paddq xmm2, xmm3
|
|
||||||
paddq xmm1, xmm6
|
|
||||||
paddq xmm0, xmm4
|
|
||||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
|
||||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
|
||||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
|
||||||
|
|
||||||
movaps xmm1, xmm8
|
|
||||||
mov rsi, r15
|
|
||||||
xor rsi, rdi
|
|
||||||
movd r14, xmm5
|
|
||||||
movdqa xmm0, xmm5
|
|
||||||
pxor xmm0, xmm3
|
|
||||||
mov r10, r14
|
|
||||||
and r10d, 2097136
|
|
||||||
movdqa XMMWORD PTR [rdx], xmm0
|
|
||||||
xor rsi, QWORD PTR [r10+rbx]
|
|
||||||
lea r12, QWORD PTR [r10+rbx]
|
|
||||||
mov r13, QWORD PTR [r10+rbx+8]
|
|
||||||
|
|
||||||
add r9d, r14d
|
|
||||||
or r9d, -2147483647
|
|
||||||
xor edx, edx
|
|
||||||
movdqa xmm0, xmm5
|
|
||||||
psrldq xmm0, 8
|
|
||||||
movd rax, xmm0
|
|
||||||
|
|
||||||
div r9
|
|
||||||
movd xmm0, rax
|
|
||||||
movd xmm1, rdx
|
|
||||||
punpckldq xmm0, xmm1
|
|
||||||
movd r15, xmm0
|
|
||||||
paddq xmm0, xmm5
|
|
||||||
movdqa xmm2, xmm0
|
|
||||||
psrlq xmm0, 12
|
|
||||||
paddq xmm0, xmm7
|
|
||||||
sqrtsd xmm1, xmm0
|
|
||||||
movd rdi, xmm1
|
|
||||||
test rdi, 524287
|
|
||||||
je sqrt_fixup_half_ryzen
|
|
||||||
shr rdi, 19
|
|
||||||
|
|
||||||
sqrt_fixup_half_ryzen_ret:
|
|
||||||
mov rax, rsi
|
|
||||||
mul r14
|
|
||||||
movd xmm1, rax
|
|
||||||
movd xmm0, rdx
|
|
||||||
punpcklqdq xmm0, xmm1
|
|
||||||
|
|
||||||
mov r9d, r10d
|
|
||||||
mov ecx, r10d
|
|
||||||
xor r9d, 16
|
|
||||||
xor ecx, 32
|
|
||||||
xor r10d, 48
|
|
||||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
|
||||||
xor rdx, [rcx+rbx]
|
|
||||||
xor rax, [rcx+rbx+8]
|
|
||||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
|
||||||
pxor xmm2, xmm0
|
|
||||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
|
||||||
paddq xmm2, xmm3
|
|
||||||
paddq xmm1, xmm6
|
|
||||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
|
||||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
|
||||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
|
||||||
|
|
||||||
movdqa xmm4, xmm3
|
|
||||||
add r8, rdx
|
|
||||||
add r11, rax
|
|
||||||
mov QWORD PTR [r12], r8
|
|
||||||
xor r8, rsi
|
|
||||||
mov QWORD PTR [r12+8], r11
|
|
||||||
mov r10, r8
|
|
||||||
xor r11, r13
|
|
||||||
and r10d, 2097136
|
|
||||||
movdqa xmm3, xmm5
|
|
||||||
dec ebp
|
|
||||||
jne main_loop_half_ryzen
|
|
||||||
|
|
||||||
ldmxcsr DWORD PTR [rsp]
|
|
||||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
|
||||||
lea r11, QWORD PTR [rsp+64]
|
|
||||||
mov rbx, QWORD PTR [r11+56]
|
|
||||||
mov rbp, QWORD PTR [r11+64]
|
|
||||||
mov rsi, QWORD PTR [r11+72]
|
|
||||||
movaps xmm8, XMMWORD PTR [r11-48]
|
|
||||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
|
||||||
mov rsp, r11
|
|
||||||
pop r15
|
|
||||||
pop r14
|
|
||||||
pop r13
|
|
||||||
pop r12
|
|
||||||
pop rdi
|
|
||||||
jmp cnv2_main_loop_half_ryzen_endp
|
|
||||||
|
|
||||||
sqrt_fixup_half_ryzen:
|
|
||||||
movd r9, xmm2
|
|
||||||
dec rdi
|
|
||||||
mov edx, -1022
|
|
||||||
shl rdx, 32
|
|
||||||
mov rax, rdi
|
|
||||||
shr rdi, 19
|
|
||||||
shr rax, 20
|
|
||||||
mov rcx, rdi
|
|
||||||
sub rcx, rax
|
|
||||||
lea rcx, [rcx+rdx+1]
|
|
||||||
add rax, rdx
|
|
||||||
imul rcx, rax
|
|
||||||
sub rcx, r9
|
|
||||||
adc rdi, 0
|
|
||||||
jmp sqrt_fixup_half_ryzen_ret
|
|
||||||
|
|
||||||
cnv2_main_loop_half_ryzen_endp:
|
|
|
@ -6,47 +6,26 @@
|
||||||
.global cnv2_mainloop_bulldozer_asm
|
.global cnv2_mainloop_bulldozer_asm
|
||||||
.global cnv2_double_mainloop_sandybridge_asm
|
.global cnv2_double_mainloop_sandybridge_asm
|
||||||
|
|
||||||
.global cn_half_mainloop_ivybridge_asm
|
|
||||||
.global cn_half_mainloop_ryzen_asm
|
|
||||||
.global cn_half_mainloop_bulldozer_asm
|
|
||||||
.global cn_half_double_mainloop_sandybridge_asm
|
|
||||||
|
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
cnv2_mainloop_ivybridge_asm:
|
cnv2_mainloop_ivybridge_asm:
|
||||||
#include "../cn2/cnv2_main_loop_ivybridge.inc"
|
#include "../cn2/cnv2_main_loop_ivybridge.inc"
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
|
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
cnv2_mainloop_ryzen_asm:
|
cnv2_mainloop_ryzen_asm:
|
||||||
#include "../cn2/cnv2_main_loop_ryzen.inc"
|
#include "../cn2/cnv2_main_loop_ryzen.inc"
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
|
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
cnv2_mainloop_bulldozer_asm:
|
cnv2_mainloop_bulldozer_asm:
|
||||||
#include "../cn2/cnv2_main_loop_bulldozer.inc"
|
#include "../cn2/cnv2_main_loop_bulldozer.inc"
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
|
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
cnv2_double_mainloop_sandybridge_asm:
|
cnv2_double_mainloop_sandybridge_asm:
|
||||||
#include "../cn2/cnv2_double_main_loop_sandybridge.inc"
|
#include "../cn2/cnv2_double_main_loop_sandybridge.inc"
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
ALIGN 16
|
|
||||||
cn_half_mainloop_ivybridge_asm:
|
|
||||||
#include "../cn_half/cn_half_main_loop_ivybridge.inc"
|
|
||||||
ret 0
|
|
||||||
|
|
||||||
ALIGN 16
|
|
||||||
cn_half_mainloop_ryzen_asm:
|
|
||||||
#include "../cn_half/cn_half_main_loop_ryzen.inc"
|
|
||||||
ret 0
|
|
||||||
|
|
||||||
ALIGN 16
|
|
||||||
cn_half_mainloop_bulldozer_asm:
|
|
||||||
#include "../cn_half/cn_half_main_loop_bulldozer.inc"
|
|
||||||
ret 0
|
|
||||||
|
|
||||||
ALIGN 16
|
|
||||||
cn_half_double_mainloop_sandybridge_asm:
|
|
||||||
#include "../cn_half/cn_half_double_main_loop_sandybridge.inc"
|
|
||||||
ret 0
|
|
||||||
|
|
|
@ -3,58 +3,34 @@ PUBLIC cnv2_mainloop_ivybridge_asm
|
||||||
PUBLIC cnv2_mainloop_ryzen_asm
|
PUBLIC cnv2_mainloop_ryzen_asm
|
||||||
PUBLIC cnv2_mainloop_bulldozer_asm
|
PUBLIC cnv2_mainloop_bulldozer_asm
|
||||||
PUBLIC cnv2_double_mainloop_sandybridge_asm
|
PUBLIC cnv2_double_mainloop_sandybridge_asm
|
||||||
PUBLIC cn_half_mainloop_ivybridge_asm
|
|
||||||
PUBLIC cn_half_mainloop_ryzen_asm
|
|
||||||
PUBLIC cn_half_mainloop_bulldozer_asm
|
|
||||||
PUBLIC cn_half_double_mainloop_sandybridge_asm
|
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
cnv2_mainloop_ivybridge_asm PROC
|
cnv2_mainloop_ivybridge_asm PROC
|
||||||
INCLUDE cn2/cnv2_main_loop_ivybridge.inc
|
INCLUDE cn2/cnv2_main_loop_ivybridge.inc
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
cnv2_mainloop_ivybridge_asm ENDP
|
cnv2_mainloop_ivybridge_asm ENDP
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
cnv2_mainloop_ryzen_asm PROC
|
cnv2_mainloop_ryzen_asm PROC
|
||||||
INCLUDE cn2/cnv2_main_loop_ryzen.inc
|
INCLUDE cn2/cnv2_main_loop_ryzen.inc
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
cnv2_mainloop_ryzen_asm ENDP
|
cnv2_mainloop_ryzen_asm ENDP
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
cnv2_mainloop_bulldozer_asm PROC
|
cnv2_mainloop_bulldozer_asm PROC
|
||||||
INCLUDE cn2/cnv2_main_loop_bulldozer.inc
|
INCLUDE cn2/cnv2_main_loop_bulldozer.inc
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
cnv2_mainloop_bulldozer_asm ENDP
|
cnv2_mainloop_bulldozer_asm ENDP
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
cnv2_double_mainloop_sandybridge_asm PROC
|
cnv2_double_mainloop_sandybridge_asm PROC
|
||||||
INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc
|
INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc
|
||||||
ret 0
|
ret 0
|
||||||
|
nop;nop;nop;nop;
|
||||||
cnv2_double_mainloop_sandybridge_asm ENDP
|
cnv2_double_mainloop_sandybridge_asm ENDP
|
||||||
|
|
||||||
ALIGN 64
|
|
||||||
cn_half_mainloop_ivybridge_asm PROC
|
|
||||||
INCLUDE cn_half/cn_half_main_loop_ivybridge.inc
|
|
||||||
ret 0
|
|
||||||
cn_half_mainloop_ivybridge_asm ENDP
|
|
||||||
|
|
||||||
ALIGN 64
|
|
||||||
cn_half_mainloop_ryzen_asm PROC
|
|
||||||
INCLUDE cn_half/cn_half_main_loop_ryzen.inc
|
|
||||||
ret 0
|
|
||||||
cn_half_mainloop_ryzen_asm ENDP
|
|
||||||
|
|
||||||
ALIGN 64
|
|
||||||
cn_half_mainloop_bulldozer_asm PROC
|
|
||||||
INCLUDE cn_half/cn_half_main_loop_bulldozer.inc
|
|
||||||
ret 0
|
|
||||||
cn_half_mainloop_bulldozer_asm ENDP
|
|
||||||
|
|
||||||
ALIGN 64
|
|
||||||
cn_half_double_mainloop_sandybridge_asm PROC
|
|
||||||
INCLUDE cn_half/cn_half_double_main_loop_sandybridge.inc
|
|
||||||
ret 0
|
|
||||||
cn_half_double_mainloop_sandybridge_asm ENDP
|
|
||||||
|
|
||||||
_TEXT_CNV2_MAINLOOP ENDS
|
_TEXT_CNV2_MAINLOOP ENDS
|
||||||
END
|
END
|
||||||
|
|
|
@ -31,6 +31,7 @@
|
||||||
#include "crypto/Asm.h"
|
#include "crypto/Asm.h"
|
||||||
#include "rapidjson/document.h"
|
#include "rapidjson/document.h"
|
||||||
#include "workers/CpuThread.h"
|
#include "workers/CpuThread.h"
|
||||||
|
#include "Mem.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(XMRIG_ARM)
|
#if defined(XMRIG_ARM)
|
||||||
|
@ -54,6 +55,61 @@ xmrig::CpuThread::CpuThread(size_t index, Algo algorithm, AlgoVariant av, Multiw
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef XMRIG_NO_ASM
|
||||||
|
template<typename T, typename U>
|
||||||
|
static void patchCode(T& dst, U src, const uint32_t iterations, const uint32_t mask)
|
||||||
|
{
|
||||||
|
const uint8_t* p = reinterpret_cast<const uint8_t*>(src);
|
||||||
|
|
||||||
|
size_t size = 0;
|
||||||
|
while (*(uint32_t*)(p + size) != 0x90909090) {
|
||||||
|
++size;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy((void*) dst, (const void*) src, size);
|
||||||
|
|
||||||
|
uint8_t* patched_data = reinterpret_cast<uint8_t*>(dst);
|
||||||
|
for (size_t i = 0; i + sizeof(uint32_t) <= size; ++i) {
|
||||||
|
switch (*(uint32_t*)(patched_data + i)) {
|
||||||
|
case xmrig::CRYPTONIGHT_ITER:
|
||||||
|
*(uint32_t*)(patched_data + i) = iterations;
|
||||||
|
break;
|
||||||
|
case xmrig::CRYPTONIGHT_MASK:
|
||||||
|
*(uint32_t*)(patched_data + i) = mask;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" void cnv2_mainloop_ivybridge_asm(cryptonight_ctx *ctx);
|
||||||
|
extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx *ctx);
|
||||||
|
extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx *ctx);
|
||||||
|
extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1);
|
||||||
|
|
||||||
|
xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ivybridge_asm = nullptr;
|
||||||
|
xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ryzen_asm = nullptr;
|
||||||
|
xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_bulldozer_asm = nullptr;
|
||||||
|
xmrig::CpuThread::cn_mainloop_double_fun cn_half_double_mainloop_sandybridge_asm = nullptr;
|
||||||
|
|
||||||
|
void xmrig::CpuThread::patchAsmVariants()
|
||||||
|
{
|
||||||
|
const int allocation_size = 65536;
|
||||||
|
uint8_t* base = reinterpret_cast<uint8_t*>(Mem::allocate_executable_memory(allocation_size));
|
||||||
|
|
||||||
|
cn_half_mainloop_ivybridge_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x0000);
|
||||||
|
cn_half_mainloop_ryzen_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x1000);
|
||||||
|
cn_half_mainloop_bulldozer_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x2000);
|
||||||
|
cn_half_double_mainloop_sandybridge_asm = reinterpret_cast<cn_mainloop_double_fun> (base + 0x3000);
|
||||||
|
|
||||||
|
patchCode(cn_half_mainloop_ivybridge_asm, cnv2_mainloop_ivybridge_asm, xmrig::CRYPTONIGHT_HALF_ITER, xmrig::CRYPTONIGHT_MASK);
|
||||||
|
patchCode(cn_half_mainloop_ryzen_asm, cnv2_mainloop_ryzen_asm, xmrig::CRYPTONIGHT_HALF_ITER, xmrig::CRYPTONIGHT_MASK);
|
||||||
|
patchCode(cn_half_mainloop_bulldozer_asm, cnv2_mainloop_bulldozer_asm, xmrig::CRYPTONIGHT_HALF_ITER, xmrig::CRYPTONIGHT_MASK);
|
||||||
|
patchCode(cn_half_double_mainloop_sandybridge_asm, cnv2_double_mainloop_sandybridge_asm, xmrig::CRYPTONIGHT_HALF_ITER, xmrig::CRYPTONIGHT_MASK);
|
||||||
|
|
||||||
|
Mem::FlushInstructionCache(base, allocation_size);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
bool xmrig::CpuThread::isSoftAES(AlgoVariant av)
|
bool xmrig::CpuThread::isSoftAES(AlgoVariant av)
|
||||||
{
|
{
|
||||||
return av == AV_SINGLE_SOFT || av == AV_DOUBLE_SOFT || av > AV_PENTA;
|
return av == AV_SINGLE_SOFT || av == AV_DOUBLE_SOFT || av > AV_PENTA;
|
||||||
|
|
|
@ -60,6 +60,12 @@ public:
|
||||||
CpuThread(size_t index, Algo algorithm, AlgoVariant av, Multiway multiway, int64_t affinity, int priority, bool softAES, bool prefetch, Assembly assembly);
|
CpuThread(size_t index, Algo algorithm, AlgoVariant av, Multiway multiway, int64_t affinity, int priority, bool softAES, bool prefetch, Assembly assembly);
|
||||||
|
|
||||||
typedef void (*cn_hash_fun)(const uint8_t *input, size_t size, uint8_t *output, cryptonight_ctx **ctx);
|
typedef void (*cn_hash_fun)(const uint8_t *input, size_t size, uint8_t *output, cryptonight_ctx **ctx);
|
||||||
|
typedef void (*cn_mainloop_fun)(cryptonight_ctx *ctx);
|
||||||
|
typedef void (*cn_mainloop_double_fun)(cryptonight_ctx *ctx1, cryptonight_ctx *ctx2);
|
||||||
|
|
||||||
|
# ifndef XMRIG_NO_ASM
|
||||||
|
static void patchAsmVariants();
|
||||||
|
# endif
|
||||||
|
|
||||||
static bool isSoftAES(AlgoVariant av);
|
static bool isSoftAES(AlgoVariant av);
|
||||||
static cn_hash_fun fn(Algo algorithm, AlgoVariant av, Variant variant, Assembly assembly);
|
static cn_hash_fun fn(Algo algorithm, AlgoVariant av, Variant variant, Assembly assembly);
|
||||||
|
|
|
@ -168,6 +168,10 @@ void Workers::start(xmrig::Controller *controller)
|
||||||
LOG_NOTICE("--------------------------------------------------------------------------");
|
LOG_NOTICE("--------------------------------------------------------------------------");
|
||||||
# endif
|
# endif
|
||||||
|
|
||||||
|
# ifndef XMRIG_NO_ASM
|
||||||
|
xmrig::CpuThread::patchAsmVariants();
|
||||||
|
# endif
|
||||||
|
|
||||||
m_controller = controller;
|
m_controller = controller;
|
||||||
|
|
||||||
const std::vector<xmrig::IThread *> &threads = controller->config()->threads();
|
const std::vector<xmrig::IThread *> &threads = controller->config()->threads();
|
||||||
|
|
Loading…
Reference in a new issue